src/5018001/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  * Copyright (c) 1986 by University of Toronto.
  42  * Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  * Permission is granted to anyone to use this software for any
  45  * purpose on any computer system, and to redistribute it freely,
  46  * subject to the following restrictions:
  47  *
  48  * 1. The author is not responsible for the consequences of use of
  49  *  this software, no matter how awful, even if they arise
  50  *  from defects in it.
  51  *
  52  * 2. The origin of this software must not be misrepresented, either
  53  *  by explicit claim or by omission.
  54  *
  55  * 3. Altered versions must be plainly marked as such, and must not
  56  *  be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 extern const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  97
  98 #ifdef op
  99 #undef op
 100 #endif /* op */
 101
 102 #ifdef MSDOS
 103 #  if defined(BUGGY_MSC6)
 104  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 105 #    pragma optimize("a",off)
 106  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 107 #    pragma optimize("w",on )
 108 #  endif /* BUGGY_MSC6 */
 109 #endif /* MSDOS */
 110
 111 #ifndef STATIC
 112 #define STATIC static
 113 #endif
 114
 115
 116 typedef struct RExC_state_t {
 117  U32  flags;   /* RXf_* are we folding, multilining? */
 118  U32  pm_flags;  /* PMf_* stuff from the calling PMOP */
 119  char *precomp;  /* uncompiled string. */
 120  REGEXP *rx_sv;   /* The SV that is the regexp. */
 121  regexp *rx;                    /* perl core regexp structure */
 122  regexp_internal *rxi;           /* internal data for regexp object pprivate field */
 123  char *start;   /* Start of input for compile */
 124  char *end;   /* End of input for compile */
 125  char *parse;   /* Input-scan pointer. */
 126  I32  whilem_seen;  /* number of WHILEM in this expr */
 127  regnode *emit_start;  /* Start of emitted-code area */
 128  regnode *emit_bound;  /* First regnode outside of the allocated space */
 129  regnode *emit;   /* Code-emit pointer; &regdummy = don't = compiling */
 130  I32  naughty;  /* How bad is this pattern? */
 131  I32  sawback;  /* Did we see \1, ...? */
 132  U32  seen;
 133  I32  size;   /* Code size. */
 134  I32  npar;   /* Capture buffer count, (OPEN). */
 135  I32  cpar;   /* Capture buffer count, (CLOSE). */
 136  I32  nestroot;  /* root parens we are in - used by accept */
 137  I32  extralen;
 138  I32  seen_zerolen;
 139  regnode **open_parens;  /* pointers to open parens */
 140  regnode **close_parens;  /* pointers to close parens */
 141  regnode *opend;   /* END node in program */
 142  I32  utf8;  /* whether the pattern is utf8 or not */
 143  I32  orig_utf8; /* whether the pattern was originally in utf8 */
 144         /* XXX use this for future optimisation of case
 145         * where pattern must be upgraded to utf8. */
 146  I32  uni_semantics; /* If a d charset modifier should use unicode
 147         rules, even if the pattern is not in
 148         utf8 */
 149  HV  *paren_names;  /* Paren names */
 150
 151  regnode **recurse;  /* Recurse regops */
 152  I32  recurse_count;  /* Number of recurse regops */
 153  I32  in_lookbehind;
 154  I32  contains_locale;
 155  I32  override_recoding;
 156  I32  in_multi_char_class;
 157  struct reg_code_block *code_blocks; /* positions of literal (?{})
 158            within pattern */
 159  int  num_code_blocks; /* size of code_blocks[] */
 160  int  code_index;  /* next code_blocks[] slot */
 161 #if ADD_TO_REGEXEC
 162  char  *starttry;  /* -Dr: where regtry was called. */
 163 #define RExC_starttry (pRExC_state->starttry)
 164 #endif
 165  SV  *runtime_code_qr; /* qr with the runtime code blocks */
 166 #ifdef DEBUGGING
 167  const char  *lastparse;
 168  I32         lastnum;
 169  AV          *paren_name_list;       /* idx -> name */
 170 #define RExC_lastparse (pRExC_state->lastparse)
 171 #define RExC_lastnum (pRExC_state->lastnum)
 172 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 173 #endif
 174 } RExC_state_t;
 175
 176 #define RExC_flags (pRExC_state->flags)
 177 #define RExC_pm_flags (pRExC_state->pm_flags)
 178 #define RExC_precomp (pRExC_state->precomp)
 179 #define RExC_rx_sv (pRExC_state->rx_sv)
 180 #define RExC_rx  (pRExC_state->rx)
 181 #define RExC_rxi (pRExC_state->rxi)
 182 #define RExC_start (pRExC_state->start)
 183 #define RExC_end (pRExC_state->end)
 184 #define RExC_parse (pRExC_state->parse)
 185 #define RExC_whilem_seen (pRExC_state->whilem_seen)
 186 #ifdef RE_TRACK_PATTERN_OFFSETS
 187 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
 188 #endif
 189 #define RExC_emit (pRExC_state->emit)
 190 #define RExC_emit_start (pRExC_state->emit_start)
 191 #define RExC_emit_bound (pRExC_state->emit_bound)
 192 #define RExC_naughty (pRExC_state->naughty)
 193 #define RExC_sawback (pRExC_state->sawback)
 194 #define RExC_seen (pRExC_state->seen)
 195 #define RExC_size (pRExC_state->size)
 196 #define RExC_npar (pRExC_state->npar)
 197 #define RExC_nestroot   (pRExC_state->nestroot)
 198 #define RExC_extralen (pRExC_state->extralen)
 199 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
 200 #define RExC_utf8 (pRExC_state->utf8)
 201 #define RExC_uni_semantics (pRExC_state->uni_semantics)
 202 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
 203 #define RExC_open_parens (pRExC_state->open_parens)
 204 #define RExC_close_parens (pRExC_state->close_parens)
 205 #define RExC_opend (pRExC_state->opend)
 206 #define RExC_paren_names (pRExC_state->paren_names)
 207 #define RExC_recurse (pRExC_state->recurse)
 208 #define RExC_recurse_count (pRExC_state->recurse_count)
 209 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
 210 #define RExC_contains_locale (pRExC_state->contains_locale)
 211 #define RExC_override_recoding (pRExC_state->override_recoding)
 212 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 213
 214
 215 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
 216 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 217   ((*s) == '{' && regcurly(s, FALSE)))
 218
 219 #ifdef SPSTART
 220 #undef SPSTART  /* dratted cpp namespace... */
 221 #endif
 222 /*
 223  * Flags to be passed up and down.
 224  */
 225 #define WORST  0 /* Worst case. */
 226 #define HASWIDTH 0x01 /* Known to match non-null strings. */
 227
 228 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 229  * character.  (There needs to be a case: in the switch statement in regexec.c
 230  * for any node marked SIMPLE.)  Note that this is not the same thing as
 231  * REGNODE_SIMPLE */
 232 #define SIMPLE  0x02
 233 #define SPSTART  0x04 /* Starts with * or + */
 234 #define POSTPONED 0x08    /* (?1),(?&name), (??{...}) or similar */
 235 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
 236 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 237
 238 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 239
 240 /* whether trie related optimizations are enabled */
 241 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 242 #define TRIE_STUDY_OPT
 243 #define FULL_TRIE_STUDY
 244 #define TRIE_STCLASS
 245 #endif
 246
 247
 248
 249 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 250 #define PBITVAL(paren) (1 << ((paren) & 7))
 251 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 252 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 253 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 254
 255 #define REQUIRE_UTF8 STMT_START {                                       \
 256          if (!UTF) {                           \
 257           *flagp = RESTART_UTF8;            \
 258           return NULL;                      \
 259          }                                     \
 260       } STMT_END
 261
 262 /* This converts the named class defined in regcomp.h to its equivalent class
 263  * number defined in handy.h. */
 264 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 265 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 266
 267 /* About scan_data_t.
 268
 269   During optimisation we recurse through the regexp program performing
 270   various inplace (keyhole style) optimisations. In addition study_chunk
 271   and scan_commit populate this data structure with information about
 272   what strings MUST appear in the pattern. We look for the longest
 273   string that must appear at a fixed location, and we look for the
 274   longest string that may appear at a floating location. So for instance
 275   in the pattern:
 276
 277  /FOO[xX]A.*B[xX]BAR/
 278
 279   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 280   strings (because they follow a .* construct). study_chunk will identify
 281   both FOO and BAR as being the longest fixed and floating strings respectively.
 282
 283   The strings can be composites, for instance
 284
 285  /(f)(o)(o)/
 286
 287   will result in a composite fixed substring 'foo'.
 288
 289   For each string some basic information is maintained:
 290
 291   - offset or min_offset
 292  This is the position the string must appear at, or not before.
 293  It also implicitly (when combined with minlenp) tells us how many
 294  characters must match before the string we are searching for.
 295  Likewise when combined with minlenp and the length of the string it
 296  tells us how many characters must appear after the string we have
 297  found.
 298
 299   - max_offset
 300  Only used for floating strings. This is the rightmost point that
 301  the string can appear at. If set to I32 max it indicates that the
 302  string can occur infinitely far to the right.
 303
 304   - minlenp
 305  A pointer to the minimum number of characters of the pattern that the
 306  string was found inside. This is important as in the case of positive
 307  lookahead or positive lookbehind we can have multiple patterns
 308  involved. Consider
 309
 310  /(?=FOO).*F/
 311
 312  The minimum length of the pattern overall is 3, the minimum length
 313  of the lookahead part is 3, but the minimum length of the part that
 314  will actually match is 1. So 'FOO's minimum length is 3, but the
 315  minimum length for the F is 1. This is important as the minimum length
 316  is used to determine offsets in front of and behind the string being
 317  looked for.  Since strings can be composites this is the length of the
 318  pattern at the time it was committed with a scan_commit. Note that
 319  the length is calculated by study_chunk, so that the minimum lengths
 320  are not known until the full pattern has been compiled, thus the
 321  pointer to the value.
 322
 323   - lookbehind
 324
 325  In the case of lookbehind the string being searched for can be
 326  offset past the start point of the final matching string.
 327  If this value was just blithely removed from the min_offset it would
 328  invalidate some of the calculations for how many chars must match
 329  before or after (as they are derived from min_offset and minlen and
 330  the length of the string being searched for).
 331  When the final pattern is compiled and the data is moved from the
 332  scan_data_t structure into the regexp structure the information
 333  about lookbehind is factored in, with the information that would
 334  have been lost precalculated in the end_shift field for the
 335  associated string.
 336
 337   The fields pos_min and pos_delta are used to store the minimum offset
 338   and the delta to the maximum offset at the current point in the pattern.
 339
 340 */
 341
 342 typedef struct scan_data_t {
 343  /*I32 len_min;      unused */
 344  /*I32 len_delta;    unused */
 345  I32 pos_min;
 346  I32 pos_delta;
 347  SV *last_found;
 348  I32 last_end;     /* min value, <0 unless valid. */
 349  I32 last_start_min;
 350  I32 last_start_max;
 351  SV **longest;     /* Either &l_fixed, or &l_float. */
 352  SV *longest_fixed;      /* longest fixed string found in pattern */
 353  I32 offset_fixed;       /* offset where it starts */
 354  I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 355  I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 356  SV *longest_float;      /* longest floating string found in pattern */
 357  I32 offset_float_min;   /* earliest point in string it can appear */
 358  I32 offset_float_max;   /* latest point in string it can appear */
 359  I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 360  I32 lookbehind_float;   /* is the position of the string modified by LB */
 361  I32 flags;
 362  I32 whilem_c;
 363  I32 *last_closep;
 364  struct regnode_charclass_class *start_class;
 365 } scan_data_t;
 366
 367 /*
 368  * Forward declarations for pregcomp()'s friends.
 369  */
 370
 371 static const scan_data_t zero_scan_data =
 372   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 373
 374 #define SF_BEFORE_EOL  (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 375 #define SF_BEFORE_SEOL  0x0001
 376 #define SF_BEFORE_MEOL  0x0002
 377 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 378 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 379
 380 #ifdef NO_UNARY_PLUS
 381 #  define SF_FIX_SHIFT_EOL (0+2)
 382 #  define SF_FL_SHIFT_EOL  (0+4)
 383 #else
 384 #  define SF_FIX_SHIFT_EOL (+2)
 385 #  define SF_FL_SHIFT_EOL  (+4)
 386 #endif
 387
 388 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 389 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 390
 391 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 392 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 393 #define SF_IS_INF  0x0040
 394 #define SF_HAS_PAR  0x0080
 395 #define SF_IN_PAR  0x0100
 396 #define SF_HAS_EVAL  0x0200
 397 #define SCF_DO_SUBSTR  0x0400
 398 #define SCF_DO_STCLASS_AND 0x0800
 399 #define SCF_DO_STCLASS_OR 0x1000
 400 #define SCF_DO_STCLASS  (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 401 #define SCF_WHILEM_VISITED_POS 0x2000
 402
 403 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 404 #define SCF_SEEN_ACCEPT         0x8000
 405
 406 #define UTF cBOOL(RExC_utf8)
 407
 408 /* The enums for all these are ordered so things work out correctly */
 409 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 410 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 411 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 412 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 413 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 414 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 415 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 416
 417 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 418
 419 #define OOB_NAMEDCLASS  -1
 420
 421 /* There is no code point that is out-of-bounds, so this is problematic.  But
 422  * its only current use is to initialize a variable that is always set before
 423  * looked at. */
 424 #define OOB_UNICODE  0xDEADBEEF
 425
 426 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 427 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 428
 429
 430 /* length of regex to show in messages that don't mark a position within */
 431 #define RegexLengthToShowInErrorMessages 127
 432
 433 /*
 434  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 435  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 436  * op/pragma/warn/regcomp.
 437  */
 438 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 439 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 440
 441 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 442
 443 /*
 444  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 445  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 446  * "...".
 447  */
 448 #define _FAIL(code) STMT_START {     \
 449  const char *ellipses = "";      \
 450  IV len = RExC_end - RExC_precomp;     \
 451                   \
 452  if (!SIZE_ONLY)       \
 453   SAVEFREESV(RExC_rx_sv);      \
 454  if (len > RegexLengthToShowInErrorMessages) {   \
 455   /* chop 10 shorter than the max, to ensure meaning of "..." */ \
 456   len = RegexLengthToShowInErrorMessages - 10;   \
 457   ellipses = "...";      \
 458  }         \
 459  code;                                                               \
 460 } STMT_END
 461
 462 #define FAIL(msg) _FAIL(       \
 463  Perl_croak(aTHX_ "%s in regex m/%.*s%s/",     \
 464    msg, (int)len, RExC_precomp, ellipses))
 465
 466 #define FAIL2(msg,arg) _FAIL(       \
 467  Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 468    arg, (int)len, RExC_precomp, ellipses))
 469
 470 /*
 471  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 472  */
 473 #define Simple_vFAIL(m) STMT_START {     \
 474  const IV offset = RExC_parse - RExC_precomp;   \
 475  Perl_croak(aTHX_ "%s" REPORT_LOCATION,    \
 476    m, (int)offset, RExC_precomp, RExC_precomp + offset); \
 477 } STMT_END
 478
 479 /*
 480  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 481  */
 482 #define vFAIL(m) STMT_START {    \
 483  if (!SIZE_ONLY)     \
 484   SAVEFREESV(RExC_rx_sv);    \
 485  Simple_vFAIL(m);     \
 486 } STMT_END
 487
 488 /*
 489  * Like Simple_vFAIL(), but accepts two arguments.
 490  */
 491 #define Simple_vFAIL2(m,a1) STMT_START {   \
 492  const IV offset = RExC_parse - RExC_precomp;   \
 493  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,   \
 494    (int)offset, RExC_precomp, RExC_precomp + offset); \
 495 } STMT_END
 496
 497 /*
 498  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 499  */
 500 #define vFAIL2(m,a1) STMT_START {   \
 501  if (!SIZE_ONLY)     \
 502   SAVEFREESV(RExC_rx_sv);    \
 503  Simple_vFAIL2(m, a1);    \
 504 } STMT_END
 505
 506
 507 /*
 508  * Like Simple_vFAIL(), but accepts three arguments.
 509  */
 510 #define Simple_vFAIL3(m, a1, a2) STMT_START {   \
 511  const IV offset = RExC_parse - RExC_precomp;  \
 512  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,  \
 513    (int)offset, RExC_precomp, RExC_precomp + offset); \
 514 } STMT_END
 515
 516 /*
 517  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 518  */
 519 #define vFAIL3(m,a1,a2) STMT_START {   \
 520  if (!SIZE_ONLY)     \
 521   SAVEFREESV(RExC_rx_sv);    \
 522  Simple_vFAIL3(m, a1, a2);    \
 523 } STMT_END
 524
 525 /*
 526  * Like Simple_vFAIL(), but accepts four arguments.
 527  */
 528 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {  \
 529  const IV offset = RExC_parse - RExC_precomp;  \
 530  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,  \
 531    (int)offset, RExC_precomp, RExC_precomp + offset); \
 532 } STMT_END
 533
 534 #define vFAIL4(m,a1,a2,a3) STMT_START {   \
 535  if (!SIZE_ONLY)     \
 536   SAVEFREESV(RExC_rx_sv);    \
 537  Simple_vFAIL4(m, a1, a2, a3);   \
 538 } STMT_END
 539
 540 /* m is not necessarily a "literal string", in this macro */
 541 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 542  const IV offset = loc - RExC_precomp;                               \
 543  Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 544    m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 545 } STMT_END
 546
 547 #define ckWARNreg(loc,m) STMT_START {     \
 548  const IV offset = loc - RExC_precomp;    \
 549  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 550    (int)offset, RExC_precomp, RExC_precomp + offset);  \
 551 } STMT_END
 552
 553 #define vWARN_dep(loc, m) STMT_START {            \
 554  const IV offset = loc - RExC_precomp;    \
 555  Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
 556    (int)offset, RExC_precomp, RExC_precomp + offset);         \
 557 } STMT_END
 558
 559 #define ckWARNdep(loc,m) STMT_START {            \
 560  const IV offset = loc - RExC_precomp;    \
 561  Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                 \
 562    m REPORT_LOCATION,      \
 563    (int)offset, RExC_precomp, RExC_precomp + offset);  \
 564 } STMT_END
 565
 566 #define ckWARNregdep(loc,m) STMT_START {    \
 567  const IV offset = loc - RExC_precomp;    \
 568  Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
 569    m REPORT_LOCATION,      \
 570    (int)offset, RExC_precomp, RExC_precomp + offset);  \
 571 } STMT_END
 572
 573 #define ckWARN2regdep(loc,m, a1) STMT_START {    \
 574  const IV offset = loc - RExC_precomp;    \
 575  Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
 576    m REPORT_LOCATION,      \
 577    a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
 578 } STMT_END
 579
 580 #define ckWARN2reg(loc, m, a1) STMT_START {    \
 581  const IV offset = loc - RExC_precomp;    \
 582  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 583    a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
 584 } STMT_END
 585
 586 #define vWARN3(loc, m, a1, a2) STMT_START {    \
 587  const IV offset = loc - RExC_precomp;    \
 588  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 589    a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
 590 } STMT_END
 591
 592 #define ckWARN3reg(loc, m, a1, a2) STMT_START {    \
 593  const IV offset = loc - RExC_precomp;    \
 594  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 595    a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
 596 } STMT_END
 597
 598 #define vWARN4(loc, m, a1, a2, a3) STMT_START {    \
 599  const IV offset = loc - RExC_precomp;    \
 600  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 601    a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 602 } STMT_END
 603
 604 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {   \
 605  const IV offset = loc - RExC_precomp;    \
 606  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 607    a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 608 } STMT_END
 609
 610 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {   \
 611  const IV offset = loc - RExC_precomp;    \
 612  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 613    a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 614 } STMT_END
 615
 616
 617 /* Allow for side effects in s */
 618 #define REGC(c,s) STMT_START {   \
 619  if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 620 } STMT_END
 621
 622 /* Macros for recording node offsets.   20001227 mjd@plover.com
 623  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 624  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 625  * Element 0 holds the number n.
 626  * Position is 1 indexed.
 627  */
 628 #ifndef RE_TRACK_PATTERN_OFFSETS
 629 #define Set_Node_Offset_To_R(node,byte)
 630 #define Set_Node_Offset(node,byte)
 631 #define Set_Cur_Node_Offset
 632 #define Set_Node_Length_To_R(node,len)
 633 #define Set_Node_Length(node,len)
 634 #define Set_Node_Cur_Length(node)
 635 #define Node_Offset(n)
 636 #define Node_Length(n)
 637 #define Set_Node_Offset_Length(node,offset,len)
 638 #define ProgLen(ri) ri->u.proglen
 639 #define SetProgLen(ri,x) ri->u.proglen = x
 640 #else
 641 #define ProgLen(ri) ri->u.offsets[0]
 642 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 643 #define Set_Node_Offset_To_R(node,byte) STMT_START {   \
 644  if (! SIZE_ONLY) {       \
 645   MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",  \
 646      __LINE__, (int)(node), (int)(byte)));  \
 647   if((node) < 0) {      \
 648    Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 649   } else {       \
 650    RExC_offsets[2*(node)-1] = (byte);    \
 651   }        \
 652  }         \
 653 } STMT_END
 654
 655 #define Set_Node_Offset(node,byte) \
 656  Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 657 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 658
 659 #define Set_Node_Length_To_R(node,len) STMT_START {   \
 660  if (! SIZE_ONLY) {       \
 661   MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",  \
 662     __LINE__, (int)(node), (int)(len)));   \
 663   if((node) < 0) {      \
 664    Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 665   } else {       \
 666    RExC_offsets[2*(node)] = (len);    \
 667   }        \
 668  }         \
 669 } STMT_END
 670
 671 #define Set_Node_Length(node,len) \
 672  Set_Node_Length_To_R((node)-RExC_emit_start, len)
 673 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 674 #define Set_Node_Cur_Length(node) \
 675  Set_Node_Length(node, RExC_parse - parse_start)
 676
 677 /* Get offsets and lengths */
 678 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 679 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 680
 681 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
 682  Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
 683  Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
 684 } STMT_END
 685 #endif
 686
 687 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 688 #define EXPERIMENTAL_INPLACESCAN
 689 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 690
 691 #define DEBUG_STUDYDATA(str,data,depth)                              \
 692 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 693  PerlIO_printf(Perl_debug_log,                                    \
 694   "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 695   " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 696   (int)(depth)*2, "",                                          \
 697   (IV)((data)->pos_min),                                       \
 698   (IV)((data)->pos_delta),                                     \
 699   (UV)((data)->flags),                                         \
 700   (IV)((data)->whilem_c),                                      \
 701   (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 702   is_inf ? "INF " : ""                                         \
 703  );                                                               \
 704  if ((data)->last_found)                                          \
 705   PerlIO_printf(Perl_debug_log,                                \
 706    "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 707    " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 708    SvPVX_const((data)->last_found),                         \
 709    (IV)((data)->last_end),                                  \
 710    (IV)((data)->last_start_min),                            \
 711    (IV)((data)->last_start_max),                            \
 712    ((data)->longest &&                                      \
 713    (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 714    SvPVX_const((data)->longest_fixed),                      \
 715    (IV)((data)->offset_fixed),                              \
 716    ((data)->longest &&                                      \
 717    (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 718    SvPVX_const((data)->longest_float),                      \
 719    (IV)((data)->offset_float_min),                          \
 720    (IV)((data)->offset_float_max)                           \
 721   );                                                           \
 722  PerlIO_printf(Perl_debug_log,"\n");                              \
 723 });
 724
 725 /* Mark that we cannot extend a found fixed substring at this point.
 726    Update the longest found anchored substring and the longest found
 727    floating substrings if needed. */
 728
 729 STATIC void
 730 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 731 {
 732  const STRLEN l = CHR_SVLEN(data->last_found);
 733  const STRLEN old_l = CHR_SVLEN(*data->longest);
 734  GET_RE_DEBUG_FLAGS_DECL;
 735
 736  PERL_ARGS_ASSERT_SCAN_COMMIT;
 737
 738  if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 739   SvSetMagicSV(*data->longest, data->last_found);
 740   if (*data->longest == data->longest_fixed) {
 741    data->offset_fixed = l ? data->last_start_min : data->pos_min;
 742    if (data->flags & SF_BEFORE_EOL)
 743     data->flags
 744      |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 745    else
 746     data->flags &= ~SF_FIX_BEFORE_EOL;
 747    data->minlen_fixed=minlenp;
 748    data->lookbehind_fixed=0;
 749   }
 750   else { /* *data->longest == data->longest_float */
 751    data->offset_float_min = l ? data->last_start_min : data->pos_min;
 752    data->offset_float_max = (l
 753          ? data->last_start_max
 754          : (data->pos_delta == I32_MAX ? I32_MAX : data->pos_min + data->pos_delta));
 755    if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 756     data->offset_float_max = I32_MAX;
 757    if (data->flags & SF_BEFORE_EOL)
 758     data->flags
 759      |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 760    else
 761     data->flags &= ~SF_FL_BEFORE_EOL;
 762    data->minlen_float=minlenp;
 763    data->lookbehind_float=0;
 764   }
 765  }
 766  SvCUR_set(data->last_found, 0);
 767  {
 768   SV * const sv = data->last_found;
 769   if (SvUTF8(sv) && SvMAGICAL(sv)) {
 770    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 771    if (mg)
 772     mg->mg_len = 0;
 773   }
 774  }
 775  data->last_end = -1;
 776  data->flags &= ~SF_BEFORE_EOL;
 777  DEBUG_STUDYDATA("commit: ",data,0);
 778 }
 779
 780 /* These macros set, clear and test whether the synthetic start class ('ssc',
 781  * given by the parameter) matches an empty string (EOS).  This uses the
 782  * 'next_off' field in the node, to save a bit in the flags field.  The ssc
 783  * stands alone, so there is never a next_off, so this field is otherwise
 784  * unused.  The EOS information is used only for compilation, but theoretically
 785  * it could be passed on to the execution code.  This could be used to store
 786  * more than one bit of information, but only this one is currently used. */
 787 #define SET_SSC_EOS(node)   STMT_START { (node)->next_off = TRUE; } STMT_END
 788 #define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
 789 #define TEST_SSC_EOS(node)  cBOOL((node)->next_off)
 790
 791 /* Can match anything (initialization) */
 792 STATIC void
 793 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 794 {
 795  PERL_ARGS_ASSERT_CL_ANYTHING;
 796
 797  ANYOF_BITMAP_SETALL(cl);
 798  cl->flags = ANYOF_UNICODE_ALL;
 799  SET_SSC_EOS(cl);
 800
 801  /* If any portion of the regex is to operate under locale rules,
 802  * initialization includes it.  The reason this isn't done for all regexes
 803  * is that the optimizer was written under the assumption that locale was
 804  * all-or-nothing.  Given the complexity and lack of documentation in the
 805  * optimizer, and that there are inadequate test cases for locale, so many
 806  * parts of it may not work properly, it is safest to avoid locale unless
 807  * necessary. */
 808  if (RExC_contains_locale) {
 809   ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 810   cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
 811  }
 812  else {
 813   ANYOF_CLASS_ZERO(cl);     /* Only /l uses class now */
 814  }
 815 }
 816
 817 /* Can match anything (initialization) */
 818 STATIC int
 819 S_cl_is_anything(const struct regnode_charclass_class *cl)
 820 {
 821  int value;
 822
 823  PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 824
 825  for (value = 0; value < ANYOF_MAX; value += 2)
 826   if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 827    return 1;
 828  if (!(cl->flags & ANYOF_UNICODE_ALL))
 829   return 0;
 830  if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 831   return 0;
 832  return 1;
 833 }
 834
 835 /* Can match anything (initialization) */
 836 STATIC void
 837 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 838 {
 839  PERL_ARGS_ASSERT_CL_INIT;
 840
 841  Zero(cl, 1, struct regnode_charclass_class);
 842  cl->type = ANYOF;
 843  cl_anything(pRExC_state, cl);
 844  ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 845 }
 846
 847 /* These two functions currently do the exact same thing */
 848 #define cl_init_zero  S_cl_init
 849
 850 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 851  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 852  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 853 STATIC void
 854 S_cl_and(struct regnode_charclass_class *cl,
 855   const struct regnode_charclass_class *and_with)
 856 {
 857  PERL_ARGS_ASSERT_CL_AND;
 858
 859  assert(PL_regkind[and_with->type] == ANYOF);
 860
 861  /* I (khw) am not sure all these restrictions are necessary XXX */
 862  if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 863   && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 864   && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 865   && !(and_with->flags & ANYOF_LOC_FOLD)
 866   && !(cl->flags & ANYOF_LOC_FOLD)) {
 867   int i;
 868
 869   if (and_with->flags & ANYOF_INVERT)
 870    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 871     cl->bitmap[i] &= ~and_with->bitmap[i];
 872   else
 873    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 874     cl->bitmap[i] &= and_with->bitmap[i];
 875  } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 876
 877  if (and_with->flags & ANYOF_INVERT) {
 878
 879   /* Here, the and'ed node is inverted.  Get the AND of the flags that
 880   * aren't affected by the inversion.  Those that are affected are
 881   * handled individually below */
 882   U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 883   cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 884   cl->flags |= affected_flags;
 885
 886   /* We currently don't know how to deal with things that aren't in the
 887   * bitmap, but we know that the intersection is no greater than what
 888   * is already in cl, so let there be false positives that get sorted
 889   * out after the synthetic start class succeeds, and the node is
 890   * matched for real. */
 891
 892   /* The inversion of these two flags indicate that the resulting
 893   * intersection doesn't have them */
 894   if (and_with->flags & ANYOF_UNICODE_ALL) {
 895    cl->flags &= ~ANYOF_UNICODE_ALL;
 896   }
 897   if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 898    cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 899   }
 900  }
 901  else {   /* and'd node is not inverted */
 902   U8 outside_bitmap_but_not_utf8; /* Temp variable */
 903
 904   if (! ANYOF_NONBITMAP(and_with)) {
 905
 906    /* Here 'and_with' doesn't match anything outside the bitmap
 907    * (except possibly ANYOF_UNICODE_ALL), which means the
 908    * intersection can't either, except for ANYOF_UNICODE_ALL, in
 909    * which case we don't know what the intersection is, but it's no
 910    * greater than what cl already has, so can just leave it alone,
 911    * with possible false positives */
 912    if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 913     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 914     cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 915    }
 916   }
 917   else if (! ANYOF_NONBITMAP(cl)) {
 918
 919    /* Here, 'and_with' does match something outside the bitmap, and cl
 920    * doesn't have a list of things to match outside the bitmap.  If
 921    * cl can match all code points above 255, the intersection will
 922    * be those above-255 code points that 'and_with' matches.  If cl
 923    * can't match all Unicode code points, it means that it can't
 924    * match anything outside the bitmap (since the 'if' that got us
 925    * into this block tested for that), so we leave the bitmap empty.
 926    */
 927    if (cl->flags & ANYOF_UNICODE_ALL) {
 928     ARG_SET(cl, ARG(and_with));
 929
 930     /* and_with's ARG may match things that don't require UTF8.
 931     * And now cl's will too, in spite of this being an 'and'.  See
 932     * the comments below about the kludge */
 933     cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 934    }
 935   }
 936   else {
 937    /* Here, both 'and_with' and cl match something outside the
 938    * bitmap.  Currently we do not do the intersection, so just match
 939    * whatever cl had at the beginning.  */
 940   }
 941
 942
 943   /* Take the intersection of the two sets of flags.  However, the
 944   * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 945   * kludge around the fact that this flag is not treated like the others
 946   * which are initialized in cl_anything().  The way the optimizer works
 947   * is that the synthetic start class (SSC) is initialized to match
 948   * anything, and then the first time a real node is encountered, its
 949   * values are AND'd with the SSC's with the result being the values of
 950   * the real node.  However, there are paths through the optimizer where
 951   * the AND never gets called, so those initialized bits are set
 952   * inappropriately, which is not usually a big deal, as they just cause
 953   * false positives in the SSC, which will just mean a probably
 954   * imperceptible slow down in execution.  However this bit has a
 955   * higher false positive consequence in that it can cause utf8.pm,
 956   * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 957   * bigger slowdown and also causes significant extra memory to be used.
 958   * In order to prevent this, the code now takes a different tack.  The
 959   * bit isn't set unless some part of the regular expression needs it,
 960   * but once set it won't get cleared.  This means that these extra
 961   * modules won't get loaded unless there was some path through the
 962   * pattern that would have required them anyway, and  so any false
 963   * positives that occur by not ANDing them out when they could be
 964   * aren't as severe as they would be if we treated this bit like all
 965   * the others */
 966   outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 967          & ANYOF_NONBITMAP_NON_UTF8;
 968   cl->flags &= and_with->flags;
 969   cl->flags |= outside_bitmap_but_not_utf8;
 970  }
 971 }
 972
 973 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 974  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 975  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 976 STATIC void
 977 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 978 {
 979  PERL_ARGS_ASSERT_CL_OR;
 980
 981  if (or_with->flags & ANYOF_INVERT) {
 982
 983   /* Here, the or'd node is to be inverted.  This means we take the
 984   * complement of everything not in the bitmap, but currently we don't
 985   * know what that is, so give up and match anything */
 986   if (ANYOF_NONBITMAP(or_with)) {
 987    cl_anything(pRExC_state, cl);
 988   }
 989   /* We do not use
 990   * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 991   *   <= (B1 | !B2) | (CL1 | !CL2)
 992   * which is wasteful if CL2 is small, but we ignore CL2:
 993   *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 994   * XXXX Can we handle case-fold?  Unclear:
 995   *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 996   *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 997   */
 998   else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 999    && !(or_with->flags & ANYOF_LOC_FOLD)
1000    && !(cl->flags & ANYOF_LOC_FOLD) ) {
1001    int i;
1002
1003    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1004     cl->bitmap[i] |= ~or_with->bitmap[i];
1005   } /* XXXX: logic is complicated otherwise */
1006   else {
1007    cl_anything(pRExC_state, cl);
1008   }
1009
1010   /* And, we can just take the union of the flags that aren't affected
1011   * by the inversion */
1012   cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
1013
1014   /* For the remaining flags:
1015    ANYOF_UNICODE_ALL and inverted means to not match anything above
1016      255, which means that the union with cl should just be
1017      what cl has in it, so can ignore this flag
1018    ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
1019      is 127-255 to match them, but then invert that, so the
1020      union with cl should just be what cl has in it, so can
1021      ignore this flag
1022   */
1023  } else {    /* 'or_with' is not inverted */
1024   /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
1025   if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1026    && (!(or_with->flags & ANYOF_LOC_FOLD)
1027     || (cl->flags & ANYOF_LOC_FOLD)) ) {
1028    int i;
1029
1030    /* OR char bitmap and class bitmap separately */
1031    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1032     cl->bitmap[i] |= or_with->bitmap[i];
1033    if (or_with->flags & ANYOF_CLASS) {
1034     ANYOF_CLASS_OR(or_with, cl);
1035    }
1036   }
1037   else { /* XXXX: logic is complicated, leave it along for a moment. */
1038    cl_anything(pRExC_state, cl);
1039   }
1040
1041   if (ANYOF_NONBITMAP(or_with)) {
1042
1043    /* Use the added node's outside-the-bit-map match if there isn't a
1044    * conflict.  If there is a conflict (both nodes match something
1045    * outside the bitmap, but what they match outside is not the same
1046    * pointer, and hence not easily compared until XXX we extend
1047    * inversion lists this far), give up and allow the start class to
1048    * match everything outside the bitmap.  If that stuff is all above
1049    * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1050    if (! ANYOF_NONBITMAP(cl)) {
1051     ARG_SET(cl, ARG(or_with));
1052    }
1053    else if (ARG(cl) != ARG(or_with)) {
1054
1055     if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1056      cl_anything(pRExC_state, cl);
1057     }
1058     else {
1059      cl->flags |= ANYOF_UNICODE_ALL;
1060     }
1061    }
1062   }
1063
1064   /* Take the union */
1065   cl->flags |= or_with->flags;
1066  }
1067 }
1068
1069 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1070 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1071 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1072 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1073
1074
1075 #ifdef DEBUGGING
1076 /*
1077    dump_trie(trie,widecharmap,revcharmap)
1078    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1079    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1080
1081    These routines dump out a trie in a somewhat readable format.
1082    The _interim_ variants are used for debugging the interim
1083    tables that are used to generate the final compressed
1084    representation which is what dump_trie expects.
1085
1086    Part of the reason for their existence is to provide a form
1087    of documentation as to how the different representations function.
1088
1089 */
1090
1091 /*
1092   Dumps the final compressed table form of the trie to Perl_debug_log.
1093   Used for debugging make_trie().
1094 */
1095
1096 STATIC void
1097 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1098    AV *revcharmap, U32 depth)
1099 {
1100  U32 state;
1101  SV *sv=sv_newmortal();
1102  int colwidth= widecharmap ? 6 : 4;
1103  U16 word;
1104  GET_RE_DEBUG_FLAGS_DECL;
1105
1106  PERL_ARGS_ASSERT_DUMP_TRIE;
1107
1108  PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1109   (int)depth * 2 + 2,"",
1110   "Match","Base","Ofs" );
1111
1112  for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1113   SV ** const tmp = av_fetch( revcharmap, state, 0);
1114   if ( tmp ) {
1115    PerlIO_printf( Perl_debug_log, "%*s",
1116     colwidth,
1117     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1118        PL_colors[0], PL_colors[1],
1119        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1120        PERL_PV_ESCAPE_FIRSTCHAR
1121     )
1122    );
1123   }
1124  }
1125  PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1126   (int)depth * 2 + 2,"");
1127
1128  for( state = 0 ; state < trie->uniquecharcount ; state++ )
1129   PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1130  PerlIO_printf( Perl_debug_log, "\n");
1131
1132  for( state = 1 ; state < trie->statecount ; state++ ) {
1133   const U32 base = trie->states[ state ].trans.base;
1134
1135   PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1136
1137   if ( trie->states[ state ].wordnum ) {
1138    PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1139   } else {
1140    PerlIO_printf( Perl_debug_log, "%6s", "" );
1141   }
1142
1143   PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1144
1145   if ( base ) {
1146    U32 ofs = 0;
1147
1148    while( ( base + ofs  < trie->uniquecharcount ) ||
1149     ( base + ofs - trie->uniquecharcount < trie->lasttrans
1150      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1151      ofs++;
1152
1153    PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1154
1155    for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1156     if ( ( base + ofs >= trie->uniquecharcount ) &&
1157      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1158      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1159     {
1160     PerlIO_printf( Perl_debug_log, "%*"UVXf,
1161      colwidth,
1162      (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1163     } else {
1164      PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1165     }
1166    }
1167
1168    PerlIO_printf( Perl_debug_log, "]");
1169
1170   }
1171   PerlIO_printf( Perl_debug_log, "\n" );
1172  }
1173  PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1174  for (word=1; word <= trie->wordcount; word++) {
1175   PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1176    (int)word, (int)(trie->wordinfo[word].prev),
1177    (int)(trie->wordinfo[word].len));
1178  }
1179  PerlIO_printf(Perl_debug_log, "\n" );
1180 }
1181 /*
1182   Dumps a fully constructed but uncompressed trie in list form.
1183   List tries normally only are used for construction when the number of
1184   possible chars (trie->uniquecharcount) is very high.
1185   Used for debugging make_trie().
1186 */
1187 STATIC void
1188 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1189       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1190       U32 depth)
1191 {
1192  U32 state;
1193  SV *sv=sv_newmortal();
1194  int colwidth= widecharmap ? 6 : 4;
1195  GET_RE_DEBUG_FLAGS_DECL;
1196
1197  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1198
1199  /* print out the table precompression.  */
1200  PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1201   (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1202   "------:-----+-----------------\n" );
1203
1204  for( state=1 ; state < next_alloc ; state ++ ) {
1205   U16 charid;
1206
1207   PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1208    (int)depth * 2 + 2,"", (UV)state  );
1209   if ( ! trie->states[ state ].wordnum ) {
1210    PerlIO_printf( Perl_debug_log, "%5s| ","");
1211   } else {
1212    PerlIO_printf( Perl_debug_log, "W%4x| ",
1213     trie->states[ state ].wordnum
1214    );
1215   }
1216   for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1217    SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1218    if ( tmp ) {
1219     PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1220      colwidth,
1221      pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1222        PL_colors[0], PL_colors[1],
1223        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1224        PERL_PV_ESCAPE_FIRSTCHAR
1225      ) ,
1226      TRIE_LIST_ITEM(state,charid).forid,
1227      (UV)TRIE_LIST_ITEM(state,charid).newstate
1228     );
1229     if (!(charid % 10))
1230      PerlIO_printf(Perl_debug_log, "\n%*s| ",
1231       (int)((depth * 2) + 14), "");
1232    }
1233   }
1234   PerlIO_printf( Perl_debug_log, "\n");
1235  }
1236 }
1237
1238 /*
1239   Dumps a fully constructed but uncompressed trie in table form.
1240   This is the normal DFA style state transition table, with a few
1241   twists to facilitate compression later.
1242   Used for debugging make_trie().
1243 */
1244 STATIC void
1245 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1246       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1247       U32 depth)
1248 {
1249  U32 state;
1250  U16 charid;
1251  SV *sv=sv_newmortal();
1252  int colwidth= widecharmap ? 6 : 4;
1253  GET_RE_DEBUG_FLAGS_DECL;
1254
1255  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1256
1257  /*
1258  print out the table precompression so that we can do a visual check
1259  that they are identical.
1260  */
1261
1262  PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1263
1264  for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1265   SV ** const tmp = av_fetch( revcharmap, charid, 0);
1266   if ( tmp ) {
1267    PerlIO_printf( Perl_debug_log, "%*s",
1268     colwidth,
1269     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1270        PL_colors[0], PL_colors[1],
1271        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1272        PERL_PV_ESCAPE_FIRSTCHAR
1273     )
1274    );
1275   }
1276  }
1277
1278  PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1279
1280  for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1281   PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1282  }
1283
1284  PerlIO_printf( Perl_debug_log, "\n" );
1285
1286  for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1287
1288   PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1289    (int)depth * 2 + 2,"",
1290    (UV)TRIE_NODENUM( state ) );
1291
1292   for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1293    UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1294    if (v)
1295     PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1296    else
1297     PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1298   }
1299   if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1300    PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1301   } else {
1302    PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1303    trie->states[ TRIE_NODENUM( state ) ].wordnum );
1304   }
1305  }
1306 }
1307
1308 #endif
1309
1310
1311 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1312   startbranch: the first branch in the whole branch sequence
1313   first      : start branch of sequence of branch-exact nodes.
1314    May be the same as startbranch
1315   last       : Thing following the last branch.
1316    May be the same as tail.
1317   tail       : item following the branch sequence
1318   count      : words in the sequence
1319   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1320   depth      : indent depth
1321
1322 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1323
1324 A trie is an N'ary tree where the branches are determined by digital
1325 decomposition of the key. IE, at the root node you look up the 1st character and
1326 follow that branch repeat until you find the end of the branches. Nodes can be
1327 marked as "accepting" meaning they represent a complete word. Eg:
1328
1329   /he|she|his|hers/
1330
1331 would convert into the following structure. Numbers represent states, letters
1332 following numbers represent valid transitions on the letter from that state, if
1333 the number is in square brackets it represents an accepting state, otherwise it
1334 will be in parenthesis.
1335
1336  +-h->+-e->[3]-+-r->(8)-+-s->[9]
1337  |    |
1338  |   (2)
1339  |    |
1340  (1)   +-i->(6)-+-s->[7]
1341  |
1342  +-s->(3)-+-h->(4)-+-e->[5]
1343
1344  Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1345
1346 This shows that when matching against the string 'hers' we will begin at state 1
1347 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1348 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1349 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1350 single traverse. We store a mapping from accepting to state to which word was
1351 matched, and then when we have multiple possibilities we try to complete the
1352 rest of the regex in the order in which they occured in the alternation.
1353
1354 The only prior NFA like behaviour that would be changed by the TRIE support is
1355 the silent ignoring of duplicate alternations which are of the form:
1356
1357  / (DUPE|DUPE) X? (?{ ... }) Y /x
1358
1359 Thus EVAL blocks following a trie may be called a different number of times with
1360 and without the optimisation. With the optimisations dupes will be silently
1361 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1362 the following demonstrates:
1363
1364  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1365
1366 which prints out 'word' three times, but
1367
1368  'words'=~/(word|word|word)(?{ print $1 })S/
1369
1370 which doesnt print it out at all. This is due to other optimisations kicking in.
1371
1372 Example of what happens on a structural level:
1373
1374 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1375
1376    1: CURLYM[1] {1,32767}(18)
1377    5:   BRANCH(8)
1378    6:     EXACT <ac>(16)
1379    8:   BRANCH(11)
1380    9:     EXACT <ad>(16)
1381   11:   BRANCH(14)
1382   12:     EXACT <ab>(16)
1383   16:   SUCCEED(0)
1384   17:   NOTHING(18)
1385   18: END(0)
1386
1387 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1388 and should turn into:
1389
1390    1: CURLYM[1] {1,32767}(18)
1391    5:   TRIE(16)
1392   [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1393   <ac>
1394   <ad>
1395   <ab>
1396   16:   SUCCEED(0)
1397   17:   NOTHING(18)
1398   18: END(0)
1399
1400 Cases where tail != last would be like /(?foo|bar)baz/:
1401
1402    1: BRANCH(4)
1403    2:   EXACT <foo>(8)
1404    4: BRANCH(7)
1405    5:   EXACT <bar>(8)
1406    7: TAIL(8)
1407    8: EXACT <baz>(10)
1408   10: END(0)
1409
1410 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1411 and would end up looking like:
1412
1413  1: TRIE(8)
1414  [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1415   <foo>
1416   <bar>
1417    7: TAIL(8)
1418    8: EXACT <baz>(10)
1419   10: END(0)
1420
1421  d = uvuni_to_utf8_flags(d, uv, 0);
1422
1423 is the recommended Unicode-aware way of saying
1424
1425  *(d++) = uv;
1426 */
1427
1428 #define TRIE_STORE_REVCHAR(val)                                            \
1429  STMT_START {                                                           \
1430   if (UTF) {          \
1431    SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1432    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);    \
1433    unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1434    SvCUR_set(zlopp, kapow - flrbbbbb);       \
1435    SvPOK_on(zlopp);         \
1436    SvUTF8_on(zlopp);         \
1437    av_push(revcharmap, zlopp);        \
1438   } else {          \
1439    char ooooff = (char)val;                                           \
1440    av_push(revcharmap, newSVpvn(&ooooff, 1));      \
1441   }           \
1442   } STMT_END
1443
1444 #define TRIE_READ_CHAR STMT_START {                                                     \
1445  wordlen++;                                                                          \
1446  if ( UTF ) {                                                                        \
1447   /* if it is UTF then it is either already folded, or does not need folding */   \
1448   uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1449  }                                                                                   \
1450  else if (folder == PL_fold_latin1) {                                                \
1451   /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1452   if ( foldlen > 0 ) {                                                            \
1453   uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1454   foldlen -= len;                                                              \
1455   scan += len;                                                                 \
1456   len = 0;                                                                     \
1457   } else {                                                                        \
1458    len = 1;                                                                    \
1459    uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
1460    skiplen = UNISKIP(uvc);                                                     \
1461    foldlen -= skiplen;                                                         \
1462    scan = foldbuf + skiplen;                                                   \
1463   }                                                                               \
1464  } else {                                                                            \
1465   /* raw data, will be folded later if needed */                                  \
1466   uvc = (U32)*uc;                                                                 \
1467   len = 1;                                                                        \
1468  }                                                                                   \
1469 } STMT_END
1470
1471
1472
1473 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1474  if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1475   U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1476   Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1477  }                                                           \
1478  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1479  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1480  TRIE_LIST_CUR( state )++;                                   \
1481 } STMT_END
1482
1483 #define TRIE_LIST_NEW(state) STMT_START {                       \
1484  Newxz( trie->states[ state ].trans.list,               \
1485   4, reg_trie_trans_le );                                 \
1486  TRIE_LIST_CUR( state ) = 1;                                \
1487  TRIE_LIST_LEN( state ) = 4;                                \
1488 } STMT_END
1489
1490 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1491  U16 dupe= trie->states[ state ].wordnum;                    \
1492  regnode * const noper_next = regnext( noper );              \
1493                 \
1494  DEBUG_r({                                                   \
1495   /* store the word for dumping */                        \
1496   SV* tmp;                                                \
1497   if (OP(noper) != NOTHING)                               \
1498    tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1499   else                                                    \
1500    tmp = newSVpvn_utf8( "", 0, UTF );   \
1501   av_push( trie_words, tmp );                             \
1502  });                                                         \
1503                 \
1504  curword++;                                                  \
1505  trie->wordinfo[curword].prev   = 0;                         \
1506  trie->wordinfo[curword].len    = wordlen;                   \
1507  trie->wordinfo[curword].accept = state;                     \
1508                 \
1509  if ( noper_next < tail ) {                                  \
1510   if (!trie->jump)                                        \
1511    trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1512   trie->jump[curword] = (U16)(noper_next - convert);      \
1513   if (!jumper)                                            \
1514    jumper = noper_next;                                \
1515   if (!nextbranch)                                        \
1516    nextbranch= regnext(cur);                           \
1517  }                                                           \
1518                 \
1519  if ( dupe ) {                                               \
1520   /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1521   /* chain, so that when the bits of chain are later    */\
1522   /* linked together, the dups appear in the chain      */\
1523   trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1524   trie->wordinfo[dupe].prev = curword;                    \
1525  } else {                                                    \
1526   /* we haven't inserted this word yet.                */ \
1527   trie->states[ state ].wordnum = curword;                \
1528  }                                                           \
1529 } STMT_END
1530
1531
1532 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)  \
1533  ( ( base + charid >=  ucharcount     \
1534   && base + charid < ubound     \
1535   && state == trie->trans[ base - ucharcount + charid ].check \
1536   && trie->trans[ base - ucharcount + charid ].next )  \
1537   ? trie->trans[ base - ucharcount + charid ].next  \
1538   : ( state==1 ? special : 0 )     \
1539  )
1540
1541 #define MADE_TRIE       1
1542 #define MADE_JUMP_TRIE  2
1543 #define MADE_EXACT_TRIE 4
1544
1545 STATIC I32
1546 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1547 {
1548  dVAR;
1549  /* first pass, loop through and scan words */
1550  reg_trie_data *trie;
1551  HV *widecharmap = NULL;
1552  AV *revcharmap = newAV();
1553  regnode *cur;
1554  const U32 uniflags = UTF8_ALLOW_DEFAULT;
1555  STRLEN len = 0;
1556  UV uvc = 0;
1557  U16 curword = 0;
1558  U32 next_alloc = 0;
1559  regnode *jumper = NULL;
1560  regnode *nextbranch = NULL;
1561  regnode *convert = NULL;
1562  U32 *prev_states; /* temp array mapping each state to previous one */
1563  /* we just use folder as a flag in utf8 */
1564  const U8 * folder = NULL;
1565
1566 #ifdef DEBUGGING
1567  const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1568  AV *trie_words = NULL;
1569  /* along with revcharmap, this only used during construction but both are
1570  * useful during debugging so we store them in the struct when debugging.
1571  */
1572 #else
1573  const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1574  STRLEN trie_charcount=0;
1575 #endif
1576  SV *re_trie_maxbuff;
1577  GET_RE_DEBUG_FLAGS_DECL;
1578
1579  PERL_ARGS_ASSERT_MAKE_TRIE;
1580 #ifndef DEBUGGING
1581  PERL_UNUSED_ARG(depth);
1582 #endif
1583
1584  switch (flags) {
1585   case EXACT: break;
1586   case EXACTFA:
1587   case EXACTFU_SS:
1588   case EXACTFU_TRICKYFOLD:
1589   case EXACTFU: folder = PL_fold_latin1; break;
1590   case EXACTF:  folder = PL_fold; break;
1591   case EXACTFL: folder = PL_fold_locale; break;
1592   default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1593  }
1594
1595  trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1596  trie->refcount = 1;
1597  trie->startstate = 1;
1598  trie->wordcount = word_count;
1599  RExC_rxi->data->data[ data_slot ] = (void*)trie;
1600  trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1601  if (flags == EXACT)
1602   trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1603  trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1604      trie->wordcount+1, sizeof(reg_trie_wordinfo));
1605
1606  DEBUG_r({
1607   trie_words = newAV();
1608  });
1609
1610  re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1611  if (!SvIOK(re_trie_maxbuff)) {
1612   sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1613  }
1614  DEBUG_TRIE_COMPILE_r({
1615     PerlIO_printf( Perl_debug_log,
1616     "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1617     (int)depth * 2 + 2, "",
1618     REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1619     REG_NODE_NUM(last), REG_NODE_NUM(tail),
1620     (int)depth);
1621  });
1622
1623    /* Find the node we are going to overwrite */
1624  if ( first == startbranch && OP( last ) != BRANCH ) {
1625   /* whole branch chain */
1626   convert = first;
1627  } else {
1628   /* branch sub-chain */
1629   convert = NEXTOPER( first );
1630  }
1631
1632  /*  -- First loop and Setup --
1633
1634  We first traverse the branches and scan each word to determine if it
1635  contains widechars, and how many unique chars there are, this is
1636  important as we have to build a table with at least as many columns as we
1637  have unique chars.
1638
1639  We use an array of integers to represent the character codes 0..255
1640  (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1641  native representation of the character value as the key and IV's for the
1642  coded index.
1643
1644  *TODO* If we keep track of how many times each character is used we can
1645  remap the columns so that the table compression later on is more
1646  efficient in terms of memory by ensuring the most common value is in the
1647  middle and the least common are on the outside.  IMO this would be better
1648  than a most to least common mapping as theres a decent chance the most
1649  common letter will share a node with the least common, meaning the node
1650  will not be compressible. With a middle is most common approach the worst
1651  case is when we have the least common nodes twice.
1652
1653  */
1654
1655  for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1656   regnode *noper = NEXTOPER( cur );
1657   const U8 *uc = (U8*)STRING( noper );
1658   const U8 *e  = uc + STR_LEN( noper );
1659   STRLEN foldlen = 0;
1660   U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1661   STRLEN skiplen = 0;
1662   const U8 *scan = (U8*)NULL;
1663   U32 wordlen      = 0;         /* required init */
1664   STRLEN chars = 0;
1665   bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1666
1667   if (OP(noper) == NOTHING) {
1668    regnode *noper_next= regnext(noper);
1669    if (noper_next != tail && OP(noper_next) == flags) {
1670     noper = noper_next;
1671     uc= (U8*)STRING(noper);
1672     e= uc + STR_LEN(noper);
1673     trie->minlen= STR_LEN(noper);
1674    } else {
1675     trie->minlen= 0;
1676     continue;
1677    }
1678   }
1679
1680   if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1681    TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1682           regardless of encoding */
1683    if (OP( noper ) == EXACTFU_SS) {
1684     /* false positives are ok, so just set this */
1685     TRIE_BITMAP_SET(trie,0xDF);
1686    }
1687   }
1688   for ( ; uc < e ; uc += len ) {
1689    TRIE_CHARCOUNT(trie)++;
1690    TRIE_READ_CHAR;
1691    chars++;
1692    if ( uvc < 256 ) {
1693     if ( folder ) {
1694      U8 folded= folder[ (U8) uvc ];
1695      if ( !trie->charmap[ folded ] ) {
1696       trie->charmap[ folded ]=( ++trie->uniquecharcount );
1697       TRIE_STORE_REVCHAR( folded );
1698      }
1699     }
1700     if ( !trie->charmap[ uvc ] ) {
1701      trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1702      TRIE_STORE_REVCHAR( uvc );
1703     }
1704     if ( set_bit ) {
1705      /* store the codepoint in the bitmap, and its folded
1706      * equivalent. */
1707      TRIE_BITMAP_SET(trie, uvc);
1708
1709      /* store the folded codepoint */
1710      if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1711
1712      if ( !UTF ) {
1713       /* store first byte of utf8 representation of
1714       variant codepoints */
1715       if (! UNI_IS_INVARIANT(uvc)) {
1716        TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1717       }
1718      }
1719      set_bit = 0; /* We've done our bit :-) */
1720     }
1721    } else {
1722     SV** svpp;
1723     if ( !widecharmap )
1724      widecharmap = newHV();
1725
1726     svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1727
1728     if ( !svpp )
1729      Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1730
1731     if ( !SvTRUE( *svpp ) ) {
1732      sv_setiv( *svpp, ++trie->uniquecharcount );
1733      TRIE_STORE_REVCHAR(uvc);
1734     }
1735    }
1736   }
1737   if( cur == first ) {
1738    trie->minlen = chars;
1739    trie->maxlen = chars;
1740   } else if (chars < trie->minlen) {
1741    trie->minlen = chars;
1742   } else if (chars > trie->maxlen) {
1743    trie->maxlen = chars;
1744   }
1745   if (OP( noper ) == EXACTFU_SS) {
1746    /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1747    if (trie->minlen > 1)
1748     trie->minlen= 1;
1749   }
1750   if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1751    /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1752    *        - We assume that any such sequence might match a 2 byte string */
1753    if (trie->minlen > 2 )
1754     trie->minlen= 2;
1755   }
1756
1757  } /* end first pass */
1758  DEBUG_TRIE_COMPILE_r(
1759   PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1760     (int)depth * 2 + 2,"",
1761     ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1762     (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1763     (int)trie->minlen, (int)trie->maxlen )
1764  );
1765
1766  /*
1767   We now know what we are dealing with in terms of unique chars and
1768   string sizes so we can calculate how much memory a naive
1769   representation using a flat table  will take. If it's over a reasonable
1770   limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1771   conservative but potentially much slower representation using an array
1772   of lists.
1773
1774   At the end we convert both representations into the same compressed
1775   form that will be used in regexec.c for matching with. The latter
1776   is a form that cannot be used to construct with but has memory
1777   properties similar to the list form and access properties similar
1778   to the table form making it both suitable for fast searches and
1779   small enough that its feasable to store for the duration of a program.
1780
1781   See the comment in the code where the compressed table is produced
1782   inplace from the flat tabe representation for an explanation of how
1783   the compression works.
1784
1785  */
1786
1787
1788  Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1789  prev_states[1] = 0;
1790
1791  if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1792   /*
1793    Second Pass -- Array Of Lists Representation
1794
1795    Each state will be represented by a list of charid:state records
1796    (reg_trie_trans_le) the first such element holds the CUR and LEN
1797    points of the allocated array. (See defines above).
1798
1799    We build the initial structure using the lists, and then convert
1800    it into the compressed table form which allows faster lookups
1801    (but cant be modified once converted).
1802   */
1803
1804   STRLEN transcount = 1;
1805
1806   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1807    "%*sCompiling trie using list compiler\n",
1808    (int)depth * 2 + 2, ""));
1809
1810   trie->states = (reg_trie_state *)
1811    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1812         sizeof(reg_trie_state) );
1813   TRIE_LIST_NEW(1);
1814   next_alloc = 2;
1815
1816   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1817
1818    regnode *noper   = NEXTOPER( cur );
1819    U8 *uc           = (U8*)STRING( noper );
1820    const U8 *e      = uc + STR_LEN( noper );
1821    U32 state        = 1;         /* required init */
1822    U16 charid       = 0;         /* sanity init */
1823    U8 *scan         = (U8*)NULL; /* sanity init */
1824    STRLEN foldlen   = 0;         /* required init */
1825    U32 wordlen      = 0;         /* required init */
1826    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1827    STRLEN skiplen   = 0;
1828
1829    if (OP(noper) == NOTHING) {
1830     regnode *noper_next= regnext(noper);
1831     if (noper_next != tail && OP(noper_next) == flags) {
1832      noper = noper_next;
1833      uc= (U8*)STRING(noper);
1834      e= uc + STR_LEN(noper);
1835     }
1836    }
1837
1838    if (OP(noper) != NOTHING) {
1839     for ( ; uc < e ; uc += len ) {
1840
1841      TRIE_READ_CHAR;
1842
1843      if ( uvc < 256 ) {
1844       charid = trie->charmap[ uvc ];
1845      } else {
1846       SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1847       if ( !svpp ) {
1848        charid = 0;
1849       } else {
1850        charid=(U16)SvIV( *svpp );
1851       }
1852      }
1853      /* charid is now 0 if we dont know the char read, or nonzero if we do */
1854      if ( charid ) {
1855
1856       U16 check;
1857       U32 newstate = 0;
1858
1859       charid--;
1860       if ( !trie->states[ state ].trans.list ) {
1861        TRIE_LIST_NEW( state );
1862       }
1863       for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1864        if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1865         newstate = TRIE_LIST_ITEM( state, check ).newstate;
1866         break;
1867        }
1868       }
1869       if ( ! newstate ) {
1870        newstate = next_alloc++;
1871        prev_states[newstate] = state;
1872        TRIE_LIST_PUSH( state, charid, newstate );
1873        transcount++;
1874       }
1875       state = newstate;
1876      } else {
1877       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1878      }
1879     }
1880    }
1881    TRIE_HANDLE_WORD(state);
1882
1883   } /* end second pass */
1884
1885   /* next alloc is the NEXT state to be allocated */
1886   trie->statecount = next_alloc;
1887   trie->states = (reg_trie_state *)
1888    PerlMemShared_realloc( trie->states,
1889         next_alloc
1890         * sizeof(reg_trie_state) );
1891
1892   /* and now dump it out before we compress it */
1893   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1894               revcharmap, next_alloc,
1895               depth+1)
1896   );
1897
1898   trie->trans = (reg_trie_trans *)
1899    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1900   {
1901    U32 state;
1902    U32 tp = 0;
1903    U32 zp = 0;
1904
1905
1906    for( state=1 ; state < next_alloc ; state ++ ) {
1907     U32 base=0;
1908
1909     /*
1910     DEBUG_TRIE_COMPILE_MORE_r(
1911      PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1912     );
1913     */
1914
1915     if (trie->states[state].trans.list) {
1916      U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1917      U16 maxid=minid;
1918      U16 idx;
1919
1920      for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1921       const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1922       if ( forid < minid ) {
1923        minid=forid;
1924       } else if ( forid > maxid ) {
1925        maxid=forid;
1926       }
1927      }
1928      if ( transcount < tp + maxid - minid + 1) {
1929       transcount *= 2;
1930       trie->trans = (reg_trie_trans *)
1931        PerlMemShared_realloc( trie->trans,
1932              transcount
1933              * sizeof(reg_trie_trans) );
1934       Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1935      }
1936      base = trie->uniquecharcount + tp - minid;
1937      if ( maxid == minid ) {
1938       U32 set = 0;
1939       for ( ; zp < tp ; zp++ ) {
1940        if ( ! trie->trans[ zp ].next ) {
1941         base = trie->uniquecharcount + zp - minid;
1942         trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1943         trie->trans[ zp ].check = state;
1944         set = 1;
1945         break;
1946        }
1947       }
1948       if ( !set ) {
1949        trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1950        trie->trans[ tp ].check = state;
1951        tp++;
1952        zp = tp;
1953       }
1954      } else {
1955       for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1956        const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1957        trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1958        trie->trans[ tid ].check = state;
1959       }
1960       tp += ( maxid - minid + 1 );
1961      }
1962      Safefree(trie->states[ state ].trans.list);
1963     }
1964     /*
1965     DEBUG_TRIE_COMPILE_MORE_r(
1966      PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1967     );
1968     */
1969     trie->states[ state ].trans.base=base;
1970    }
1971    trie->lasttrans = tp + 1;
1972   }
1973  } else {
1974   /*
1975   Second Pass -- Flat Table Representation.
1976
1977   we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1978   We know that we will need Charcount+1 trans at most to store the data
1979   (one row per char at worst case) So we preallocate both structures
1980   assuming worst case.
1981
1982   We then construct the trie using only the .next slots of the entry
1983   structs.
1984
1985   We use the .check field of the first entry of the node temporarily to
1986   make compression both faster and easier by keeping track of how many non
1987   zero fields are in the node.
1988
1989   Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1990   transition.
1991
1992   There are two terms at use here: state as a TRIE_NODEIDX() which is a
1993   number representing the first entry of the node, and state as a
1994   TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1995   TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1996   are 2 entrys per node. eg:
1997
1998    A B       A B
1999   1. 2 4    1. 3 7
2000   2. 0 3    3. 0 5
2001   3. 0 0    5. 0 0
2002   4. 0 0    7. 0 0
2003
2004   The table is internally in the right hand, idx form. However as we also
2005   have to deal with the states array which is indexed by nodenum we have to
2006   use TRIE_NODENUM() to convert.
2007
2008   */
2009   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2010    "%*sCompiling trie using table compiler\n",
2011    (int)depth * 2 + 2, ""));
2012
2013   trie->trans = (reg_trie_trans *)
2014    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2015         * trie->uniquecharcount + 1,
2016         sizeof(reg_trie_trans) );
2017   trie->states = (reg_trie_state *)
2018    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2019         sizeof(reg_trie_state) );
2020   next_alloc = trie->uniquecharcount + 1;
2021
2022
2023   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2024
2025    regnode *noper   = NEXTOPER( cur );
2026    const U8 *uc     = (U8*)STRING( noper );
2027    const U8 *e      = uc + STR_LEN( noper );
2028
2029    U32 state        = 1;         /* required init */
2030
2031    U16 charid       = 0;         /* sanity init */
2032    U32 accept_state = 0;         /* sanity init */
2033    U8 *scan         = (U8*)NULL; /* sanity init */
2034
2035    STRLEN foldlen   = 0;         /* required init */
2036    U32 wordlen      = 0;         /* required init */
2037    STRLEN skiplen   = 0;
2038    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2039
2040    if (OP(noper) == NOTHING) {
2041     regnode *noper_next= regnext(noper);
2042     if (noper_next != tail && OP(noper_next) == flags) {
2043      noper = noper_next;
2044      uc= (U8*)STRING(noper);
2045      e= uc + STR_LEN(noper);
2046     }
2047    }
2048
2049    if ( OP(noper) != NOTHING ) {
2050     for ( ; uc < e ; uc += len ) {
2051
2052      TRIE_READ_CHAR;
2053
2054      if ( uvc < 256 ) {
2055       charid = trie->charmap[ uvc ];
2056      } else {
2057       SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2058       charid = svpp ? (U16)SvIV(*svpp) : 0;
2059      }
2060      if ( charid ) {
2061       charid--;
2062       if ( !trie->trans[ state + charid ].next ) {
2063        trie->trans[ state + charid ].next = next_alloc;
2064        trie->trans[ state ].check++;
2065        prev_states[TRIE_NODENUM(next_alloc)]
2066          = TRIE_NODENUM(state);
2067        next_alloc += trie->uniquecharcount;
2068       }
2069       state = trie->trans[ state + charid ].next;
2070      } else {
2071       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2072      }
2073      /* charid is now 0 if we dont know the char read, or nonzero if we do */
2074     }
2075    }
2076    accept_state = TRIE_NODENUM( state );
2077    TRIE_HANDLE_WORD(accept_state);
2078
2079   } /* end second pass */
2080
2081   /* and now dump it out before we compress it */
2082   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2083               revcharmap,
2084               next_alloc, depth+1));
2085
2086   {
2087   /*
2088   * Inplace compress the table.*
2089
2090   For sparse data sets the table constructed by the trie algorithm will
2091   be mostly 0/FAIL transitions or to put it another way mostly empty.
2092   (Note that leaf nodes will not contain any transitions.)
2093
2094   This algorithm compresses the tables by eliminating most such
2095   transitions, at the cost of a modest bit of extra work during lookup:
2096
2097   - Each states[] entry contains a .base field which indicates the
2098   index in the state[] array wheres its transition data is stored.
2099
2100   - If .base is 0 there are no valid transitions from that node.
2101
2102   - If .base is nonzero then charid is added to it to find an entry in
2103   the trans array.
2104
2105   -If trans[states[state].base+charid].check!=state then the
2106   transition is taken to be a 0/Fail transition. Thus if there are fail
2107   transitions at the front of the node then the .base offset will point
2108   somewhere inside the previous nodes data (or maybe even into a node
2109   even earlier), but the .check field determines if the transition is
2110   valid.
2111
2112   XXX - wrong maybe?
2113   The following process inplace converts the table to the compressed
2114   table: We first do not compress the root node 1,and mark all its
2115   .check pointers as 1 and set its .base pointer as 1 as well. This
2116   allows us to do a DFA construction from the compressed table later,
2117   and ensures that any .base pointers we calculate later are greater
2118   than 0.
2119
2120   - We set 'pos' to indicate the first entry of the second node.
2121
2122   - We then iterate over the columns of the node, finding the first and
2123   last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2124   and set the .check pointers accordingly, and advance pos
2125   appropriately and repreat for the next node. Note that when we copy
2126   the next pointers we have to convert them from the original
2127   NODEIDX form to NODENUM form as the former is not valid post
2128   compression.
2129
2130   - If a node has no transitions used we mark its base as 0 and do not
2131   advance the pos pointer.
2132
2133   - If a node only has one transition we use a second pointer into the
2134   structure to fill in allocated fail transitions from other states.
2135   This pointer is independent of the main pointer and scans forward
2136   looking for null transitions that are allocated to a state. When it
2137   finds one it writes the single transition into the "hole".  If the
2138   pointer doesnt find one the single transition is appended as normal.
2139
2140   - Once compressed we can Renew/realloc the structures to release the
2141   excess space.
2142
2143   See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2144   specifically Fig 3.47 and the associated pseudocode.
2145
2146   demq
2147   */
2148   const U32 laststate = TRIE_NODENUM( next_alloc );
2149   U32 state, charid;
2150   U32 pos = 0, zp=0;
2151   trie->statecount = laststate;
2152
2153   for ( state = 1 ; state < laststate ; state++ ) {
2154    U8 flag = 0;
2155    const U32 stateidx = TRIE_NODEIDX( state );
2156    const U32 o_used = trie->trans[ stateidx ].check;
2157    U32 used = trie->trans[ stateidx ].check;
2158    trie->trans[ stateidx ].check = 0;
2159
2160    for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2161     if ( flag || trie->trans[ stateidx + charid ].next ) {
2162      if ( trie->trans[ stateidx + charid ].next ) {
2163       if (o_used == 1) {
2164        for ( ; zp < pos ; zp++ ) {
2165         if ( ! trie->trans[ zp ].next ) {
2166          break;
2167         }
2168        }
2169        trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2170        trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2171        trie->trans[ zp ].check = state;
2172        if ( ++zp > pos ) pos = zp;
2173        break;
2174       }
2175       used--;
2176      }
2177      if ( !flag ) {
2178       flag = 1;
2179       trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2180      }
2181      trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2182      trie->trans[ pos ].check = state;
2183      pos++;
2184     }
2185    }
2186   }
2187   trie->lasttrans = pos + 1;
2188   trie->states = (reg_trie_state *)
2189    PerlMemShared_realloc( trie->states, laststate
2190         * sizeof(reg_trie_state) );
2191   DEBUG_TRIE_COMPILE_MORE_r(
2192     PerlIO_printf( Perl_debug_log,
2193      "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2194      (int)depth * 2 + 2,"",
2195      (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2196      (IV)next_alloc,
2197      (IV)pos,
2198      ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2199    );
2200
2201   } /* end table compress */
2202  }
2203  DEBUG_TRIE_COMPILE_MORE_r(
2204    PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2205     (int)depth * 2 + 2, "",
2206     (UV)trie->statecount,
2207     (UV)trie->lasttrans)
2208  );
2209  /* resize the trans array to remove unused space */
2210  trie->trans = (reg_trie_trans *)
2211   PerlMemShared_realloc( trie->trans, trie->lasttrans
2212        * sizeof(reg_trie_trans) );
2213
2214  {   /* Modify the program and insert the new TRIE node */
2215   U8 nodetype =(U8)(flags & 0xFF);
2216   char *str=NULL;
2217
2218 #ifdef DEBUGGING
2219   regnode *optimize = NULL;
2220 #ifdef RE_TRACK_PATTERN_OFFSETS
2221
2222   U32 mjd_offset = 0;
2223   U32 mjd_nodelen = 0;
2224 #endif /* RE_TRACK_PATTERN_OFFSETS */
2225 #endif /* DEBUGGING */
2226   /*
2227   This means we convert either the first branch or the first Exact,
2228   depending on whether the thing following (in 'last') is a branch
2229   or not and whther first is the startbranch (ie is it a sub part of
2230   the alternation or is it the whole thing.)
2231   Assuming its a sub part we convert the EXACT otherwise we convert
2232   the whole branch sequence, including the first.
2233   */
2234   /* Find the node we are going to overwrite */
2235   if ( first != startbranch || OP( last ) == BRANCH ) {
2236    /* branch sub-chain */
2237    NEXT_OFF( first ) = (U16)(last - first);
2238 #ifdef RE_TRACK_PATTERN_OFFSETS
2239    DEBUG_r({
2240     mjd_offset= Node_Offset((convert));
2241     mjd_nodelen= Node_Length((convert));
2242    });
2243 #endif
2244    /* whole branch chain */
2245   }
2246 #ifdef RE_TRACK_PATTERN_OFFSETS
2247   else {
2248    DEBUG_r({
2249     const  regnode *nop = NEXTOPER( convert );
2250     mjd_offset= Node_Offset((nop));
2251     mjd_nodelen= Node_Length((nop));
2252    });
2253   }
2254   DEBUG_OPTIMISE_r(
2255    PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2256     (int)depth * 2 + 2, "",
2257     (UV)mjd_offset, (UV)mjd_nodelen)
2258   );
2259 #endif
2260   /* But first we check to see if there is a common prefix we can
2261   split out as an EXACT and put in front of the TRIE node.  */
2262   trie->startstate= 1;
2263   if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2264    U32 state;
2265    for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2266     U32 ofs = 0;
2267     I32 idx = -1;
2268     U32 count = 0;
2269     const U32 base = trie->states[ state ].trans.base;
2270
2271     if ( trie->states[state].wordnum )
2272       count = 1;
2273
2274     for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2275      if ( ( base + ofs >= trie->uniquecharcount ) &&
2276       ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2277       trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2278      {
2279       if ( ++count > 1 ) {
2280        SV **tmp = av_fetch( revcharmap, ofs, 0);
2281        const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2282        if ( state == 1 ) break;
2283        if ( count == 2 ) {
2284         Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2285         DEBUG_OPTIMISE_r(
2286          PerlIO_printf(Perl_debug_log,
2287           "%*sNew Start State=%"UVuf" Class: [",
2288           (int)depth * 2 + 2, "",
2289           (UV)state));
2290         if (idx >= 0) {
2291          SV ** const tmp = av_fetch( revcharmap, idx, 0);
2292          const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2293
2294          TRIE_BITMAP_SET(trie,*ch);
2295          if ( folder )
2296           TRIE_BITMAP_SET(trie, folder[ *ch ]);
2297          DEBUG_OPTIMISE_r(
2298           PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2299          );
2300         }
2301        }
2302        TRIE_BITMAP_SET(trie,*ch);
2303        if ( folder )
2304         TRIE_BITMAP_SET(trie,folder[ *ch ]);
2305        DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2306       }
2307       idx = ofs;
2308      }
2309     }
2310     if ( count == 1 ) {
2311      SV **tmp = av_fetch( revcharmap, idx, 0);
2312      STRLEN len;
2313      char *ch = SvPV( *tmp, len );
2314      DEBUG_OPTIMISE_r({
2315       SV *sv=sv_newmortal();
2316       PerlIO_printf( Perl_debug_log,
2317        "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2318        (int)depth * 2 + 2, "",
2319        (UV)state, (UV)idx,
2320        pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2321         PL_colors[0], PL_colors[1],
2322         (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2323         PERL_PV_ESCAPE_FIRSTCHAR
2324        )
2325       );
2326      });
2327      if ( state==1 ) {
2328       OP( convert ) = nodetype;
2329       str=STRING(convert);
2330       STR_LEN(convert)=0;
2331      }
2332      STR_LEN(convert) += len;
2333      while (len--)
2334       *str++ = *ch++;
2335     } else {
2336 #ifdef DEBUGGING
2337      if (state>1)
2338       DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2339 #endif
2340      break;
2341     }
2342    }
2343    trie->prefixlen = (state-1);
2344    if (str) {
2345     regnode *n = convert+NODE_SZ_STR(convert);
2346     NEXT_OFF(convert) = NODE_SZ_STR(convert);
2347     trie->startstate = state;
2348     trie->minlen -= (state - 1);
2349     trie->maxlen -= (state - 1);
2350 #ifdef DEBUGGING
2351    /* At least the UNICOS C compiler choked on this
2352     * being argument to DEBUG_r(), so let's just have
2353     * it right here. */
2354    if (
2355 #ifdef PERL_EXT_RE_BUILD
2356     1
2357 #else
2358     DEBUG_r_TEST
2359 #endif
2360     ) {
2361     regnode *fix = convert;
2362     U32 word = trie->wordcount;
2363     mjd_nodelen++;
2364     Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2365     while( ++fix < n ) {
2366      Set_Node_Offset_Length(fix, 0, 0);
2367     }
2368     while (word--) {
2369      SV ** const tmp = av_fetch( trie_words, word, 0 );
2370      if (tmp) {
2371       if ( STR_LEN(convert) <= SvCUR(*tmp) )
2372        sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2373       else
2374        sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2375      }
2376     }
2377    }
2378 #endif
2379     if (trie->maxlen) {
2380      convert = n;
2381     } else {
2382      NEXT_OFF(convert) = (U16)(tail - convert);
2383      DEBUG_r(optimize= n);
2384     }
2385    }
2386   }
2387   if (!jumper)
2388    jumper = last;
2389   if ( trie->maxlen ) {
2390    NEXT_OFF( convert ) = (U16)(tail - convert);
2391    ARG_SET( convert, data_slot );
2392    /* Store the offset to the first unabsorbed branch in
2393    jump[0], which is otherwise unused by the jump logic.
2394    We use this when dumping a trie and during optimisation. */
2395    if (trie->jump)
2396     trie->jump[0] = (U16)(nextbranch - convert);
2397
2398    /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2399    *   and there is a bitmap
2400    *   and the first "jump target" node we found leaves enough room
2401    * then convert the TRIE node into a TRIEC node, with the bitmap
2402    * embedded inline in the opcode - this is hypothetically faster.
2403    */
2404    if ( !trie->states[trie->startstate].wordnum
2405     && trie->bitmap
2406     && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2407    {
2408     OP( convert ) = TRIEC;
2409     Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2410     PerlMemShared_free(trie->bitmap);
2411     trie->bitmap= NULL;
2412    } else
2413     OP( convert ) = TRIE;
2414
2415    /* store the type in the flags */
2416    convert->flags = nodetype;
2417    DEBUG_r({
2418    optimize = convert
2419      + NODE_STEP_REGNODE
2420      + regarglen[ OP( convert ) ];
2421    });
2422    /* XXX We really should free up the resource in trie now,
2423     as we won't use them - (which resources?) dmq */
2424   }
2425   /* needed for dumping*/
2426   DEBUG_r(if (optimize) {
2427    regnode *opt = convert;
2428
2429    while ( ++opt < optimize) {
2430     Set_Node_Offset_Length(opt,0,0);
2431    }
2432    /*
2433     Try to clean up some of the debris left after the
2434     optimisation.
2435    */
2436    while( optimize < jumper ) {
2437     mjd_nodelen += Node_Length((optimize));
2438     OP( optimize ) = OPTIMIZED;
2439     Set_Node_Offset_Length(optimize,0,0);
2440     optimize++;
2441    }
2442    Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2443   });
2444  } /* end node insert */
2445  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2446
2447  /*  Finish populating the prev field of the wordinfo array.  Walk back
2448  *  from each accept state until we find another accept state, and if
2449  *  so, point the first word's .prev field at the second word. If the
2450  *  second already has a .prev field set, stop now. This will be the
2451  *  case either if we've already processed that word's accept state,
2452  *  or that state had multiple words, and the overspill words were
2453  *  already linked up earlier.
2454  */
2455  {
2456   U16 word;
2457   U32 state;
2458   U16 prev;
2459
2460   for (word=1; word <= trie->wordcount; word++) {
2461    prev = 0;
2462    if (trie->wordinfo[word].prev)
2463     continue;
2464    state = trie->wordinfo[word].accept;
2465    while (state) {
2466     state = prev_states[state];
2467     if (!state)
2468      break;
2469     prev = trie->states[state].wordnum;
2470     if (prev)
2471      break;
2472    }
2473    trie->wordinfo[word].prev = prev;
2474   }
2475   Safefree(prev_states);
2476  }
2477
2478
2479  /* and now dump out the compressed format */
2480  DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2481
2482  RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2483 #ifdef DEBUGGING
2484  RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2485  RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2486 #else
2487  SvREFCNT_dec_NN(revcharmap);
2488 #endif
2489  return trie->jump
2490   ? MADE_JUMP_TRIE
2491   : trie->startstate>1
2492    ? MADE_EXACT_TRIE
2493    : MADE_TRIE;
2494 }
2495
2496 STATIC void
2497 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2498 {
2499 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2500
2501    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2502    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2503    ISBN 0-201-10088-6
2504
2505    We find the fail state for each state in the trie, this state is the longest proper
2506    suffix of the current state's 'word' that is also a proper prefix of another word in our
2507    trie. State 1 represents the word '' and is thus the default fail state. This allows
2508    the DFA not to have to restart after its tried and failed a word at a given point, it
2509    simply continues as though it had been matching the other word in the first place.
2510    Consider
2511  'abcdgu'=~/abcdefg|cdgu/
2512    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2513    fail, which would bring us to the state representing 'd' in the second word where we would
2514    try 'g' and succeed, proceeding to match 'cdgu'.
2515  */
2516  /* add a fail transition */
2517  const U32 trie_offset = ARG(source);
2518  reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2519  U32 *q;
2520  const U32 ucharcount = trie->uniquecharcount;
2521  const U32 numstates = trie->statecount;
2522  const U32 ubound = trie->lasttrans + ucharcount;
2523  U32 q_read = 0;
2524  U32 q_write = 0;
2525  U32 charid;
2526  U32 base = trie->states[ 1 ].trans.base;
2527  U32 *fail;
2528  reg_ac_data *aho;
2529  const U32 data_slot = add_data( pRExC_state, 1, "T" );
2530  GET_RE_DEBUG_FLAGS_DECL;
2531
2532  PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2533 #ifndef DEBUGGING
2534  PERL_UNUSED_ARG(depth);
2535 #endif
2536
2537
2538  ARG_SET( stclass, data_slot );
2539  aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2540  RExC_rxi->data->data[ data_slot ] = (void*)aho;
2541  aho->trie=trie_offset;
2542  aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2543  Copy( trie->states, aho->states, numstates, reg_trie_state );
2544  Newxz( q, numstates, U32);
2545  aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2546  aho->refcount = 1;
2547  fail = aho->fail;
2548  /* initialize fail[0..1] to be 1 so that we always have
2549  a valid final fail state */
2550  fail[ 0 ] = fail[ 1 ] = 1;
2551
2552  for ( charid = 0; charid < ucharcount ; charid++ ) {
2553   const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2554   if ( newstate ) {
2555    q[ q_write ] = newstate;
2556    /* set to point at the root */
2557    fail[ q[ q_write++ ] ]=1;
2558   }
2559  }
2560  while ( q_read < q_write) {
2561   const U32 cur = q[ q_read++ % numstates ];
2562   base = trie->states[ cur ].trans.base;
2563
2564   for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2565    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2566    if (ch_state) {
2567     U32 fail_state = cur;
2568     U32 fail_base;
2569     do {
2570      fail_state = fail[ fail_state ];
2571      fail_base = aho->states[ fail_state ].trans.base;
2572     } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2573
2574     fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2575     fail[ ch_state ] = fail_state;
2576     if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2577     {
2578       aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2579     }
2580     q[ q_write++ % numstates] = ch_state;
2581    }
2582   }
2583  }
2584  /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2585  when we fail in state 1, this allows us to use the
2586  charclass scan to find a valid start char. This is based on the principle
2587  that theres a good chance the string being searched contains lots of stuff
2588  that cant be a start char.
2589  */
2590  fail[ 0 ] = fail[ 1 ] = 0;
2591  DEBUG_TRIE_COMPILE_r({
2592   PerlIO_printf(Perl_debug_log,
2593      "%*sStclass Failtable (%"UVuf" states): 0",
2594      (int)(depth * 2), "", (UV)numstates
2595   );
2596   for( q_read=1; q_read<numstates; q_read++ ) {
2597    PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2598   }
2599   PerlIO_printf(Perl_debug_log, "\n");
2600  });
2601  Safefree(q);
2602  /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2603 }
2604
2605
2606 /*
2607  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2608  * These need to be revisited when a newer toolchain becomes available.
2609  */
2610 #if defined(__sparc64__) && defined(__GNUC__)
2611 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2612 #       undef  SPARC64_GCC_WORKAROUND
2613 #       define SPARC64_GCC_WORKAROUND 1
2614 #   endif
2615 #endif
2616
2617 #define DEBUG_PEEP(str,scan,depth) \
2618  DEBUG_OPTIMISE_r({if (scan){ \
2619  SV * const mysv=sv_newmortal(); \
2620  regnode *Next = regnext(scan); \
2621  regprop(RExC_rx, mysv, scan); \
2622  PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2623  (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2624  Next ? (REG_NODE_NUM(Next)) : 0 ); \
2625    }});
2626
2627
2628 /* The below joins as many adjacent EXACTish nodes as possible into a single
2629  * one.  The regop may be changed if the node(s) contain certain sequences that
2630  * require special handling.  The joining is only done if:
2631  * 1) there is room in the current conglomerated node to entirely contain the
2632  *    next one.
2633  * 2) they are the exact same node type
2634  *
2635  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2636  * these get optimized out
2637  *
2638  * If a node is to match under /i (folded), the number of characters it matches
2639  * can be different than its character length if it contains a multi-character
2640  * fold.  *min_subtract is set to the total delta of the input nodes.
2641  *
2642  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2643  * and contains LATIN SMALL LETTER SHARP S
2644  *
2645  * This is as good a place as any to discuss the design of handling these
2646  * multi-character fold sequences.  It's been wrong in Perl for a very long
2647  * time.  There are three code points in Unicode whose multi-character folds
2648  * were long ago discovered to mess things up.  The previous designs for
2649  * dealing with these involved assigning a special node for them.  This
2650  * approach doesn't work, as evidenced by this example:
2651  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2652  * Both these fold to "sss", but if the pattern is parsed to create a node that
2653  * would match just the \xDF, it won't be able to handle the case where a
2654  * successful match would have to cross the node's boundary.  The new approach
2655  * that hopefully generally solves the problem generates an EXACTFU_SS node
2656  * that is "sss".
2657  *
2658  * It turns out that there are problems with all multi-character folds, and not
2659  * just these three.  Now the code is general, for all such cases, but the
2660  * three still have some special handling.  The approach taken is:
2661  * 1)   This routine examines each EXACTFish node that could contain multi-
2662  *      character fold sequences.  It returns in *min_subtract how much to
2663  *      subtract from the the actual length of the string to get a real minimum
2664  *      match length; it is 0 if there are no multi-char folds.  This delta is
2665  *      used by the caller to adjust the min length of the match, and the delta
2666  *      between min and max, so that the optimizer doesn't reject these
2667  *      possibilities based on size constraints.
2668  * 2)   Certain of these sequences require special handling by the trie code,
2669  *      so, if found, this code changes the joined node type to special ops:
2670  *      EXACTFU_TRICKYFOLD and EXACTFU_SS.
2671  * 3)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2672  *      is used for an EXACTFU node that contains at least one "ss" sequence in
2673  *      it.  For non-UTF-8 patterns and strings, this is the only case where
2674  *      there is a possible fold length change.  That means that a regular
2675  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
2676  *      with length changes, and so can be processed faster.  regexec.c takes
2677  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
2678  *      pre-folded by regcomp.c.  This saves effort in regex matching.
2679  *      However, the pre-folding isn't done for non-UTF8 patterns because the
2680  *      fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2681  *      down by forcing the pattern into UTF8 unless necessary.  Also what
2682  *      EXACTF and EXACTFL nodes fold to isn't known until runtime.  The fold
2683  *      possibilities for the non-UTF8 patterns are quite simple, except for
2684  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
2685  *      members of a fold-pair, and arrays are set up for all of them so that
2686  *      the other member of the pair can be found quickly.  Code elsewhere in
2687  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2688  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
2689  *      described in the next item.
2690  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2691  *      'ss' or not is not knowable at compile time.  It will match iff the
2692  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2693  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2694  *      it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2695  *      described in item 3).  An assumption that the optimizer part of
2696  *      regexec.c (probably unwittingly) makes is that a character in the
2697  *      pattern corresponds to at most a single character in the target string.
2698  *      (And I do mean character, and not byte here, unlike other parts of the
2699  *      documentation that have never been updated to account for multibyte
2700  *      Unicode.)  This assumption is wrong only in this case, as all other
2701  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2702  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2703  *      reluctant to try to change this assumption, so instead the code punts.
2704  *      This routine examines EXACTF nodes for the sharp s, and returns a
2705  *      boolean indicating whether or not the node is an EXACTF node that
2706  *      contains a sharp s.  When it is true, the caller sets a flag that later
2707  *      causes the optimizer in this file to not set values for the floating
2708  *      and fixed string lengths, and thus avoids the optimizer code in
2709  *      regexec.c that makes the invalid assumption.  Thus, there is no
2710  *      optimization based on string lengths for EXACTF nodes that contain the
2711  *      sharp s.  This only happens for /id rules (which means the pattern
2712  *      isn't in UTF-8).
2713  */
2714
2715 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2716  if (PL_regkind[OP(scan)] == EXACT) \
2717   join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2718
2719 STATIC U32
2720 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2721  /* Merge several consecutive EXACTish nodes into one. */
2722  regnode *n = regnext(scan);
2723  U32 stringok = 1;
2724  regnode *next = scan + NODE_SZ_STR(scan);
2725  U32 merged = 0;
2726  U32 stopnow = 0;
2727 #ifdef DEBUGGING
2728  regnode *stop = scan;
2729  GET_RE_DEBUG_FLAGS_DECL;
2730 #else
2731  PERL_UNUSED_ARG(depth);
2732 #endif
2733
2734  PERL_ARGS_ASSERT_JOIN_EXACT;
2735 #ifndef EXPERIMENTAL_INPLACESCAN
2736  PERL_UNUSED_ARG(flags);
2737  PERL_UNUSED_ARG(val);
2738 #endif
2739  DEBUG_PEEP("join",scan,depth);
2740
2741  /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2742  * EXACT ones that are mergeable to the current one. */
2743  while (n
2744   && (PL_regkind[OP(n)] == NOTHING
2745    || (stringok && OP(n) == OP(scan)))
2746   && NEXT_OFF(n)
2747   && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2748  {
2749
2750   if (OP(n) == TAIL || n > next)
2751    stringok = 0;
2752   if (PL_regkind[OP(n)] == NOTHING) {
2753    DEBUG_PEEP("skip:",n,depth);
2754    NEXT_OFF(scan) += NEXT_OFF(n);
2755    next = n + NODE_STEP_REGNODE;
2756 #ifdef DEBUGGING
2757    if (stringok)
2758     stop = n;
2759 #endif
2760    n = regnext(n);
2761   }
2762   else if (stringok) {
2763    const unsigned int oldl = STR_LEN(scan);
2764    regnode * const nnext = regnext(n);
2765
2766    /* XXX I (khw) kind of doubt that this works on platforms where
2767    * U8_MAX is above 255 because of lots of other assumptions */
2768    /* Don't join if the sum can't fit into a single node */
2769    if (oldl + STR_LEN(n) > U8_MAX)
2770     break;
2771
2772    DEBUG_PEEP("merg",n,depth);
2773    merged++;
2774
2775    NEXT_OFF(scan) += NEXT_OFF(n);
2776    STR_LEN(scan) += STR_LEN(n);
2777    next = n + NODE_SZ_STR(n);
2778    /* Now we can overwrite *n : */
2779    Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2780 #ifdef DEBUGGING
2781    stop = next - 1;
2782 #endif
2783    n = nnext;
2784    if (stopnow) break;
2785   }
2786
2787 #ifdef EXPERIMENTAL_INPLACESCAN
2788   if (flags && !NEXT_OFF(n)) {
2789    DEBUG_PEEP("atch", val, depth);
2790    if (reg_off_by_arg[OP(n)]) {
2791     ARG_SET(n, val - n);
2792    }
2793    else {
2794     NEXT_OFF(n) = val - n;
2795    }
2796    stopnow = 1;
2797   }
2798 #endif
2799  }
2800
2801  *min_subtract = 0;
2802  *has_exactf_sharp_s = FALSE;
2803
2804  /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2805  * can now analyze for sequences of problematic code points.  (Prior to
2806  * this final joining, sequences could have been split over boundaries, and
2807  * hence missed).  The sequences only happen in folding, hence for any
2808  * non-EXACT EXACTish node */
2809  if (OP(scan) != EXACT) {
2810   const U8 * const s0 = (U8*) STRING(scan);
2811   const U8 * s = s0;
2812   const U8 * const s_end = s0 + STR_LEN(scan);
2813
2814   /* One pass is made over the node's string looking for all the
2815   * possibilities.  to avoid some tests in the loop, there are two main
2816   * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2817   * non-UTF-8 */
2818   if (UTF) {
2819
2820    /* Examine the string for a multi-character fold sequence.  UTF-8
2821    * patterns have all characters pre-folded by the time this code is
2822    * executed */
2823    while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2824          length sequence we are looking for is 2 */
2825    {
2826     int count = 0;
2827     int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2828     if (! len) {    /* Not a multi-char fold: get next char */
2829      s += UTF8SKIP(s);
2830      continue;
2831     }
2832
2833     /* Nodes with 'ss' require special handling, except for EXACTFL
2834     * and EXACTFA for which there is no multi-char fold to this */
2835     if (len == 2 && *s == 's' && *(s+1) == 's'
2836      && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2837     {
2838      count = 2;
2839      OP(scan) = EXACTFU_SS;
2840      s += 2;
2841     }
2842     else if (len == 6   /* len is the same in both ASCII and EBCDIC for these */
2843       && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2844          COMBINING_DIAERESIS_UTF8
2845          COMBINING_ACUTE_ACCENT_UTF8,
2846         6)
2847        || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2848           COMBINING_DIAERESIS_UTF8
2849           COMBINING_ACUTE_ACCENT_UTF8,
2850          6)))
2851     {
2852      count = 3;
2853
2854      /* These two folds require special handling by trie's, so
2855      * change the node type to indicate this.  If EXACTFA and
2856      * EXACTFL were ever to be handled by trie's, this would
2857      * have to be changed.  If this node has already been
2858      * changed to EXACTFU_SS in this loop, leave it as is.  (I
2859      * (khw) think it doesn't matter in regexec.c for UTF
2860      * patterns, but no need to change it */
2861      if (OP(scan) == EXACTFU) {
2862       OP(scan) = EXACTFU_TRICKYFOLD;
2863      }
2864      s += 6;
2865     }
2866     else { /* Here is a generic multi-char fold. */
2867      const U8* multi_end  = s + len;
2868
2869      /* Count how many characters in it.  In the case of /l and
2870      * /aa, no folds which contain ASCII code points are
2871      * allowed, so check for those, and skip if found.  (In
2872      * EXACTFL, no folds are allowed to any Latin1 code point,
2873      * not just ASCII.  But there aren't any of these
2874      * currently, nor ever likely, so don't take the time to
2875      * test for them.  The code that generates the
2876      * is_MULTI_foo() macros croaks should one actually get put
2877      * into Unicode .) */
2878      if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2879       count = utf8_length(s, multi_end);
2880       s = multi_end;
2881      }
2882      else {
2883       while (s < multi_end) {
2884        if (isASCII(*s)) {
2885         s++;
2886         goto next_iteration;
2887        }
2888        else {
2889         s += UTF8SKIP(s);
2890        }
2891        count++;
2892       }
2893      }
2894     }
2895
2896     /* The delta is how long the sequence is minus 1 (1 is how long
2897     * the character that folds to the sequence is) */
2898     *min_subtract += count - 1;
2899    next_iteration: ;
2900    }
2901   }
2902   else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2903
2904    /* Here, the pattern is not UTF-8.  Look for the multi-char folds
2905    * that are all ASCII.  As in the above case, EXACTFL and EXACTFA
2906    * nodes can't have multi-char folds to this range (and there are
2907    * no existing ones in the upper latin1 range).  In the EXACTF
2908    * case we look also for the sharp s, which can be in the final
2909    * position.  Otherwise we can stop looking 1 byte earlier because
2910    * have to find at least two characters for a multi-fold */
2911    const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2912
2913    /* The below is perhaps overboard, but this allows us to save a
2914    * test each time through the loop at the expense of a mask.  This
2915    * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2916    * by a single bit.  On ASCII they are 32 apart; on EBCDIC, they
2917    * are 64.  This uses an exclusive 'or' to find that bit and then
2918    * inverts it to form a mask, with just a single 0, in the bit
2919    * position where 'S' and 's' differ. */
2920    const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2921    const U8 s_masked = 's' & S_or_s_mask;
2922
2923    while (s < upper) {
2924     int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2925     if (! len) {    /* Not a multi-char fold. */
2926      if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2927      {
2928       *has_exactf_sharp_s = TRUE;
2929      }
2930      s++;
2931      continue;
2932     }
2933
2934     if (len == 2
2935      && ((*s & S_or_s_mask) == s_masked)
2936      && ((*(s+1) & S_or_s_mask) == s_masked))
2937     {
2938
2939      /* EXACTF nodes need to know that the minimum length
2940      * changed so that a sharp s in the string can match this
2941      * ss in the pattern, but they remain EXACTF nodes, as they
2942      * won't match this unless the target string is is UTF-8,
2943      * which we don't know until runtime */
2944      if (OP(scan) != EXACTF) {
2945       OP(scan) = EXACTFU_SS;
2946      }
2947     }
2948
2949     *min_subtract += len - 1;
2950     s += len;
2951    }
2952   }
2953  }
2954
2955 #ifdef DEBUGGING
2956  /* Allow dumping but overwriting the collection of skipped
2957  * ops and/or strings with fake optimized ops */
2958  n = scan + NODE_SZ_STR(scan);
2959  while (n <= stop) {
2960   OP(n) = OPTIMIZED;
2961   FLAGS(n) = 0;
2962   NEXT_OFF(n) = 0;
2963   n++;
2964  }
2965 #endif
2966  DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2967  return stopnow;
2968 }
2969
2970 /* REx optimizer.  Converts nodes into quicker variants "in place".
2971    Finds fixed substrings.  */
2972
2973 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2974    to the position after last scanned or to NULL. */
2975
2976 #define INIT_AND_WITHP \
2977  assert(!and_withp); \
2978  Newx(and_withp,1,struct regnode_charclass_class); \
2979  SAVEFREEPV(and_withp)
2980
2981 /* this is a chain of data about sub patterns we are processing that
2982    need to be handled separately/specially in study_chunk. Its so
2983    we can simulate recursion without losing state.  */
2984 struct scan_frame;
2985 typedef struct scan_frame {
2986  regnode *last;  /* last node to process in this frame */
2987  regnode *next;  /* next node to process when last is reached */
2988  struct scan_frame *prev; /*previous frame*/
2989  I32 stop; /* what stopparen do we use */
2990 } scan_frame;
2991
2992
2993 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2994
2995 STATIC I32
2996 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2997       I32 *minlenp, I32 *deltap,
2998       regnode *last,
2999       scan_data_t *data,
3000       I32 stopparen,
3001       U8* recursed,
3002       struct regnode_charclass_class *and_withp,
3003       U32 flags, U32 depth)
3004       /* scanp: Start here (read-write). */
3005       /* deltap: Write maxlen-minlen here. */
3006       /* last: Stop before this one. */
3007       /* data: string data about the pattern */
3008       /* stopparen: treat close N as END */
3009       /* recursed: which subroutines have we recursed into */
3010       /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3011 {
3012  dVAR;
3013  I32 min = 0;    /* There must be at least this number of characters to match */
3014  I32 pars = 0, code;
3015  regnode *scan = *scanp, *next;
3016  I32 delta = 0;
3017  int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3018  int is_inf_internal = 0;  /* The studied chunk is infinite */
3019  I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3020  scan_data_t data_fake;
3021  SV *re_trie_maxbuff = NULL;
3022  regnode *first_non_open = scan;
3023  I32 stopmin = I32_MAX;
3024  scan_frame *frame = NULL;
3025  GET_RE_DEBUG_FLAGS_DECL;
3026
3027  PERL_ARGS_ASSERT_STUDY_CHUNK;
3028
3029 #ifdef DEBUGGING
3030  StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3031 #endif
3032
3033  if ( depth == 0 ) {
3034   while (first_non_open && OP(first_non_open) == OPEN)
3035    first_non_open=regnext(first_non_open);
3036  }
3037
3038
3039   fake_study_recurse:
3040  while ( scan && OP(scan) != END && scan < last ){
3041   UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3042         node length to get a real minimum (because
3043         the folded version may be shorter) */
3044   bool has_exactf_sharp_s = FALSE;
3045   /* Peephole optimizer: */
3046   DEBUG_STUDYDATA("Peep:", data,depth);
3047   DEBUG_PEEP("Peep",scan,depth);
3048
3049   /* Its not clear to khw or hv why this is done here, and not in the
3050   * clauses that deal with EXACT nodes.  khw's guess is that it's
3051   * because of a previous design */
3052   JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3053
3054   /* Follow the next-chain of the current node and optimize
3055   away all the NOTHINGs from it.  */
3056   if (OP(scan) != CURLYX) {
3057    const int max = (reg_off_by_arg[OP(scan)]
3058      ? I32_MAX
3059      /* I32 may be smaller than U16 on CRAYs! */
3060      : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3061    int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3062    int noff;
3063    regnode *n = scan;
3064
3065    /* Skip NOTHING and LONGJMP. */
3066    while ((n = regnext(n))
3067     && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3068      || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3069     && off + noff < max)
3070     off += noff;
3071    if (reg_off_by_arg[OP(scan)])
3072     ARG(scan) = off;
3073    else
3074     NEXT_OFF(scan) = off;
3075   }
3076
3077
3078
3079   /* The principal pseudo-switch.  Cannot be a switch, since we
3080   look into several different things.  */
3081   if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3082     || OP(scan) == IFTHEN) {
3083    next = regnext(scan);
3084    code = OP(scan);
3085    /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3086
3087    if (OP(next) == code || code == IFTHEN) {
3088     /* NOTE - There is similar code to this block below for handling
3089     TRIE nodes on a re-study.  If you change stuff here check there
3090     too. */
3091     I32 max1 = 0, min1 = I32_MAX, num = 0;
3092     struct regnode_charclass_class accum;
3093     regnode * const startbranch=scan;
3094
3095     if (flags & SCF_DO_SUBSTR)
3096      SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3097     if (flags & SCF_DO_STCLASS)
3098      cl_init_zero(pRExC_state, &accum);
3099
3100     while (OP(scan) == code) {
3101      I32 deltanext, minnext, f = 0, fake;
3102      struct regnode_charclass_class this_class;
3103
3104      num++;
3105      data_fake.flags = 0;
3106      if (data) {
3107       data_fake.whilem_c = data->whilem_c;
3108       data_fake.last_closep = data->last_closep;
3109      }
3110      else
3111       data_fake.last_closep = &fake;
3112
3113      data_fake.pos_delta = delta;
3114      next = regnext(scan);
3115      scan = NEXTOPER(scan);
3116      if (code != BRANCH)
3117       scan = NEXTOPER(scan);
3118      if (flags & SCF_DO_STCLASS) {
3119       cl_init(pRExC_state, &this_class);
3120       data_fake.start_class = &this_class;
3121       f = SCF_DO_STCLASS_AND;
3122      }
3123      if (flags & SCF_WHILEM_VISITED_POS)
3124       f |= SCF_WHILEM_VISITED_POS;
3125
3126      /* we suppose the run is continuous, last=next...*/
3127      minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3128           next, &data_fake,
3129           stopparen, recursed, NULL, f,depth+1);
3130      if (min1 > minnext)
3131       min1 = minnext;
3132      if (deltanext == I32_MAX) {
3133       is_inf = is_inf_internal = 1;
3134       max1 = I32_MAX;
3135      } else if (max1 < minnext + deltanext)
3136       max1 = minnext + deltanext;
3137      scan = next;
3138      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3139       pars++;
3140      if (data_fake.flags & SCF_SEEN_ACCEPT) {
3141       if ( stopmin > minnext)
3142        stopmin = min + min1;
3143       flags &= ~SCF_DO_SUBSTR;
3144       if (data)
3145        data->flags |= SCF_SEEN_ACCEPT;
3146      }
3147      if (data) {
3148       if (data_fake.flags & SF_HAS_EVAL)
3149        data->flags |= SF_HAS_EVAL;
3150       data->whilem_c = data_fake.whilem_c;
3151      }
3152      if (flags & SCF_DO_STCLASS)
3153       cl_or(pRExC_state, &accum, &this_class);
3154     }
3155     if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3156      min1 = 0;
3157     if (flags & SCF_DO_SUBSTR) {
3158      data->pos_min += min1;
3159      if (data->pos_delta >= I32_MAX - (max1 - min1))
3160       data->pos_delta = I32_MAX;
3161      else
3162       data->pos_delta += max1 - min1;
3163      if (max1 != min1 || is_inf)
3164       data->longest = &(data->longest_float);
3165     }
3166     min += min1;
3167     if (delta == I32_MAX || I32_MAX - delta - (max1 - min1) < 0)
3168      delta = I32_MAX;
3169     else
3170      delta += max1 - min1;
3171     if (flags & SCF_DO_STCLASS_OR) {
3172      cl_or(pRExC_state, data->start_class, &accum);
3173      if (min1) {
3174       cl_and(data->start_class, and_withp);
3175       flags &= ~SCF_DO_STCLASS;
3176      }
3177     }
3178     else if (flags & SCF_DO_STCLASS_AND) {
3179      if (min1) {
3180       cl_and(data->start_class, &accum);
3181       flags &= ~SCF_DO_STCLASS;
3182      }
3183      else {
3184       /* Switch to OR mode: cache the old value of
3185       * data->start_class */
3186       INIT_AND_WITHP;
3187       StructCopy(data->start_class, and_withp,
3188         struct regnode_charclass_class);
3189       flags &= ~SCF_DO_STCLASS_AND;
3190       StructCopy(&accum, data->start_class,
3191         struct regnode_charclass_class);
3192       flags |= SCF_DO_STCLASS_OR;
3193       SET_SSC_EOS(data->start_class);
3194      }
3195     }
3196
3197     if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3198     /* demq.
3199
3200     Assuming this was/is a branch we are dealing with: 'scan' now
3201     points at the item that follows the branch sequence, whatever
3202     it is. We now start at the beginning of the sequence and look
3203     for subsequences of
3204
3205     BRANCH->EXACT=>x1
3206     BRANCH->EXACT=>x2
3207     tail
3208
3209     which would be constructed from a pattern like /A|LIST|OF|WORDS/
3210
3211     If we can find such a subsequence we need to turn the first
3212     element into a trie and then add the subsequent branch exact
3213     strings to the trie.
3214
3215     We have two cases
3216
3217      1. patterns where the whole set of branches can be converted.
3218
3219      2. patterns where only a subset can be converted.
3220
3221     In case 1 we can replace the whole set with a single regop
3222     for the trie. In case 2 we need to keep the start and end
3223     branches so
3224
3225      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3226      becomes BRANCH TRIE; BRANCH X;
3227
3228     There is an additional case, that being where there is a
3229     common prefix, which gets split out into an EXACT like node
3230     preceding the TRIE node.
3231
3232     If x(1..n)==tail then we can do a simple trie, if not we make
3233     a "jump" trie, such that when we match the appropriate word
3234     we "jump" to the appropriate tail node. Essentially we turn
3235     a nested if into a case structure of sorts.
3236
3237     */
3238
3239      int made=0;
3240      if (!re_trie_maxbuff) {
3241       re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3242       if (!SvIOK(re_trie_maxbuff))
3243        sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3244      }
3245      if ( SvIV(re_trie_maxbuff)>=0  ) {
3246       regnode *cur;
3247       regnode *first = (regnode *)NULL;
3248       regnode *last = (regnode *)NULL;
3249       regnode *tail = scan;
3250       U8 trietype = 0;
3251       U32 count=0;
3252
3253 #ifdef DEBUGGING
3254       SV * const mysv = sv_newmortal();       /* for dumping */
3255 #endif
3256       /* var tail is used because there may be a TAIL
3257       regop in the way. Ie, the exacts will point to the
3258       thing following the TAIL, but the last branch will
3259       point at the TAIL. So we advance tail. If we
3260       have nested (?:) we may have to move through several
3261       tails.
3262       */
3263
3264       while ( OP( tail ) == TAIL ) {
3265        /* this is the TAIL generated by (?:) */
3266        tail = regnext( tail );
3267       }
3268
3269
3270       DEBUG_TRIE_COMPILE_r({
3271        regprop(RExC_rx, mysv, tail );
3272        PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3273         (int)depth * 2 + 2, "",
3274         "Looking for TRIE'able sequences. Tail node is: ",
3275         SvPV_nolen_const( mysv )
3276        );
3277       });
3278
3279       /*
3280
3281        Step through the branches
3282         cur represents each branch,
3283         noper is the first thing to be matched as part of that branch
3284         noper_next is the regnext() of that node.
3285
3286        We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3287        via a "jump trie" but we also support building with NOJUMPTRIE,
3288        which restricts the trie logic to structures like /FOO|BAR/.
3289
3290        If noper is a trieable nodetype then the branch is a possible optimization
3291        target. If we are building under NOJUMPTRIE then we require that noper_next
3292        is the same as scan (our current position in the regex program).
3293
3294        Once we have two or more consecutive such branches we can create a
3295        trie of the EXACT's contents and stitch it in place into the program.
3296
3297        If the sequence represents all of the branches in the alternation we
3298        replace the entire thing with a single TRIE node.
3299
3300        Otherwise when it is a subsequence we need to stitch it in place and
3301        replace only the relevant branches. This means the first branch has
3302        to remain as it is used by the alternation logic, and its next pointer,
3303        and needs to be repointed at the item on the branch chain following
3304        the last branch we have optimized away.
3305
3306        This could be either a BRANCH, in which case the subsequence is internal,
3307        or it could be the item following the branch sequence in which case the
3308        subsequence is at the end (which does not necessarily mean the first node
3309        is the start of the alternation).
3310
3311        TRIE_TYPE(X) is a define which maps the optype to a trietype.
3312
3313         optype          |  trietype
3314         ----------------+-----------
3315         NOTHING         | NOTHING
3316         EXACT           | EXACT
3317         EXACTFU         | EXACTFU
3318         EXACTFU_SS      | EXACTFU
3319         EXACTFU_TRICKYFOLD | EXACTFU
3320         EXACTFA         | 0
3321
3322
3323       */
3324 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3325      ( EXACT == (X) )   ? EXACT :        \
3326      ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3327      0 )
3328
3329       /* dont use tail as the end marker for this traverse */
3330       for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3331        regnode * const noper = NEXTOPER( cur );
3332        U8 noper_type = OP( noper );
3333        U8 noper_trietype = TRIE_TYPE( noper_type );
3334 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3335        regnode * const noper_next = regnext( noper );
3336        U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3337        U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3338 #endif
3339
3340        DEBUG_TRIE_COMPILE_r({
3341         regprop(RExC_rx, mysv, cur);
3342         PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3343         (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3344
3345         regprop(RExC_rx, mysv, noper);
3346         PerlIO_printf( Perl_debug_log, " -> %s",
3347          SvPV_nolen_const(mysv));
3348
3349         if ( noper_next ) {
3350         regprop(RExC_rx, mysv, noper_next );
3351         PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3352          SvPV_nolen_const(mysv));
3353         }
3354         PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3355         REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3356         PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3357         );
3358        });
3359
3360        /* Is noper a trieable nodetype that can be merged with the
3361        * current trie (if there is one)? */
3362        if ( noper_trietype
3363         &&
3364         (
3365           ( noper_trietype == NOTHING)
3366           || ( trietype == NOTHING )
3367           || ( trietype == noper_trietype )
3368         )
3369 #ifdef NOJUMPTRIE
3370         && noper_next == tail
3371 #endif
3372         && count < U16_MAX)
3373        {
3374         /* Handle mergable triable node
3375         * Either we are the first node in a new trieable sequence,
3376         * in which case we do some bookkeeping, otherwise we update
3377         * the end pointer. */
3378         if ( !first ) {
3379          first = cur;
3380          if ( noper_trietype == NOTHING ) {
3381 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3382           regnode * const noper_next = regnext( noper );
3383           U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3384           U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3385 #endif
3386
3387           if ( noper_next_trietype ) {
3388            trietype = noper_next_trietype;
3389           } else if (noper_next_type)  {
3390            /* a NOTHING regop is 1 regop wide. We need at least two
3391            * for a trie so we can't merge this in */
3392            first = NULL;
3393           }
3394          } else {
3395           trietype = noper_trietype;
3396          }
3397         } else {
3398          if ( trietype == NOTHING )
3399           trietype = noper_trietype;
3400          last = cur;
3401         }
3402         if (first)
3403          count++;
3404        } /* end handle mergable triable node */
3405        else {
3406         /* handle unmergable node -
3407         * noper may either be a triable node which can not be tried
3408         * together with the current trie, or a non triable node */
3409         if ( last ) {
3410          /* If last is set and trietype is not NOTHING then we have found
3411          * at least two triable branch sequences in a row of a similar
3412          * trietype so we can turn them into a trie. If/when we
3413          * allow NOTHING to start a trie sequence this condition will be
3414          * required, and it isn't expensive so we leave it in for now. */
3415          if ( trietype && trietype != NOTHING )
3416           make_trie( pRExC_state,
3417             startbranch, first, cur, tail, count,
3418             trietype, depth+1 );
3419          last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3420         }
3421         if ( noper_trietype
3422 #ifdef NOJUMPTRIE
3423          && noper_next == tail
3424 #endif
3425         ){
3426          /* noper is triable, so we can start a new trie sequence */
3427          count = 1;
3428          first = cur;
3429          trietype = noper_trietype;
3430         } else if (first) {
3431          /* if we already saw a first but the current node is not triable then we have
3432          * to reset the first information. */
3433          count = 0;
3434          first = NULL;
3435          trietype = 0;
3436         }
3437        } /* end handle unmergable node */
3438       } /* loop over branches */
3439       DEBUG_TRIE_COMPILE_r({
3440        regprop(RExC_rx, mysv, cur);
3441        PerlIO_printf( Perl_debug_log,
3442        "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3443        "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3444
3445       });
3446       if ( last && trietype ) {
3447        if ( trietype != NOTHING ) {
3448         /* the last branch of the sequence was part of a trie,
3449         * so we have to construct it here outside of the loop
3450         */
3451         made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3452 #ifdef TRIE_STUDY_OPT
3453         if ( ((made == MADE_EXACT_TRIE &&
3454          startbranch == first)
3455          || ( first_non_open == first )) &&
3456          depth==0 ) {
3457          flags |= SCF_TRIE_RESTUDY;
3458          if ( startbranch == first
3459           && scan == tail )
3460          {
3461           RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3462          }
3463         }
3464 #endif
3465        } else {
3466         /* at this point we know whatever we have is a NOTHING sequence/branch
3467         * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3468         */
3469         if ( startbranch == first ) {
3470          regnode *opt;
3471          /* the entire thing is a NOTHING sequence, something like this:
3472          * (?:|) So we can turn it into a plain NOTHING op. */
3473          DEBUG_TRIE_COMPILE_r({
3474           regprop(RExC_rx, mysv, cur);
3475           PerlIO_printf( Perl_debug_log,
3476           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3477           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3478
3479          });
3480          OP(startbranch)= NOTHING;
3481          NEXT_OFF(startbranch)= tail - startbranch;
3482          for ( opt= startbranch + 1; opt < tail ; opt++ )
3483           OP(opt)= OPTIMIZED;
3484         }
3485        }
3486       } /* end if ( last) */
3487      } /* TRIE_MAXBUF is non zero */
3488
3489     } /* do trie */
3490
3491    }
3492    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3493     scan = NEXTOPER(NEXTOPER(scan));
3494    } else   /* single branch is optimized. */
3495     scan = NEXTOPER(scan);
3496    continue;
3497   } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3498    scan_frame *newframe = NULL;
3499    I32 paren;
3500    regnode *start;
3501    regnode *end;
3502
3503    if (OP(scan) != SUSPEND) {
3504    /* set the pointer */
3505     if (OP(scan) == GOSUB) {
3506      paren = ARG(scan);
3507      RExC_recurse[ARG2L(scan)] = scan;
3508      start = RExC_open_parens[paren-1];
3509      end   = RExC_close_parens[paren-1];
3510     } else {
3511      paren = 0;
3512      start = RExC_rxi->program + 1;
3513      end   = RExC_opend;
3514     }
3515     if (!recursed) {
3516      Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3517      SAVEFREEPV(recursed);
3518     }
3519     if (!PAREN_TEST(recursed,paren+1)) {
3520      PAREN_SET(recursed,paren+1);
3521      Newx(newframe,1,scan_frame);
3522     } else {
3523      if (flags & SCF_DO_SUBSTR) {
3524       SCAN_COMMIT(pRExC_state,data,minlenp);
3525       data->longest = &(data->longest_float);
3526      }
3527      is_inf = is_inf_internal = 1;
3528      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3529       cl_anything(pRExC_state, data->start_class);
3530      flags &= ~SCF_DO_STCLASS;
3531     }
3532    } else {
3533     Newx(newframe,1,scan_frame);
3534     paren = stopparen;
3535     start = scan+2;
3536     end = regnext(scan);
3537    }
3538    if (newframe) {
3539     assert(start);
3540     assert(end);
3541     SAVEFREEPV(newframe);
3542     newframe->next = regnext(scan);
3543     newframe->last = last;
3544     newframe->stop = stopparen;
3545     newframe->prev = frame;
3546
3547     frame = newframe;
3548     scan =  start;
3549     stopparen = paren;
3550     last = end;
3551
3552     continue;
3553    }
3554   }
3555   else if (OP(scan) == EXACT) {
3556    I32 l = STR_LEN(scan);
3557    UV uc;
3558    if (UTF) {
3559     const U8 * const s = (U8*)STRING(scan);
3560     uc = utf8_to_uvchr_buf(s, s + l, NULL);
3561     l = utf8_length(s, s + l);
3562    } else {
3563     uc = *((U8*)STRING(scan));
3564    }
3565    min += l;
3566    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3567     /* The code below prefers earlier match for fixed
3568     offset, later match for variable offset.  */
3569     if (data->last_end == -1) { /* Update the start info. */
3570      data->last_start_min = data->pos_min;
3571      data->last_start_max = is_inf
3572       ? I32_MAX : data->pos_min + data->pos_delta;
3573     }
3574     sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3575     if (UTF)
3576      SvUTF8_on(data->last_found);
3577     {
3578      SV * const sv = data->last_found;
3579      MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3580       mg_find(sv, PERL_MAGIC_utf8) : NULL;
3581      if (mg && mg->mg_len >= 0)
3582       mg->mg_len += utf8_length((U8*)STRING(scan),
3583             (U8*)STRING(scan)+STR_LEN(scan));
3584     }
3585     data->last_end = data->pos_min + l;
3586     data->pos_min += l; /* As in the first entry. */
3587     data->flags &= ~SF_BEFORE_EOL;
3588    }
3589    if (flags & SCF_DO_STCLASS_AND) {
3590     /* Check whether it is compatible with what we know already! */
3591     int compat = 1;
3592
3593
3594     /* If compatible, we or it in below.  It is compatible if is
3595     * in the bitmp and either 1) its bit or its fold is set, or 2)
3596     * it's for a locale.  Even if there isn't unicode semantics
3597     * here, at runtime there may be because of matching against a
3598     * utf8 string, so accept a possible false positive for
3599     * latin1-range folds */
3600     if (uc >= 0x100 ||
3601      (!(data->start_class->flags & ANYOF_LOCALE)
3602      && !ANYOF_BITMAP_TEST(data->start_class, uc)
3603      && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3604       || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3605      )
3606     {
3607      compat = 0;
3608     }
3609     ANYOF_CLASS_ZERO(data->start_class);
3610     ANYOF_BITMAP_ZERO(data->start_class);
3611     if (compat)
3612      ANYOF_BITMAP_SET(data->start_class, uc);
3613     else if (uc >= 0x100) {
3614      int i;
3615
3616      /* Some Unicode code points fold to the Latin1 range; as
3617      * XXX temporary code, instead of figuring out if this is
3618      * one, just assume it is and set all the start class bits
3619      * that could be some such above 255 code point's fold
3620      * which will generate fals positives.  As the code
3621      * elsewhere that does compute the fold settles down, it
3622      * can be extracted out and re-used here */
3623      for (i = 0; i < 256; i++){
3624       if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3625        ANYOF_BITMAP_SET(data->start_class, i);
3626       }
3627      }
3628     }
3629     CLEAR_SSC_EOS(data->start_class);
3630     if (uc < 0x100)
3631     data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3632    }
3633    else if (flags & SCF_DO_STCLASS_OR) {
3634     /* false positive possible if the class is case-folded */
3635     if (uc < 0x100)
3636      ANYOF_BITMAP_SET(data->start_class, uc);
3637     else
3638      data->start_class->flags |= ANYOF_UNICODE_ALL;
3639     CLEAR_SSC_EOS(data->start_class);
3640     cl_and(data->start_class, and_withp);
3641    }
3642    flags &= ~SCF_DO_STCLASS;
3643   }
3644   else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3645    I32 l = STR_LEN(scan);
3646    UV uc = *((U8*)STRING(scan));
3647
3648    /* Search for fixed substrings supports EXACT only. */
3649    if (flags & SCF_DO_SUBSTR) {
3650     assert(data);
3651     SCAN_COMMIT(pRExC_state, data, minlenp);
3652    }
3653    if (UTF) {
3654     const U8 * const s = (U8 *)STRING(scan);
3655     uc = utf8_to_uvchr_buf(s, s + l, NULL);
3656     l = utf8_length(s, s + l);
3657    }
3658    if (has_exactf_sharp_s) {
3659     RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3660    }
3661    min += l - min_subtract;
3662    assert (min >= 0);
3663    delta += min_subtract;
3664    if (flags & SCF_DO_SUBSTR) {
3665     data->pos_min += l - min_subtract;
3666     if (data->pos_min < 0) {
3667      data->pos_min = 0;
3668     }
3669     data->pos_delta += min_subtract;
3670     if (min_subtract) {
3671      data->longest = &(data->longest_float);
3672     }
3673    }
3674    if (flags & SCF_DO_STCLASS_AND) {
3675     /* Check whether it is compatible with what we know already! */
3676     int compat = 1;
3677     if (uc >= 0x100 ||
3678     (!(data->start_class->flags & ANYOF_LOCALE)
3679     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3680     && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3681     {
3682      compat = 0;
3683     }
3684     ANYOF_CLASS_ZERO(data->start_class);
3685     ANYOF_BITMAP_ZERO(data->start_class);
3686     if (compat) {
3687      ANYOF_BITMAP_SET(data->start_class, uc);
3688      CLEAR_SSC_EOS(data->start_class);
3689      if (OP(scan) == EXACTFL) {
3690       /* XXX This set is probably no longer necessary, and
3691       * probably wrong as LOCALE now is on in the initial
3692       * state */
3693       data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3694      }
3695      else {
3696
3697       /* Also set the other member of the fold pair.  In case
3698       * that unicode semantics is called for at runtime, use
3699       * the full latin1 fold.  (Can't do this for locale,
3700       * because not known until runtime) */
3701       ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3702
3703       /* All other (EXACTFL handled above) folds except under
3704       * /iaa that include s, S, and sharp_s also may include
3705       * the others */
3706       if (OP(scan) != EXACTFA) {
3707        if (uc == 's' || uc == 'S') {
3708         ANYOF_BITMAP_SET(data->start_class,
3709             LATIN_SMALL_LETTER_SHARP_S);
3710        }
3711        else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3712         ANYOF_BITMAP_SET(data->start_class, 's');
3713         ANYOF_BITMAP_SET(data->start_class, 'S');
3714        }
3715       }
3716      }
3717     }
3718     else if (uc >= 0x100) {
3719      int i;
3720      for (i = 0; i < 256; i++){
3721       if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3722        ANYOF_BITMAP_SET(data->start_class, i);
3723       }
3724      }
3725     }
3726    }
3727    else if (flags & SCF_DO_STCLASS_OR) {
3728     if (data->start_class->flags & ANYOF_LOC_FOLD) {
3729      /* false positive possible if the class is case-folded.
3730      Assume that the locale settings are the same... */
3731      if (uc < 0x100) {
3732       ANYOF_BITMAP_SET(data->start_class, uc);
3733       if (OP(scan) != EXACTFL) {
3734
3735        /* And set the other member of the fold pair, but
3736        * can't do that in locale because not known until
3737        * run-time */
3738        ANYOF_BITMAP_SET(data->start_class,
3739            PL_fold_latin1[uc]);
3740
3741        /* All folds except under /iaa that include s, S,
3742        * and sharp_s also may include the others */
3743        if (OP(scan) != EXACTFA) {
3744         if (uc == 's' || uc == 'S') {
3745          ANYOF_BITMAP_SET(data->start_class,
3746             LATIN_SMALL_LETTER_SHARP_S);
3747         }
3748         else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3749          ANYOF_BITMAP_SET(data->start_class, 's');
3750          ANYOF_BITMAP_SET(data->start_class, 'S');
3751         }
3752        }
3753       }
3754      }
3755      CLEAR_SSC_EOS(data->start_class);
3756     }
3757     cl_and(data->start_class, and_withp);
3758    }
3759    flags &= ~SCF_DO_STCLASS;
3760   }
3761   else if (REGNODE_VARIES(OP(scan))) {
3762    I32 mincount, maxcount, minnext, deltanext, fl = 0;
3763    I32 f = flags, pos_before = 0;
3764    regnode * const oscan = scan;
3765    struct regnode_charclass_class this_class;
3766    struct regnode_charclass_class *oclass = NULL;
3767    I32 next_is_eval = 0;
3768
3769    switch (PL_regkind[OP(scan)]) {
3770    case WHILEM:  /* End of (?:...)* . */
3771     scan = NEXTOPER(scan);
3772     goto finish;
3773    case PLUS:
3774     if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3775      next = NEXTOPER(scan);
3776      if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3777       mincount = 1;
3778       maxcount = REG_INFTY;
3779       next = regnext(scan);
3780       scan = NEXTOPER(scan);
3781       goto do_curly;
3782      }
3783     }
3784     if (flags & SCF_DO_SUBSTR)
3785      data->pos_min++;
3786     min++;
3787     /* Fall through. */
3788    case STAR:
3789     if (flags & SCF_DO_STCLASS) {
3790      mincount = 0;
3791      maxcount = REG_INFTY;
3792      next = regnext(scan);
3793      scan = NEXTOPER(scan);
3794      goto do_curly;
3795     }
3796     is_inf = is_inf_internal = 1;
3797     scan = regnext(scan);
3798     if (flags & SCF_DO_SUBSTR) {
3799      SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3800      data->longest = &(data->longest_float);
3801     }
3802     goto optimize_curly_tail;
3803    case CURLY:
3804     if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3805      && (scan->flags == stopparen))
3806     {
3807      mincount = 1;
3808      maxcount = 1;
3809     } else {
3810      mincount = ARG1(scan);
3811      maxcount = ARG2(scan);
3812     }
3813     next = regnext(scan);
3814     if (OP(scan) == CURLYX) {
3815      I32 lp = (data ? *(data->last_closep) : 0);
3816      scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3817     }
3818     scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3819     next_is_eval = (OP(scan) == EVAL);
3820    do_curly:
3821     if (flags & SCF_DO_SUBSTR) {
3822      if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3823      pos_before = data->pos_min;
3824     }
3825     if (data) {
3826      fl = data->flags;
3827      data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3828      if (is_inf)
3829       data->flags |= SF_IS_INF;
3830     }
3831     if (flags & SCF_DO_STCLASS) {
3832      cl_init(pRExC_state, &this_class);
3833      oclass = data->start_class;
3834      data->start_class = &this_class;
3835      f |= SCF_DO_STCLASS_AND;
3836      f &= ~SCF_DO_STCLASS_OR;
3837     }
3838     /* Exclude from super-linear cache processing any {n,m}
3839     regops for which the combination of input pos and regex
3840     pos is not enough information to determine if a match
3841     will be possible.
3842
3843     For example, in the regex /foo(bar\s*){4,8}baz/ with the
3844     regex pos at the \s*, the prospects for a match depend not
3845     only on the input position but also on how many (bar\s*)
3846     repeats into the {4,8} we are. */
3847    if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3848      f &= ~SCF_WHILEM_VISITED_POS;
3849
3850     /* This will finish on WHILEM, setting scan, or on NULL: */
3851     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3852          last, data, stopparen, recursed, NULL,
3853          (mincount == 0
3854           ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3855
3856     if (flags & SCF_DO_STCLASS)
3857      data->start_class = oclass;
3858     if (mincount == 0 || minnext == 0) {
3859      if (flags & SCF_DO_STCLASS_OR) {
3860       cl_or(pRExC_state, data->start_class, &this_class);
3861      }
3862      else if (flags & SCF_DO_STCLASS_AND) {
3863       /* Switch to OR mode: cache the old value of
3864       * data->start_class */
3865       INIT_AND_WITHP;
3866       StructCopy(data->start_class, and_withp,
3867         struct regnode_charclass_class);
3868       flags &= ~SCF_DO_STCLASS_AND;
3869       StructCopy(&this_class, data->start_class,
3870         struct regnode_charclass_class);
3871       flags |= SCF_DO_STCLASS_OR;
3872       SET_SSC_EOS(data->start_class);
3873      }
3874     } else {  /* Non-zero len */
3875      if (flags & SCF_DO_STCLASS_OR) {
3876       cl_or(pRExC_state, data->start_class, &this_class);
3877       cl_and(data->start_class, and_withp);
3878      }
3879      else if (flags & SCF_DO_STCLASS_AND)
3880       cl_and(data->start_class, &this_class);
3881      flags &= ~SCF_DO_STCLASS;
3882     }
3883     if (!scan)   /* It was not CURLYX, but CURLY. */
3884      scan = next;
3885     if ( /* ? quantifier ok, except for (?{ ... }) */
3886      (next_is_eval || !(mincount == 0 && maxcount == 1))
3887      && (minnext == 0) && (deltanext == 0)
3888      && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3889      && maxcount <= REG_INFTY/3) /* Complement check for big count */
3890     {
3891      /* Fatal warnings may leak the regexp without this: */
3892      SAVEFREESV(RExC_rx_sv);
3893      ckWARNreg(RExC_parse,
3894        "Quantifier unexpected on zero-length expression");
3895      (void)ReREFCNT_inc(RExC_rx_sv);
3896     }
3897
3898     min += minnext * mincount;
3899     is_inf_internal |= deltanext == I32_MAX
3900          || (maxcount == REG_INFTY && minnext + deltanext > 0);
3901     is_inf |= is_inf_internal;
3902     if (is_inf)
3903      delta = I32_MAX;
3904     else
3905      delta += (minnext + deltanext) * maxcount - minnext * mincount;
3906
3907     /* Try powerful optimization CURLYX => CURLYN. */
3908     if (  OP(oscan) == CURLYX && data
3909      && data->flags & SF_IN_PAR
3910      && !(data->flags & SF_HAS_EVAL)
3911      && !deltanext && minnext == 1 ) {
3912      /* Try to optimize to CURLYN.  */
3913      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3914      regnode * const nxt1 = nxt;
3915 #ifdef DEBUGGING
3916      regnode *nxt2;
3917 #endif
3918
3919      /* Skip open. */
3920      nxt = regnext(nxt);
3921      if (!REGNODE_SIMPLE(OP(nxt))
3922       && !(PL_regkind[OP(nxt)] == EXACT
3923        && STR_LEN(nxt) == 1))
3924       goto nogo;
3925 #ifdef DEBUGGING
3926      nxt2 = nxt;
3927 #endif
3928      nxt = regnext(nxt);
3929      if (OP(nxt) != CLOSE)
3930       goto nogo;
3931      if (RExC_open_parens) {
3932       RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3933       RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3934      }
3935      /* Now we know that nxt2 is the only contents: */
3936      oscan->flags = (U8)ARG(nxt);
3937      OP(oscan) = CURLYN;
3938      OP(nxt1) = NOTHING; /* was OPEN. */
3939
3940 #ifdef DEBUGGING
3941      OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3942      NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3943      NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3944      OP(nxt) = OPTIMIZED; /* was CLOSE. */
3945      OP(nxt + 1) = OPTIMIZED; /* was count. */
3946      NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3947 #endif
3948     }
3949    nogo:
3950
3951     /* Try optimization CURLYX => CURLYM. */
3952     if (  OP(oscan) == CURLYX && data
3953      && !(data->flags & SF_HAS_PAR)
3954      && !(data->flags & SF_HAS_EVAL)
3955      && !deltanext /* atom is fixed width */
3956      && minnext != 0 /* CURLYM can't handle zero width */
3957      && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3958     ) {
3959      /* XXXX How to optimize if data == 0? */
3960      /* Optimize to a simpler form.  */
3961      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3962      regnode *nxt2;
3963
3964      OP(oscan) = CURLYM;
3965      while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3966        && (OP(nxt2) != WHILEM))
3967       nxt = nxt2;
3968      OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3969      /* Need to optimize away parenths. */
3970      if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3971       /* Set the parenth number.  */
3972       regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3973
3974       oscan->flags = (U8)ARG(nxt);
3975       if (RExC_open_parens) {
3976        RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3977        RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3978       }
3979       OP(nxt1) = OPTIMIZED; /* was OPEN. */
3980       OP(nxt) = OPTIMIZED; /* was CLOSE. */
3981
3982 #ifdef DEBUGGING
3983       OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3984       OP(nxt + 1) = OPTIMIZED; /* was count. */
3985       NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3986       NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3987 #endif
3988 #if 0
3989       while ( nxt1 && (OP(nxt1) != WHILEM)) {
3990        regnode *nnxt = regnext(nxt1);
3991        if (nnxt == nxt) {
3992         if (reg_off_by_arg[OP(nxt1)])
3993          ARG_SET(nxt1, nxt2 - nxt1);
3994         else if (nxt2 - nxt1 < U16_MAX)
3995          NEXT_OFF(nxt1) = nxt2 - nxt1;
3996         else
3997          OP(nxt) = NOTHING; /* Cannot beautify */
3998        }
3999        nxt1 = nnxt;
4000       }
4001 #endif
4002       /* Optimize again: */
4003       study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4004          NULL, stopparen, recursed, NULL, 0,depth+1);
4005      }
4006      else
4007       oscan->flags = 0;
4008     }
4009     else if ((OP(oscan) == CURLYX)
4010       && (flags & SCF_WHILEM_VISITED_POS)
4011       /* See the comment on a similar expression above.
4012        However, this time it's not a subexpression
4013        we care about, but the expression itself. */
4014       && (maxcount == REG_INFTY)
4015       && data && ++data->whilem_c < 16) {
4016      /* This stays as CURLYX, we can put the count/of pair. */
4017      /* Find WHILEM (as in regexec.c) */
4018      regnode *nxt = oscan + NEXT_OFF(oscan);
4019
4020      if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4021       nxt += ARG(nxt);
4022      PREVOPER(nxt)->flags = (U8)(data->whilem_c
4023       | (RExC_whilem_seen << 4)); /* On WHILEM */
4024     }
4025     if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4026      pars++;
4027     if (flags & SCF_DO_SUBSTR) {
4028      SV *last_str = NULL;
4029      int counted = mincount != 0;
4030
4031      if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4032 #if defined(SPARC64_GCC_WORKAROUND)
4033       I32 b = 0;
4034       STRLEN l = 0;
4035       const char *s = NULL;
4036       I32 old = 0;
4037
4038       if (pos_before >= data->last_start_min)
4039        b = pos_before;
4040       else
4041        b = data->last_start_min;
4042
4043       l = 0;
4044       s = SvPV_const(data->last_found, l);
4045       old = b - data->last_start_min;
4046
4047 #else
4048       I32 b = pos_before >= data->last_start_min
4049        ? pos_before : data->last_start_min;
4050       STRLEN l;
4051       const char * const s = SvPV_const(data->last_found, l);
4052       I32 old = b - data->last_start_min;
4053 #endif
4054
4055       if (UTF)
4056        old = utf8_hop((U8*)s, old) - (U8*)s;
4057       l -= old;
4058       /* Get the added string: */
4059       last_str = newSVpvn_utf8(s  + old, l, UTF);
4060       if (deltanext == 0 && pos_before == b) {
4061        /* What was added is a constant string */
4062        if (mincount > 1) {
4063         SvGROW(last_str, (mincount * l) + 1);
4064         repeatcpy(SvPVX(last_str) + l,
4065           SvPVX_const(last_str), l, mincount - 1);
4066         SvCUR_set(last_str, SvCUR(last_str) * mincount);
4067         /* Add additional parts. */
4068         SvCUR_set(data->last_found,
4069           SvCUR(data->last_found) - l);
4070         sv_catsv(data->last_found, last_str);
4071         {
4072          SV * sv = data->last_found;
4073          MAGIC *mg =
4074           SvUTF8(sv) && SvMAGICAL(sv) ?
4075           mg_find(sv, PERL_MAGIC_utf8) : NULL;
4076          if (mg && mg->mg_len >= 0)
4077           mg->mg_len += CHR_SVLEN(last_str) - l;
4078         }
4079         data->last_end += l * (mincount - 1);
4080        }
4081       } else {
4082        /* start offset must point into the last copy */
4083        data->last_start_min += minnext * (mincount - 1);
4084        data->last_start_max += is_inf ? I32_MAX
4085         : (maxcount - 1) * (minnext + data->pos_delta);
4086       }
4087      }
4088      /* It is counted once already... */
4089      data->pos_min += minnext * (mincount - counted);
4090 #if 0
4091 PerlIO_printf(Perl_debug_log, "counted=%d deltanext=%d I32_MAX=%d minnext=%d maxcount=%d mincount=%d\n",
4092  counted, deltanext, I32_MAX, minnext, maxcount, mincount);
4093 if (deltanext != I32_MAX)
4094 PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount, I32_MAX - data->pos_delta);
4095 #endif
4096      if (deltanext == I32_MAX || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= I32_MAX - data->pos_delta)
4097       data->pos_delta = I32_MAX;
4098      else
4099       data->pos_delta += - counted * deltanext +
4100       (minnext + deltanext) * maxcount - minnext * mincount;
4101      if (mincount != maxcount) {
4102       /* Cannot extend fixed substrings found inside
4103        the group.  */
4104       SCAN_COMMIT(pRExC_state,data,minlenp);
4105       if (mincount && last_str) {
4106        SV * const sv = data->last_found;
4107        MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4108         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4109
4110        if (mg)
4111         mg->mg_len = -1;
4112        sv_setsv(sv, last_str);
4113        data->last_end = data->pos_min;
4114        data->last_start_min =
4115         data->pos_min - CHR_SVLEN(last_str);
4116        data->last_start_max = is_inf
4117         ? I32_MAX
4118         : data->pos_min + data->pos_delta
4119         - CHR_SVLEN(last_str);
4120       }
4121       data->longest = &(data->longest_float);
4122      }
4123      SvREFCNT_dec(last_str);
4124     }
4125     if (data && (fl & SF_HAS_EVAL))
4126      data->flags |= SF_HAS_EVAL;
4127    optimize_curly_tail:
4128     if (OP(oscan) != CURLYX) {
4129      while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4130       && NEXT_OFF(next))
4131       NEXT_OFF(oscan) += NEXT_OFF(next);
4132     }
4133     continue;
4134    default:   /* REF, and CLUMP only? */
4135     if (flags & SCF_DO_SUBSTR) {
4136      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4137      data->longest = &(data->longest_float);
4138     }
4139     is_inf = is_inf_internal = 1;
4140     if (flags & SCF_DO_STCLASS_OR)
4141      cl_anything(pRExC_state, data->start_class);
4142     flags &= ~SCF_DO_STCLASS;
4143     break;
4144    }
4145   }
4146   else if (OP(scan) == LNBREAK) {
4147    if (flags & SCF_DO_STCLASS) {
4148     int value = 0;
4149     CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4150      if (flags & SCF_DO_STCLASS_AND) {
4151      for (value = 0; value < 256; value++)
4152       if (!is_VERTWS_cp(value))
4153        ANYOF_BITMAP_CLEAR(data->start_class, value);
4154     }
4155     else {
4156      for (value = 0; value < 256; value++)
4157       if (is_VERTWS_cp(value))
4158        ANYOF_BITMAP_SET(data->start_class, value);
4159     }
4160     if (flags & SCF_DO_STCLASS_OR)
4161      cl_and(data->start_class, and_withp);
4162     flags &= ~SCF_DO_STCLASS;
4163    }
4164    min++;
4165    delta++;    /* Because of the 2 char string cr-lf */
4166    if (flags & SCF_DO_SUBSTR) {
4167      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4168      data->pos_min += 1;
4169     data->pos_delta += 1;
4170     data->longest = &(data->longest_float);
4171     }
4172   }
4173   else if (REGNODE_SIMPLE(OP(scan))) {
4174    int value = 0;
4175
4176    if (flags & SCF_DO_SUBSTR) {
4177     SCAN_COMMIT(pRExC_state,data,minlenp);
4178     data->pos_min++;
4179    }
4180    min++;
4181    if (flags & SCF_DO_STCLASS) {
4182     int loop_max = 256;
4183     CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4184
4185     /* Some of the logic below assumes that switching
4186     locale on will only add false positives. */
4187     switch (PL_regkind[OP(scan)]) {
4188      U8 classnum;
4189
4190     case SANY:
4191     default:
4192 #ifdef DEBUGGING
4193     Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4194 #endif
4195     do_default:
4196      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4197       cl_anything(pRExC_state, data->start_class);
4198      break;
4199     case REG_ANY:
4200      if (OP(scan) == SANY)
4201       goto do_default;
4202      if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4203       value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4204         || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4205       cl_anything(pRExC_state, data->start_class);
4206      }
4207      if (flags & SCF_DO_STCLASS_AND || !value)
4208       ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4209      break;
4210     case ANYOF:
4211      if (flags & SCF_DO_STCLASS_AND)
4212       cl_and(data->start_class,
4213        (struct regnode_charclass_class*)scan);
4214      else
4215       cl_or(pRExC_state, data->start_class,
4216        (struct regnode_charclass_class*)scan);
4217      break;
4218     case POSIXA:
4219      loop_max = 128;
4220      /* FALL THROUGH */
4221     case POSIXL:
4222     case POSIXD:
4223     case POSIXU:
4224      classnum = FLAGS(scan);
4225      if (flags & SCF_DO_STCLASS_AND) {
4226       if (!(data->start_class->flags & ANYOF_LOCALE)) {
4227        ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4228        for (value = 0; value < loop_max; value++) {
4229         if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4230          ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4231         }
4232        }
4233       }
4234      }
4235      else {
4236       if (data->start_class->flags & ANYOF_LOCALE) {
4237        ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4238       }
4239       else {
4240
4241       /* Even if under locale, set the bits for non-locale
4242       * in case it isn't a true locale-node.  This will
4243       * create false positives if it truly is locale */
4244       for (value = 0; value < loop_max; value++) {
4245        if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4246         ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4247        }
4248       }
4249       }
4250      }
4251      break;
4252     case NPOSIXA:
4253      loop_max = 128;
4254      /* FALL THROUGH */
4255     case NPOSIXL:
4256     case NPOSIXU:
4257     case NPOSIXD:
4258      classnum = FLAGS(scan);
4259      if (flags & SCF_DO_STCLASS_AND) {
4260       if (!(data->start_class->flags & ANYOF_LOCALE)) {
4261        ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4262        for (value = 0; value < loop_max; value++) {
4263         if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4264          ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4265         }
4266        }
4267       }
4268      }
4269      else {
4270       if (data->start_class->flags & ANYOF_LOCALE) {
4271        ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4272       }
4273       else {
4274
4275       /* Even if under locale, set the bits for non-locale in
4276       * case it isn't a true locale-node.  This will create
4277       * false positives if it truly is locale */
4278       for (value = 0; value < loop_max; value++) {
4279        if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4280         ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4281        }
4282       }
4283       if (PL_regkind[OP(scan)] == NPOSIXD) {
4284        data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4285       }
4286       }
4287      }
4288      break;
4289     }
4290     if (flags & SCF_DO_STCLASS_OR)
4291      cl_and(data->start_class, and_withp);
4292     flags &= ~SCF_DO_STCLASS;
4293    }
4294   }
4295   else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4296    data->flags |= (OP(scan) == MEOL
4297        ? SF_BEFORE_MEOL
4298        : SF_BEFORE_SEOL);
4299    SCAN_COMMIT(pRExC_state, data, minlenp);
4300
4301   }
4302   else if (  PL_regkind[OP(scan)] == BRANCHJ
4303     /* Lookbehind, or need to calculate parens/evals/stclass: */
4304     && (scan->flags || data || (flags & SCF_DO_STCLASS))
4305     && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4306    if ( OP(scan) == UNLESSM &&
4307     scan->flags == 0 &&
4308     OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4309     OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4310    ) {
4311     regnode *opt;
4312     regnode *upto= regnext(scan);
4313     DEBUG_PARSE_r({
4314      SV * const mysv_val=sv_newmortal();
4315      DEBUG_STUDYDATA("OPFAIL",data,depth);
4316
4317      /*DEBUG_PARSE_MSG("opfail");*/
4318      regprop(RExC_rx, mysv_val, upto);
4319      PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4320         SvPV_nolen_const(mysv_val),
4321         (IV)REG_NODE_NUM(upto),
4322         (IV)(upto - scan)
4323      );
4324     });
4325     OP(scan) = OPFAIL;
4326     NEXT_OFF(scan) = upto - scan;
4327     for (opt= scan + 1; opt < upto ; opt++)
4328      OP(opt) = OPTIMIZED;
4329     scan= upto;
4330     continue;
4331    }
4332    if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4333     || OP(scan) == UNLESSM )
4334    {
4335     /* Negative Lookahead/lookbehind
4336     In this case we can't do fixed string optimisation.
4337     */
4338
4339     I32 deltanext, minnext, fake = 0;
4340     regnode *nscan;
4341     struct regnode_charclass_class intrnl;
4342     int f = 0;
4343
4344     data_fake.flags = 0;
4345     if (data) {
4346      data_fake.whilem_c = data->whilem_c;
4347      data_fake.last_closep = data->last_closep;
4348     }
4349     else
4350      data_fake.last_closep = &fake;
4351     data_fake.pos_delta = delta;
4352     if ( flags & SCF_DO_STCLASS && !scan->flags
4353      && OP(scan) == IFMATCH ) { /* Lookahead */
4354      cl_init(pRExC_state, &intrnl);
4355      data_fake.start_class = &intrnl;
4356      f |= SCF_DO_STCLASS_AND;
4357     }
4358     if (flags & SCF_WHILEM_VISITED_POS)
4359      f |= SCF_WHILEM_VISITED_POS;
4360     next = regnext(scan);
4361     nscan = NEXTOPER(NEXTOPER(scan));
4362     minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4363      last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4364     if (scan->flags) {
4365      if (deltanext) {
4366       FAIL("Variable length lookbehind not implemented");
4367      }
4368      else if (minnext > (I32)U8_MAX) {
4369       FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4370      }
4371      scan->flags = (U8)minnext;
4372     }
4373     if (data) {
4374      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4375       pars++;
4376      if (data_fake.flags & SF_HAS_EVAL)
4377       data->flags |= SF_HAS_EVAL;
4378      data->whilem_c = data_fake.whilem_c;
4379     }
4380     if (f & SCF_DO_STCLASS_AND) {
4381      if (flags & SCF_DO_STCLASS_OR) {
4382       /* OR before, AND after: ideally we would recurse with
4383       * data_fake to get the AND applied by study of the
4384       * remainder of the pattern, and then derecurse;
4385       * *** HACK *** for now just treat as "no information".
4386       * See [perl #56690].
4387       */
4388       cl_init(pRExC_state, data->start_class);
4389      }  else {
4390       /* AND before and after: combine and continue */
4391       const int was = TEST_SSC_EOS(data->start_class);
4392
4393       cl_and(data->start_class, &intrnl);
4394       if (was)
4395        SET_SSC_EOS(data->start_class);
4396      }
4397     }
4398    }
4399 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4400    else {
4401     /* Positive Lookahead/lookbehind
4402     In this case we can do fixed string optimisation,
4403     but we must be careful about it. Note in the case of
4404     lookbehind the positions will be offset by the minimum
4405     length of the pattern, something we won't know about
4406     until after the recurse.
4407     */
4408     I32 deltanext, fake = 0;
4409     regnode *nscan;
4410     struct regnode_charclass_class intrnl;
4411     int f = 0;
4412     /* We use SAVEFREEPV so that when the full compile
4413      is finished perl will clean up the allocated
4414      minlens when it's all done. This way we don't
4415      have to worry about freeing them when we know
4416      they wont be used, which would be a pain.
4417     */
4418     I32 *minnextp;
4419     Newx( minnextp, 1, I32 );
4420     SAVEFREEPV(minnextp);
4421
4422     if (data) {
4423      StructCopy(data, &data_fake, scan_data_t);
4424      if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4425       f |= SCF_DO_SUBSTR;
4426       if (scan->flags)
4427        SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4428       data_fake.last_found=newSVsv(data->last_found);
4429      }
4430     }
4431     else
4432      data_fake.last_closep = &fake;
4433     data_fake.flags = 0;
4434     data_fake.pos_delta = delta;
4435     if (is_inf)
4436      data_fake.flags |= SF_IS_INF;
4437     if ( flags & SCF_DO_STCLASS && !scan->flags
4438      && OP(scan) == IFMATCH ) { /* Lookahead */
4439      cl_init(pRExC_state, &intrnl);
4440      data_fake.start_class = &intrnl;
4441      f |= SCF_DO_STCLASS_AND;
4442     }
4443     if (flags & SCF_WHILEM_VISITED_POS)
4444      f |= SCF_WHILEM_VISITED_POS;
4445     next = regnext(scan);
4446     nscan = NEXTOPER(NEXTOPER(scan));
4447
4448     *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4449      last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4450     if (scan->flags) {
4451      if (deltanext) {
4452       FAIL("Variable length lookbehind not implemented");
4453      }
4454      else if (*minnextp > (I32)U8_MAX) {
4455       FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4456      }
4457      scan->flags = (U8)*minnextp;
4458     }
4459
4460     *minnextp += min;
4461
4462     if (f & SCF_DO_STCLASS_AND) {
4463      const int was = TEST_SSC_EOS(data.start_class);
4464
4465      cl_and(data->start_class, &intrnl);
4466      if (was)
4467       SET_SSC_EOS(data->start_class);
4468     }
4469     if (data) {
4470      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4471       pars++;
4472      if (data_fake.flags & SF_HAS_EVAL)
4473       data->flags |= SF_HAS_EVAL;
4474      data->whilem_c = data_fake.whilem_c;
4475      if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4476       if (RExC_rx->minlen<*minnextp)
4477        RExC_rx->minlen=*minnextp;
4478       SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4479       SvREFCNT_dec_NN(data_fake.last_found);
4480
4481       if ( data_fake.minlen_fixed != minlenp )
4482       {
4483        data->offset_fixed= data_fake.offset_fixed;
4484        data->minlen_fixed= data_fake.minlen_fixed;
4485        data->lookbehind_fixed+= scan->flags;
4486       }
4487       if ( data_fake.minlen_float != minlenp )
4488       {
4489        data->minlen_float= data_fake.minlen_float;
4490        data->offset_float_min=data_fake.offset_float_min;
4491        data->offset_float_max=data_fake.offset_float_max;
4492        data->lookbehind_float+= scan->flags;
4493       }
4494      }
4495     }
4496    }
4497 #endif
4498   }
4499   else if (OP(scan) == OPEN) {
4500    if (stopparen != (I32)ARG(scan))
4501     pars++;
4502   }
4503   else if (OP(scan) == CLOSE) {
4504    if (stopparen == (I32)ARG(scan)) {
4505     break;
4506    }
4507    if ((I32)ARG(scan) == is_par) {
4508     next = regnext(scan);
4509
4510     if ( next && (OP(next) != WHILEM) && next < last)
4511      is_par = 0;  /* Disable optimization */
4512    }
4513    if (data)
4514     *(data->last_closep) = ARG(scan);
4515   }
4516   else if (OP(scan) == EVAL) {
4517     if (data)
4518      data->flags |= SF_HAS_EVAL;
4519   }
4520   else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4521    if (flags & SCF_DO_SUBSTR) {
4522     SCAN_COMMIT(pRExC_state,data,minlenp);
4523     flags &= ~SCF_DO_SUBSTR;
4524    }
4525    if (data && OP(scan)==ACCEPT) {
4526     data->flags |= SCF_SEEN_ACCEPT;
4527     if (stopmin > min)
4528      stopmin = min;
4529    }
4530   }
4531   else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4532   {
4533     if (flags & SCF_DO_SUBSTR) {
4534      SCAN_COMMIT(pRExC_state,data,minlenp);
4535      data->longest = &(data->longest_float);
4536     }
4537     is_inf = is_inf_internal = 1;
4538     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4539      cl_anything(pRExC_state, data->start_class);
4540     flags &= ~SCF_DO_STCLASS;
4541   }
4542   else if (OP(scan) == GPOS) {
4543    if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4544     !(delta || is_inf || (data && data->pos_delta)))
4545    {
4546     if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4547      RExC_rx->extflags |= RXf_ANCH_GPOS;
4548     if (RExC_rx->gofs < (U32)min)
4549      RExC_rx->gofs = min;
4550    } else {
4551     RExC_rx->extflags |= RXf_GPOS_FLOAT;
4552     RExC_rx->gofs = 0;
4553    }
4554   }
4555 #ifdef TRIE_STUDY_OPT
4556 #ifdef FULL_TRIE_STUDY
4557   else if (PL_regkind[OP(scan)] == TRIE) {
4558    /* NOTE - There is similar code to this block above for handling
4559    BRANCH nodes on the initial study.  If you change stuff here
4560    check there too. */
4561    regnode *trie_node= scan;
4562    regnode *tail= regnext(scan);
4563    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4564    I32 max1 = 0, min1 = I32_MAX;
4565    struct regnode_charclass_class accum;
4566
4567    if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4568     SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4569    if (flags & SCF_DO_STCLASS)
4570     cl_init_zero(pRExC_state, &accum);
4571
4572    if (!trie->jump) {
4573     min1= trie->minlen;
4574     max1= trie->maxlen;
4575    } else {
4576     const regnode *nextbranch= NULL;
4577     U32 word;
4578
4579     for ( word=1 ; word <= trie->wordcount ; word++)
4580     {
4581      I32 deltanext=0, minnext=0, f = 0, fake;
4582      struct regnode_charclass_class this_class;
4583
4584      data_fake.flags = 0;
4585      if (data) {
4586       data_fake.whilem_c = data->whilem_c;
4587       data_fake.last_closep = data->last_closep;
4588      }
4589      else
4590       data_fake.last_closep = &fake;
4591      data_fake.pos_delta = delta;
4592      if (flags & SCF_DO_STCLASS) {
4593       cl_init(pRExC_state, &this_class);
4594       data_fake.start_class = &this_class;
4595       f = SCF_DO_STCLASS_AND;
4596      }
4597      if (flags & SCF_WHILEM_VISITED_POS)
4598       f |= SCF_WHILEM_VISITED_POS;
4599
4600      if (trie->jump[word]) {
4601       if (!nextbranch)
4602        nextbranch = trie_node + trie->jump[0];
4603       scan= trie_node + trie->jump[word];
4604       /* We go from the jump point to the branch that follows
4605       it. Note this means we need the vestigal unused branches
4606       even though they arent otherwise used.
4607       */
4608       minnext = study_chunk(pRExC_state, &scan, minlenp,
4609        &deltanext, (regnode *)nextbranch, &data_fake,
4610        stopparen, recursed, NULL, f,depth+1);
4611      }
4612      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4613       nextbranch= regnext((regnode*)nextbranch);
4614
4615      if (min1 > (I32)(minnext + trie->minlen))
4616       min1 = minnext + trie->minlen;
4617      if (deltanext == I32_MAX) {
4618       is_inf = is_inf_internal = 1;
4619       max1 = I32_MAX;
4620      } else if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4621       max1 = minnext + deltanext + trie->maxlen;
4622
4623      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4624       pars++;
4625      if (data_fake.flags & SCF_SEEN_ACCEPT) {
4626       if ( stopmin > min + min1)
4627        stopmin = min + min1;
4628       flags &= ~SCF_DO_SUBSTR;
4629       if (data)
4630        data->flags |= SCF_SEEN_ACCEPT;
4631      }
4632      if (data) {
4633       if (data_fake.flags & SF_HAS_EVAL)
4634        data->flags |= SF_HAS_EVAL;
4635       data->whilem_c = data_fake.whilem_c;
4636      }
4637      if (flags & SCF_DO_STCLASS)
4638       cl_or(pRExC_state, &accum, &this_class);
4639     }
4640    }
4641    if (flags & SCF_DO_SUBSTR) {
4642     data->pos_min += min1;
4643     data->pos_delta += max1 - min1;
4644     if (max1 != min1 || is_inf)
4645      data->longest = &(data->longest_float);
4646    }
4647    min += min1;
4648    delta += max1 - min1;
4649    if (flags & SCF_DO_STCLASS_OR) {
4650     cl_or(pRExC_state, data->start_class, &accum);
4651     if (min1) {
4652      cl_and(data->start_class, and_withp);
4653      flags &= ~SCF_DO_STCLASS;
4654     }
4655    }
4656    else if (flags & SCF_DO_STCLASS_AND) {
4657     if (min1) {
4658      cl_and(data->start_class, &accum);
4659      flags &= ~SCF_DO_STCLASS;
4660     }
4661     else {
4662      /* Switch to OR mode: cache the old value of
4663      * data->start_class */
4664      INIT_AND_WITHP;
4665      StructCopy(data->start_class, and_withp,
4666        struct regnode_charclass_class);
4667      flags &= ~SCF_DO_STCLASS_AND;
4668      StructCopy(&accum, data->start_class,
4669        struct regnode_charclass_class);
4670      flags |= SCF_DO_STCLASS_OR;
4671      SET_SSC_EOS(data->start_class);
4672     }
4673    }
4674    scan= tail;
4675    continue;
4676   }
4677 #else
4678   else if (PL_regkind[OP(scan)] == TRIE) {
4679    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4680    U8*bang=NULL;
4681
4682    min += trie->minlen;
4683    delta += (trie->maxlen - trie->minlen);
4684    flags &= ~SCF_DO_STCLASS; /* xxx */
4685    if (flags & SCF_DO_SUBSTR) {
4686      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4687      data->pos_min += trie->minlen;
4688      data->pos_delta += (trie->maxlen - trie->minlen);
4689     if (trie->maxlen != trie->minlen)
4690      data->longest = &(data->longest_float);
4691     }
4692     if (trie->jump) /* no more substrings -- for now /grr*/
4693      flags &= ~SCF_DO_SUBSTR;
4694   }
4695 #endif /* old or new */
4696 #endif /* TRIE_STUDY_OPT */
4697
4698   /* Else: zero-length, ignore. */
4699   scan = regnext(scan);
4700  }
4701  if (frame) {
4702   last = frame->last;
4703   scan = frame->next;
4704   stopparen = frame->stop;
4705   frame = frame->prev;
4706   goto fake_study_recurse;
4707  }
4708
4709   finish:
4710  assert(!frame);
4711  DEBUG_STUDYDATA("pre-fin:",data,depth);
4712
4713  *scanp = scan;
4714  *deltap = is_inf_internal ? I32_MAX : delta;
4715  if (flags & SCF_DO_SUBSTR && is_inf)
4716   data->pos_delta = I32_MAX - data->pos_min;
4717  if (is_par > (I32)U8_MAX)
4718   is_par = 0;
4719  if (is_par && pars==1 && data) {
4720   data->flags |= SF_IN_PAR;
4721   data->flags &= ~SF_HAS_PAR;
4722  }
4723  else if (pars && data) {
4724   data->flags |= SF_HAS_PAR;
4725   data->flags &= ~SF_IN_PAR;
4726  }
4727  if (flags & SCF_DO_STCLASS_OR)
4728   cl_and(data->start_class, and_withp);
4729  if (flags & SCF_TRIE_RESTUDY)
4730   data->flags |=  SCF_TRIE_RESTUDY;
4731
4732  DEBUG_STUDYDATA("post-fin:",data,depth);
4733
4734  return min < stopmin ? min : stopmin;
4735 }
4736
4737 STATIC U32
4738 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4739 {
4740  U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4741
4742  PERL_ARGS_ASSERT_ADD_DATA;
4743
4744  Renewc(RExC_rxi->data,
4745   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4746   char, struct reg_data);
4747  if(count)
4748   Renew(RExC_rxi->data->what, count + n, U8);
4749  else
4750   Newx(RExC_rxi->data->what, n, U8);
4751  RExC_rxi->data->count = count + n;
4752  Copy(s, RExC_rxi->data->what + count, n, U8);
4753  return count;
4754 }
4755
4756 /*XXX: todo make this not included in a non debugging perl */
4757 #ifndef PERL_IN_XSUB_RE
4758 void
4759 Perl_reginitcolors(pTHX)
4760 {
4761  dVAR;
4762  const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4763  if (s) {
4764   char *t = savepv(s);
4765   int i = 0;
4766   PL_colors[0] = t;
4767   while (++i < 6) {
4768    t = strchr(t, '\t');
4769    if (t) {
4770     *t = '\0';
4771     PL_colors[i] = ++t;
4772    }
4773    else
4774     PL_colors[i] = t = (char *)"";
4775   }
4776  } else {
4777   int i = 0;
4778   while (i < 6)
4779    PL_colors[i++] = (char *)"";
4780  }
4781  PL_colorset = 1;
4782 }
4783 #endif
4784
4785
4786 #ifdef TRIE_STUDY_OPT
4787 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
4788  STMT_START {                                            \
4789   if (                                                \
4790    (data.flags & SCF_TRIE_RESTUDY)               \
4791    && ! restudied++                              \
4792   ) {                                                 \
4793    dOsomething;                                    \
4794    goto reStudy;                                   \
4795   }                                                   \
4796  } STMT_END
4797 #else
4798 #define CHECK_RESTUDY_GOTO_butfirst
4799 #endif
4800
4801 /*
4802  * pregcomp - compile a regular expression into internal code
4803  *
4804  * Decides which engine's compiler to call based on the hint currently in
4805  * scope
4806  */
4807
4808 #ifndef PERL_IN_XSUB_RE
4809
4810 /* return the currently in-scope regex engine (or the default if none)  */
4811
4812 regexp_engine const *
4813 Perl_current_re_engine(pTHX)
4814 {
4815  dVAR;
4816
4817  if (IN_PERL_COMPILETIME) {
4818   HV * const table = GvHV(PL_hintgv);
4819   SV **ptr;
4820
4821   if (!table)
4822    return &reh_regexp_engine;
4823   ptr = hv_fetchs(table, "regcomp", FALSE);
4824   if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4825    return &reh_regexp_engine;
4826   return INT2PTR(regexp_engine*,SvIV(*ptr));
4827  }
4828  else {
4829   SV *ptr;
4830   if (!PL_curcop->cop_hints_hash)
4831    return &reh_regexp_engine;
4832   ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4833   if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4834    return &reh_regexp_engine;
4835   return INT2PTR(regexp_engine*,SvIV(ptr));
4836  }
4837 }
4838
4839
4840 REGEXP *
4841 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4842 {
4843  dVAR;
4844  regexp_engine const *eng = current_re_engine();
4845  GET_RE_DEBUG_FLAGS_DECL;
4846
4847  PERL_ARGS_ASSERT_PREGCOMP;
4848
4849  /* Dispatch a request to compile a regexp to correct regexp engine. */
4850  DEBUG_COMPILE_r({
4851   PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4852       PTR2UV(eng));
4853  });
4854  return CALLREGCOMP_ENG(eng, pattern, flags);
4855 }
4856 #endif
4857
4858 /* public(ish) entry point for the perl core's own regex compiling code.
4859  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4860  * pattern rather than a list of OPs, and uses the internal engine rather
4861  * than the current one */
4862
4863 REGEXP *
4864 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4865 {
4866  SV *pat = pattern; /* defeat constness! */
4867  PERL_ARGS_ASSERT_RE_COMPILE;
4868  return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4869 #ifdef PERL_IN_XSUB_RE
4870         &my_reg_engine,
4871 #else
4872         &reh_regexp_engine,
4873 #endif
4874         NULL, NULL, rx_flags, 0);
4875 }
4876
4877
4878 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
4879  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
4880  * point to the realloced string and length.
4881  *
4882  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
4883  * stuff added */
4884
4885 static void
4886 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
4887      char **pat_p, STRLEN *plen_p, int num_code_blocks)
4888 {
4889  U8 *const src = (U8*)*pat_p;
4890  U8 *dst;
4891  int n=0;
4892  STRLEN s = 0, d = 0;
4893  bool do_end = 0;
4894  GET_RE_DEBUG_FLAGS_DECL;
4895
4896  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4897   "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4898
4899  Newx(dst, *plen_p * 2 + 1, U8);
4900
4901  while (s < *plen_p) {
4902   const UV uv = NATIVE_TO_ASCII(src[s]);
4903   if (UNI_IS_INVARIANT(uv))
4904    dst[d]   = (U8)UTF_TO_NATIVE(uv);
4905   else {
4906    dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
4907    dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
4908   }
4909   if (n < num_code_blocks) {
4910    if (!do_end && pRExC_state->code_blocks[n].start == s) {
4911     pRExC_state->code_blocks[n].start = d;
4912     assert(dst[d] == '(');
4913     do_end = 1;
4914    }
4915    else if (do_end && pRExC_state->code_blocks[n].end == s) {
4916     pRExC_state->code_blocks[n].end = d;
4917     assert(dst[d] == ')');
4918     do_end = 0;
4919     n++;
4920    }
4921   }
4922   s++;
4923   d++;
4924  }
4925  dst[d] = '\0';
4926  *plen_p = d;
4927  *pat_p = (char*) dst;
4928  SAVEFREEPV(*pat_p);
4929  RExC_orig_utf8 = RExC_utf8 = 1;
4930 }
4931
4932
4933
4934 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
4935  * while recording any code block indices, and handling overloading,
4936  * nested qr// objects etc.  If pat is null, it will allocate a new
4937  * string, or just return the first arg, if there's only one.
4938  *
4939  * Returns the malloced/updated pat.
4940  * patternp and pat_count is the array of SVs to be concatted;
4941  * oplist is the optional list of ops that generated the SVs;
4942  * recompile_p is a pointer to a boolean that will be set if
4943  *   the regex will need to be recompiled.
4944  * delim, if non-null is an SV that will be inserted between each element
4945  */
4946
4947 static SV*
4948 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
4949     SV *pat, SV ** const patternp, int pat_count,
4950     OP *oplist, bool *recompile_p, SV *delim)
4951 {
4952  SV **svp;
4953  int n = 0;
4954  bool use_delim = FALSE;
4955  bool alloced = FALSE;
4956
4957  /* if we know we have at least two args, create an empty string,
4958  * then concatenate args to that. For no args, return an empty string */
4959  if (!pat && pat_count != 1) {
4960   pat = newSVpvn("", 0);
4961   SAVEFREESV(pat);
4962   alloced = TRUE;
4963  }
4964
4965  for (svp = patternp; svp < patternp + pat_count; svp++) {
4966   SV *sv;
4967   SV *rx  = NULL;
4968   STRLEN orig_patlen = 0;
4969   bool code = 0;
4970   SV *msv = use_delim ? delim : *svp;
4971
4972   /* if we've got a delimiter, we go round the loop twice for each
4973   * svp slot (except the last), using the delimiter the second
4974   * time round */
4975   if (use_delim) {
4976    svp--;
4977    use_delim = FALSE;
4978   }
4979   else if (delim)
4980    use_delim = TRUE;
4981
4982   if (SvTYPE(msv) == SVt_PVAV) {
4983    /* we've encountered an interpolated array within
4984    * the pattern, e.g. /...@a..../. Expand the list of elements,
4985    * then recursively append elements.
4986    * The code in this block is based on S_pushav() */
4987
4988    AV *const av = (AV*)msv;
4989    const I32 maxarg = AvFILL(av) + 1;
4990    SV **array;
4991
4992    if (oplist) {
4993     assert(oplist->op_type == OP_PADAV
4994      || oplist->op_type == OP_RV2AV);
4995     oplist = oplist->op_sibling;;
4996    }
4997
4998    if (SvRMAGICAL(av)) {
4999     U32 i;
5000
5001     Newx(array, maxarg, SV*);
5002     SAVEFREEPV(array);
5003     for (i=0; i < (U32)maxarg; i++) {
5004      SV ** const svp = av_fetch(av, i, FALSE);
5005      array[i] = svp ? *svp : &PL_sv_undef;
5006     }
5007    }
5008    else
5009     array = AvARRAY(av);
5010
5011    pat = S_concat_pat(aTHX_ pRExC_state, pat,
5012         array, maxarg, NULL, recompile_p,
5013         /* $" */
5014         GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5015
5016    continue;
5017   }
5018
5019
5020   /* we make the assumption here that each op in the list of
5021   * op_siblings maps to one SV pushed onto the stack,
5022   * except for code blocks, with have both an OP_NULL and
5023   * and OP_CONST.
5024   * This allows us to match up the list of SVs against the
5025   * list of OPs to find the next code block.
5026   *
5027   * Note that       PUSHMARK PADSV PADSV ..
5028   * is optimised to
5029   *                 PADRANGE PADSV  PADSV  ..
5030   * so the alignment still works. */
5031
5032   if (oplist) {
5033    if (oplist->op_type == OP_NULL
5034     && (oplist->op_flags & OPf_SPECIAL))
5035    {
5036     assert(n < pRExC_state->num_code_blocks);
5037     pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5038     pRExC_state->code_blocks[n].block = oplist;
5039     pRExC_state->code_blocks[n].src_regex = NULL;
5040     n++;
5041     code = 1;
5042     oplist = oplist->op_sibling; /* skip CONST */
5043     assert(oplist);
5044    }
5045    oplist = oplist->op_sibling;;
5046   }
5047
5048   /* apply magic and QR overloading to arg */
5049
5050   SvGETMAGIC(msv);
5051   if (SvROK(msv) && SvAMAGIC(msv)) {
5052    SV *sv = AMG_CALLunary(msv, regexp_amg);
5053    if (sv) {
5054     if (SvROK(sv))
5055      sv = SvRV(sv);
5056     if (SvTYPE(sv) != SVt_REGEXP)
5057      Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5058     msv = sv;
5059    }
5060   }
5061
5062   /* try concatenation overload ... */
5063   if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5064     (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5065   {
5066    sv_setsv(pat, sv);
5067    /* overloading involved: all bets are off over literal
5068    * code. Pretend we haven't seen it */
5069    pRExC_state->num_code_blocks -= n;
5070    n = 0;
5071   }
5072   else  {
5073    /* ... or failing that, try "" overload */
5074    while (SvAMAGIC(msv)
5075      && (sv = AMG_CALLunary(msv, string_amg))
5076      && sv != msv
5077      &&  !(   SvROK(msv)
5078       && SvROK(sv)
5079       && SvRV(msv) == SvRV(sv))
5080    ) {
5081     msv = sv;
5082     SvGETMAGIC(msv);
5083    }
5084    if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5085     msv = SvRV(msv);
5086
5087    if (pat) {
5088     /* this is a partially unrolled
5089     *     sv_catsv_nomg(pat, msv);
5090     * that allows us to adjust code block indices if
5091     * needed */
5092     STRLEN dlen;
5093     char *dst = SvPV_force_nomg(pat, dlen);
5094     orig_patlen = dlen;
5095     if (SvUTF8(msv) && !SvUTF8(pat)) {
5096      S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5097      sv_setpvn(pat, dst, dlen);
5098      SvUTF8_on(pat);
5099     }
5100     sv_catsv_nomg(pat, msv);
5101     rx = msv;
5102    }
5103    else
5104     pat = msv;
5105
5106    if (code)
5107     pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5108   }
5109
5110   /* extract any code blocks within any embedded qr//'s */
5111   if (rx && SvTYPE(rx) == SVt_REGEXP
5112    && RX_ENGINE((REGEXP*)rx)->op_comp)
5113   {
5114
5115    RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5116    if (ri->num_code_blocks) {
5117     int i;
5118     /* the presence of an embedded qr// with code means
5119     * we should always recompile: the text of the
5120     * qr// may not have changed, but it may be a
5121     * different closure than last time */
5122     *recompile_p = 1;
5123     Renew(pRExC_state->code_blocks,
5124      pRExC_state->num_code_blocks + ri->num_code_blocks,
5125      struct reg_code_block);
5126     pRExC_state->num_code_blocks += ri->num_code_blocks;
5127
5128     for (i=0; i < ri->num_code_blocks; i++) {
5129      struct reg_code_block *src, *dst;
5130      STRLEN offset =  orig_patlen
5131       + ReANY((REGEXP *)rx)->pre_prefix;
5132      assert(n < pRExC_state->num_code_blocks);
5133      src = &ri->code_blocks[i];
5134      dst = &pRExC_state->code_blocks[n];
5135      dst->start     = src->start + offset;
5136      dst->end     = src->end   + offset;
5137      dst->block     = src->block;
5138      dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5139            src->src_regex
5140             ? src->src_regex
5141             : (REGEXP*)rx);
5142      n++;
5143     }
5144    }
5145   }
5146  }
5147  /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5148  if (alloced)
5149   SvSETMAGIC(pat);
5150
5151  return pat;
5152 }
5153
5154
5155
5156 /* see if there are any run-time code blocks in the pattern.
5157  * False positives are allowed */
5158
5159 static bool
5160 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5161      char *pat, STRLEN plen)
5162 {
5163  int n = 0;
5164  STRLEN s;
5165
5166  for (s = 0; s < plen; s++) {
5167   if (n < pRExC_state->num_code_blocks
5168    && s == pRExC_state->code_blocks[n].start)
5169   {
5170    s = pRExC_state->code_blocks[n].end;
5171    n++;
5172    continue;
5173   }
5174   /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5175   * positives here */
5176   if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5177    (pat[s+2] == '{'
5178     || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5179   )
5180    return 1;
5181  }
5182  return 0;
5183 }
5184
5185 /* Handle run-time code blocks. We will already have compiled any direct
5186  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5187  * copy of it, but with any literal code blocks blanked out and
5188  * appropriate chars escaped; then feed it into
5189  *
5190  *    eval "qr'modified_pattern'"
5191  *
5192  * For example,
5193  *
5194  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5195  *
5196  * becomes
5197  *
5198  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5199  *
5200  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5201  * and merge them with any code blocks of the original regexp.
5202  *
5203  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5204  * instead, just save the qr and return FALSE; this tells our caller that
5205  * the original pattern needs upgrading to utf8.
5206  */
5207
5208 static bool
5209 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5210  char *pat, STRLEN plen)
5211 {
5212  SV *qr;
5213
5214  GET_RE_DEBUG_FLAGS_DECL;
5215
5216  if (pRExC_state->runtime_code_qr) {
5217   /* this is the second time we've been called; this should
5218   * only happen if the main pattern got upgraded to utf8
5219   * during compilation; re-use the qr we compiled first time
5220   * round (which should be utf8 too)
5221   */
5222   qr = pRExC_state->runtime_code_qr;
5223   pRExC_state->runtime_code_qr = NULL;
5224   assert(RExC_utf8 && SvUTF8(qr));
5225  }
5226  else {
5227   int n = 0;
5228   STRLEN s;
5229   char *p, *newpat;
5230   int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5231   SV *sv, *qr_ref;
5232   dSP;
5233
5234   /* determine how many extra chars we need for ' and \ escaping */
5235   for (s = 0; s < plen; s++) {
5236    if (pat[s] == '\'' || pat[s] == '\\')
5237     newlen++;
5238   }
5239
5240   Newx(newpat, newlen, char);
5241   p = newpat;
5242   *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5243
5244   for (s = 0; s < plen; s++) {
5245    if (n < pRExC_state->num_code_blocks
5246     && s == pRExC_state->code_blocks[n].start)
5247    {
5248     /* blank out literal code block */
5249     assert(pat[s] == '(');
5250     while (s <= pRExC_state->code_blocks[n].end) {
5251      *p++ = '_';
5252      s++;
5253     }
5254     s--;
5255     n++;
5256     continue;
5257    }
5258    if (pat[s] == '\'' || pat[s] == '\\')
5259     *p++ = '\\';
5260    *p++ = pat[s];
5261   }
5262   *p++ = '\'';
5263   if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5264    *p++ = 'x';
5265   *p++ = '\0';
5266   DEBUG_COMPILE_r({
5267    PerlIO_printf(Perl_debug_log,
5268     "%sre-parsing pattern for runtime code:%s %s\n",
5269     PL_colors[4],PL_colors[5],newpat);
5270   });
5271
5272   sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5273   Safefree(newpat);
5274
5275   ENTER;
5276   SAVETMPS;
5277   save_re_context();
5278   PUSHSTACKi(PERLSI_REQUIRE);
5279   /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5280   * parsing qr''; normally only q'' does this. It also alters
5281   * hints handling */
5282   eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5283   SvREFCNT_dec_NN(sv);
5284   SPAGAIN;
5285   qr_ref = POPs;
5286   PUTBACK;
5287   {
5288    SV * const errsv = ERRSV;
5289    if (SvTRUE_NN(errsv))
5290    {
5291     Safefree(pRExC_state->code_blocks);
5292     /* use croak_sv ? */
5293     Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
5294    }
5295   }
5296   assert(SvROK(qr_ref));
5297   qr = SvRV(qr_ref);
5298   assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5299   /* the leaving below frees the tmp qr_ref.
5300   * Give qr a life of its own */
5301   SvREFCNT_inc(qr);
5302   POPSTACK;
5303   FREETMPS;
5304   LEAVE;
5305
5306  }
5307
5308  if (!RExC_utf8 && SvUTF8(qr)) {
5309   /* first time through; the pattern got upgraded; save the
5310   * qr for the next time through */
5311   assert(!pRExC_state->runtime_code_qr);
5312   pRExC_state->runtime_code_qr = qr;
5313   return 0;
5314  }
5315
5316
5317  /* extract any code blocks within the returned qr//  */
5318
5319
5320  /* merge the main (r1) and run-time (r2) code blocks into one */
5321  {
5322   RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5323   struct reg_code_block *new_block, *dst;
5324   RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5325   int i1 = 0, i2 = 0;
5326
5327   if (!r2->num_code_blocks) /* we guessed wrong */
5328   {
5329    SvREFCNT_dec_NN(qr);
5330    return 1;
5331   }
5332
5333   Newx(new_block,
5334    r1->num_code_blocks + r2->num_code_blocks,
5335    struct reg_code_block);
5336   dst = new_block;
5337
5338   while (    i1 < r1->num_code_blocks
5339     || i2 < r2->num_code_blocks)
5340   {
5341    struct reg_code_block *src;
5342    bool is_qr = 0;
5343
5344    if (i1 == r1->num_code_blocks) {
5345     src = &r2->code_blocks[i2++];
5346     is_qr = 1;
5347    }
5348    else if (i2 == r2->num_code_blocks)
5349     src = &r1->code_blocks[i1++];
5350    else if (  r1->code_blocks[i1].start
5351      < r2->code_blocks[i2].start)
5352    {
5353     src = &r1->code_blocks[i1++];
5354     assert(src->end < r2->code_blocks[i2].start);
5355    }
5356    else {
5357     assert(  r1->code_blocks[i1].start
5358      > r2->code_blocks[i2].start);
5359     src = &r2->code_blocks[i2++];
5360     is_qr = 1;
5361     assert(src->end < r1->code_blocks[i1].start);
5362    }
5363
5364    assert(pat[src->start] == '(');
5365    assert(pat[src->end]   == ')');
5366    dst->start     = src->start;
5367    dst->end     = src->end;
5368    dst->block     = src->block;
5369    dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5370          : src->src_regex;
5371    dst++;
5372   }
5373   r1->num_code_blocks += r2->num_code_blocks;
5374   Safefree(r1->code_blocks);
5375   r1->code_blocks = new_block;
5376  }
5377
5378  SvREFCNT_dec_NN(qr);
5379  return 1;
5380 }
5381
5382
5383 STATIC bool
5384 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5385 {
5386  /* This is the common code for setting up the floating and fixed length
5387  * string data extracted from Perl_re_op_compile() below.  Returns a boolean
5388  * as to whether succeeded or not */
5389
5390  I32 t,ml;
5391
5392  if (! (longest_length
5393   || (eol /* Can't have SEOL and MULTI */
5394    && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5395   )
5396    /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5397   || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5398  {
5399   return FALSE;
5400  }
5401
5402  /* copy the information about the longest from the reg_scan_data
5403   over to the program. */
5404  if (SvUTF8(sv_longest)) {
5405   *rx_utf8 = sv_longest;
5406   *rx_substr = NULL;
5407  } else {
5408   *rx_substr = sv_longest;
5409   *rx_utf8 = NULL;
5410  }
5411  /* end_shift is how many chars that must be matched that
5412   follow this item. We calculate it ahead of time as once the
5413   lookbehind offset is added in we lose the ability to correctly
5414   calculate it.*/
5415  ml = minlen ? *(minlen) : (I32)longest_length;
5416  *rx_end_shift = ml - offset
5417   - longest_length + (SvTAIL(sv_longest) != 0)
5418   + lookbehind;
5419
5420  t = (eol/* Can't have SEOL and MULTI */
5421   && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5422  fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5423
5424  return TRUE;
5425 }
5426
5427 /*
5428  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5429  * regular expression into internal code.
5430  * The pattern may be passed either as:
5431  *    a list of SVs (patternp plus pat_count)
5432  *    a list of OPs (expr)
5433  * If both are passed, the SV list is used, but the OP list indicates
5434  * which SVs are actually pre-compiled code blocks
5435  *
5436  * The SVs in the list have magic and qr overloading applied to them (and
5437  * the list may be modified in-place with replacement SVs in the latter
5438  * case).
5439  *
5440  * If the pattern hasn't changed from old_re, then old_re will be
5441  * returned.
5442  *
5443  * eng is the current engine. If that engine has an op_comp method, then
5444  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5445  * do the initial concatenation of arguments and pass on to the external
5446  * engine.
5447  *
5448  * If is_bare_re is not null, set it to a boolean indicating whether the
5449  * arg list reduced (after overloading) to a single bare regex which has
5450  * been returned (i.e. /$qr/).
5451  *
5452  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5453  *
5454  * pm_flags contains the PMf_* flags, typically based on those from the
5455  * pm_flags field of the related PMOP. Currently we're only interested in
5456  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5457  *
5458  * We can't allocate space until we know how big the compiled form will be,
5459  * but we can't compile it (and thus know how big it is) until we've got a
5460  * place to put the code.  So we cheat:  we compile it twice, once with code
5461  * generation turned off and size counting turned on, and once "for real".
5462  * This also means that we don't allocate space until we are sure that the
5463  * thing really will compile successfully, and we never have to move the
5464  * code and thus invalidate pointers into it.  (Note that it has to be in
5465  * one piece because free() must be able to free it all.) [NB: not true in perl]
5466  *
5467  * Beware that the optimization-preparation code in here knows about some
5468  * of the structure of the compiled regexp.  [I'll say.]
5469  */
5470
5471 REGEXP *
5472 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5473      OP *expr, const regexp_engine* eng, REGEXP *old_re,
5474      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5475 {
5476  dVAR;
5477  REGEXP *rx;
5478  struct regexp *r;
5479  regexp_internal *ri;
5480  STRLEN plen;
5481  char *exp;
5482  regnode *scan;
5483  I32 flags;
5484  I32 minlen = 0;
5485  U32 rx_flags;
5486  SV *pat;
5487  SV *code_blocksv = NULL;
5488  SV** new_patternp = patternp;
5489
5490  /* these are all flags - maybe they should be turned
5491  * into a single int with different bit masks */
5492  I32 sawlookahead = 0;
5493  I32 sawplus = 0;
5494  I32 sawopen = 0;
5495  regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5496  bool recompile = 0;
5497  bool runtime_code = 0;
5498  scan_data_t data;
5499  RExC_state_t RExC_state;
5500  RExC_state_t * const pRExC_state = &RExC_state;
5501 #ifdef TRIE_STUDY_OPT
5502  int restudied = 0;
5503  RExC_state_t copyRExC_state;
5504 #endif
5505  GET_RE_DEBUG_FLAGS_DECL;
5506
5507  PERL_ARGS_ASSERT_RE_OP_COMPILE;
5508
5509  DEBUG_r(if (!PL_colorset) reginitcolors());
5510
5511 #ifndef PERL_IN_XSUB_RE
5512  /* Initialize these here instead of as-needed, as is quick and avoids
5513  * having to test them each time otherwise */
5514  if (! PL_AboveLatin1) {
5515   PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5516   PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5517   PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5518
5519   PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5520         = _new_invlist_C_array(L1PosixAlnum_invlist);
5521   PL_Posix_ptrs[_CC_ALPHANUMERIC]
5522         = _new_invlist_C_array(PosixAlnum_invlist);
5523
5524   PL_L1Posix_ptrs[_CC_ALPHA]
5525         = _new_invlist_C_array(L1PosixAlpha_invlist);
5526   PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5527
5528   PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5529   PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5530
5531   /* Cased is the same as Alpha in the ASCII range */
5532   PL_L1Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(L1Cased_invlist);
5533   PL_Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(PosixAlpha_invlist);
5534
5535   PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5536   PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5537
5538   PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5539   PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5540
5541   PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5542   PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5543
5544   PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5545   PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5546
5547   PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5548   PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5549
5550   PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5551   PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5552
5553   PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5554   PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5555   PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5556   PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5557
5558   PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5559   PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5560
5561   PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5562
5563   PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5564   PL_L1Posix_ptrs[_CC_WORDCHAR]
5565         = _new_invlist_C_array(L1PosixWord_invlist);
5566
5567   PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5568   PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5569
5570   PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5571  }
5572 #endif
5573
5574  pRExC_state->code_blocks = NULL;
5575  pRExC_state->num_code_blocks = 0;
5576
5577  if (is_bare_re)
5578   *is_bare_re = FALSE;
5579
5580  if (expr && (expr->op_type == OP_LIST ||
5581     (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5582   /* allocate code_blocks if needed */
5583   OP *o;
5584   int ncode = 0;
5585
5586   for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
5587    if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5588     ncode++; /* count of DO blocks */
5589   if (ncode) {
5590    pRExC_state->num_code_blocks = ncode;
5591    Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5592   }
5593  }
5594
5595  if (!pat_count) {
5596   /* compile-time pattern with just OP_CONSTs and DO blocks */
5597
5598   int n;
5599   OP *o;
5600
5601   /* find how many CONSTs there are */
5602   assert(expr);
5603   n = 0;
5604   if (expr->op_type == OP_CONST)
5605    n = 1;
5606   else
5607    for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5608     if (o->op_type == OP_CONST)
5609      n++;
5610    }
5611
5612   /* fake up an SV array */
5613
5614   assert(!new_patternp);
5615   Newx(new_patternp, n, SV*);
5616   SAVEFREEPV(new_patternp);
5617   pat_count = n;
5618
5619   n = 0;
5620   if (expr->op_type == OP_CONST)
5621    new_patternp[n] = cSVOPx_sv(expr);
5622   else
5623    for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5624     if (o->op_type == OP_CONST)
5625      new_patternp[n++] = cSVOPo_sv;
5626    }
5627
5628  }
5629
5630  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5631   "Assembling pattern from %d elements%s\n", pat_count,
5632    orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5633
5634  /* set expr to the first arg op */
5635
5636  if (pRExC_state->num_code_blocks
5637   && expr->op_type != OP_CONST)
5638  {
5639    expr = cLISTOPx(expr)->op_first;
5640    assert(   expr->op_type == OP_PUSHMARK
5641     || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
5642     || expr->op_type == OP_PADRANGE);
5643    expr = expr->op_sibling;
5644  }
5645
5646  pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
5647       expr, &recompile, NULL);
5648
5649  /* handle bare (possibly after overloading) regex: foo =~ $re */
5650  {
5651   SV *re = pat;
5652   if (SvROK(re))
5653    re = SvRV(re);
5654   if (SvTYPE(re) == SVt_REGEXP) {
5655    if (is_bare_re)
5656     *is_bare_re = TRUE;
5657    SvREFCNT_inc(re);
5658    Safefree(pRExC_state->code_blocks);
5659    DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5660     "Precompiled pattern%s\n",
5661      orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5662
5663    return (REGEXP*)re;
5664   }
5665  }
5666
5667  exp = SvPV_nomg(pat, plen);
5668
5669  if (!eng->op_comp) {
5670   if ((SvUTF8(pat) && IN_BYTES)
5671     || SvGMAGICAL(pat) || SvAMAGIC(pat))
5672   {
5673    /* make a temporary copy; either to convert to bytes,
5674    * or to avoid repeating get-magic / overloaded stringify */
5675    pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5676           (IN_BYTES ? 0 : SvUTF8(pat)));
5677   }
5678   Safefree(pRExC_state->code_blocks);
5679   return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5680  }
5681
5682  /* ignore the utf8ness if the pattern is 0 length */
5683  RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5684  RExC_uni_semantics = 0;
5685  RExC_contains_locale = 0;
5686  pRExC_state->runtime_code_qr = NULL;
5687
5688  DEBUG_COMPILE_r({
5689    SV *dsv= sv_newmortal();
5690    RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
5691    PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5692       PL_colors[4],PL_colors[5],s);
5693   });
5694
5695   redo_first_pass:
5696  /* we jump here if we upgrade the pattern to utf8 and have to
5697  * recompile */
5698
5699  if ((pm_flags & PMf_USE_RE_EVAL)
5700     /* this second condition covers the non-regex literal case,
5701     * i.e.  $foo =~ '(?{})'. */
5702     || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
5703  )
5704   runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
5705
5706  /* return old regex if pattern hasn't changed */
5707  /* XXX: note in the below we have to check the flags as well as the pattern.
5708  *
5709  * Things get a touch tricky as we have to compare the utf8 flag independently
5710  * from the compile flags.
5711  */
5712
5713  if (   old_re
5714   && !recompile
5715   && !!RX_UTF8(old_re) == !!RExC_utf8
5716   && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
5717   && RX_PRECOMP(old_re)
5718   && RX_PRELEN(old_re) == plen
5719   && memEQ(RX_PRECOMP(old_re), exp, plen)
5720   && !runtime_code /* with runtime code, always recompile */ )
5721  {
5722   Safefree(pRExC_state->code_blocks);
5723   return old_re;
5724  }
5725
5726  rx_flags = orig_rx_flags;
5727
5728  if (initial_charset == REGEX_LOCALE_CHARSET) {
5729   RExC_contains_locale = 1;
5730  }
5731  else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5732
5733   /* Set to use unicode semantics if the pattern is in utf8 and has the
5734   * 'depends' charset specified, as it means unicode when utf8  */
5735   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5736  }
5737
5738  RExC_precomp = exp;
5739  RExC_flags = rx_flags;
5740  RExC_pm_flags = pm_flags;
5741
5742  if (runtime_code) {
5743   if (TAINTING_get && TAINT_get)
5744    Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5745
5746   if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5747    /* whoops, we have a non-utf8 pattern, whilst run-time code
5748    * got compiled as utf8. Try again with a utf8 pattern */
5749    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
5750          pRExC_state->num_code_blocks);
5751    goto redo_first_pass;
5752   }
5753  }
5754  assert(!pRExC_state->runtime_code_qr);
5755
5756  RExC_sawback = 0;
5757
5758  RExC_seen = 0;
5759  RExC_in_lookbehind = 0;
5760  RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5761  RExC_extralen = 0;
5762  RExC_override_recoding = 0;
5763  RExC_in_multi_char_class = 0;
5764
5765  /* First pass: determine size, legality. */
5766  RExC_parse = exp;
5767  RExC_start = exp;
5768  RExC_end = exp + plen;
5769  RExC_naughty = 0;
5770  RExC_npar = 1;
5771  RExC_nestroot = 0;
5772  RExC_size = 0L;
5773  RExC_emit = &PL_regdummy;
5774  RExC_whilem_seen = 0;
5775  RExC_open_parens = NULL;
5776  RExC_close_parens = NULL;
5777  RExC_opend = NULL;
5778  RExC_paren_names = NULL;
5779 #ifdef DEBUGGING
5780  RExC_paren_name_list = NULL;
5781 #endif
5782  RExC_recurse = NULL;
5783  RExC_recurse_count = 0;
5784  pRExC_state->code_index = 0;
5785
5786 #if 0 /* REGC() is (currently) a NOP at the first pass.
5787  * Clever compilers notice this and complain. --jhi */
5788  REGC((U8)REG_MAGIC, (char*)RExC_emit);
5789 #endif
5790  DEBUG_PARSE_r(
5791   PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5792   RExC_lastnum=0;
5793   RExC_lastparse=NULL;
5794  );
5795  /* reg may croak on us, not giving us a chance to free
5796  pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
5797  need it to survive as long as the regexp (qr/(?{})/).
5798  We must check that code_blocksv is not already set, because we may
5799  have jumped back to restart the sizing pass. */
5800  if (pRExC_state->code_blocks && !code_blocksv) {
5801   code_blocksv = newSV_type(SVt_PV);
5802   SAVEFREESV(code_blocksv);
5803   SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5804   SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5805  }
5806  if (reg(pRExC_state, 0, &flags,1) == NULL) {
5807   /* It's possible to write a regexp in ascii that represents Unicode
5808   codepoints outside of the byte range, such as via \x{100}. If we
5809   detect such a sequence we have to convert the entire pattern to utf8
5810   and then recompile, as our sizing calculation will have been based
5811   on 1 byte == 1 character, but we will need to use utf8 to encode
5812   at least some part of the pattern, and therefore must convert the whole
5813   thing.
5814   -- dmq */
5815   if (flags & RESTART_UTF8) {
5816    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
5817          pRExC_state->num_code_blocks);
5818    goto redo_first_pass;
5819   }
5820   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#X", flags);
5821  }
5822  if (code_blocksv)
5823   SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5824
5825  DEBUG_PARSE_r({
5826   PerlIO_printf(Perl_debug_log,
5827    "Required size %"IVdf" nodes\n"
5828    "Starting second pass (creation)\n",
5829    (IV)RExC_size);
5830   RExC_lastnum=0;
5831   RExC_lastparse=NULL;
5832  });
5833
5834  /* The first pass could have found things that force Unicode semantics */
5835  if ((RExC_utf8 || RExC_uni_semantics)
5836   && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5837  {
5838   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5839  }
5840
5841  /* Small enough for pointer-storage convention?
5842  If extralen==0, this means that we will not need long jumps. */
5843  if (RExC_size >= 0x10000L && RExC_extralen)
5844   RExC_size += RExC_extralen;
5845  else
5846   RExC_extralen = 0;
5847  if (RExC_whilem_seen > 15)
5848   RExC_whilem_seen = 15;
5849
5850  /* Allocate space and zero-initialize. Note, the two step process
5851  of zeroing when in debug mode, thus anything assigned has to
5852  happen after that */
5853  rx = (REGEXP*) newSV_type(SVt_REGEXP);
5854  r = ReANY(rx);
5855  Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5856   char, regexp_internal);
5857  if ( r == NULL || ri == NULL )
5858   FAIL("Regexp out of space");
5859 #ifdef DEBUGGING
5860  /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5861  Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5862 #else
5863  /* bulk initialize base fields with 0. */
5864  Zero(ri, sizeof(regexp_internal), char);
5865 #endif
5866
5867  /* non-zero initialization begins here */
5868  RXi_SET( r, ri );
5869  r->engine= eng;
5870  r->extflags = rx_flags;
5871  RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
5872
5873  if (pm_flags & PMf_IS_QR) {
5874   ri->code_blocks = pRExC_state->code_blocks;
5875   ri->num_code_blocks = pRExC_state->num_code_blocks;
5876  }
5877  else
5878  {
5879   int n;
5880   for (n = 0; n < pRExC_state->num_code_blocks; n++)
5881    if (pRExC_state->code_blocks[n].src_regex)
5882     SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5883   SAVEFREEPV(pRExC_state->code_blocks);
5884  }
5885
5886  {
5887   bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5888   bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5889
5890   /* The caret is output if there are any defaults: if not all the STD
5891   * flags are set, or if no character set specifier is needed */
5892   bool has_default =
5893      (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5894      || ! has_charset);
5895   bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5896   U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5897        >> RXf_PMf_STD_PMMOD_SHIFT);
5898   const char *fptr = STD_PAT_MODS;        /*"msix"*/
5899   char *p;
5900   /* Allocate for the worst case, which is all the std flags are turned
5901   * on.  If more precision is desired, we could do a population count of
5902   * the flags set.  This could be done with a small lookup table, or by
5903   * shifting, masking and adding, or even, when available, assembly
5904   * language for a machine-language population count.
5905   * We never output a minus, as all those are defaults, so are
5906   * covered by the caret */
5907   const STRLEN wraplen = plen + has_p + has_runon
5908    + has_default       /* If needs a caret */
5909
5910     /* If needs a character set specifier */
5911    + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5912    + (sizeof(STD_PAT_MODS) - 1)
5913    + (sizeof("(?:)") - 1);
5914
5915   Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5916   r->xpv_len_u.xpvlenu_pv = p;
5917   if (RExC_utf8)
5918    SvFLAGS(rx) |= SVf_UTF8;
5919   *p++='('; *p++='?';
5920
5921   /* If a default, cover it using the caret */
5922   if (has_default) {
5923    *p++= DEFAULT_PAT_MOD;
5924   }
5925   if (has_charset) {
5926    STRLEN len;
5927    const char* const name = get_regex_charset_name(r->extflags, &len);
5928    Copy(name, p, len, char);
5929    p += len;
5930   }
5931   if (has_p)
5932    *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5933   {
5934    char ch;
5935    while((ch = *fptr++)) {
5936     if(reganch & 1)
5937      *p++ = ch;
5938     reganch >>= 1;
5939    }
5940   }
5941
5942   *p++ = ':';
5943   Copy(RExC_precomp, p, plen, char);
5944   assert ((RX_WRAPPED(rx) - p) < 16);
5945   r->pre_prefix = p - RX_WRAPPED(rx);
5946   p += plen;
5947   if (has_runon)
5948    *p++ = '\n';
5949   *p++ = ')';
5950   *p = 0;
5951   SvCUR_set(rx, p - RX_WRAPPED(rx));
5952  }
5953
5954  r->intflags = 0;
5955  r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5956
5957  if (RExC_seen & REG_SEEN_RECURSE) {
5958   Newxz(RExC_open_parens, RExC_npar,regnode *);
5959   SAVEFREEPV(RExC_open_parens);
5960   Newxz(RExC_close_parens,RExC_npar,regnode *);
5961   SAVEFREEPV(RExC_close_parens);
5962  }
5963
5964  /* Useful during FAIL. */
5965 #ifdef RE_TRACK_PATTERN_OFFSETS
5966  Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5967  DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5968       "%s %"UVuf" bytes for offset annotations.\n",
5969       ri->u.offsets ? "Got" : "Couldn't get",
5970       (UV)((2*RExC_size+1) * sizeof(U32))));
5971 #endif
5972  SetProgLen(ri,RExC_size);
5973  RExC_rx_sv = rx;
5974  RExC_rx = r;
5975  RExC_rxi = ri;
5976  REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
5977
5978  /* Second pass: emit code. */
5979  RExC_flags = rx_flags; /* don't let top level (?i) bleed */
5980  RExC_pm_flags = pm_flags;
5981  RExC_parse = exp;
5982  RExC_end = exp + plen;
5983  RExC_naughty = 0;
5984  RExC_npar = 1;
5985  RExC_emit_start = ri->program;
5986  RExC_emit = ri->program;
5987  RExC_emit_bound = ri->program + RExC_size + 1;
5988  pRExC_state->code_index = 0;
5989
5990  REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5991  if (reg(pRExC_state, 0, &flags,1) == NULL) {
5992   ReREFCNT_dec(rx);
5993   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#X", flags);
5994  }
5995  /* XXXX To minimize changes to RE engine we always allocate
5996  3-units-long substrs field. */
5997  Newx(r->substrs, 1, struct reg_substr_data);
5998  if (RExC_recurse_count) {
5999   Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6000   SAVEFREEPV(RExC_recurse);
6001  }
6002
6003 reStudy:
6004  r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
6005  Zero(r->substrs, 1, struct reg_substr_data);
6006
6007 #ifdef TRIE_STUDY_OPT
6008  if (!restudied) {
6009   StructCopy(&zero_scan_data, &data, scan_data_t);
6010   copyRExC_state = RExC_state;
6011  } else {
6012   U32 seen=RExC_seen;
6013   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6014
6015   RExC_state = copyRExC_state;
6016   if (seen & REG_TOP_LEVEL_BRANCHES)
6017    RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6018   else
6019    RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6020   StructCopy(&zero_scan_data, &data, scan_data_t);
6021  }
6022 #else
6023  StructCopy(&zero_scan_data, &data, scan_data_t);
6024 #endif
6025
6026  /* Dig out information for optimizations. */
6027  r->extflags = RExC_flags; /* was pm_op */
6028  /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6029
6030  if (UTF)
6031   SvUTF8_on(rx); /* Unicode in it? */
6032  ri->regstclass = NULL;
6033  if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6034   r->intflags |= PREGf_NAUGHTY;
6035  scan = ri->program + 1;  /* First BRANCH. */
6036
6037  /* testing for BRANCH here tells us whether there is "must appear"
6038  data in the pattern. If there is then we can use it for optimisations */
6039  if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
6040   I32 fake;
6041   STRLEN longest_float_length, longest_fixed_length;
6042   struct regnode_charclass_class ch_class; /* pointed to by data */
6043   int stclass_flag;
6044   I32 last_close = 0; /* pointed to by data */
6045   regnode *first= scan;
6046   regnode *first_next= regnext(first);
6047   /*
6048   * Skip introductions and multiplicators >= 1
6049   * so that we can extract the 'meat' of the pattern that must
6050   * match in the large if() sequence following.
6051   * NOTE that EXACT is NOT covered here, as it is normally
6052   * picked up by the optimiser separately.
6053   *
6054   * This is unfortunate as the optimiser isnt handling lookahead
6055   * properly currently.
6056   *
6057   */
6058   while ((OP(first) == OPEN && (sawopen = 1)) ||
6059    /* An OR of *one* alternative - should not happen now. */
6060    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6061    /* for now we can't handle lookbehind IFMATCH*/
6062    (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6063    (OP(first) == PLUS) ||
6064    (OP(first) == MINMOD) ||
6065    /* An {n,m} with n>0 */
6066    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6067    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6068   {
6069     /*
6070     * the only op that could be a regnode is PLUS, all the rest
6071     * will be regnode_1 or regnode_2.
6072     *
6073     */
6074     if (OP(first) == PLUS)
6075      sawplus = 1;
6076     else
6077      first += regarglen[OP(first)];
6078
6079     first = NEXTOPER(first);
6080     first_next= regnext(first);
6081   }
6082
6083   /* Starting-point info. */
6084  again:
6085   DEBUG_PEEP("first:",first,0);
6086   /* Ignore EXACT as we deal with it later. */
6087   if (PL_regkind[OP(first)] == EXACT) {
6088    if (OP(first) == EXACT)
6089     NOOP; /* Empty, get anchored substr later. */
6090    else
6091     ri->regstclass = first;
6092   }
6093 #ifdef TRIE_STCLASS
6094   else if (PL_regkind[OP(first)] == TRIE &&
6095     ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6096   {
6097    regnode *trie_op;
6098    /* this can happen only on restudy */
6099    if ( OP(first) == TRIE ) {
6100     struct regnode_1 *trieop = (struct regnode_1 *)
6101      PerlMemShared_calloc(1, sizeof(struct regnode_1));
6102     StructCopy(first,trieop,struct regnode_1);
6103     trie_op=(regnode *)trieop;
6104    } else {
6105     struct regnode_charclass *trieop = (struct regnode_charclass *)
6106      PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6107     StructCopy(first,trieop,struct regnode_charclass);
6108     trie_op=(regnode *)trieop;
6109    }
6110    OP(trie_op)+=2;
6111    make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6112    ri->regstclass = trie_op;
6113   }
6114 #endif
6115   else if (REGNODE_SIMPLE(OP(first)))
6116    ri->regstclass = first;
6117   else if (PL_regkind[OP(first)] == BOUND ||
6118     PL_regkind[OP(first)] == NBOUND)
6119    ri->regstclass = first;
6120   else if (PL_regkind[OP(first)] == BOL) {
6121    r->extflags |= (OP(first) == MBOL
6122       ? RXf_ANCH_MBOL
6123       : (OP(first) == SBOL
6124        ? RXf_ANCH_SBOL
6125        : RXf_ANCH_BOL));
6126    first = NEXTOPER(first);
6127    goto again;
6128   }
6129   else if (OP(first) == GPOS) {
6130    r->extflags |= RXf_ANCH_GPOS;
6131    first = NEXTOPER(first);
6132    goto again;
6133   }
6134   else if ((!sawopen || !RExC_sawback) &&
6135    (OP(first) == STAR &&
6136    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6137    !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6138   {
6139    /* turn .* into ^.* with an implied $*=1 */
6140    const int type =
6141     (OP(NEXTOPER(first)) == REG_ANY)
6142      ? RXf_ANCH_MBOL
6143      : RXf_ANCH_SBOL;
6144    r->extflags |= type;
6145    r->intflags |= PREGf_IMPLICIT;
6146    first = NEXTOPER(first);
6147    goto again;
6148   }
6149   if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6150    && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6151    /* x+ must match at the 1st pos of run of x's */
6152    r->intflags |= PREGf_SKIP;
6153
6154   /* Scan is after the zeroth branch, first is atomic matcher. */
6155 #ifdef TRIE_STUDY_OPT
6156   DEBUG_PARSE_r(
6157    if (!restudied)
6158     PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6159        (IV)(first - scan + 1))
6160   );
6161 #else
6162   DEBUG_PARSE_r(
6163    PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6164     (IV)(first - scan + 1))
6165   );
6166 #endif
6167
6168
6169   /*
6170   * If there's something expensive in the r.e., find the
6171   * longest literal string that must appear and make it the
6172   * regmust.  Resolve ties in favor of later strings, since
6173   * the regstart check works with the beginning of the r.e.
6174   * and avoiding duplication strengthens checking.  Not a
6175   * strong reason, but sufficient in the absence of others.
6176   * [Now we resolve ties in favor of the earlier string if
6177   * it happens that c_offset_min has been invalidated, since the
6178   * earlier string may buy us something the later one won't.]
6179   */
6180
6181   data.longest_fixed = newSVpvs("");
6182   data.longest_float = newSVpvs("");
6183   data.last_found = newSVpvs("");
6184   data.longest = &(data.longest_fixed);
6185   ENTER_with_name("study_chunk");
6186   SAVEFREESV(data.longest_fixed);
6187   SAVEFREESV(data.longest_float);
6188   SAVEFREESV(data.last_found);
6189   first = scan;
6190   if (!ri->regstclass) {
6191    cl_init(pRExC_state, &ch_class);
6192    data.start_class = &ch_class;
6193    stclass_flag = SCF_DO_STCLASS_AND;
6194   } else    /* XXXX Check for BOUND? */
6195    stclass_flag = 0;
6196   data.last_closep = &last_close;
6197
6198   minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6199    &data, -1, NULL, NULL,
6200    SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6201
6202
6203   CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6204
6205
6206   if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6207    && data.last_start_min == 0 && data.last_end > 0
6208    && !RExC_seen_zerolen
6209    && !(RExC_seen & REG_SEEN_VERBARG)
6210    && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6211    r->extflags |= RXf_CHECK_ALL;
6212   scan_commit(pRExC_state, &data,&minlen,0);
6213
6214   longest_float_length = CHR_SVLEN(data.longest_float);
6215
6216   if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6217     && data.offset_fixed == data.offset_float_min
6218     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6219    && S_setup_longest (aTHX_ pRExC_state,
6220          data.longest_float,
6221          &(r->float_utf8),
6222          &(r->float_substr),
6223          &(r->float_end_shift),
6224          data.lookbehind_float,
6225          data.offset_float_min,
6226          data.minlen_float,
6227          longest_float_length,
6228          cBOOL(data.flags & SF_FL_BEFORE_EOL),
6229          cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6230   {
6231    r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6232    r->float_max_offset = data.offset_float_max;
6233    if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6234     r->float_max_offset -= data.lookbehind_float;
6235    SvREFCNT_inc_simple_void_NN(data.longest_float);
6236   }
6237   else {
6238    r->float_substr = r->float_utf8 = NULL;
6239    longest_float_length = 0;
6240   }
6241
6242   longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6243
6244   if (S_setup_longest (aTHX_ pRExC_state,
6245         data.longest_fixed,
6246         &(r->anchored_utf8),
6247         &(r->anchored_substr),
6248         &(r->anchored_end_shift),
6249         data.lookbehind_fixed,
6250         data.offset_fixed,
6251         data.minlen_fixed,
6252         longest_fixed_length,
6253         cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6254         cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6255   {
6256    r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6257    SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6258   }
6259   else {
6260    r->anchored_substr = r->anchored_utf8 = NULL;
6261    longest_fixed_length = 0;
6262   }
6263   LEAVE_with_name("study_chunk");
6264
6265   if (ri->regstclass
6266    && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6267    ri->regstclass = NULL;
6268
6269   if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6270    && stclass_flag
6271    && ! TEST_SSC_EOS(data.start_class)
6272    && !cl_is_anything(data.start_class))
6273   {
6274    const U32 n = add_data(pRExC_state, 1, "f");
6275    OP(data.start_class) = ANYOF_SYNTHETIC;
6276
6277    Newx(RExC_rxi->data->data[n], 1,
6278     struct regnode_charclass_class);
6279    StructCopy(data.start_class,
6280      (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6281      struct regnode_charclass_class);
6282    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6283    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6284    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6285      regprop(r, sv, (regnode*)data.start_class);
6286      PerlIO_printf(Perl_debug_log,
6287          "synthetic stclass \"%s\".\n",
6288          SvPVX_const(sv));});
6289   }
6290
6291   /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6292   if (longest_fixed_length > longest_float_length) {
6293    r->check_end_shift = r->anchored_end_shift;
6294    r->check_substr = r->anchored_substr;
6295    r->check_utf8 = r->anchored_utf8;
6296    r->check_offset_min = r->check_offset_max = r->anchored_offset;
6297    if (r->extflags & RXf_ANCH_SINGLE)
6298     r->extflags |= RXf_NOSCAN;
6299   }
6300   else {
6301    r->check_end_shift = r->float_end_shift;
6302    r->check_substr = r->float_substr;
6303    r->check_utf8 = r->float_utf8;
6304    r->check_offset_min = r->float_min_offset;
6305    r->check_offset_max = r->float_max_offset;
6306   }
6307   /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6308   This should be changed ASAP!  */
6309   if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6310    r->extflags |= RXf_USE_INTUIT;
6311    if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6312     r->extflags |= RXf_INTUIT_TAIL;
6313   }
6314   /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6315   if ( (STRLEN)minlen < longest_float_length )
6316    minlen= longest_float_length;
6317   if ( (STRLEN)minlen < longest_fixed_length )
6318    minlen= longest_fixed_length;
6319   */
6320  }
6321  else {
6322   /* Several toplevels. Best we can is to set minlen. */
6323   I32 fake;
6324   struct regnode_charclass_class ch_class;
6325   I32 last_close = 0;
6326
6327   DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6328
6329   scan = ri->program + 1;
6330   cl_init(pRExC_state, &ch_class);
6331   data.start_class = &ch_class;
6332   data.last_closep = &last_close;
6333
6334
6335   minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6336    &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6337
6338   CHECK_RESTUDY_GOTO_butfirst(NOOP);
6339
6340   r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6341     = r->float_substr = r->float_utf8 = NULL;
6342
6343   if (! TEST_SSC_EOS(data.start_class)
6344    && !cl_is_anything(data.start_class))
6345   {
6346    const U32 n = add_data(pRExC_state, 1, "f");
6347    OP(data.start_class) = ANYOF_SYNTHETIC;
6348
6349    Newx(RExC_rxi->data->data[n], 1,
6350     struct regnode_charclass_class);
6351    StructCopy(data.start_class,
6352      (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6353      struct regnode_charclass_class);
6354    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6355    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6356    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6357      regprop(r, sv, (regnode*)data.start_class);
6358      PerlIO_printf(Perl_debug_log,
6359          "synthetic stclass \"%s\".\n",
6360          SvPVX_const(sv));});
6361   }
6362  }
6363
6364  /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6365  the "real" pattern. */
6366  DEBUG_OPTIMISE_r({
6367   PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6368      (IV)minlen, (IV)r->minlen);
6369  });
6370  r->minlenret = minlen;
6371  if (r->minlen < minlen)
6372   r->minlen = minlen;
6373
6374  if (RExC_seen & REG_SEEN_GPOS)
6375   r->extflags |= RXf_GPOS_SEEN;
6376  if (RExC_seen & REG_SEEN_LOOKBEHIND)
6377   r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */
6378  if (pRExC_state->num_code_blocks)
6379   r->extflags |= RXf_EVAL_SEEN;
6380  if (RExC_seen & REG_SEEN_CANY)
6381   r->extflags |= RXf_CANY_SEEN;
6382  if (RExC_seen & REG_SEEN_VERBARG)
6383  {
6384   r->intflags |= PREGf_VERBARG_SEEN;
6385   r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
6386  }
6387  if (RExC_seen & REG_SEEN_CUTGROUP)
6388   r->intflags |= PREGf_CUTGROUP_SEEN;
6389  if (pm_flags & PMf_USE_RE_EVAL)
6390   r->intflags |= PREGf_USE_RE_EVAL;
6391  if (RExC_paren_names)
6392   RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6393  else
6394   RXp_PAREN_NAMES(r) = NULL;
6395
6396  {
6397   regnode *first = ri->program + 1;
6398   U8 fop = OP(first);
6399   regnode *next = NEXTOPER(first);
6400   U8 nop = OP(next);
6401
6402   if (PL_regkind[fop] == NOTHING && nop == END)
6403    r->extflags |= RXf_NULL;
6404   else if (PL_regkind[fop] == BOL && nop == END)
6405    r->extflags |= RXf_START_ONLY;
6406   else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END)
6407    r->extflags |= RXf_WHITE;
6408   else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END )
6409    r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6410
6411  }
6412 #ifdef DEBUGGING
6413  if (RExC_paren_names) {
6414   ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6415   ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6416  } else
6417 #endif
6418   ri->name_list_idx = 0;
6419
6420  if (RExC_recurse_count) {
6421   for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6422    const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6423    ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6424   }
6425  }
6426  Newxz(r->offs, RExC_npar, regexp_paren_pair);
6427  /* assume we don't need to swap parens around before we match */
6428
6429  DEBUG_DUMP_r({
6430   PerlIO_printf(Perl_debug_log,"Final program:\n");
6431   regdump(r);
6432  });
6433 #ifdef RE_TRACK_PATTERN_OFFSETS
6434  DEBUG_OFFSETS_r(if (ri->u.offsets) {
6435   const U32 len = ri->u.offsets[0];
6436   U32 i;
6437   GET_RE_DEBUG_FLAGS_DECL;
6438   PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6439   for (i = 1; i <= len; i++) {
6440    if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6441     PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6442     (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6443    }
6444   PerlIO_printf(Perl_debug_log, "\n");
6445  });
6446 #endif
6447
6448 #ifdef USE_ITHREADS
6449  /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6450  * by setting the regexp SV to readonly-only instead. If the
6451  * pattern's been recompiled, the USEDness should remain. */
6452  if (old_re && SvREADONLY(old_re))
6453   SvREADONLY_on(rx);
6454 #endif
6455  return rx;
6456 }
6457
6458
6459 SV*
6460 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6461      const U32 flags)
6462 {
6463  PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6464
6465  PERL_UNUSED_ARG(value);
6466
6467  if (flags & RXapif_FETCH) {
6468   return reg_named_buff_fetch(rx, key, flags);
6469  } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6470   Perl_croak_no_modify();
6471   return NULL;
6472  } else if (flags & RXapif_EXISTS) {
6473   return reg_named_buff_exists(rx, key, flags)
6474    ? &PL_sv_yes
6475    : &PL_sv_no;
6476  } else if (flags & RXapif_REGNAMES) {
6477   return reg_named_buff_all(rx, flags);
6478  } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6479   return reg_named_buff_scalar(rx, flags);
6480  } else {
6481   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6482   return NULL;
6483  }
6484 }
6485
6486 SV*
6487 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6488       const U32 flags)
6489 {
6490  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6491  PERL_UNUSED_ARG(lastkey);
6492
6493  if (flags & RXapif_FIRSTKEY)
6494   return reg_named_buff_firstkey(rx, flags);
6495  else if (flags & RXapif_NEXTKEY)
6496   return reg_named_buff_nextkey(rx, flags);
6497  else {
6498   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6499   return NULL;
6500  }
6501 }
6502
6503 SV*
6504 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6505       const U32 flags)
6506 {
6507  AV *retarray = NULL;
6508  SV *ret;
6509  struct regexp *const rx = ReANY(r);
6510
6511  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6512
6513  if (flags & RXapif_ALL)
6514   retarray=newAV();
6515
6516  if (rx && RXp_PAREN_NAMES(rx)) {
6517   HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6518   if (he_str) {
6519    IV i;
6520    SV* sv_dat=HeVAL(he_str);
6521    I32 *nums=(I32*)SvPVX(sv_dat);
6522    for ( i=0; i<SvIVX(sv_dat); i++ ) {
6523     if ((I32)(rx->nparens) >= nums[i]
6524      && rx->offs[nums[i]].start != -1
6525      && rx->offs[nums[i]].end != -1)
6526     {
6527      ret = newSVpvs("");
6528      CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6529      if (!retarray)
6530       return ret;
6531     } else {
6532      if (retarray)
6533       ret = newSVsv(&PL_sv_undef);
6534     }
6535     if (retarray)
6536      av_push(retarray, ret);
6537    }
6538    if (retarray)
6539     return newRV_noinc(MUTABLE_SV(retarray));
6540   }
6541  }
6542  return NULL;
6543 }
6544
6545 bool
6546 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6547       const U32 flags)
6548 {
6549  struct regexp *const rx = ReANY(r);
6550
6551  PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6552
6553  if (rx && RXp_PAREN_NAMES(rx)) {
6554   if (flags & RXapif_ALL) {
6555    return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6556   } else {
6557    SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6558    if (sv) {
6559     SvREFCNT_dec_NN(sv);
6560     return TRUE;
6561    } else {
6562     return FALSE;
6563    }
6564   }
6565  } else {
6566   return FALSE;
6567  }
6568 }
6569
6570 SV*
6571 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6572 {
6573  struct regexp *const rx = ReANY(r);
6574
6575  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6576
6577  if ( rx && RXp_PAREN_NAMES(rx) ) {
6578   (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6579
6580   return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6581  } else {
6582   return FALSE;
6583  }
6584 }
6585
6586 SV*
6587 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6588 {
6589  struct regexp *const rx = ReANY(r);
6590  GET_RE_DEBUG_FLAGS_DECL;
6591
6592  PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6593
6594  if (rx && RXp_PAREN_NAMES(rx)) {
6595   HV *hv = RXp_PAREN_NAMES(rx);
6596   HE *temphe;
6597   while ( (temphe = hv_iternext_flags(hv,0)) ) {
6598    IV i;
6599    IV parno = 0;
6600    SV* sv_dat = HeVAL(temphe);
6601    I32 *nums = (I32*)SvPVX(sv_dat);
6602    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6603     if ((I32)(rx->lastparen) >= nums[i] &&
6604      rx->offs[nums[i]].start != -1 &&
6605      rx->offs[nums[i]].end != -1)
6606     {
6607      parno = nums[i];
6608      break;
6609     }
6610    }
6611    if (parno || flags & RXapif_ALL) {
6612     return newSVhek(HeKEY_hek(temphe));
6613    }
6614   }
6615  }
6616  return NULL;
6617 }
6618
6619 SV*
6620 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6621 {
6622  SV *ret;
6623  AV *av;
6624  I32 length;
6625  struct regexp *const rx = ReANY(r);
6626
6627  PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6628
6629  if (rx && RXp_PAREN_NAMES(rx)) {
6630   if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6631    return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6632   } else if (flags & RXapif_ONE) {
6633    ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6634    av = MUTABLE_AV(SvRV(ret));
6635    length = av_len(av);
6636    SvREFCNT_dec_NN(ret);
6637    return newSViv(length + 1);
6638   } else {
6639    Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6640    return NULL;
6641   }
6642  }
6643  return &PL_sv_undef;
6644 }
6645
6646 SV*
6647 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6648 {
6649  struct regexp *const rx = ReANY(r);
6650  AV *av = newAV();
6651
6652  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6653
6654  if (rx && RXp_PAREN_NAMES(rx)) {
6655   HV *hv= RXp_PAREN_NAMES(rx);
6656   HE *temphe;
6657   (void)hv_iterinit(hv);
6658   while ( (temphe = hv_iternext_flags(hv,0)) ) {
6659    IV i;
6660    IV parno = 0;
6661    SV* sv_dat = HeVAL(temphe);
6662    I32 *nums = (I32*)SvPVX(sv_dat);
6663    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6664     if ((I32)(rx->lastparen) >= nums[i] &&
6665      rx->offs[nums[i]].start != -1 &&
6666      rx->offs[nums[i]].end != -1)
6667     {
6668      parno = nums[i];
6669      break;
6670     }
6671    }
6672    if (parno || flags & RXapif_ALL) {
6673     av_push(av, newSVhek(HeKEY_hek(temphe)));
6674    }
6675   }
6676  }
6677
6678  return newRV_noinc(MUTABLE_SV(av));
6679 }
6680
6681 void
6682 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6683        SV * const sv)
6684 {
6685  struct regexp *const rx = ReANY(r);
6686  char *s = NULL;
6687  I32 i = 0;
6688  I32 s1, t1;
6689  I32 n = paren;
6690
6691  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6692
6693  if ( (    n == RX_BUFF_IDX_CARET_PREMATCH
6694   || n == RX_BUFF_IDX_CARET_FULLMATCH
6695   || n == RX_BUFF_IDX_CARET_POSTMATCH
6696   )
6697   && !(rx->extflags & RXf_PMf_KEEPCOPY)
6698  )
6699   goto ret_undef;
6700
6701  if (!rx->subbeg)
6702   goto ret_undef;
6703
6704  if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6705   /* no need to distinguish between them any more */
6706   n = RX_BUFF_IDX_FULLMATCH;
6707
6708  if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6709   && rx->offs[0].start != -1)
6710  {
6711   /* $`, ${^PREMATCH} */
6712   i = rx->offs[0].start;
6713   s = rx->subbeg;
6714  }
6715  else
6716  if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6717   && rx->offs[0].end != -1)
6718  {
6719   /* $', ${^POSTMATCH} */
6720   s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6721   i = rx->sublen + rx->suboffset - rx->offs[0].end;
6722  }
6723  else
6724  if ( 0 <= n && n <= (I32)rx->nparens &&
6725   (s1 = rx->offs[n].start) != -1 &&
6726   (t1 = rx->offs[n].end) != -1)
6727  {
6728   /* $&, ${^MATCH},  $1 ... */
6729   i = t1 - s1;
6730   s = rx->subbeg + s1 - rx->suboffset;
6731  } else {
6732   goto ret_undef;
6733  }
6734
6735  assert(s >= rx->subbeg);
6736  assert(rx->sublen >= (s - rx->subbeg) + i );
6737  if (i >= 0) {
6738 #if NO_TAINT_SUPPORT
6739   sv_setpvn(sv, s, i);
6740 #else
6741   const int oldtainted = TAINT_get;
6742   TAINT_NOT;
6743   sv_setpvn(sv, s, i);
6744   TAINT_set(oldtainted);
6745 #endif
6746   if ( (rx->extflags & RXf_CANY_SEEN)
6747    ? (RXp_MATCH_UTF8(rx)
6748       && (!i || is_utf8_string((U8*)s, i)))
6749    : (RXp_MATCH_UTF8(rx)) )
6750   {
6751    SvUTF8_on(sv);
6752   }
6753   else
6754    SvUTF8_off(sv);
6755   if (TAINTING_get) {
6756    if (RXp_MATCH_TAINTED(rx)) {
6757     if (SvTYPE(sv) >= SVt_PVMG) {
6758      MAGIC* const mg = SvMAGIC(sv);
6759      MAGIC* mgt;
6760      TAINT;
6761      SvMAGIC_set(sv, mg->mg_moremagic);
6762      SvTAINT(sv);
6763      if ((mgt = SvMAGIC(sv))) {
6764       mg->mg_moremagic = mgt;
6765       SvMAGIC_set(sv, mg);
6766      }
6767     } else {
6768      TAINT;
6769      SvTAINT(sv);
6770     }
6771    } else
6772     SvTAINTED_off(sv);
6773   }
6774  } else {
6775  ret_undef:
6776   sv_setsv(sv,&PL_sv_undef);
6777   return;
6778  }
6779 }
6780
6781 void
6782 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6783               SV const * const value)
6784 {
6785  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6786
6787  PERL_UNUSED_ARG(rx);
6788  PERL_UNUSED_ARG(paren);
6789  PERL_UNUSED_ARG(value);
6790
6791  if (!PL_localizing)
6792   Perl_croak_no_modify();
6793 }
6794
6795 I32
6796 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6797        const I32 paren)
6798 {
6799  struct regexp *const rx = ReANY(r);
6800  I32 i;
6801  I32 s1, t1;
6802
6803  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6804
6805  /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6806  switch (paren) {
6807  case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6808   if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6809    goto warn_undef;
6810   /*FALLTHROUGH*/
6811
6812  case RX_BUFF_IDX_PREMATCH:       /* $` */
6813   if (rx->offs[0].start != -1) {
6814       i = rx->offs[0].start;
6815       if (i > 0) {
6816         s1 = 0;
6817         t1 = i;
6818         goto getlen;
6819       }
6820    }
6821   return 0;
6822
6823  case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6824   if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6825    goto warn_undef;
6826  case RX_BUFF_IDX_POSTMATCH:       /* $' */
6827    if (rx->offs[0].end != -1) {
6828       i = rx->sublen - rx->offs[0].end;
6829       if (i > 0) {
6830         s1 = rx->offs[0].end;
6831         t1 = rx->sublen;
6832         goto getlen;
6833       }
6834    }
6835   return 0;
6836
6837  case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6838   if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6839    goto warn_undef;
6840   /*FALLTHROUGH*/
6841
6842  /* $& / ${^MATCH}, $1, $2, ... */
6843  default:
6844    if (paren <= (I32)rx->nparens &&
6845    (s1 = rx->offs[paren].start) != -1 &&
6846    (t1 = rx->offs[paren].end) != -1)
6847    {
6848    i = t1 - s1;
6849    goto getlen;
6850   } else {
6851   warn_undef:
6852    if (ckWARN(WARN_UNINITIALIZED))
6853     report_uninit((const SV *)sv);
6854    return 0;
6855   }
6856  }
6857   getlen:
6858  if (i > 0 && RXp_MATCH_UTF8(rx)) {
6859   const char * const s = rx->subbeg - rx->suboffset + s1;
6860   const U8 *ep;
6861   STRLEN el;
6862
6863   i = t1 - s1;
6864   if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6865       i = el;
6866  }
6867  return i;
6868 }
6869
6870 SV*
6871 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6872 {
6873  PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6874   PERL_UNUSED_ARG(rx);
6875   if (0)
6876    return NULL;
6877   else
6878    return newSVpvs("Regexp");
6879 }
6880
6881 /* Scans the name of a named buffer from the pattern.
6882  * If flags is REG_RSN_RETURN_NULL returns null.
6883  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6884  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6885  * to the parsed name as looked up in the RExC_paren_names hash.
6886  * If there is an error throws a vFAIL().. type exception.
6887  */
6888
6889 #define REG_RSN_RETURN_NULL    0
6890 #define REG_RSN_RETURN_NAME    1
6891 #define REG_RSN_RETURN_DATA    2
6892
6893 STATIC SV*
6894 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6895 {
6896  char *name_start = RExC_parse;
6897
6898  PERL_ARGS_ASSERT_REG_SCAN_NAME;
6899
6900  if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6901   /* skip IDFIRST by using do...while */
6902   if (UTF)
6903    do {
6904     RExC_parse += UTF8SKIP(RExC_parse);
6905    } while (isWORDCHAR_utf8((U8*)RExC_parse));
6906   else
6907    do {
6908     RExC_parse++;
6909    } while (isWORDCHAR(*RExC_parse));
6910  } else {
6911   RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6912   vFAIL("Group name must start with a non-digit word character");
6913  }
6914  if ( flags ) {
6915   SV* sv_name
6916    = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6917        SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6918   if ( flags == REG_RSN_RETURN_NAME)
6919    return sv_name;
6920   else if (flags==REG_RSN_RETURN_DATA) {
6921    HE *he_str = NULL;
6922    SV *sv_dat = NULL;
6923    if ( ! sv_name )      /* should not happen*/
6924     Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6925    if (RExC_paren_names)
6926     he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6927    if ( he_str )
6928     sv_dat = HeVAL(he_str);
6929    if ( ! sv_dat )
6930     vFAIL("Reference to nonexistent named group");
6931    return sv_dat;
6932   }
6933   else {
6934    Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6935      (unsigned long) flags);
6936   }
6937   assert(0); /* NOT REACHED */
6938  }
6939  return NULL;
6940 }
6941
6942 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6943  int rem=(int)(RExC_end - RExC_parse);                       \
6944  int cut;                                                    \
6945  int num;                                                    \
6946  int iscut=0;                                                \
6947  if (rem>10) {                                               \
6948   rem=10;                                                 \
6949   iscut=1;                                                \
6950  }                                                           \
6951  cut=10-rem;                                                 \
6952  if (RExC_lastparse!=RExC_parse)                             \
6953   PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6954    rem, RExC_parse,                                    \
6955    cut + 4,                                            \
6956    iscut ? "..." : "<"                                 \
6957   );                                                      \
6958  else                                                        \
6959   PerlIO_printf(Perl_debug_log,"%16s","");                \
6960                 \
6961  if (SIZE_ONLY)                                              \
6962  num = RExC_size + 1;                                     \
6963  else                                                        \
6964  num=REG_NODE_NUM(RExC_emit);                             \
6965  if (RExC_lastnum!=num)                                      \
6966  PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6967  else                                                        \
6968  PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6969  PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6970   (int)((depth*2)), "",                                   \
6971   (funcname)                                              \
6972  );                                                          \
6973  RExC_lastnum=num;                                           \
6974  RExC_lastparse=RExC_parse;                                  \
6975 })
6976
6977
6978
6979 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6980  DEBUG_PARSE_MSG((funcname));                            \
6981  PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6982 })
6983 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6984  DEBUG_PARSE_MSG((funcname));                            \
6985  PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6986 })
6987
6988 /* This section of code defines the inversion list object and its methods.  The
6989  * interfaces are highly subject to change, so as much as possible is static to
6990  * this file.  An inversion list is here implemented as a malloc'd C UV array
6991  * with some added info that is placed as UVs at the beginning in a header
6992  * portion.  An inversion list for Unicode is an array of code points, sorted
6993  * by ordinal number.  The zeroth element is the first code point in the list.
6994  * The 1th element is the first element beyond that not in the list.  In other
6995  * words, the first range is
6996  *  invlist[0]..(invlist[1]-1)
6997  * The other ranges follow.  Thus every element whose index is divisible by two
6998  * marks the beginning of a range that is in the list, and every element not
6999  * divisible by two marks the beginning of a range not in the list.  A single
7000  * element inversion list that contains the single code point N generally
7001  * consists of two elements
7002  *  invlist[0] == N
7003  *  invlist[1] == N+1
7004  * (The exception is when N is the highest representable value on the
7005  * machine, in which case the list containing just it would be a single
7006  * element, itself.  By extension, if the last range in the list extends to
7007  * infinity, then the first element of that range will be in the inversion list
7008  * at a position that is divisible by two, and is the final element in the
7009  * list.)
7010  * Taking the complement (inverting) an inversion list is quite simple, if the
7011  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7012  * This implementation reserves an element at the beginning of each inversion
7013  * list to contain 0 when the list contains 0, and contains 1 otherwise.  The
7014  * actual beginning of the list is either that element if 0, or the next one if
7015  * 1.
7016  *
7017  * More about inversion lists can be found in "Unicode Demystified"
7018  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7019  * More will be coming when functionality is added later.
7020  *
7021  * The inversion list data structure is currently implemented as an SV pointing
7022  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7023  * array of UV whose memory management is automatically handled by the existing
7024  * facilities for SV's.
7025  *
7026  * Some of the methods should always be private to the implementation, and some
7027  * should eventually be made public */
7028
7029 /* The header definitions are in F<inline_invlist.c> */
7030 #define TO_INTERNAL_SIZE(x) (((x) + HEADER_LENGTH) * sizeof(UV))
7031 #define FROM_INTERNAL_SIZE(x) (((x)/ sizeof(UV)) - HEADER_LENGTH)
7032
7033 #define INVLIST_INITIAL_LEN 10
7034
7035 PERL_STATIC_INLINE UV*
7036 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7037 {
7038  /* Returns a pointer to the first element in the inversion list's array.
7039  * This is called upon initialization of an inversion list.  Where the
7040  * array begins depends on whether the list has the code point U+0000
7041  * in it or not.  The other parameter tells it whether the code that
7042  * follows this call is about to put a 0 in the inversion list or not.
7043  * The first element is either the element with 0, if 0, or the next one,
7044  * if 1 */
7045
7046  UV* zero = get_invlist_zero_addr(invlist);
7047
7048  PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7049
7050  /* Must be empty */
7051  assert(! *_get_invlist_len_addr(invlist));
7052
7053  /* 1^1 = 0; 1^0 = 1 */
7054  *zero = 1 ^ will_have_0;
7055  return zero + *zero;
7056 }
7057
7058 PERL_STATIC_INLINE UV*
7059 S_invlist_array(pTHX_ SV* const invlist)
7060 {
7061  /* Returns the pointer to the inversion list's array.  Every time the
7062  * length changes, this needs to be called in case malloc or realloc moved
7063  * it */
7064
7065  PERL_ARGS_ASSERT_INVLIST_ARRAY;
7066
7067  /* Must not be empty.  If these fail, you probably didn't check for <len>
7068  * being non-zero before trying to get the array */
7069  assert(*_get_invlist_len_addr(invlist));
7070  assert(*get_invlist_zero_addr(invlist) == 0
7071   || *get_invlist_zero_addr(invlist) == 1);
7072
7073  /* The array begins either at the element reserved for zero if the
7074  * list contains 0 (that element will be set to 0), or otherwise the next
7075  * element (in which case the reserved element will be set to 1). */
7076  return (UV *) (get_invlist_zero_addr(invlist)
7077     + *get_invlist_zero_addr(invlist));
7078 }
7079
7080 PERL_STATIC_INLINE void
7081 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7082 {
7083  /* Sets the current number of elements stored in the inversion list */
7084
7085  PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7086
7087  *_get_invlist_len_addr(invlist) = len;
7088
7089  assert(len <= SvLEN(invlist));
7090
7091  SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7092  /* If the list contains U+0000, that element is part of the header,
7093  * and should not be counted as part of the array.  It will contain
7094  * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
7095  * subtract:
7096  * SvCUR_set(invlist,
7097  *    TO_INTERNAL_SIZE(len
7098  *       - (*get_invlist_zero_addr(inv_list) ^ 1)));
7099  * But, this is only valid if len is not 0.  The consequences of not doing
7100  * this is that the memory allocation code may think that 1 more UV is
7101  * being used than actually is, and so might do an unnecessary grow.  That
7102  * seems worth not bothering to make this the precise amount.
7103  *
7104  * Note that when inverting, SvCUR shouldn't change */
7105 }
7106
7107 PERL_STATIC_INLINE IV*
7108 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7109 {
7110  /* Return the address of the UV that is reserved to hold the cached index
7111  * */
7112
7113  PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7114
7115  return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7116 }
7117
7118 PERL_STATIC_INLINE IV
7119 S_invlist_previous_index(pTHX_ SV* const invlist)
7120 {
7121  /* Returns cached index of previous search */
7122
7123  PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7124
7125  return *get_invlist_previous_index_addr(invlist);
7126 }
7127
7128 PERL_STATIC_INLINE void
7129 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7130 {
7131  /* Caches <index> for later retrieval */
7132
7133  PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7134
7135  assert(index == 0 || index < (int) _invlist_len(invlist));
7136
7137  *get_invlist_previous_index_addr(invlist) = index;
7138 }
7139
7140 PERL_STATIC_INLINE UV
7141 S_invlist_max(pTHX_ SV* const invlist)
7142 {
7143  /* Returns the maximum number of elements storable in the inversion list's
7144  * array, without having to realloc() */
7145
7146  PERL_ARGS_ASSERT_INVLIST_MAX;
7147
7148  return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7149   ? _invlist_len(invlist)
7150   : FROM_INTERNAL_SIZE(SvLEN(invlist));
7151 }
7152
7153 PERL_STATIC_INLINE UV*
7154 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7155 {
7156  /* Return the address of the UV that is reserved to hold 0 if the inversion
7157  * list contains 0.  This has to be the last element of the heading, as the
7158  * list proper starts with either it if 0, or the next element if not.
7159  * (But we force it to contain either 0 or 1) */
7160
7161  PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7162
7163  return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7164 }
7165
7166 #ifndef PERL_IN_XSUB_RE
7167 SV*
7168 Perl__new_invlist(pTHX_ IV initial_size)
7169 {
7170
7171  /* Return a pointer to a newly constructed inversion list, with enough
7172  * space to store 'initial_size' elements.  If that number is negative, a
7173  * system default is used instead */
7174
7175  SV* new_list;
7176
7177  if (initial_size < 0) {
7178   initial_size = INVLIST_INITIAL_LEN;
7179  }
7180
7181  /* Allocate the initial space */
7182  new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7183  invlist_set_len(new_list, 0);
7184
7185  /* Force iterinit() to be used to get iteration to work */
7186  *get_invlist_iter_addr(new_list) = UV_MAX;
7187
7188  /* This should force a segfault if a method doesn't initialize this
7189  * properly */
7190  *get_invlist_zero_addr(new_list) = UV_MAX;
7191
7192  *get_invlist_previous_index_addr(new_list) = 0;
7193  *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7194 #if HEADER_LENGTH != 5
7195 #   error Need to regenerate INVLIST_VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7196 #endif
7197
7198  return new_list;
7199 }
7200 #endif
7201
7202 STATIC SV*
7203 S__new_invlist_C_array(pTHX_ UV* list)
7204 {
7205  /* Return a pointer to a newly constructed inversion list, initialized to
7206  * point to <list>, which has to be in the exact correct inversion list
7207  * form, including internal fields.  Thus this is a dangerous routine that
7208  * should not be used in the wrong hands */
7209
7210  SV* invlist = newSV_type(SVt_PV);
7211
7212  PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7213
7214  SvPV_set(invlist, (char *) list);
7215  SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7216        shouldn't touch it */
7217  SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7218
7219  if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7220   Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7221  }
7222
7223  /* Initialize the iteration pointer.
7224  * XXX This could be done at compile time in charclass_invlists.h, but I
7225  * (khw) am not confident that the suffixes for specifying the C constant
7226  * UV_MAX are portable, e.g.  'ull' on a 32 bit machine that is configured
7227  * to use 64 bits; might need a Configure probe */
7228  invlist_iterfinish(invlist);
7229
7230  return invlist;
7231 }
7232
7233 STATIC void
7234 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7235 {
7236  /* Grow the maximum size of an inversion list */
7237
7238  PERL_ARGS_ASSERT_INVLIST_EXTEND;
7239
7240  SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7241 }
7242
7243 PERL_STATIC_INLINE void
7244 S_invlist_trim(pTHX_ SV* const invlist)
7245 {
7246  PERL_ARGS_ASSERT_INVLIST_TRIM;
7247
7248  /* Change the length of the inversion list to how many entries it currently
7249  * has */
7250
7251  SvPV_shrink_to_cur((SV *) invlist);
7252 }
7253
7254 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7255
7256 STATIC void
7257 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7258 {
7259    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7260  * the end of the inversion list.  The range must be above any existing
7261  * ones. */
7262
7263  UV* array;
7264  UV max = invlist_max(invlist);
7265  UV len = _invlist_len(invlist);
7266
7267  PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7268
7269  if (len == 0) { /* Empty lists must be initialized */
7270   array = _invlist_array_init(invlist, start == 0);
7271  }
7272  else {
7273   /* Here, the existing list is non-empty. The current max entry in the
7274   * list is generally the first value not in the set, except when the
7275   * set extends to the end of permissible values, in which case it is
7276   * the first entry in that final set, and so this call is an attempt to
7277   * append out-of-order */
7278
7279   UV final_element = len - 1;
7280   array = invlist_array(invlist);
7281   if (array[final_element] > start
7282    || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7283   {
7284    Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7285      array[final_element], start,
7286      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7287   }
7288
7289   /* Here, it is a legal append.  If the new range begins with the first
7290   * value not in the set, it is extending the set, so the new first
7291   * value not in the set is one greater than the newly extended range.
7292   * */
7293   if (array[final_element] == start) {
7294    if (end != UV_MAX) {
7295     array[final_element] = end + 1;
7296    }
7297    else {
7298     /* But if the end is the maximum representable on the machine,
7299     * just let the range that this would extend to have no end */
7300     invlist_set_len(invlist, len - 1);
7301    }
7302    return;
7303   }
7304  }
7305
7306  /* Here the new range doesn't extend any existing set.  Add it */
7307
7308  len += 2; /* Includes an element each for the start and end of range */
7309
7310  /* If overflows the existing space, extend, which may cause the array to be
7311  * moved */
7312  if (max < len) {
7313   invlist_extend(invlist, len);
7314   invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7315           failure in invlist_array() */
7316   array = invlist_array(invlist);
7317  }
7318  else {
7319   invlist_set_len(invlist, len);
7320  }
7321
7322  /* The next item on the list starts the range, the one after that is
7323  * one past the new range.  */
7324  array[len - 2] = start;
7325  if (end != UV_MAX) {
7326   array[len - 1] = end + 1;
7327  }
7328  else {
7329   /* But if the end is the maximum representable on the machine, just let
7330   * the range have no end */
7331   invlist_set_len(invlist, len - 1);
7332  }
7333 }
7334
7335 #ifndef PERL_IN_XSUB_RE
7336
7337 IV
7338 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7339 {
7340  /* Searches the inversion list for the entry that contains the input code
7341  * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7342  * return value is the index into the list's array of the range that
7343  * contains <cp> */
7344
7345  IV low = 0;
7346  IV mid;
7347  IV high = _invlist_len(invlist);
7348  const IV highest_element = high - 1;
7349  const UV* array;
7350
7351  PERL_ARGS_ASSERT__INVLIST_SEARCH;
7352
7353  /* If list is empty, return failure. */
7354  if (high == 0) {
7355   return -1;
7356  }
7357
7358  /* (We can't get the array unless we know the list is non-empty) */
7359  array = invlist_array(invlist);
7360
7361  mid = invlist_previous_index(invlist);
7362  assert(mid >=0 && mid <= highest_element);
7363
7364  /* <mid> contains the cache of the result of the previous call to this
7365  * function (0 the first time).  See if this call is for the same result,
7366  * or if it is for mid-1.  This is under the theory that calls to this
7367  * function will often be for related code points that are near each other.
7368  * And benchmarks show that caching gives better results.  We also test
7369  * here if the code point is within the bounds of the list.  These tests
7370  * replace others that would have had to be made anyway to make sure that
7371  * the array bounds were not exceeded, and these give us extra information
7372  * at the same time */
7373  if (cp >= array[mid]) {
7374   if (cp >= array[highest_element]) {
7375    return highest_element;
7376   }
7377
7378   /* Here, array[mid] <= cp < array[highest_element].  This means that
7379   * the final element is not the answer, so can exclude it; it also
7380   * means that <mid> is not the final element, so can refer to 'mid + 1'
7381   * safely */
7382   if (cp < array[mid + 1]) {
7383    return mid;
7384   }
7385   high--;
7386   low = mid + 1;
7387  }
7388  else { /* cp < aray[mid] */
7389   if (cp < array[0]) { /* Fail if outside the array */
7390    return -1;
7391   }
7392   high = mid;
7393   if (cp >= array[mid - 1]) {
7394    goto found_entry;
7395   }
7396  }
7397
7398  /* Binary search.  What we are looking for is <i> such that
7399  * array[i] <= cp < array[i+1]
7400  * The loop below converges on the i+1.  Note that there may not be an
7401  * (i+1)th element in the array, and things work nonetheless */
7402  while (low < high) {
7403   mid = (low + high) / 2;
7404   assert(mid <= highest_element);
7405   if (array[mid] <= cp) { /* cp >= array[mid] */
7406    low = mid + 1;
7407
7408    /* We could do this extra test to exit the loop early.
7409    if (cp < array[low]) {
7410     return mid;
7411    }
7412    */
7413   }
7414   else { /* cp < array[mid] */
7415    high = mid;
7416   }
7417  }
7418
7419   found_entry:
7420  high--;
7421  invlist_set_previous_index(invlist, high);
7422  return high;
7423 }
7424
7425 void
7426 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7427 {
7428  /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7429  * but is used when the swash has an inversion list.  This makes this much
7430  * faster, as it uses a binary search instead of a linear one.  This is
7431  * intimately tied to that function, and perhaps should be in utf8.c,
7432  * except it is intimately tied to inversion lists as well.  It assumes
7433  * that <swatch> is all 0's on input */
7434
7435  UV current = start;
7436  const IV len = _invlist_len(invlist);
7437  IV i;
7438  const UV * array;
7439
7440  PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7441
7442  if (len == 0) { /* Empty inversion list */
7443   return;
7444  }
7445
7446  array = invlist_array(invlist);
7447
7448  /* Find which element it is */
7449  i = _invlist_search(invlist, start);
7450
7451  /* We populate from <start> to <end> */
7452  while (current < end) {
7453   UV upper;
7454
7455   /* The inversion list gives the results for every possible code point
7456   * after the first one in the list.  Only those ranges whose index is
7457   * even are ones that the inversion list matches.  For the odd ones,
7458   * and if the initial code point is not in the list, we have to skip
7459   * forward to the next element */
7460   if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7461    i++;
7462    if (i >= len) { /* Finished if beyond the end of the array */
7463     return;
7464    }
7465    current = array[i];
7466    if (current >= end) {   /* Finished if beyond the end of what we
7467          are populating */
7468     if (LIKELY(end < UV_MAX)) {
7469      return;
7470     }
7471
7472     /* We get here when the upper bound is the maximum
7473     * representable on the machine, and we are looking for just
7474     * that code point.  Have to special case it */
7475     i = len;
7476     goto join_end_of_list;
7477    }
7478   }
7479   assert(current >= start);
7480
7481   /* The current range ends one below the next one, except don't go past
7482   * <end> */
7483   i++;
7484   upper = (i < len && array[i] < end) ? array[i] : end;
7485
7486   /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7487   * for each code point in it */
7488   for (; current < upper; current++) {
7489    const STRLEN offset = (STRLEN)(current - start);
7490    swatch[offset >> 3] |= 1 << (offset & 7);
7491   }
7492
7493  join_end_of_list:
7494
7495   /* Quit if at the end of the list */
7496   if (i >= len) {
7497
7498    /* But first, have to deal with the highest possible code point on
7499    * the platform.  The previous code assumes that <end> is one
7500    * beyond where we want to populate, but that is impossible at the
7501    * platform's infinity, so have to handle it specially */
7502    if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7503    {
7504     const STRLEN offset = (STRLEN)(end - start);
7505     swatch[offset >> 3] |= 1 << (offset & 7);
7506    }
7507    return;
7508   }
7509
7510   /* Advance to the next range, which will be for code points not in the
7511   * inversion list */
7512   current = array[i];
7513  }
7514
7515  return;
7516 }
7517
7518 void
7519 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7520 {
7521  /* Take the union of two inversion lists and point <output> to it.  *output
7522  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7523  * the reference count to that list will be decremented.  The first list,
7524  * <a>, may be NULL, in which case a copy of the second list is returned.
7525  * If <complement_b> is TRUE, the union is taken of the complement
7526  * (inversion) of <b> instead of b itself.
7527  *
7528  * The basis for this comes from "Unicode Demystified" Chapter 13 by
7529  * Richard Gillam, published by Addison-Wesley, and explained at some
7530  * length there.  The preface says to incorporate its examples into your
7531  * code at your own risk.
7532  *
7533  * The algorithm is like a merge sort.
7534  *
7535  * XXX A potential performance improvement is to keep track as we go along
7536  * if only one of the inputs contributes to the result, meaning the other
7537  * is a subset of that one.  In that case, we can skip the final copy and
7538  * return the larger of the input lists, but then outside code might need
7539  * to keep track of whether to free the input list or not */
7540
7541  UV* array_a;    /* a's array */
7542  UV* array_b;
7543  UV len_a;     /* length of a's array */
7544  UV len_b;
7545
7546  SV* u;   /* the resulting union */
7547  UV* array_u;
7548  UV len_u;
7549
7550  UV i_a = 0;      /* current index into a's array */
7551  UV i_b = 0;
7552  UV i_u = 0;
7553
7554  /* running count, as explained in the algorithm source book; items are
7555  * stopped accumulating and are output when the count changes to/from 0.
7556  * The count is incremented when we start a range that's in the set, and
7557  * decremented when we start a range that's not in the set.  So its range
7558  * is 0 to 2.  Only when the count is zero is something not in the set.
7559  */
7560  UV count = 0;
7561
7562  PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7563  assert(a != b);
7564
7565  /* If either one is empty, the union is the other one */
7566  if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7567   if (*output == a) {
7568    if (a != NULL) {
7569     SvREFCNT_dec_NN(a);
7570    }
7571   }
7572   if (*output != b) {
7573    *output = invlist_clone(b);
7574    if (complement_b) {
7575     _invlist_invert(*output);
7576    }
7577   } /* else *output already = b; */
7578   return;
7579  }
7580  else if ((len_b = _invlist_len(b)) == 0) {
7581   if (*output == b) {
7582    SvREFCNT_dec_NN(b);
7583   }
7584
7585   /* The complement of an empty list is a list that has everything in it,
7586   * so the union with <a> includes everything too */
7587   if (complement_b) {
7588    if (a == *output) {
7589     SvREFCNT_dec_NN(a);
7590    }
7591    *output = _new_invlist(1);
7592    _append_range_to_invlist(*output, 0, UV_MAX);
7593   }
7594   else if (*output != a) {
7595    *output = invlist_clone(a);
7596   }
7597   /* else *output already = a; */
7598   return;
7599  }
7600
7601  /* Here both lists exist and are non-empty */
7602  array_a = invlist_array(a);
7603  array_b = invlist_array(b);
7604
7605  /* If are to take the union of 'a' with the complement of b, set it
7606  * up so are looking at b's complement. */
7607  if (complement_b) {
7608
7609   /* To complement, we invert: if the first element is 0, remove it.  To
7610   * do this, we just pretend the array starts one later, and clear the
7611   * flag as we don't have to do anything else later */
7612   if (array_b[0] == 0) {
7613    array_b++;
7614    len_b--;
7615    complement_b = FALSE;
7616   }
7617   else {
7618
7619    /* But if the first element is not zero, we unshift a 0 before the
7620    * array.  The data structure reserves a space for that 0 (which
7621    * should be a '1' right now), so physical shifting is unneeded,
7622    * but temporarily change that element to 0.  Before exiting the
7623    * routine, we must restore the element to '1' */
7624    array_b--;
7625    len_b++;
7626    array_b[0] = 0;
7627   }
7628  }
7629
7630  /* Size the union for the worst case: that the sets are completely
7631  * disjoint */
7632  u = _new_invlist(len_a + len_b);
7633
7634  /* Will contain U+0000 if either component does */
7635  array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7636          || (len_b > 0 && array_b[0] == 0));
7637
7638  /* Go through each list item by item, stopping when exhausted one of
7639  * them */
7640  while (i_a < len_a && i_b < len_b) {
7641   UV cp;     /* The element to potentially add to the union's array */
7642   bool cp_in_set;   /* is it in the the input list's set or not */
7643
7644   /* We need to take one or the other of the two inputs for the union.
7645   * Since we are merging two sorted lists, we take the smaller of the
7646   * next items.  In case of a tie, we take the one that is in its set
7647   * first.  If we took one not in the set first, it would decrement the
7648   * count, possibly to 0 which would cause it to be output as ending the
7649   * range, and the next time through we would take the same number, and
7650   * output it again as beginning the next range.  By doing it the
7651   * opposite way, there is no possibility that the count will be
7652   * momentarily decremented to 0, and thus the two adjoining ranges will
7653   * be seamlessly merged.  (In a tie and both are in the set or both not
7654   * in the set, it doesn't matter which we take first.) */
7655   if (array_a[i_a] < array_b[i_b]
7656    || (array_a[i_a] == array_b[i_b]
7657     && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7658   {
7659    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7660    cp= array_a[i_a++];
7661   }
7662   else {
7663    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7664    cp = array_b[i_b++];
7665   }
7666
7667   /* Here, have chosen which of the two inputs to look at.  Only output
7668   * if the running count changes to/from 0, which marks the
7669   * beginning/end of a range in that's in the set */
7670   if (cp_in_set) {
7671    if (count == 0) {
7672     array_u[i_u++] = cp;
7673    }
7674    count++;
7675   }
7676   else {
7677    count--;
7678    if (count == 0) {
7679     array_u[i_u++] = cp;
7680    }
7681   }
7682  }
7683
7684  /* Here, we are finished going through at least one of the lists, which
7685  * means there is something remaining in at most one.  We check if the list
7686  * that hasn't been exhausted is positioned such that we are in the middle
7687  * of a range in its set or not.  (i_a and i_b point to the element beyond
7688  * the one we care about.) If in the set, we decrement 'count'; if 0, there
7689  * is potentially more to output.
7690  * There are four cases:
7691  * 1) Both weren't in their sets, count is 0, and remains 0.  What's left
7692  *    in the union is entirely from the non-exhausted set.
7693  * 2) Both were in their sets, count is 2.  Nothing further should
7694  *    be output, as everything that remains will be in the exhausted
7695  *    list's set, hence in the union; decrementing to 1 but not 0 insures
7696  *    that
7697  * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7698  *    Nothing further should be output because the union includes
7699  *    everything from the exhausted set.  Not decrementing ensures that.
7700  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7701  *    decrementing to 0 insures that we look at the remainder of the
7702  *    non-exhausted set */
7703  if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7704   || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7705  {
7706   count--;
7707  }
7708
7709  /* The final length is what we've output so far, plus what else is about to
7710  * be output.  (If 'count' is non-zero, then the input list we exhausted
7711  * has everything remaining up to the machine's limit in its set, and hence
7712  * in the union, so there will be no further output. */
7713  len_u = i_u;
7714  if (count == 0) {
7715   /* At most one of the subexpressions will be non-zero */
7716   len_u += (len_a - i_a) + (len_b - i_b);
7717  }
7718
7719  /* Set result to final length, which can change the pointer to array_u, so
7720  * re-find it */
7721  if (len_u != _invlist_len(u)) {
7722   invlist_set_len(u, len_u);
7723   invlist_trim(u);
7724   array_u = invlist_array(u);
7725  }
7726
7727  /* When 'count' is 0, the list that was exhausted (if one was shorter than
7728  * the other) ended with everything above it not in its set.  That means
7729  * that the remaining part of the union is precisely the same as the
7730  * non-exhausted list, so can just copy it unchanged.  (If both list were
7731  * exhausted at the same time, then the operations below will be both 0.)
7732  */
7733  if (count == 0) {
7734   IV copy_count; /* At most one will have a non-zero copy count */
7735   if ((copy_count = len_a - i_a) > 0) {
7736    Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7737   }
7738   else if ((copy_count = len_b - i_b) > 0) {
7739    Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7740   }
7741  }
7742
7743  /* If we've changed b, restore it */
7744  if (complement_b) {
7745   array_b[0] = 1;
7746  }
7747
7748  /*  We may be removing a reference to one of the inputs */
7749  if (a == *output || b == *output) {
7750   assert(! invlist_is_iterating(*output));
7751   SvREFCNT_dec_NN(*output);
7752  }
7753
7754  *output = u;
7755  return;
7756 }
7757
7758 void
7759 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7760 {
7761  /* Take the intersection of two inversion lists and point <i> to it.  *i
7762  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7763  * the reference count to that list will be decremented.
7764  * If <complement_b> is TRUE, the result will be the intersection of <a>
7765  * and the complement (or inversion) of <b> instead of <b> directly.
7766  *
7767  * The basis for this comes from "Unicode Demystified" Chapter 13 by
7768  * Richard Gillam, published by Addison-Wesley, and explained at some
7769  * length there.  The preface says to incorporate its examples into your
7770  * code at your own risk.  In fact, it had bugs
7771  *
7772  * The algorithm is like a merge sort, and is essentially the same as the
7773  * union above
7774  */
7775
7776  UV* array_a;  /* a's array */
7777  UV* array_b;
7778  UV len_a; /* length of a's array */
7779  UV len_b;
7780
7781  SV* r;       /* the resulting intersection */
7782  UV* array_r;
7783  UV len_r;
7784
7785  UV i_a = 0;      /* current index into a's array */
7786  UV i_b = 0;
7787  UV i_r = 0;
7788
7789  /* running count, as explained in the algorithm source book; items are
7790  * stopped accumulating and are output when the count changes to/from 2.
7791  * The count is incremented when we start a range that's in the set, and
7792  * decremented when we start a range that's not in the set.  So its range
7793  * is 0 to 2.  Only when the count is 2 is something in the intersection.
7794  */
7795  UV count = 0;
7796
7797  PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7798  assert(a != b);
7799
7800  /* Special case if either one is empty */
7801  len_a = _invlist_len(a);
7802  if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7803
7804   if (len_a != 0 && complement_b) {
7805
7806    /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7807    * be empty.  Here, also we are using 'b's complement, which hence
7808    * must be every possible code point.  Thus the intersection is
7809    * simply 'a'. */
7810    if (*i != a) {
7811     *i = invlist_clone(a);
7812
7813     if (*i == b) {
7814      SvREFCNT_dec_NN(b);
7815     }
7816    }
7817    /* else *i is already 'a' */
7818    return;
7819   }
7820
7821   /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7822   * intersection must be empty */
7823   if (*i == a) {
7824    SvREFCNT_dec_NN(a);
7825   }
7826   else if (*i == b) {
7827    SvREFCNT_dec_NN(b);
7828   }
7829   *i = _new_invlist(0);
7830   return;
7831  }
7832
7833  /* Here both lists exist and are non-empty */
7834  array_a = invlist_array(a);
7835  array_b = invlist_array(b);
7836
7837  /* If are to take the intersection of 'a' with the complement of b, set it
7838  * up so are looking at b's complement. */
7839  if (complement_b) {
7840
7841   /* To complement, we invert: if the first element is 0, remove it.  To
7842   * do this, we just pretend the array starts one later, and clear the
7843   * flag as we don't have to do anything else later */
7844   if (array_b[0] == 0) {
7845    array_b++;
7846    len_b--;
7847    complement_b = FALSE;
7848   }
7849   else {
7850
7851    /* But if the first element is not zero, we unshift a 0 before the
7852    * array.  The data structure reserves a space for that 0 (which
7853    * should be a '1' right now), so physical shifting is unneeded,
7854    * but temporarily change that element to 0.  Before exiting the
7855    * routine, we must restore the element to '1' */
7856    array_b--;
7857    len_b++;
7858    array_b[0] = 0;
7859   }
7860  }
7861
7862  /* Size the intersection for the worst case: that the intersection ends up
7863  * fragmenting everything to be completely disjoint */
7864  r= _new_invlist(len_a + len_b);
7865
7866  /* Will contain U+0000 iff both components do */
7867  array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7868          && len_b > 0 && array_b[0] == 0);
7869
7870  /* Go through each list item by item, stopping when exhausted one of
7871  * them */
7872  while (i_a < len_a && i_b < len_b) {
7873   UV cp;     /* The element to potentially add to the intersection's
7874      array */
7875   bool cp_in_set; /* Is it in the input list's set or not */
7876
7877   /* We need to take one or the other of the two inputs for the
7878   * intersection.  Since we are merging two sorted lists, we take the
7879   * smaller of the next items.  In case of a tie, we take the one that
7880   * is not in its set first (a difference from the union algorithm).  If
7881   * we took one in the set first, it would increment the count, possibly
7882   * to 2 which would cause it to be output as starting a range in the
7883   * intersection, and the next time through we would take that same
7884   * number, and output it again as ending the set.  By doing it the
7885   * opposite of this, there is no possibility that the count will be
7886   * momentarily incremented to 2.  (In a tie and both are in the set or
7887   * both not in the set, it doesn't matter which we take first.) */
7888   if (array_a[i_a] < array_b[i_b]
7889    || (array_a[i_a] == array_b[i_b]
7890     && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7891   {
7892    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7893    cp= array_a[i_a++];
7894   }
7895   else {
7896    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7897    cp= array_b[i_b++];
7898   }
7899
7900   /* Here, have chosen which of the two inputs to look at.  Only output
7901   * if the running count changes to/from 2, which marks the
7902   * beginning/end of a range that's in the intersection */
7903   if (cp_in_set) {
7904    count++;
7905    if (count == 2) {
7906     array_r[i_r++] = cp;
7907    }
7908   }
7909   else {
7910    if (count == 2) {
7911     array_r[i_r++] = cp;
7912    }
7913    count--;
7914   }
7915  }
7916
7917  /* Here, we are finished going through at least one of the lists, which
7918  * means there is something remaining in at most one.  We check if the list
7919  * that has been exhausted is positioned such that we are in the middle
7920  * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7921  * the ones we care about.)  There are four cases:
7922  * 1) Both weren't in their sets, count is 0, and remains 0.  There's
7923  *    nothing left in the intersection.
7924  * 2) Both were in their sets, count is 2 and perhaps is incremented to
7925  *    above 2.  What should be output is exactly that which is in the
7926  *    non-exhausted set, as everything it has is also in the intersection
7927  *    set, and everything it doesn't have can't be in the intersection
7928  * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7929  *    gets incremented to 2.  Like the previous case, the intersection is
7930  *    everything that remains in the non-exhausted set.
7931  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7932  *    remains 1.  And the intersection has nothing more. */
7933  if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7934   || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7935  {
7936   count++;
7937  }
7938
7939  /* The final length is what we've output so far plus what else is in the
7940  * intersection.  At most one of the subexpressions below will be non-zero */
7941  len_r = i_r;
7942  if (count >= 2) {
7943   len_r += (len_a - i_a) + (len_b - i_b);
7944  }
7945
7946  /* Set result to final length, which can change the pointer to array_r, so
7947  * re-find it */
7948  if (len_r != _invlist_len(r)) {
7949   invlist_set_len(r, len_r);
7950   invlist_trim(r);
7951   array_r = invlist_array(r);
7952  }
7953
7954  /* Finish outputting any remaining */
7955  if (count >= 2) { /* At most one will have a non-zero copy count */
7956   IV copy_count;
7957   if ((copy_count = len_a - i_a) > 0) {
7958    Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7959   }
7960   else if ((copy_count = len_b - i_b) > 0) {
7961    Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7962   }
7963  }
7964
7965  /* If we've changed b, restore it */
7966  if (complement_b) {
7967   array_b[0] = 1;
7968  }
7969
7970  /*  We may be removing a reference to one of the inputs */
7971  if (a == *i || b == *i) {
7972   assert(! invlist_is_iterating(*i));
7973   SvREFCNT_dec_NN(*i);
7974  }
7975
7976  *i = r;
7977  return;
7978 }
7979
7980 SV*
7981 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7982 {
7983  /* Add the range from 'start' to 'end' inclusive to the inversion list's
7984  * set.  A pointer to the inversion list is returned.  This may actually be
7985  * a new list, in which case the passed in one has been destroyed.  The
7986  * passed in inversion list can be NULL, in which case a new one is created
7987  * with just the one range in it */
7988
7989  SV* range_invlist;
7990  UV len;
7991
7992  if (invlist == NULL) {
7993   invlist = _new_invlist(2);
7994   len = 0;
7995  }
7996  else {
7997   len = _invlist_len(invlist);
7998  }
7999
8000  /* If comes after the final entry actually in the list, can just append it
8001  * to the end, */
8002  if (len == 0
8003   || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8004    && start >= invlist_array(invlist)[len - 1]))
8005  {
8006   _append_range_to_invlist(invlist, start, end);
8007   return invlist;
8008  }
8009
8010  /* Here, can't just append things, create and return a new inversion list
8011  * which is the union of this range and the existing inversion list */
8012  range_invlist = _new_invlist(2);
8013  _append_range_to_invlist(range_invlist, start, end);
8014
8015  _invlist_union(invlist, range_invlist, &invlist);
8016
8017  /* The temporary can be freed */
8018  SvREFCNT_dec_NN(range_invlist);
8019
8020  return invlist;
8021 }
8022
8023 #endif
8024
8025 PERL_STATIC_INLINE SV*
8026 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8027  return _add_range_to_invlist(invlist, cp, cp);
8028 }
8029
8030 #ifndef PERL_IN_XSUB_RE
8031 void
8032 Perl__invlist_invert(pTHX_ SV* const invlist)
8033 {
8034  /* Complement the input inversion list.  This adds a 0 if the list didn't
8035  * have a zero; removes it otherwise.  As described above, the data
8036  * structure is set up so that this is very efficient */
8037
8038  UV* len_pos = _get_invlist_len_addr(invlist);
8039
8040  PERL_ARGS_ASSERT__INVLIST_INVERT;
8041
8042  assert(! invlist_is_iterating(invlist));
8043
8044  /* The inverse of matching nothing is matching everything */
8045  if (*len_pos == 0) {
8046   _append_range_to_invlist(invlist, 0, UV_MAX);
8047   return;
8048  }
8049
8050  /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
8051  * zero element was a 0, so it is being removed, so the length decrements
8052  * by 1; and vice-versa.  SvCUR is unaffected */
8053  if (*get_invlist_zero_addr(invlist) ^= 1) {
8054   (*len_pos)--;
8055  }
8056  else {
8057   (*len_pos)++;
8058  }
8059 }
8060
8061 void
8062 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8063 {
8064  /* Complement the input inversion list (which must be a Unicode property,
8065  * all of which don't match above the Unicode maximum code point.)  And
8066  * Perl has chosen to not have the inversion match above that either.  This
8067  * adds a 0x110000 if the list didn't end with it, and removes it if it did
8068  */
8069
8070  UV len;
8071  UV* array;
8072
8073  PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8074
8075  _invlist_invert(invlist);
8076
8077  len = _invlist_len(invlist);
8078
8079  if (len != 0) { /* If empty do nothing */
8080   array = invlist_array(invlist);
8081   if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8082    /* Add 0x110000.  First, grow if necessary */
8083    len++;
8084    if (invlist_max(invlist) < len) {
8085     invlist_extend(invlist, len);
8086     array = invlist_array(invlist);
8087    }
8088    invlist_set_len(invlist, len);
8089    array[len - 1] = PERL_UNICODE_MAX + 1;
8090   }
8091   else {  /* Remove the 0x110000 */
8092    invlist_set_len(invlist, len - 1);
8093   }
8094  }
8095
8096  return;
8097 }
8098 #endif
8099
8100 PERL_STATIC_INLINE SV*
8101 S_invlist_clone(pTHX_ SV* const invlist)
8102 {
8103
8104  /* Return a new inversion list that is a copy of the input one, which is
8105  * unchanged */
8106
8107  /* Need to allocate extra space to accommodate Perl's addition of a
8108  * trailing NUL to SvPV's, since it thinks they are always strings */
8109  SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8110  STRLEN length = SvCUR(invlist);
8111
8112  PERL_ARGS_ASSERT_INVLIST_CLONE;
8113
8114  SvCUR_set(new_invlist, length); /* This isn't done automatically */
8115  Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8116
8117  return new_invlist;
8118 }
8119
8120 PERL_STATIC_INLINE UV*
8121 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8122 {
8123  /* Return the address of the UV that contains the current iteration
8124  * position */
8125
8126  PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8127
8128  return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8129 }
8130
8131 PERL_STATIC_INLINE UV*
8132 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8133 {
8134  /* Return the address of the UV that contains the version id. */
8135
8136  PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8137
8138  return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8139 }
8140
8141 PERL_STATIC_INLINE void
8142 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8143 {
8144  PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8145
8146  *get_invlist_iter_addr(invlist) = 0;
8147 }
8148
8149 PERL_STATIC_INLINE void
8150 S_invlist_iterfinish(pTHX_ SV* invlist)
8151 {
8152  /* Terminate iterator for invlist.  This is to catch development errors.
8153  * Any iteration that is interrupted before completed should call this
8154  * function.  Functions that add code points anywhere else but to the end
8155  * of an inversion list assert that they are not in the middle of an
8156  * iteration.  If they were, the addition would make the iteration
8157  * problematical: if the iteration hadn't reached the place where things
8158  * were being added, it would be ok */
8159
8160  PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8161
8162  *get_invlist_iter_addr(invlist) = UV_MAX;
8163 }
8164
8165 STATIC bool
8166 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8167 {
8168  /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8169  * This call sets in <*start> and <*end>, the next range in <invlist>.
8170  * Returns <TRUE> if successful and the next call will return the next
8171  * range; <FALSE> if was already at the end of the list.  If the latter,
8172  * <*start> and <*end> are unchanged, and the next call to this function
8173  * will start over at the beginning of the list */
8174
8175  UV* pos = get_invlist_iter_addr(invlist);
8176  UV len = _invlist_len(invlist);
8177  UV *array;
8178
8179  PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8180
8181  if (*pos >= len) {
8182   *pos = UV_MAX; /* Force iterinit() to be required next time */
8183   return FALSE;
8184  }
8185
8186  array = invlist_array(invlist);
8187
8188  *start = array[(*pos)++];
8189
8190  if (*pos >= len) {
8191   *end = UV_MAX;
8192  }
8193  else {
8194   *end = array[(*pos)++] - 1;
8195  }
8196
8197  return TRUE;
8198 }
8199
8200 PERL_STATIC_INLINE bool
8201 S_invlist_is_iterating(pTHX_ SV* const invlist)
8202 {
8203  PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8204
8205  return *(get_invlist_iter_addr(invlist)) < UV_MAX;
8206 }
8207
8208 PERL_STATIC_INLINE UV
8209 S_invlist_highest(pTHX_ SV* const invlist)
8210 {
8211  /* Returns the highest code point that matches an inversion list.  This API
8212  * has an ambiguity, as it returns 0 under either the highest is actually
8213  * 0, or if the list is empty.  If this distinction matters to you, check
8214  * for emptiness before calling this function */
8215
8216  UV len = _invlist_len(invlist);
8217  UV *array;
8218
8219  PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8220
8221  if (len == 0) {
8222   return 0;
8223  }
8224
8225  array = invlist_array(invlist);
8226
8227  /* The last element in the array in the inversion list always starts a
8228  * range that goes to infinity.  That range may be for code points that are
8229  * matched in the inversion list, or it may be for ones that aren't
8230  * matched.  In the latter case, the highest code point in the set is one
8231  * less than the beginning of this range; otherwise it is the final element
8232  * of this range: infinity */
8233  return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8234   ? UV_MAX
8235   : array[len - 1] - 1;
8236 }
8237
8238 #ifndef PERL_IN_XSUB_RE
8239 SV *
8240 Perl__invlist_contents(pTHX_ SV* const invlist)
8241 {
8242  /* Get the contents of an inversion list into a string SV so that they can
8243  * be printed out.  It uses the format traditionally done for debug tracing
8244  */
8245
8246  UV start, end;
8247  SV* output = newSVpvs("\n");
8248
8249  PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8250
8251  assert(! invlist_is_iterating(invlist));
8252
8253  invlist_iterinit(invlist);
8254  while (invlist_iternext(invlist, &start, &end)) {
8255   if (end == UV_MAX) {
8256    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8257   }
8258   else if (end != start) {
8259    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8260      start,       end);
8261   }
8262   else {
8263    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8264   }
8265  }
8266
8267  return output;
8268 }
8269 #endif
8270
8271 #ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
8272 void
8273 Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
8274 {
8275  /* Dumps out the ranges in an inversion list.  The string 'header'
8276  * if present is output on a line before the first range */
8277
8278  UV start, end;
8279
8280  PERL_ARGS_ASSERT__INVLIST_DUMP;
8281
8282  if (header && strlen(header)) {
8283   PerlIO_printf(Perl_debug_log, "%s\n", header);
8284  }
8285  if (invlist_is_iterating(invlist)) {
8286   PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n");
8287   return;
8288  }
8289
8290  invlist_iterinit(invlist);
8291  while (invlist_iternext(invlist, &start, &end)) {
8292   if (end == UV_MAX) {
8293    PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8294   }
8295   else if (end != start) {
8296    PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
8297             start,         end);
8298   }
8299   else {
8300    PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
8301   }
8302  }
8303 }
8304 #endif
8305
8306 #if 0
8307 bool
8308 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8309 {
8310  /* Return a boolean as to if the two passed in inversion lists are
8311  * identical.  The final argument, if TRUE, says to take the complement of
8312  * the second inversion list before doing the comparison */
8313
8314  UV* array_a = invlist_array(a);
8315  UV* array_b = invlist_array(b);
8316  UV len_a = _invlist_len(a);
8317  UV len_b = _invlist_len(b);
8318
8319  UV i = 0;      /* current index into the arrays */
8320  bool retval = TRUE;     /* Assume are identical until proven otherwise */
8321
8322  PERL_ARGS_ASSERT__INVLISTEQ;
8323
8324  /* If are to compare 'a' with the complement of b, set it
8325  * up so are looking at b's complement. */
8326  if (complement_b) {
8327
8328   /* The complement of nothing is everything, so <a> would have to have
8329   * just one element, starting at zero (ending at infinity) */
8330   if (len_b == 0) {
8331    return (len_a == 1 && array_a[0] == 0);
8332   }
8333   else if (array_b[0] == 0) {
8334
8335    /* Otherwise, to complement, we invert.  Here, the first element is
8336    * 0, just remove it.  To do this, we just pretend the array starts
8337    * one later, and clear the flag as we don't have to do anything
8338    * else later */
8339
8340    array_b++;
8341    len_b--;
8342    complement_b = FALSE;
8343   }
8344   else {
8345
8346    /* But if the first element is not zero, we unshift a 0 before the
8347    * array.  The data structure reserves a space for that 0 (which
8348    * should be a '1' right now), so physical shifting is unneeded,
8349    * but temporarily change that element to 0.  Before exiting the
8350    * routine, we must restore the element to '1' */
8351    array_b--;
8352    len_b++;
8353    array_b[0] = 0;
8354   }
8355  }
8356
8357  /* Make sure that the lengths are the same, as well as the final element
8358  * before looping through the remainder.  (Thus we test the length, final,
8359  * and first elements right off the bat) */
8360  if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8361   retval = FALSE;
8362  }
8363  else for (i = 0; i < len_a - 1; i++) {
8364   if (array_a[i] != array_b[i]) {
8365    retval = FALSE;
8366    break;
8367   }
8368  }
8369
8370  if (complement_b) {
8371   array_b[0] = 1;
8372  }
8373  return retval;
8374 }
8375 #endif
8376
8377 #undef HEADER_LENGTH
8378 #undef INVLIST_INITIAL_LENGTH
8379 #undef TO_INTERNAL_SIZE
8380 #undef FROM_INTERNAL_SIZE
8381 #undef INVLIST_LEN_OFFSET
8382 #undef INVLIST_ZERO_OFFSET
8383 #undef INVLIST_ITER_OFFSET
8384 #undef INVLIST_VERSION_ID
8385 #undef INVLIST_PREVIOUS_INDEX_OFFSET
8386
8387 /* End of inversion list object */
8388
8389 STATIC void
8390 S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
8391 {
8392  /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8393  * constructs, and updates RExC_flags with them.  On input, RExC_parse
8394  * should point to the first flag; it is updated on output to point to the
8395  * final ')' or ':'.  There needs to be at least one flag, or this will
8396  * abort */
8397
8398  /* for (?g), (?gc), and (?o) warnings; warning
8399  about (?c) will warn about (?g) -- japhy    */
8400
8401 #define WASTED_O  0x01
8402 #define WASTED_G  0x02
8403 #define WASTED_C  0x04
8404 #define WASTED_GC (0x02|0x04)
8405  I32 wastedflags = 0x00;
8406  U32 posflags = 0, negflags = 0;
8407  U32 *flagsp = &posflags;
8408  char has_charset_modifier = '\0';
8409  regex_charset cs;
8410  bool has_use_defaults = FALSE;
8411  const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8412
8413  PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8414
8415  /* '^' as an initial flag sets certain defaults */
8416  if (UCHARAT(RExC_parse) == '^') {
8417   RExC_parse++;
8418   has_use_defaults = TRUE;
8419   STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8420   set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8421           ? REGEX_UNICODE_CHARSET
8422           : REGEX_DEPENDS_CHARSET);
8423  }
8424
8425  cs = get_regex_charset(RExC_flags);
8426  if (cs == REGEX_DEPENDS_CHARSET
8427   && (RExC_utf8 || RExC_uni_semantics))
8428  {
8429   cs = REGEX_UNICODE_CHARSET;
8430  }
8431
8432  while (*RExC_parse) {
8433   /* && strchr("iogcmsx", *RExC_parse) */
8434   /* (?g), (?gc) and (?o) are useless here
8435   and must be globally applied -- japhy */
8436   switch (*RExC_parse) {
8437
8438    /* Code for the imsx flags */
8439    CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8440
8441    case LOCALE_PAT_MOD:
8442     if (has_charset_modifier) {
8443      goto excess_modifier;
8444     }
8445     else if (flagsp == &negflags) {
8446      goto neg_modifier;
8447     }
8448     cs = REGEX_LOCALE_CHARSET;
8449     has_charset_modifier = LOCALE_PAT_MOD;
8450     RExC_contains_locale = 1;
8451     break;
8452    case UNICODE_PAT_MOD:
8453     if (has_charset_modifier) {
8454      goto excess_modifier;
8455     }
8456     else if (flagsp == &negflags) {
8457      goto neg_modifier;
8458     }
8459     cs = REGEX_UNICODE_CHARSET;
8460     has_charset_modifier = UNICODE_PAT_MOD;
8461     break;
8462    case ASCII_RESTRICT_PAT_MOD:
8463     if (flagsp == &negflags) {
8464      goto neg_modifier;
8465     }
8466     if (has_charset_modifier) {
8467      if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8468       goto excess_modifier;
8469      }
8470      /* Doubled modifier implies more restricted */
8471      cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8472     }
8473     else {
8474      cs = REGEX_ASCII_RESTRICTED_CHARSET;
8475     }
8476     has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8477     break;
8478    case DEPENDS_PAT_MOD:
8479     if (has_use_defaults) {
8480      goto fail_modifiers;
8481     }
8482     else if (flagsp == &negflags) {
8483      goto neg_modifier;
8484     }
8485     else if (has_charset_modifier) {
8486      goto excess_modifier;
8487     }
8488
8489     /* The dual charset means unicode semantics if the
8490     * pattern (or target, not known until runtime) are
8491     * utf8, or something in the pattern indicates unicode
8492     * semantics */
8493     cs = (RExC_utf8 || RExC_uni_semantics)
8494      ? REGEX_UNICODE_CHARSET
8495      : REGEX_DEPENDS_CHARSET;
8496     has_charset_modifier = DEPENDS_PAT_MOD;
8497     break;
8498    excess_modifier:
8499     RExC_parse++;
8500     if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8501      vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8502     }
8503     else if (has_charset_modifier == *(RExC_parse - 1)) {
8504      vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8505     }
8506     else {
8507      vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8508     }
8509     /*NOTREACHED*/
8510    neg_modifier:
8511     RExC_parse++;
8512     vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8513     /*NOTREACHED*/
8514    case ONCE_PAT_MOD: /* 'o' */
8515    case GLOBAL_PAT_MOD: /* 'g' */
8516     if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8517      const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8518      if (! (wastedflags & wflagbit) ) {
8519       wastedflags |= wflagbit;
8520       vWARN5(
8521        RExC_parse + 1,
8522        "Useless (%s%c) - %suse /%c modifier",
8523        flagsp == &negflags ? "?-" : "?",
8524        *RExC_parse,
8525        flagsp == &negflags ? "don't " : "",
8526        *RExC_parse
8527       );
8528      }
8529     }
8530     break;
8531
8532    case CONTINUE_PAT_MOD: /* 'c' */
8533     if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8534      if (! (wastedflags & WASTED_C) ) {
8535       wastedflags |= WASTED_GC;
8536       vWARN3(
8537        RExC_parse + 1,
8538        "Useless (%sc) - %suse /gc modifier",
8539        flagsp == &negflags ? "?-" : "?",
8540        flagsp == &negflags ? "don't " : ""
8541       );
8542      }
8543     }
8544     break;
8545    case KEEPCOPY_PAT_MOD: /* 'p' */
8546     if (flagsp == &negflags) {
8547      if (SIZE_ONLY)
8548       ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8549     } else {
8550      *flagsp |= RXf_PMf_KEEPCOPY;
8551     }
8552     break;
8553    case '-':
8554     /* A flag is a default iff it is following a minus, so
8555     * if there is a minus, it means will be trying to
8556     * re-specify a default which is an error */
8557     if (has_use_defaults || flagsp == &negflags) {
8558      goto fail_modifiers;
8559     }
8560     flagsp = &negflags;
8561     wastedflags = 0;  /* reset so (?g-c) warns twice */
8562     break;
8563    case ':':
8564    case ')':
8565     RExC_flags |= posflags;
8566     RExC_flags &= ~negflags;
8567     set_regex_charset(&RExC_flags, cs);
8568     return;
8569     /*NOTREACHED*/
8570    default:
8571    fail_modifiers:
8572     RExC_parse++;
8573     vFAIL3("Sequence (%.*s...) not recognized",
8574      RExC_parse-seqstart, seqstart);
8575     /*NOTREACHED*/
8576   }
8577
8578   ++RExC_parse;
8579  }
8580 }
8581
8582 /*
8583  - reg - regular expression, i.e. main body or parenthesized thing
8584  *
8585  * Caller must absorb opening parenthesis.
8586  *
8587  * Combining parenthesis handling with the base level of regular expression
8588  * is a trifle forced, but the need to tie the tails of the branches to what
8589  * follows makes it hard to avoid.
8590  */
8591 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8592 #ifdef DEBUGGING
8593 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8594 #else
8595 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8596 #endif
8597
8598 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
8599    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
8600    needs to be restarted.
8601    Otherwise would only return NULL if regbranch() returns NULL, which
8602    cannot happen.  */
8603 STATIC regnode *
8604 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8605  /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
8606  * 2 is like 1, but indicates that nextchar() has been called to advance
8607  * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
8608  * this flag alerts us to the need to check for that */
8609 {
8610  dVAR;
8611  regnode *ret;  /* Will be the head of the group. */
8612  regnode *br;
8613  regnode *lastbr;
8614  regnode *ender = NULL;
8615  I32 parno = 0;
8616  I32 flags;
8617  U32 oregflags = RExC_flags;
8618  bool have_branch = 0;
8619  bool is_open = 0;
8620  I32 freeze_paren = 0;
8621  I32 after_freeze = 0;
8622
8623  char * parse_start = RExC_parse; /* MJD */
8624  char * const oregcomp_parse = RExC_parse;
8625
8626  GET_RE_DEBUG_FLAGS_DECL;
8627
8628  PERL_ARGS_ASSERT_REG;
8629  DEBUG_PARSE("reg ");
8630
8631  *flagp = 0;    /* Tentatively. */
8632
8633
8634  /* Make an OPEN node, if parenthesized. */
8635  if (paren) {
8636
8637   /* Under /x, space and comments can be gobbled up between the '(' and
8638   * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
8639   * intervening space, as the sequence is a token, and a token should be
8640   * indivisible */
8641   bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
8642
8643   if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8644    char *start_verb = RExC_parse;
8645    STRLEN verb_len = 0;
8646    char *start_arg = NULL;
8647    unsigned char op = 0;
8648    int argok = 1;
8649    int internal_argval = 0; /* internal_argval is only useful if !argok */
8650
8651    if (has_intervening_patws && SIZE_ONLY) {
8652     ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
8653    }
8654    while ( *RExC_parse && *RExC_parse != ')' ) {
8655     if ( *RExC_parse == ':' ) {
8656      start_arg = RExC_parse + 1;
8657      break;
8658     }
8659     RExC_parse++;
8660    }
8661    ++start_verb;
8662    verb_len = RExC_parse - start_verb;
8663    if ( start_arg ) {
8664     RExC_parse++;
8665     while ( *RExC_parse && *RExC_parse != ')' )
8666      RExC_parse++;
8667     if ( *RExC_parse != ')' )
8668      vFAIL("Unterminated verb pattern argument");
8669     if ( RExC_parse == start_arg )
8670      start_arg = NULL;
8671    } else {
8672     if ( *RExC_parse != ')' )
8673      vFAIL("Unterminated verb pattern");
8674    }
8675
8676    switch ( *start_verb ) {
8677    case 'A':  /* (*ACCEPT) */
8678     if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8679      op = ACCEPT;
8680      internal_argval = RExC_nestroot;
8681     }
8682     break;
8683    case 'C':  /* (*COMMIT) */
8684     if ( memEQs(start_verb,verb_len,"COMMIT") )
8685      op = COMMIT;
8686     break;
8687    case 'F':  /* (*FAIL) */
8688     if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8689      op = OPFAIL;
8690      argok = 0;
8691     }
8692     break;
8693    case ':':  /* (*:NAME) */
8694    case 'M':  /* (*MARK:NAME) */
8695     if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8696      op = MARKPOINT;
8697      argok = -1;
8698     }
8699     break;
8700    case 'P':  /* (*PRUNE) */
8701     if ( memEQs(start_verb,verb_len,"PRUNE") )
8702      op = PRUNE;
8703     break;
8704    case 'S':   /* (*SKIP) */
8705     if ( memEQs(start_verb,verb_len,"SKIP") )
8706      op = SKIP;
8707     break;
8708    case 'T':  /* (*THEN) */
8709     /* [19:06] <TimToady> :: is then */
8710     if ( memEQs(start_verb,verb_len,"THEN") ) {
8711      op = CUTGROUP;
8712      RExC_seen |= REG_SEEN_CUTGROUP;
8713     }
8714     break;
8715    }
8716    if ( ! op ) {
8717     RExC_parse++;
8718     vFAIL3("Unknown verb pattern '%.*s'",
8719      verb_len, start_verb);
8720    }
8721    if ( argok ) {
8722     if ( start_arg && internal_argval ) {
8723      vFAIL3("Verb pattern '%.*s' may not have an argument",
8724       verb_len, start_verb);
8725     } else if ( argok < 0 && !start_arg ) {
8726      vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8727       verb_len, start_verb);
8728     } else {
8729      ret = reganode(pRExC_state, op, internal_argval);
8730      if ( ! internal_argval && ! SIZE_ONLY ) {
8731       if (start_arg) {
8732        SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8733        ARG(ret) = add_data( pRExC_state, 1, "S" );
8734        RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8735        ret->flags = 0;
8736       } else {
8737        ret->flags = 1;
8738       }
8739      }
8740     }
8741     if (!internal_argval)
8742      RExC_seen |= REG_SEEN_VERBARG;
8743    } else if ( start_arg ) {
8744     vFAIL3("Verb pattern '%.*s' may not have an argument",
8745       verb_len, start_verb);
8746    } else {
8747     ret = reg_node(pRExC_state, op);
8748    }
8749    nextchar(pRExC_state);
8750    return ret;
8751   } else
8752   if (*RExC_parse == '?') { /* (?...) */
8753    bool is_logical = 0;
8754    const char * const seqstart = RExC_parse;
8755    if (has_intervening_patws && SIZE_ONLY) {
8756     ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
8757    }
8758
8759    RExC_parse++;
8760    paren = *RExC_parse++;
8761    ret = NULL;   /* For look-ahead/behind. */
8762    switch (paren) {
8763
8764    case 'P': /* (?P...) variants for those used to PCRE/Python */
8765     paren = *RExC_parse++;
8766     if ( paren == '<')         /* (?P<...>) named capture */
8767      goto named_capture;
8768     else if (paren == '>') {   /* (?P>name) named recursion */
8769      goto named_recursion;
8770     }
8771     else if (paren == '=') {   /* (?P=...)  named backref */
8772      /* this pretty much dupes the code for \k<NAME> in regatom(), if
8773      you change this make sure you change that */
8774      char* name_start = RExC_parse;
8775      U32 num = 0;
8776      SV *sv_dat = reg_scan_name(pRExC_state,
8777       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8778      if (RExC_parse == name_start || *RExC_parse != ')')
8779       vFAIL2("Sequence %.3s... not terminated",parse_start);
8780
8781      if (!SIZE_ONLY) {
8782       num = add_data( pRExC_state, 1, "S" );
8783       RExC_rxi->data->data[num]=(void*)sv_dat;
8784       SvREFCNT_inc_simple_void(sv_dat);
8785      }
8786      RExC_sawback = 1;
8787      ret = reganode(pRExC_state,
8788         ((! FOLD)
8789          ? NREF
8790          : (ASCII_FOLD_RESTRICTED)
8791          ? NREFFA
8792          : (AT_LEAST_UNI_SEMANTICS)
8793           ? NREFFU
8794           : (LOC)
8795           ? NREFFL
8796           : NREFF),
8797          num);
8798      *flagp |= HASWIDTH;
8799
8800      Set_Node_Offset(ret, parse_start+1);
8801      Set_Node_Cur_Length(ret); /* MJD */
8802
8803      nextchar(pRExC_state);
8804      return ret;
8805     }
8806     RExC_parse++;
8807     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8808     /*NOTREACHED*/
8809    case '<':           /* (?<...) */
8810     if (*RExC_parse == '!')
8811      paren = ',';
8812     else if (*RExC_parse != '=')
8813    named_capture:
8814     {               /* (?<...>) */
8815      char *name_start;
8816      SV *svname;
8817      paren= '>';
8818    case '\'':          /* (?'...') */
8819       name_start= RExC_parse;
8820       svname = reg_scan_name(pRExC_state,
8821        SIZE_ONLY ?  /* reverse test from the others */
8822        REG_RSN_RETURN_NAME :
8823        REG_RSN_RETURN_NULL);
8824      if (RExC_parse == name_start) {
8825       RExC_parse++;
8826       vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8827       /*NOTREACHED*/
8828      }
8829      if (*RExC_parse != paren)
8830       vFAIL2("Sequence (?%c... not terminated",
8831        paren=='>' ? '<' : paren);
8832      if (SIZE_ONLY) {
8833       HE *he_str;
8834       SV *sv_dat = NULL;
8835       if (!svname) /* shouldn't happen */
8836        Perl_croak(aTHX_
8837         "panic: reg_scan_name returned NULL");
8838       if (!RExC_paren_names) {
8839        RExC_paren_names= newHV();
8840        sv_2mortal(MUTABLE_SV(RExC_paren_names));
8841 #ifdef DEBUGGING
8842        RExC_paren_name_list= newAV();
8843        sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8844 #endif
8845       }
8846       he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8847       if ( he_str )
8848        sv_dat = HeVAL(he_str);
8849       if ( ! sv_dat ) {
8850        /* croak baby croak */
8851        Perl_croak(aTHX_
8852         "panic: paren_name hash element allocation failed");
8853       } else if ( SvPOK(sv_dat) ) {
8854        /* (?|...) can mean we have dupes so scan to check
8855        its already been stored. Maybe a flag indicating
8856        we are inside such a construct would be useful,
8857        but the arrays are likely to be quite small, so
8858        for now we punt -- dmq */
8859        IV count = SvIV(sv_dat);
8860        I32 *pv = (I32*)SvPVX(sv_dat);
8861        IV i;
8862        for ( i = 0 ; i < count ; i++ ) {
8863         if ( pv[i] == RExC_npar ) {
8864          count = 0;
8865          break;
8866         }
8867        }
8868        if ( count ) {
8869         pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8870         SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8871         pv[count] = RExC_npar;
8872         SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8873        }
8874       } else {
8875        (void)SvUPGRADE(sv_dat,SVt_PVNV);
8876        sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8877        SvIOK_on(sv_dat);
8878        SvIV_set(sv_dat, 1);
8879       }
8880 #ifdef DEBUGGING
8881       /* Yes this does cause a memory leak in debugging Perls */
8882       if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8883        SvREFCNT_dec_NN(svname);
8884 #endif
8885
8886       /*sv_dump(sv_dat);*/
8887      }
8888      nextchar(pRExC_state);
8889      paren = 1;
8890      goto capturing_parens;
8891     }
8892     RExC_seen |= REG_SEEN_LOOKBEHIND;
8893     RExC_in_lookbehind++;
8894     RExC_parse++;
8895    case '=':           /* (?=...) */
8896     RExC_seen_zerolen++;
8897     break;
8898    case '!':           /* (?!...) */
8899     RExC_seen_zerolen++;
8900     if (*RExC_parse == ')') {
8901      ret=reg_node(pRExC_state, OPFAIL);
8902      nextchar(pRExC_state);
8903      return ret;
8904     }
8905     break;
8906    case '|':           /* (?|...) */
8907     /* branch reset, behave like a (?:...) except that
8908     buffers in alternations share the same numbers */
8909     paren = ':';
8910     after_freeze = freeze_paren = RExC_npar;
8911     break;
8912    case ':':           /* (?:...) */
8913    case '>':           /* (?>...) */
8914     break;
8915    case '$':           /* (?$...) */
8916    case '@':           /* (?@...) */
8917     vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8918     break;
8919    case '#':           /* (?#...) */
8920     /* XXX As soon as we disallow separating the '?' and '*' (by
8921     * spaces or (?#...) comment), it is believed that this case
8922     * will be unreachable and can be removed.  See
8923     * [perl #117327] */
8924     while (*RExC_parse && *RExC_parse != ')')
8925      RExC_parse++;
8926     if (*RExC_parse != ')')
8927      FAIL("Sequence (?#... not terminated");
8928     nextchar(pRExC_state);
8929     *flagp = TRYAGAIN;
8930     return NULL;
8931    case '0' :           /* (?0) */
8932    case 'R' :           /* (?R) */
8933     if (*RExC_parse != ')')
8934      FAIL("Sequence (?R) not terminated");
8935     ret = reg_node(pRExC_state, GOSTART);
8936     *flagp |= POSTPONED;
8937     nextchar(pRExC_state);
8938     return ret;
8939     /*notreached*/
8940    { /* named and numeric backreferences */
8941     I32 num;
8942    case '&':            /* (?&NAME) */
8943     parse_start = RExC_parse - 1;
8944    named_recursion:
8945     {
8946       SV *sv_dat = reg_scan_name(pRExC_state,
8947        SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8948       num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8949     }
8950     goto gen_recurse_regop;
8951     assert(0); /* NOT REACHED */
8952    case '+':
8953     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8954      RExC_parse++;
8955      vFAIL("Illegal pattern");
8956     }
8957     goto parse_recursion;
8958     /* NOT REACHED*/
8959    case '-': /* (?-1) */
8960     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8961      RExC_parse--; /* rewind to let it be handled later */
8962      goto parse_flags;
8963     }
8964     /*FALLTHROUGH */
8965    case '1': case '2': case '3': case '4': /* (?1) */
8966    case '5': case '6': case '7': case '8': case '9':
8967     RExC_parse--;
8968    parse_recursion:
8969     num = atoi(RExC_parse);
8970     parse_start = RExC_parse - 1; /* MJD */
8971     if (*RExC_parse == '-')
8972      RExC_parse++;
8973     while (isDIGIT(*RExC_parse))
8974       RExC_parse++;
8975     if (*RExC_parse!=')')
8976      vFAIL("Expecting close bracket");
8977
8978    gen_recurse_regop:
8979     if ( paren == '-' ) {
8980      /*
8981      Diagram of capture buffer numbering.
8982      Top line is the normal capture buffer numbers
8983      Bottom line is the negative indexing as from
8984      the X (the (?-2))
8985
8986      +   1 2    3 4 5 X          6 7
8987      /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8988      -   5 4    3 2 1 X          x x
8989
8990      */
8991      num = RExC_npar + num;
8992      if (num < 1)  {
8993       RExC_parse++;
8994       vFAIL("Reference to nonexistent group");
8995      }
8996     } else if ( paren == '+' ) {
8997      num = RExC_npar + num - 1;
8998     }
8999
9000     ret = reganode(pRExC_state, GOSUB, num);
9001     if (!SIZE_ONLY) {
9002      if (num > (I32)RExC_rx->nparens) {
9003       RExC_parse++;
9004       vFAIL("Reference to nonexistent group");
9005      }
9006      ARG2L_SET( ret, RExC_recurse_count++);
9007      RExC_emit++;
9008      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9009       "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
9010     } else {
9011      RExC_size++;
9012      }
9013      RExC_seen |= REG_SEEN_RECURSE;
9014     Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9015     Set_Node_Offset(ret, parse_start); /* MJD */
9016
9017     *flagp |= POSTPONED;
9018     nextchar(pRExC_state);
9019     return ret;
9020    } /* named and numeric backreferences */
9021    assert(0); /* NOT REACHED */
9022
9023    case '?':           /* (??...) */
9024     is_logical = 1;
9025     if (*RExC_parse != '{') {
9026      RExC_parse++;
9027      vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9028      /*NOTREACHED*/
9029     }
9030     *flagp |= POSTPONED;
9031     paren = *RExC_parse++;
9032     /* FALL THROUGH */
9033    case '{':           /* (?{...}) */
9034    {
9035     U32 n = 0;
9036     struct reg_code_block *cb;
9037
9038     RExC_seen_zerolen++;
9039
9040     if (   !pRExC_state->num_code_blocks
9041      || pRExC_state->code_index >= pRExC_state->num_code_blocks
9042      || pRExC_state->code_blocks[pRExC_state->code_index].start
9043       != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9044        - RExC_start)
9045     ) {
9046      if (RExC_pm_flags & PMf_USE_RE_EVAL)
9047       FAIL("panic: Sequence (?{...}): no code block found\n");
9048      FAIL("Eval-group not allowed at runtime, use re 'eval'");
9049     }
9050     /* this is a pre-compiled code block (?{...}) */
9051     cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9052     RExC_parse = RExC_start + cb->end;
9053     if (!SIZE_ONLY) {
9054      OP *o = cb->block;
9055      if (cb->src_regex) {
9056       n = add_data(pRExC_state, 2, "rl");
9057       RExC_rxi->data->data[n] =
9058        (void*)SvREFCNT_inc((SV*)cb->src_regex);
9059       RExC_rxi->data->data[n+1] = (void*)o;
9060      }
9061      else {
9062       n = add_data(pRExC_state, 1,
9063        (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
9064       RExC_rxi->data->data[n] = (void*)o;
9065      }
9066     }
9067     pRExC_state->code_index++;
9068     nextchar(pRExC_state);
9069
9070     if (is_logical) {
9071      regnode *eval;
9072      ret = reg_node(pRExC_state, LOGICAL);
9073      eval = reganode(pRExC_state, EVAL, n);
9074      if (!SIZE_ONLY) {
9075       ret->flags = 2;
9076       /* for later propagation into (??{}) return value */
9077       eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9078      }
9079      REGTAIL(pRExC_state, ret, eval);
9080      /* deal with the length of this later - MJD */
9081      return ret;
9082     }
9083     ret = reganode(pRExC_state, EVAL, n);
9084     Set_Node_Length(ret, RExC_parse - parse_start + 1);
9085     Set_Node_Offset(ret, parse_start);
9086     return ret;
9087    }
9088    case '(':           /* (?(?{...})...) and (?(?=...)...) */
9089    {
9090     int is_define= 0;
9091     if (RExC_parse[0] == '?') {        /* (?(?...)) */
9092      if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9093       || RExC_parse[1] == '<'
9094       || RExC_parse[1] == '{') { /* Lookahead or eval. */
9095       I32 flag;
9096       regnode *tail;
9097
9098       ret = reg_node(pRExC_state, LOGICAL);
9099       if (!SIZE_ONLY)
9100        ret->flags = 1;
9101
9102       tail = reg(pRExC_state, 1, &flag, depth+1);
9103       if (flag & RESTART_UTF8) {
9104        *flagp = RESTART_UTF8;
9105        return NULL;
9106       }
9107       REGTAIL(pRExC_state, ret, tail);
9108       goto insert_if;
9109      }
9110     }
9111     else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9112       || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9113     {
9114      char ch = RExC_parse[0] == '<' ? '>' : '\'';
9115      char *name_start= RExC_parse++;
9116      U32 num = 0;
9117      SV *sv_dat=reg_scan_name(pRExC_state,
9118       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9119      if (RExC_parse == name_start || *RExC_parse != ch)
9120       vFAIL2("Sequence (?(%c... not terminated",
9121        (ch == '>' ? '<' : ch));
9122      RExC_parse++;
9123      if (!SIZE_ONLY) {
9124       num = add_data( pRExC_state, 1, "S" );
9125       RExC_rxi->data->data[num]=(void*)sv_dat;
9126       SvREFCNT_inc_simple_void(sv_dat);
9127      }
9128      ret = reganode(pRExC_state,NGROUPP,num);
9129      goto insert_if_check_paren;
9130     }
9131     else if (RExC_parse[0] == 'D' &&
9132       RExC_parse[1] == 'E' &&
9133       RExC_parse[2] == 'F' &&
9134       RExC_parse[3] == 'I' &&
9135       RExC_parse[4] == 'N' &&
9136       RExC_parse[5] == 'E')
9137     {
9138      ret = reganode(pRExC_state,DEFINEP,0);
9139      RExC_parse +=6 ;
9140      is_define = 1;
9141      goto insert_if_check_paren;
9142     }
9143     else if (RExC_parse[0] == 'R') {
9144      RExC_parse++;
9145      parno = 0;
9146      if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9147       parno = atoi(RExC_parse++);
9148       while (isDIGIT(*RExC_parse))
9149        RExC_parse++;
9150      } else if (RExC_parse[0] == '&') {
9151       SV *sv_dat;
9152       RExC_parse++;
9153       sv_dat = reg_scan_name(pRExC_state,
9154         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9155        parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9156      }
9157      ret = reganode(pRExC_state,INSUBP,parno);
9158      goto insert_if_check_paren;
9159     }
9160     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9161      /* (?(1)...) */
9162      char c;
9163      parno = atoi(RExC_parse++);
9164
9165      while (isDIGIT(*RExC_parse))
9166       RExC_parse++;
9167      ret = reganode(pRExC_state, GROUPP, parno);
9168
9169     insert_if_check_paren:
9170      if ((c = *nextchar(pRExC_state)) != ')')
9171       vFAIL("Switch condition not recognized");
9172     insert_if:
9173      REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9174      br = regbranch(pRExC_state, &flags, 1,depth+1);
9175      if (br == NULL) {
9176       if (flags & RESTART_UTF8) {
9177        *flagp = RESTART_UTF8;
9178        return NULL;
9179       }
9180       FAIL2("panic: regbranch returned NULL, flags=%#X",
9181        flags);
9182      } else
9183       REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9184      c = *nextchar(pRExC_state);
9185      if (flags&HASWIDTH)
9186       *flagp |= HASWIDTH;
9187      if (c == '|') {
9188       if (is_define)
9189        vFAIL("(?(DEFINE)....) does not allow branches");
9190       lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9191       if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9192        if (flags & RESTART_UTF8) {
9193         *flagp = RESTART_UTF8;
9194         return NULL;
9195        }
9196        FAIL2("panic: regbranch returned NULL, flags=%#X",
9197         flags);
9198       }
9199       REGTAIL(pRExC_state, ret, lastbr);
9200       if (flags&HASWIDTH)
9201        *flagp |= HASWIDTH;
9202       c = *nextchar(pRExC_state);
9203      }
9204      else
9205       lastbr = NULL;
9206      if (c != ')')
9207       vFAIL("Switch (?(condition)... contains too many branches");
9208      ender = reg_node(pRExC_state, TAIL);
9209      REGTAIL(pRExC_state, br, ender);
9210      if (lastbr) {
9211       REGTAIL(pRExC_state, lastbr, ender);
9212       REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9213      }
9214      else
9215       REGTAIL(pRExC_state, ret, ender);
9216      RExC_size++; /* XXX WHY do we need this?!!
9217          For large programs it seems to be required
9218          but I can't figure out why. -- dmq*/
9219      return ret;
9220     }
9221     else {
9222      vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
9223     }
9224    }
9225    case '[':           /* (?[ ... ]) */
9226     return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9227           oregcomp_parse);
9228    case 0:
9229     RExC_parse--; /* for vFAIL to print correctly */
9230     vFAIL("Sequence (? incomplete");
9231     break;
9232    default: /* e.g., (?i) */
9233     --RExC_parse;
9234    parse_flags:
9235     parse_lparen_question_flags(pRExC_state);
9236     if (UCHARAT(RExC_parse) != ':') {
9237      nextchar(pRExC_state);
9238      *flagp = TRYAGAIN;
9239      return NULL;
9240     }
9241     paren = ':';
9242     nextchar(pRExC_state);
9243     ret = NULL;
9244     goto parse_rest;
9245    } /* end switch */
9246   }
9247   else {                  /* (...) */
9248   capturing_parens:
9249    parno = RExC_npar;
9250    RExC_npar++;
9251
9252    ret = reganode(pRExC_state, OPEN, parno);
9253    if (!SIZE_ONLY ){
9254     if (!RExC_nestroot)
9255      RExC_nestroot = parno;
9256     if (RExC_seen & REG_SEEN_RECURSE
9257      && !RExC_open_parens[parno-1])
9258     {
9259      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9260       "Setting open paren #%"IVdf" to %d\n",
9261       (IV)parno, REG_NODE_NUM(ret)));
9262      RExC_open_parens[parno-1]= ret;
9263     }
9264    }
9265    Set_Node_Length(ret, 1); /* MJD */
9266    Set_Node_Offset(ret, RExC_parse); /* MJD */
9267    is_open = 1;
9268   }
9269  }
9270  else                        /* ! paren */
9271   ret = NULL;
9272
9273    parse_rest:
9274  /* Pick up the branches, linking them together. */
9275  parse_start = RExC_parse;   /* MJD */
9276  br = regbranch(pRExC_state, &flags, 1,depth+1);
9277
9278  /*     branch_len = (paren != 0); */
9279
9280  if (br == NULL) {
9281   if (flags & RESTART_UTF8) {
9282    *flagp = RESTART_UTF8;
9283    return NULL;
9284   }
9285   FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
9286  }
9287  if (*RExC_parse == '|') {
9288   if (!SIZE_ONLY && RExC_extralen) {
9289    reginsert(pRExC_state, BRANCHJ, br, depth+1);
9290   }
9291   else {                  /* MJD */
9292    reginsert(pRExC_state, BRANCH, br, depth+1);
9293    Set_Node_Length(br, paren != 0);
9294    Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9295   }
9296   have_branch = 1;
9297   if (SIZE_ONLY)
9298    RExC_extralen += 1;  /* For BRANCHJ-BRANCH. */
9299  }
9300  else if (paren == ':') {
9301   *flagp |= flags&SIMPLE;
9302  }
9303  if (is_open) {    /* Starts with OPEN. */
9304   REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9305  }
9306  else if (paren != '?')  /* Not Conditional */
9307   ret = br;
9308  *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9309  lastbr = br;
9310  while (*RExC_parse == '|') {
9311   if (!SIZE_ONLY && RExC_extralen) {
9312    ender = reganode(pRExC_state, LONGJMP,0);
9313    REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9314   }
9315   if (SIZE_ONLY)
9316    RExC_extralen += 2;  /* Account for LONGJMP. */
9317   nextchar(pRExC_state);
9318   if (freeze_paren) {
9319    if (RExC_npar > after_freeze)
9320     after_freeze = RExC_npar;
9321    RExC_npar = freeze_paren;
9322   }
9323   br = regbranch(pRExC_state, &flags, 0, depth+1);
9324
9325   if (br == NULL) {
9326    if (flags & RESTART_UTF8) {
9327     *flagp = RESTART_UTF8;
9328     return NULL;
9329    }
9330    FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
9331   }
9332   REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9333   lastbr = br;
9334   *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9335  }
9336
9337  if (have_branch || paren != ':') {
9338   /* Make a closing node, and hook it on the end. */
9339   switch (paren) {
9340   case ':':
9341    ender = reg_node(pRExC_state, TAIL);
9342    break;
9343   case 1: case 2:
9344    ender = reganode(pRExC_state, CLOSE, parno);
9345    if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9346     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9347       "Setting close paren #%"IVdf" to %d\n",
9348       (IV)parno, REG_NODE_NUM(ender)));
9349     RExC_close_parens[parno-1]= ender;
9350     if (RExC_nestroot == parno)
9351      RExC_nestroot = 0;
9352    }
9353    Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9354    Set_Node_Length(ender,1); /* MJD */
9355    break;
9356   case '<':
9357   case ',':
9358   case '=':
9359   case '!':
9360    *flagp &= ~HASWIDTH;
9361    /* FALL THROUGH */
9362   case '>':
9363    ender = reg_node(pRExC_state, SUCCEED);
9364    break;
9365   case 0:
9366    ender = reg_node(pRExC_state, END);
9367    if (!SIZE_ONLY) {
9368     assert(!RExC_opend); /* there can only be one! */
9369     RExC_opend = ender;
9370    }
9371    break;
9372   }
9373   DEBUG_PARSE_r(if (!SIZE_ONLY) {
9374    SV * const mysv_val1=sv_newmortal();
9375    SV * const mysv_val2=sv_newmortal();
9376    DEBUG_PARSE_MSG("lsbr");
9377    regprop(RExC_rx, mysv_val1, lastbr);
9378    regprop(RExC_rx, mysv_val2, ender);
9379    PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9380       SvPV_nolen_const(mysv_val1),
9381       (IV)REG_NODE_NUM(lastbr),
9382       SvPV_nolen_const(mysv_val2),
9383       (IV)REG_NODE_NUM(ender),
9384       (IV)(ender - lastbr)
9385    );
9386   });
9387   REGTAIL(pRExC_state, lastbr, ender);
9388
9389   if (have_branch && !SIZE_ONLY) {
9390    char is_nothing= 1;
9391    if (depth==1)
9392     RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9393
9394    /* Hook the tails of the branches to the closing node. */
9395    for (br = ret; br; br = regnext(br)) {
9396     const U8 op = PL_regkind[OP(br)];
9397     if (op == BRANCH) {
9398      REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9399      if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9400       is_nothing= 0;
9401     }
9402     else if (op == BRANCHJ) {
9403      REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9404      /* for now we always disable this optimisation * /
9405      if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9406      */
9407       is_nothing= 0;
9408     }
9409    }
9410    if (is_nothing) {
9411     br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9412     DEBUG_PARSE_r(if (!SIZE_ONLY) {
9413      SV * const mysv_val1=sv_newmortal();
9414      SV * const mysv_val2=sv_newmortal();
9415      DEBUG_PARSE_MSG("NADA");
9416      regprop(RExC_rx, mysv_val1, ret);
9417      regprop(RExC_rx, mysv_val2, ender);
9418      PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9419         SvPV_nolen_const(mysv_val1),
9420         (IV)REG_NODE_NUM(ret),
9421         SvPV_nolen_const(mysv_val2),
9422         (IV)REG_NODE_NUM(ender),
9423         (IV)(ender - ret)
9424      );
9425     });
9426     OP(br)= NOTHING;
9427     if (OP(ender) == TAIL) {
9428      NEXT_OFF(br)= 0;
9429      RExC_emit= br + 1;
9430     } else {
9431      regnode *opt;
9432      for ( opt= br + 1; opt < ender ; opt++ )
9433       OP(opt)= OPTIMIZED;
9434      NEXT_OFF(br)= ender - br;
9435     }
9436    }
9437   }
9438  }
9439
9440  {
9441   const char *p;
9442   static const char parens[] = "=!<,>";
9443
9444   if (paren && (p = strchr(parens, paren))) {
9445    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9446    int flag = (p - parens) > 1;
9447
9448    if (paren == '>')
9449     node = SUSPEND, flag = 0;
9450    reginsert(pRExC_state, node,ret, depth+1);
9451    Set_Node_Cur_Length(ret);
9452    Set_Node_Offset(ret, parse_start + 1);
9453    ret->flags = flag;
9454    REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9455   }
9456  }
9457
9458  /* Check for proper termination. */
9459  if (paren) {
9460   /* restore original flags, but keep (?p) */
9461   RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
9462   if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9463    RExC_parse = oregcomp_parse;
9464    vFAIL("Unmatched (");
9465   }
9466  }
9467  else if (!paren && RExC_parse < RExC_end) {
9468   if (*RExC_parse == ')') {
9469    RExC_parse++;
9470    vFAIL("Unmatched )");
9471   }
9472   else
9473    FAIL("Junk on end of regexp"); /* "Can't happen". */
9474   assert(0); /* NOTREACHED */
9475  }
9476
9477  if (RExC_in_lookbehind) {
9478   RExC_in_lookbehind--;
9479  }
9480  if (after_freeze > RExC_npar)
9481   RExC_npar = after_freeze;
9482  return(ret);
9483 }
9484
9485 /*
9486  - regbranch - one alternative of an | operator
9487  *
9488  * Implements the concatenation operator.
9489  *
9490  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9491  * restarted.
9492  */
9493 STATIC regnode *
9494 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9495 {
9496  dVAR;
9497  regnode *ret;
9498  regnode *chain = NULL;
9499  regnode *latest;
9500  I32 flags = 0, c = 0;
9501  GET_RE_DEBUG_FLAGS_DECL;
9502
9503  PERL_ARGS_ASSERT_REGBRANCH;
9504
9505  DEBUG_PARSE("brnc");
9506
9507  if (first)
9508   ret = NULL;
9509  else {
9510   if (!SIZE_ONLY && RExC_extralen)
9511    ret = reganode(pRExC_state, BRANCHJ,0);
9512   else {
9513    ret = reg_node(pRExC_state, BRANCH);
9514    Set_Node_Length(ret, 1);
9515   }
9516  }
9517
9518  if (!first && SIZE_ONLY)
9519   RExC_extralen += 1;   /* BRANCHJ */
9520
9521  *flagp = WORST;   /* Tentatively. */
9522
9523  RExC_parse--;
9524  nextchar(pRExC_state);
9525  while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9526   flags &= ~TRYAGAIN;
9527   latest = regpiece(pRExC_state, &flags,depth+1);
9528   if (latest == NULL) {
9529    if (flags & TRYAGAIN)
9530     continue;
9531    if (flags & RESTART_UTF8) {
9532     *flagp = RESTART_UTF8;
9533     return NULL;
9534    }
9535    FAIL2("panic: regpiece returned NULL, flags=%#X", flags);
9536   }
9537   else if (ret == NULL)
9538    ret = latest;
9539   *flagp |= flags&(HASWIDTH|POSTPONED);
9540   if (chain == NULL)  /* First piece. */
9541    *flagp |= flags&SPSTART;
9542   else {
9543    RExC_naughty++;
9544    REGTAIL(pRExC_state, chain, latest);
9545   }
9546   chain = latest;
9547   c++;
9548  }
9549  if (chain == NULL) { /* Loop ran zero times. */
9550   chain = reg_node(pRExC_state, NOTHING);
9551   if (ret == NULL)
9552    ret = chain;
9553  }
9554  if (c == 1) {
9555   *flagp |= flags&SIMPLE;
9556  }
9557
9558  return ret;
9559 }
9560
9561 /*
9562  - regpiece - something followed by possible [*+?]
9563  *
9564  * Note that the branching code sequences used for ? and the general cases
9565  * of * and + are somewhat optimized:  they use the same NOTHING node as
9566  * both the endmarker for their branch list and the body of the last branch.
9567  * It might seem that this node could be dispensed with entirely, but the
9568  * endmarker role is not redundant.
9569  *
9570  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
9571  * TRYAGAIN.
9572  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9573  * restarted.
9574  */
9575 STATIC regnode *
9576 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9577 {
9578  dVAR;
9579  regnode *ret;
9580  char op;
9581  char *next;
9582  I32 flags;
9583  const char * const origparse = RExC_parse;
9584  I32 min;
9585  I32 max = REG_INFTY;
9586 #ifdef RE_TRACK_PATTERN_OFFSETS
9587  char *parse_start;
9588 #endif
9589  const char *maxpos = NULL;
9590
9591  /* Save the original in case we change the emitted regop to a FAIL. */
9592  regnode * const orig_emit = RExC_emit;
9593
9594  GET_RE_DEBUG_FLAGS_DECL;
9595
9596  PERL_ARGS_ASSERT_REGPIECE;
9597
9598  DEBUG_PARSE("piec");
9599
9600  ret = regatom(pRExC_state, &flags,depth+1);
9601  if (ret == NULL) {
9602   if (flags & (TRYAGAIN|RESTART_UTF8))
9603    *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
9604   else
9605    FAIL2("panic: regatom returned NULL, flags=%#X", flags);
9606   return(NULL);
9607  }
9608
9609  op = *RExC_parse;
9610
9611  if (op == '{' && regcurly(RExC_parse, FALSE)) {
9612   maxpos = NULL;
9613 #ifdef RE_TRACK_PATTERN_OFFSETS
9614   parse_start = RExC_parse; /* MJD */
9615 #endif
9616   next = RExC_parse + 1;
9617   while (isDIGIT(*next) || *next == ',') {
9618    if (*next == ',') {
9619     if (maxpos)
9620      break;
9621     else
9622      maxpos = next;
9623    }
9624    next++;
9625   }
9626   if (*next == '}') {  /* got one */
9627    if (!maxpos)
9628     maxpos = next;
9629    RExC_parse++;
9630    min = atoi(RExC_parse);
9631    if (*maxpos == ',')
9632     maxpos++;
9633    else
9634     maxpos = RExC_parse;
9635    max = atoi(maxpos);
9636    if (!max && *maxpos != '0')
9637     max = REG_INFTY;  /* meaning "infinity" */
9638    else if (max >= REG_INFTY)
9639     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9640    RExC_parse = next;
9641    nextchar(pRExC_state);
9642    if (max < min) {    /* If can't match, warn and optimize to fail
9643         unconditionally */
9644     if (SIZE_ONLY) {
9645      ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9646
9647      /* We can't back off the size because we have to reserve
9648      * enough space for all the things we are about to throw
9649      * away, but we can shrink it by the ammount we are about
9650      * to re-use here */
9651      RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9652     }
9653     else {
9654      RExC_emit = orig_emit;
9655     }
9656     ret = reg_node(pRExC_state, OPFAIL);
9657     return ret;
9658    }
9659
9660   do_curly:
9661    if ((flags&SIMPLE)) {
9662     RExC_naughty += 2 + RExC_naughty / 2;
9663     reginsert(pRExC_state, CURLY, ret, depth+1);
9664     Set_Node_Offset(ret, parse_start+1); /* MJD */
9665     Set_Node_Cur_Length(ret);
9666    }
9667    else {
9668     regnode * const w = reg_node(pRExC_state, WHILEM);
9669
9670     w->flags = 0;
9671     REGTAIL(pRExC_state, ret, w);
9672     if (!SIZE_ONLY && RExC_extralen) {
9673      reginsert(pRExC_state, LONGJMP,ret, depth+1);
9674      reginsert(pRExC_state, NOTHING,ret, depth+1);
9675      NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9676     }
9677     reginsert(pRExC_state, CURLYX,ret, depth+1);
9678         /* MJD hk */
9679     Set_Node_Offset(ret, parse_start+1);
9680     Set_Node_Length(ret,
9681         op == '{' ? (RExC_parse - parse_start) : 1);
9682
9683     if (!SIZE_ONLY && RExC_extralen)
9684      NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9685     REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9686     if (SIZE_ONLY)
9687      RExC_whilem_seen++, RExC_extralen += 3;
9688     RExC_naughty += 4 + RExC_naughty; /* compound interest */
9689    }
9690    ret->flags = 0;
9691
9692    if (min > 0)
9693     *flagp = WORST;
9694    if (max > 0)
9695     *flagp |= HASWIDTH;
9696    if (!SIZE_ONLY) {
9697     ARG1_SET(ret, (U16)min);
9698     ARG2_SET(ret, (U16)max);
9699    }
9700
9701    goto nest_check;
9702   }
9703  }
9704
9705  if (!ISMULT1(op)) {
9706   *flagp = flags;
9707   return(ret);
9708  }
9709
9710 #if 0    /* Now runtime fix should be reliable. */
9711
9712  /* if this is reinstated, don't forget to put this back into perldiag:
9713
9714    =item Regexp *+ operand could be empty at {#} in regex m/%s/
9715
9716   (F) The part of the regexp subject to either the * or + quantifier
9717   could match an empty string. The {#} shows in the regular
9718   expression about where the problem was discovered.
9719
9720  */
9721
9722  if (!(flags&HASWIDTH) && op != '?')
9723  vFAIL("Regexp *+ operand could be empty");
9724 #endif
9725
9726 #ifdef RE_TRACK_PATTERN_OFFSETS
9727  parse_start = RExC_parse;
9728 #endif
9729  nextchar(pRExC_state);
9730
9731  *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9732
9733  if (op == '*' && (flags&SIMPLE)) {
9734   reginsert(pRExC_state, STAR, ret, depth+1);
9735   ret->flags = 0;
9736   RExC_naughty += 4;
9737  }
9738  else if (op == '*') {
9739   min = 0;
9740   goto do_curly;
9741  }
9742  else if (op == '+' && (flags&SIMPLE)) {
9743   reginsert(pRExC_state, PLUS, ret, depth+1);
9744   ret->flags = 0;
9745   RExC_naughty += 3;
9746  }
9747  else if (op == '+') {
9748   min = 1;
9749   goto do_curly;
9750  }
9751  else if (op == '?') {
9752   min = 0; max = 1;
9753   goto do_curly;
9754  }
9755   nest_check:
9756  if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9757   SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9758   ckWARN3reg(RExC_parse,
9759     "%.*s matches null string many times",
9760     (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9761     origparse);
9762   (void)ReREFCNT_inc(RExC_rx_sv);
9763  }
9764
9765  if (RExC_parse < RExC_end && *RExC_parse == '?') {
9766   nextchar(pRExC_state);
9767   reginsert(pRExC_state, MINMOD, ret, depth+1);
9768   REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9769  }
9770 #ifndef REG_ALLOW_MINMOD_SUSPEND
9771  else
9772 #endif
9773  if (RExC_parse < RExC_end && *RExC_parse == '+') {
9774   regnode *ender;
9775   nextchar(pRExC_state);
9776   ender = reg_node(pRExC_state, SUCCEED);
9777   REGTAIL(pRExC_state, ret, ender);
9778   reginsert(pRExC_state, SUSPEND, ret, depth+1);
9779   ret->flags = 0;
9780   ender = reg_node(pRExC_state, TAIL);
9781   REGTAIL(pRExC_state, ret, ender);
9782   /*ret= ender;*/
9783  }
9784
9785  if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9786   RExC_parse++;
9787   vFAIL("Nested quantifiers");
9788  }
9789
9790  return(ret);
9791 }
9792
9793 STATIC bool
9794 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
9795   const bool strict   /* Apply stricter parsing rules? */
9796  )
9797 {
9798
9799  /* This is expected to be called by a parser routine that has recognized '\N'
9800    and needs to handle the rest. RExC_parse is expected to point at the first
9801    char following the N at the time of the call.  On successful return,
9802    RExC_parse has been updated to point to just after the sequence identified
9803    by this routine, and <*flagp> has been updated.
9804
9805    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9806    character class.
9807
9808    \N may begin either a named sequence, or if outside a character class, mean
9809    to match a non-newline.  For non single-quoted regexes, the tokenizer has
9810    attempted to decide which, and in the case of a named sequence, converted it
9811    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9812    where c1... are the characters in the sequence.  For single-quoted regexes,
9813    the tokenizer passes the \N sequence through unchanged; this code will not
9814    attempt to determine this nor expand those, instead raising a syntax error.
9815    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9816    or there is no '}', it signals that this \N occurrence means to match a
9817    non-newline.
9818
9819    Only the \N{U+...} form should occur in a character class, for the same
9820    reason that '.' inside a character class means to just match a period: it
9821    just doesn't make sense.
9822
9823    The function raises an error (via vFAIL), and doesn't return for various
9824    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
9825    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
9826    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
9827    only possible if node_p is non-NULL.
9828
9829
9830    If <valuep> is non-null, it means the caller can accept an input sequence
9831    consisting of a just a single code point; <*valuep> is set to that value
9832    if the input is such.
9833
9834    If <node_p> is non-null it signifies that the caller can accept any other
9835    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
9836    is set as follows:
9837  1) \N means not-a-NL: points to a newly created REG_ANY node;
9838  2) \N{}:              points to a new NOTHING node;
9839  3) otherwise:         points to a new EXACT node containing the resolved
9840       string.
9841    Note that FALSE is returned for single code point sequences if <valuep> is
9842    null.
9843  */
9844
9845  char * endbrace;    /* '}' following the name */
9846  char* p;
9847  char *endchar; /* Points to '.' or '}' ending cur char in the input
9848       stream */
9849  bool has_multiple_chars; /* true if the input stream contains a sequence of
9850         more than one character */
9851
9852  GET_RE_DEBUG_FLAGS_DECL;
9853
9854  PERL_ARGS_ASSERT_GROK_BSLASH_N;
9855
9856  GET_RE_DEBUG_FLAGS;
9857
9858  assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
9859
9860  /* The [^\n] meaning of \N ignores spaces and comments under the /x
9861  * modifier.  The other meaning does not */
9862  p = (RExC_flags & RXf_PMf_EXTENDED)
9863   ? regwhite( pRExC_state, RExC_parse )
9864   : RExC_parse;
9865
9866  /* Disambiguate between \N meaning a named character versus \N meaning
9867  * [^\n].  The former is assumed when it can't be the latter. */
9868  if (*p != '{' || regcurly(p, FALSE)) {
9869   RExC_parse = p;
9870   if (! node_p) {
9871    /* no bare \N in a charclass */
9872    if (in_char_class) {
9873     vFAIL("\\N in a character class must be a named character: \\N{...}");
9874    }
9875    return FALSE;
9876   }
9877   nextchar(pRExC_state);
9878   *node_p = reg_node(pRExC_state, REG_ANY);
9879   *flagp |= HASWIDTH|SIMPLE;
9880   RExC_naughty++;
9881   RExC_parse--;
9882   Set_Node_Length(*node_p, 1); /* MJD */
9883   return TRUE;
9884  }
9885
9886  /* Here, we have decided it should be a named character or sequence */
9887
9888  /* The test above made sure that the next real character is a '{', but
9889  * under the /x modifier, it could be separated by space (or a comment and
9890  * \n) and this is not allowed (for consistency with \x{...} and the
9891  * tokenizer handling of \N{NAME}). */
9892  if (*RExC_parse != '{') {
9893   vFAIL("Missing braces on \\N{}");
9894  }
9895
9896  RExC_parse++; /* Skip past the '{' */
9897
9898  if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9899   || ! (endbrace == RExC_parse  /* nothing between the {} */
9900    || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9901     && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9902  {
9903   if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9904   vFAIL("\\N{NAME} must be resolved by the lexer");
9905  }
9906
9907  if (endbrace == RExC_parse) {   /* empty: \N{} */
9908   bool ret = TRUE;
9909   if (node_p) {
9910    *node_p = reg_node(pRExC_state,NOTHING);
9911   }
9912   else if (in_char_class) {
9913    if (SIZE_ONLY && in_char_class) {
9914     if (strict) {
9915      RExC_parse++;   /* Position after the "}" */
9916      vFAIL("Zero length \\N{}");
9917     }
9918     else {
9919      ckWARNreg(RExC_parse,
9920        "Ignoring zero length \\N{} in character class");
9921     }
9922    }
9923    ret = FALSE;
9924   }
9925   else {
9926    return FALSE;
9927   }
9928   nextchar(pRExC_state);
9929   return ret;
9930  }
9931
9932  RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9933  RExC_parse += 2; /* Skip past the 'U+' */
9934
9935  endchar = RExC_parse + strcspn(RExC_parse, ".}");
9936
9937  /* Code points are separated by dots.  If none, there is only one code
9938  * point, and is terminated by the brace */
9939  has_multiple_chars = (endchar < endbrace);
9940
9941  if (valuep && (! has_multiple_chars || in_char_class)) {
9942   /* We only pay attention to the first char of
9943   multichar strings being returned in char classes. I kinda wonder
9944   if this makes sense as it does change the behaviour
9945   from earlier versions, OTOH that behaviour was broken
9946   as well. XXX Solution is to recharacterize as
9947   [rest-of-class]|multi1|multi2... */
9948
9949   STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9950   I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9951    | PERL_SCAN_DISALLOW_PREFIX
9952    | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9953
9954   *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9955
9956   /* The tokenizer should have guaranteed validity, but it's possible to
9957   * bypass it by using single quoting, so check */
9958   if (length_of_hex == 0
9959    || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9960   {
9961    RExC_parse += length_of_hex; /* Includes all the valid */
9962    RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9963        ? UTF8SKIP(RExC_parse)
9964        : 1;
9965    /* Guard against malformed utf8 */
9966    if (RExC_parse >= endchar) {
9967     RExC_parse = endchar;
9968    }
9969    vFAIL("Invalid hexadecimal number in \\N{U+...}");
9970   }
9971
9972   if (in_char_class && has_multiple_chars) {
9973    if (strict) {
9974     RExC_parse = endbrace;
9975     vFAIL("\\N{} in character class restricted to one character");
9976    }
9977    else {
9978     ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9979    }
9980   }
9981
9982   RExC_parse = endbrace + 1;
9983  }
9984  else if (! node_p || ! has_multiple_chars) {
9985
9986   /* Here, the input is legal, but not according to the caller's
9987   * options.  We fail without advancing the parse, so that the
9988   * caller can try again */
9989   RExC_parse = p;
9990   return FALSE;
9991  }
9992  else {
9993
9994   /* What is done here is to convert this to a sub-pattern of the form
9995   * (?:\x{char1}\x{char2}...)
9996   * and then call reg recursively.  That way, it retains its atomicness,
9997   * while not having to worry about special handling that some code
9998   * points may have.  toke.c has converted the original Unicode values
9999   * to native, so that we can just pass on the hex values unchanged.  We
10000   * do have to set a flag to keep recoding from happening in the
10001   * recursion */
10002
10003   SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10004   STRLEN len;
10005   char *orig_end = RExC_end;
10006   I32 flags;
10007
10008   while (RExC_parse < endbrace) {
10009
10010    /* Convert to notation the rest of the code understands */
10011    sv_catpv(substitute_parse, "\\x{");
10012    sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10013    sv_catpv(substitute_parse, "}");
10014
10015    /* Point to the beginning of the next character in the sequence. */
10016    RExC_parse = endchar + 1;
10017    endchar = RExC_parse + strcspn(RExC_parse, ".}");
10018   }
10019   sv_catpv(substitute_parse, ")");
10020
10021   RExC_parse = SvPV(substitute_parse, len);
10022
10023   /* Don't allow empty number */
10024   if (len < 8) {
10025    vFAIL("Invalid hexadecimal number in \\N{U+...}");
10026   }
10027   RExC_end = RExC_parse + len;
10028
10029   /* The values are Unicode, and therefore not subject to recoding */
10030   RExC_override_recoding = 1;
10031
10032   if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10033    if (flags & RESTART_UTF8) {
10034     *flagp = RESTART_UTF8;
10035     return FALSE;
10036    }
10037    FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#X",
10038     flags);
10039   }
10040   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10041
10042   RExC_parse = endbrace;
10043   RExC_end = orig_end;
10044   RExC_override_recoding = 0;
10045
10046   nextchar(pRExC_state);
10047  }
10048
10049  return TRUE;
10050 }
10051
10052
10053 /*
10054  * reg_recode
10055  *
10056  * It returns the code point in utf8 for the value in *encp.
10057  *    value: a code value in the source encoding
10058  *    encp:  a pointer to an Encode object
10059  *
10060  * If the result from Encode is not a single character,
10061  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10062  */
10063 STATIC UV
10064 S_reg_recode(pTHX_ const char value, SV **encp)
10065 {
10066  STRLEN numlen = 1;
10067  SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10068  const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10069  const STRLEN newlen = SvCUR(sv);
10070  UV uv = UNICODE_REPLACEMENT;
10071
10072  PERL_ARGS_ASSERT_REG_RECODE;
10073
10074  if (newlen)
10075   uv = SvUTF8(sv)
10076    ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10077    : *(U8*)s;
10078
10079  if (!newlen || numlen != newlen) {
10080   uv = UNICODE_REPLACEMENT;
10081   *encp = NULL;
10082  }
10083  return uv;
10084 }
10085
10086 PERL_STATIC_INLINE U8
10087 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10088 {
10089  U8 op;
10090
10091  PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10092
10093  if (! FOLD) {
10094   return EXACT;
10095  }
10096
10097  op = get_regex_charset(RExC_flags);
10098  if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10099   op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10100     been, so there is no hole */
10101  }
10102
10103  return op + EXACTF;
10104 }
10105
10106 PERL_STATIC_INLINE void
10107 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
10108 {
10109  /* This knows the details about sizing an EXACTish node, setting flags for
10110  * it (by setting <*flagp>, and potentially populating it with a single
10111  * character.
10112  *
10113  * If <len> (the length in bytes) is non-zero, this function assumes that
10114  * the node has already been populated, and just does the sizing.  In this
10115  * case <code_point> should be the final code point that has already been
10116  * placed into the node.  This value will be ignored except that under some
10117  * circumstances <*flagp> is set based on it.
10118  *
10119  * If <len> is zero, the function assumes that the node is to contain only
10120  * the single character given by <code_point> and calculates what <len>
10121  * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
10122  * additionally will populate the node's STRING with <code_point>, if <len>
10123  * is 0.  In both cases <*flagp> is appropriately set
10124  *
10125  * It knows that under FOLD, the Latin Sharp S and UTF characters above
10126  * 255, must be folded (the former only when the rules indicate it can
10127  * match 'ss') */
10128
10129  bool len_passed_in = cBOOL(len != 0);
10130  U8 character[UTF8_MAXBYTES_CASE+1];
10131
10132  PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10133
10134  if (! len_passed_in) {
10135   if (UTF) {
10136    if (FOLD && (! LOC || code_point > 255)) {
10137     _to_uni_fold_flags(NATIVE_TO_UNI(code_point),
10138         character,
10139         &len,
10140         FOLD_FLAGS_FULL | ((LOC)
10141              ? FOLD_FLAGS_LOCALE
10142              : (ASCII_FOLD_RESTRICTED)
10143              ? FOLD_FLAGS_NOMIX_ASCII
10144              : 0));
10145    }
10146    else {
10147     uvchr_to_utf8( character, code_point);
10148     len = UTF8SKIP(character);
10149    }
10150   }
10151   else if (! FOLD
10152     || code_point != LATIN_SMALL_LETTER_SHARP_S
10153     || ASCII_FOLD_RESTRICTED
10154     || ! AT_LEAST_UNI_SEMANTICS)
10155   {
10156    *character = (U8) code_point;
10157    len = 1;
10158   }
10159   else {
10160    *character = 's';
10161    *(character + 1) = 's';
10162    len = 2;
10163   }
10164  }
10165
10166  if (SIZE_ONLY) {
10167   RExC_size += STR_SZ(len);
10168  }
10169  else {
10170   RExC_emit += STR_SZ(len);
10171   STR_LEN(node) = len;
10172   if (! len_passed_in) {
10173    Copy((char *) character, STRING(node), len, char);
10174   }
10175  }
10176
10177  *flagp |= HASWIDTH;
10178
10179  /* A single character node is SIMPLE, except for the special-cased SHARP S
10180  * under /di. */
10181  if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10182   && (code_point != LATIN_SMALL_LETTER_SHARP_S
10183    || ! FOLD || ! DEPENDS_SEMANTICS))
10184  {
10185   *flagp |= SIMPLE;
10186  }
10187 }
10188
10189 /*
10190  - regatom - the lowest level
10191
10192    Try to identify anything special at the start of the pattern. If there
10193    is, then handle it as required. This may involve generating a single regop,
10194    such as for an assertion; or it may involve recursing, such as to
10195    handle a () structure.
10196
10197    If the string doesn't start with something special then we gobble up
10198    as much literal text as we can.
10199
10200    Once we have been able to handle whatever type of thing started the
10201    sequence, we return.
10202
10203    Note: we have to be careful with escapes, as they can be both literal
10204    and special, and in the case of \10 and friends, context determines which.
10205
10206    A summary of the code structure is:
10207
10208    switch (first_byte) {
10209   cases for each special:
10210    handle this special;
10211    break;
10212   case '\\':
10213    switch (2nd byte) {
10214     cases for each unambiguous special:
10215      handle this special;
10216      break;
10217     cases for each ambigous special/literal:
10218      disambiguate;
10219      if (special)  handle here
10220      else goto defchar;
10221     default: // unambiguously literal:
10222      goto defchar;
10223    }
10224   default:  // is a literal char
10225    // FALL THROUGH
10226   defchar:
10227    create EXACTish node for literal;
10228    while (more input and node isn't full) {
10229     switch (input_byte) {
10230     cases for each special;
10231      make sure parse pointer is set so that the next call to
10232       regatom will see this special first
10233      goto loopdone; // EXACTish node terminated by prev. char
10234     default:
10235      append char to EXACTISH node;
10236     }
10237     get next input byte;
10238    }
10239   loopdone:
10240    }
10241    return the generated node;
10242
10243    Specifically there are two separate switches for handling
10244    escape sequences, with the one for handling literal escapes requiring
10245    a dummy entry for all of the special escapes that are actually handled
10246    by the other.
10247
10248    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
10249    TRYAGAIN.
10250    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10251    restarted.
10252    Otherwise does not return NULL.
10253 */
10254
10255 STATIC regnode *
10256 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10257 {
10258  dVAR;
10259  regnode *ret = NULL;
10260  I32 flags = 0;
10261  char *parse_start = RExC_parse;
10262  U8 op;
10263  int invert = 0;
10264
10265  GET_RE_DEBUG_FLAGS_DECL;
10266
10267  *flagp = WORST;  /* Tentatively. */
10268
10269  DEBUG_PARSE("atom");
10270
10271  PERL_ARGS_ASSERT_REGATOM;
10272
10273 tryagain:
10274  switch ((U8)*RExC_parse) {
10275  case '^':
10276   RExC_seen_zerolen++;
10277   nextchar(pRExC_state);
10278   if (RExC_flags & RXf_PMf_MULTILINE)
10279    ret = reg_node(pRExC_state, MBOL);
10280   else if (RExC_flags & RXf_PMf_SINGLELINE)
10281    ret = reg_node(pRExC_state, SBOL);
10282   else
10283    ret = reg_node(pRExC_state, BOL);
10284   Set_Node_Length(ret, 1); /* MJD */
10285   break;
10286  case '$':
10287   nextchar(pRExC_state);
10288   if (*RExC_parse)
10289    RExC_seen_zerolen++;
10290   if (RExC_flags & RXf_PMf_MULTILINE)
10291    ret = reg_node(pRExC_state, MEOL);
10292   else if (RExC_flags & RXf_PMf_SINGLELINE)
10293    ret = reg_node(pRExC_state, SEOL);
10294   else
10295    ret = reg_node(pRExC_state, EOL);
10296   Set_Node_Length(ret, 1); /* MJD */
10297   break;
10298  case '.':
10299   nextchar(pRExC_state);
10300   if (RExC_flags & RXf_PMf_SINGLELINE)
10301    ret = reg_node(pRExC_state, SANY);
10302   else
10303    ret = reg_node(pRExC_state, REG_ANY);
10304   *flagp |= HASWIDTH|SIMPLE;
10305   RExC_naughty++;
10306   Set_Node_Length(ret, 1); /* MJD */
10307   break;
10308  case '[':
10309  {
10310   char * const oregcomp_parse = ++RExC_parse;
10311   ret = regclass(pRExC_state, flagp,depth+1,
10312      FALSE, /* means parse the whole char class */
10313      TRUE, /* allow multi-char folds */
10314      FALSE, /* don't silence non-portable warnings. */
10315      NULL);
10316   if (*RExC_parse != ']') {
10317    RExC_parse = oregcomp_parse;
10318    vFAIL("Unmatched [");
10319   }
10320   if (ret == NULL) {
10321    if (*flagp & RESTART_UTF8)
10322     return NULL;
10323    FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
10324     *flagp);
10325   }
10326   nextchar(pRExC_state);
10327   Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10328   break;
10329  }
10330  case '(':
10331   nextchar(pRExC_state);
10332   ret = reg(pRExC_state, 2, &flags,depth+1);
10333   if (ret == NULL) {
10334     if (flags & TRYAGAIN) {
10335      if (RExC_parse == RExC_end) {
10336       /* Make parent create an empty node if needed. */
10337       *flagp |= TRYAGAIN;
10338       return(NULL);
10339      }
10340      goto tryagain;
10341     }
10342     if (flags & RESTART_UTF8) {
10343      *flagp = RESTART_UTF8;
10344      return NULL;
10345     }
10346     FAIL2("panic: reg returned NULL to regatom, flags=%#X", flags);
10347   }
10348   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10349   break;
10350  case '|':
10351  case ')':
10352   if (flags & TRYAGAIN) {
10353    *flagp |= TRYAGAIN;
10354    return NULL;
10355   }
10356   vFAIL("Internal urp");
10357         /* Supposed to be caught earlier. */
10358   break;
10359  case '{':
10360   if (!regcurly(RExC_parse, FALSE)) {
10361    RExC_parse++;
10362    goto defchar;
10363   }
10364   /* FALL THROUGH */
10365  case '?':
10366  case '+':
10367  case '*':
10368   RExC_parse++;
10369   vFAIL("Quantifier follows nothing");
10370   break;
10371  case '\\':
10372   /* Special Escapes
10373
10374   This switch handles escape sequences that resolve to some kind
10375   of special regop and not to literal text. Escape sequnces that
10376   resolve to literal text are handled below in the switch marked
10377   "Literal Escapes".
10378
10379   Every entry in this switch *must* have a corresponding entry
10380   in the literal escape switch. However, the opposite is not
10381   required, as the default for this switch is to jump to the
10382   literal text handling code.
10383   */
10384   switch ((U8)*++RExC_parse) {
10385    U8 arg;
10386   /* Special Escapes */
10387   case 'A':
10388    RExC_seen_zerolen++;
10389    ret = reg_node(pRExC_state, SBOL);
10390    *flagp |= SIMPLE;
10391    goto finish_meta_pat;
10392   case 'G':
10393    ret = reg_node(pRExC_state, GPOS);
10394    RExC_seen |= REG_SEEN_GPOS;
10395    *flagp |= SIMPLE;
10396    goto finish_meta_pat;
10397   case 'K':
10398    RExC_seen_zerolen++;
10399    ret = reg_node(pRExC_state, KEEPS);
10400    *flagp |= SIMPLE;
10401    /* XXX:dmq : disabling in-place substitution seems to
10402    * be necessary here to avoid cases of memory corruption, as
10403    * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10404    */
10405    RExC_seen |= REG_SEEN_LOOKBEHIND;
10406    goto finish_meta_pat;
10407   case 'Z':
10408    ret = reg_node(pRExC_state, SEOL);
10409    *flagp |= SIMPLE;
10410    RExC_seen_zerolen++;  /* Do not optimize RE away */
10411    goto finish_meta_pat;
10412   case 'z':
10413    ret = reg_node(pRExC_state, EOS);
10414    *flagp |= SIMPLE;
10415    RExC_seen_zerolen++;  /* Do not optimize RE away */
10416    goto finish_meta_pat;
10417   case 'C':
10418    ret = reg_node(pRExC_state, CANY);
10419    RExC_seen |= REG_SEEN_CANY;
10420    *flagp |= HASWIDTH|SIMPLE;
10421    goto finish_meta_pat;
10422   case 'X':
10423    ret = reg_node(pRExC_state, CLUMP);
10424    *flagp |= HASWIDTH;
10425    goto finish_meta_pat;
10426
10427   case 'W':
10428    invert = 1;
10429    /* FALLTHROUGH */
10430   case 'w':
10431    arg = ANYOF_WORDCHAR;
10432    goto join_posix;
10433
10434   case 'b':
10435    RExC_seen_zerolen++;
10436    RExC_seen |= REG_SEEN_LOOKBEHIND;
10437    op = BOUND + get_regex_charset(RExC_flags);
10438    if (op > BOUNDA) {  /* /aa is same as /a */
10439     op = BOUNDA;
10440    }
10441    ret = reg_node(pRExC_state, op);
10442    FLAGS(ret) = get_regex_charset(RExC_flags);
10443    *flagp |= SIMPLE;
10444    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10445     ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
10446    }
10447    goto finish_meta_pat;
10448   case 'B':
10449    RExC_seen_zerolen++;
10450    RExC_seen |= REG_SEEN_LOOKBEHIND;
10451    op = NBOUND + get_regex_charset(RExC_flags);
10452    if (op > NBOUNDA) { /* /aa is same as /a */
10453     op = NBOUNDA;
10454    }
10455    ret = reg_node(pRExC_state, op);
10456    FLAGS(ret) = get_regex_charset(RExC_flags);
10457    *flagp |= SIMPLE;
10458    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10459     ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
10460    }
10461    goto finish_meta_pat;
10462
10463   case 'D':
10464    invert = 1;
10465    /* FALLTHROUGH */
10466   case 'd':
10467    arg = ANYOF_DIGIT;
10468    goto join_posix;
10469
10470   case 'R':
10471    ret = reg_node(pRExC_state, LNBREAK);
10472    *flagp |= HASWIDTH|SIMPLE;
10473    goto finish_meta_pat;
10474
10475   case 'H':
10476    invert = 1;
10477    /* FALLTHROUGH */
10478   case 'h':
10479    arg = ANYOF_BLANK;
10480    op = POSIXU;
10481    goto join_posix_op_known;
10482
10483   case 'V':
10484    invert = 1;
10485    /* FALLTHROUGH */
10486   case 'v':
10487    arg = ANYOF_VERTWS;
10488    op = POSIXU;
10489    goto join_posix_op_known;
10490
10491   case 'S':
10492    invert = 1;
10493    /* FALLTHROUGH */
10494   case 's':
10495    arg = ANYOF_SPACE;
10496
10497   join_posix:
10498
10499    op = POSIXD + get_regex_charset(RExC_flags);
10500    if (op > POSIXA) {  /* /aa is same as /a */
10501     op = POSIXA;
10502    }
10503
10504   join_posix_op_known:
10505
10506    if (invert) {
10507     op += NPOSIXD - POSIXD;
10508    }
10509
10510    ret = reg_node(pRExC_state, op);
10511    if (! SIZE_ONLY) {
10512     FLAGS(ret) = namedclass_to_classnum(arg);
10513    }
10514
10515    *flagp |= HASWIDTH|SIMPLE;
10516    /* FALL THROUGH */
10517
10518   finish_meta_pat:
10519    nextchar(pRExC_state);
10520    Set_Node_Length(ret, 2); /* MJD */
10521    break;
10522   case 'p':
10523   case 'P':
10524    {
10525 #ifdef DEBUGGING
10526     char* parse_start = RExC_parse - 2;
10527 #endif
10528
10529     RExC_parse--;
10530
10531     ret = regclass(pRExC_state, flagp,depth+1,
10532        TRUE, /* means just parse this element */
10533        FALSE, /* don't allow multi-char folds */
10534        FALSE, /* don't silence non-portable warnings.
10535           It would be a bug if these returned
10536           non-portables */
10537        NULL);
10538     /* regclass() can only return RESTART_UTF8 if multi-char folds
10539     are allowed.  */
10540     if (!ret)
10541      FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
10542       *flagp);
10543
10544     RExC_parse--;
10545
10546     Set_Node_Offset(ret, parse_start + 2);
10547     Set_Node_Cur_Length(ret);
10548     nextchar(pRExC_state);
10549    }
10550    break;
10551   case 'N':
10552    /* Handle \N and \N{NAME} with multiple code points here and not
10553    * below because it can be multicharacter. join_exact() will join
10554    * them up later on.  Also this makes sure that things like
10555    * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10556    * The options to the grok function call causes it to fail if the
10557    * sequence is just a single code point.  We then go treat it as
10558    * just another character in the current EXACT node, and hence it
10559    * gets uniform treatment with all the other characters.  The
10560    * special treatment for quantifiers is not needed for such single
10561    * character sequences */
10562    ++RExC_parse;
10563    if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
10564         FALSE /* not strict */ )) {
10565     if (*flagp & RESTART_UTF8)
10566      return NULL;
10567     RExC_parse--;
10568     goto defchar;
10569    }
10570    break;
10571   case 'k':    /* Handle \k<NAME> and \k'NAME' */
10572   parse_named_seq:
10573   {
10574    char ch= RExC_parse[1];
10575    if (ch != '<' && ch != '\'' && ch != '{') {
10576     RExC_parse++;
10577     vFAIL2("Sequence %.2s... not terminated",parse_start);
10578    } else {
10579     /* this pretty much dupes the code for (?P=...) in reg(), if
10580     you change this make sure you change that */
10581     char* name_start = (RExC_parse += 2);
10582     U32 num = 0;
10583     SV *sv_dat = reg_scan_name(pRExC_state,
10584      SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10585     ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10586     if (RExC_parse == name_start || *RExC_parse != ch)
10587      vFAIL2("Sequence %.3s... not terminated",parse_start);
10588
10589     if (!SIZE_ONLY) {
10590      num = add_data( pRExC_state, 1, "S" );
10591      RExC_rxi->data->data[num]=(void*)sv_dat;
10592      SvREFCNT_inc_simple_void(sv_dat);
10593     }
10594
10595     RExC_sawback = 1;
10596     ret = reganode(pRExC_state,
10597        ((! FOLD)
10598         ? NREF
10599         : (ASCII_FOLD_RESTRICTED)
10600         ? NREFFA
10601         : (AT_LEAST_UNI_SEMANTICS)
10602          ? NREFFU
10603          : (LOC)
10604          ? NREFFL
10605          : NREFF),
10606         num);
10607     *flagp |= HASWIDTH;
10608
10609     /* override incorrect value set in reganode MJD */
10610     Set_Node_Offset(ret, parse_start+1);
10611     Set_Node_Cur_Length(ret); /* MJD */
10612     nextchar(pRExC_state);
10613
10614    }
10615    break;
10616   }
10617   case 'g':
10618   case '1': case '2': case '3': case '4':
10619   case '5': case '6': case '7': case '8': case '9':
10620    {
10621     I32 num;
10622     bool isg = *RExC_parse == 'g';
10623     bool isrel = 0;
10624     bool hasbrace = 0;
10625     if (isg) {
10626      RExC_parse++;
10627      if (*RExC_parse == '{') {
10628       RExC_parse++;
10629       hasbrace = 1;
10630      }
10631      if (*RExC_parse == '-') {
10632       RExC_parse++;
10633       isrel = 1;
10634      }
10635      if (hasbrace && !isDIGIT(*RExC_parse)) {
10636       if (isrel) RExC_parse--;
10637       RExC_parse -= 2;
10638       goto parse_named_seq;
10639     }   }
10640     num = atoi(RExC_parse);
10641     if (isg && num == 0)
10642      vFAIL("Reference to invalid group 0");
10643     if (isrel) {
10644      num = RExC_npar - num;
10645      if (num < 1)
10646       vFAIL("Reference to nonexistent or unclosed group");
10647     }
10648     if (!isg && num > 9 && num >= RExC_npar)
10649      /* Probably a character specified in octal, e.g. \35 */
10650      goto defchar;
10651     else {
10652      char * const parse_start = RExC_parse - 1; /* MJD */
10653      while (isDIGIT(*RExC_parse))
10654       RExC_parse++;
10655      if (parse_start == RExC_parse - 1)
10656       vFAIL("Unterminated \\g... pattern");
10657      if (hasbrace) {
10658       if (*RExC_parse != '}')
10659        vFAIL("Unterminated \\g{...} pattern");
10660       RExC_parse++;
10661      }
10662      if (!SIZE_ONLY) {
10663       if (num > (I32)RExC_rx->nparens)
10664        vFAIL("Reference to nonexistent group");
10665      }
10666      RExC_sawback = 1;
10667      ret = reganode(pRExC_state,
10668         ((! FOLD)
10669          ? REF
10670          : (ASCII_FOLD_RESTRICTED)
10671          ? REFFA
10672          : (AT_LEAST_UNI_SEMANTICS)
10673           ? REFFU
10674           : (LOC)
10675           ? REFFL
10676           : REFF),
10677          num);
10678      *flagp |= HASWIDTH;
10679
10680      /* override incorrect value set in reganode MJD */
10681      Set_Node_Offset(ret, parse_start+1);
10682      Set_Node_Cur_Length(ret); /* MJD */
10683      RExC_parse--;
10684      nextchar(pRExC_state);
10685     }
10686    }
10687    break;
10688   case '\0':
10689    if (RExC_parse >= RExC_end)
10690     FAIL("Trailing \\");
10691    /* FALL THROUGH */
10692   default:
10693    /* Do not generate "unrecognized" warnings here, we fall
10694    back into the quick-grab loop below */
10695    parse_start--;
10696    goto defchar;
10697   }
10698   break;
10699
10700  case '#':
10701   if (RExC_flags & RXf_PMf_EXTENDED) {
10702    if ( reg_skipcomment( pRExC_state ) )
10703     goto tryagain;
10704   }
10705   /* FALL THROUGH */
10706
10707  default:
10708
10709    parse_start = RExC_parse - 1;
10710
10711    RExC_parse++;
10712
10713   defchar: {
10714    STRLEN len = 0;
10715    UV ender;
10716    char *p;
10717    char *s;
10718 #define MAX_NODE_STRING_SIZE 127
10719    char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10720    char *s0;
10721    U8 upper_parse = MAX_NODE_STRING_SIZE;
10722    STRLEN foldlen;
10723    U8 node_type;
10724    bool next_is_quantifier;
10725    char * oldp = NULL;
10726
10727    /* If a folding node contains only code points that don't
10728    * participate in folds, it can be changed into an EXACT node,
10729    * which allows the optimizer more things to look for */
10730    bool maybe_exact;
10731
10732    ender = 0;
10733    node_type = compute_EXACTish(pRExC_state);
10734    ret = reg_node(pRExC_state, node_type);
10735
10736    /* In pass1, folded, we use a temporary buffer instead of the
10737    * actual node, as the node doesn't exist yet */
10738    s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10739
10740    s0 = s;
10741
10742   reparse:
10743
10744    /* We do the EXACTFish to EXACT node only if folding, and not if in
10745    * locale, as whether a character folds or not isn't known until
10746    * runtime */
10747    maybe_exact = FOLD && ! LOC;
10748
10749    /* XXX The node can hold up to 255 bytes, yet this only goes to
10750    * 127.  I (khw) do not know why.  Keeping it somewhat less than
10751    * 255 allows us to not have to worry about overflow due to
10752    * converting to utf8 and fold expansion, but that value is
10753    * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
10754    * split up by this limit into a single one using the real max of
10755    * 255.  Even at 127, this breaks under rare circumstances.  If
10756    * folding, we do not want to split a node at a character that is a
10757    * non-final in a multi-char fold, as an input string could just
10758    * happen to want to match across the node boundary.  The join
10759    * would solve that problem if the join actually happens.  But a
10760    * series of more than two nodes in a row each of 127 would cause
10761    * the first join to succeed to get to 254, but then there wouldn't
10762    * be room for the next one, which could at be one of those split
10763    * multi-char folds.  I don't know of any fool-proof solution.  One
10764    * could back off to end with only a code point that isn't such a
10765    * non-final, but it is possible for there not to be any in the
10766    * entire node. */
10767    for (p = RExC_parse - 1;
10768     len < upper_parse && p < RExC_end;
10769     len++)
10770    {
10771     oldp = p;
10772
10773     if (RExC_flags & RXf_PMf_EXTENDED)
10774      p = regwhite( pRExC_state, p );
10775     switch ((U8)*p) {
10776     case '^':
10777     case '$':
10778     case '.':
10779     case '[':
10780     case '(':
10781     case ')':
10782     case '|':
10783      goto loopdone;
10784     case '\\':
10785      /* Literal Escapes Switch
10786
10787      This switch is meant to handle escape sequences that
10788      resolve to a literal character.
10789
10790      Every escape sequence that represents something
10791      else, like an assertion or a char class, is handled
10792      in the switch marked 'Special Escapes' above in this
10793      routine, but also has an entry here as anything that
10794      isn't explicitly mentioned here will be treated as
10795      an unescaped equivalent literal.
10796      */
10797
10798      switch ((U8)*++p) {
10799      /* These are all the special escapes. */
10800      case 'A':             /* Start assertion */
10801      case 'b': case 'B':   /* Word-boundary assertion*/
10802      case 'C':             /* Single char !DANGEROUS! */
10803      case 'd': case 'D':   /* digit class */
10804      case 'g': case 'G':   /* generic-backref, pos assertion */
10805      case 'h': case 'H':   /* HORIZWS */
10806      case 'k': case 'K':   /* named backref, keep marker */
10807      case 'p': case 'P':   /* Unicode property */
10808        case 'R':   /* LNBREAK */
10809      case 's': case 'S':   /* space class */
10810      case 'v': case 'V':   /* VERTWS */
10811      case 'w': case 'W':   /* word class */
10812      case 'X':             /* eXtended Unicode "combining character sequence" */
10813      case 'z': case 'Z':   /* End of line/string assertion */
10814       --p;
10815       goto loopdone;
10816
10817      /* Anything after here is an escape that resolves to a
10818      literal. (Except digits, which may or may not)
10819      */
10820      case 'n':
10821       ender = '\n';
10822       p++;
10823       break;
10824      case 'N': /* Handle a single-code point named character. */
10825       /* The options cause it to fail if a multiple code
10826       * point sequence.  Handle those in the switch() above
10827       * */
10828       RExC_parse = p + 1;
10829       if (! grok_bslash_N(pRExC_state, NULL, &ender,
10830            flagp, depth, FALSE,
10831            FALSE /* not strict */ ))
10832       {
10833        if (*flagp & RESTART_UTF8)
10834         FAIL("panic: grok_bslash_N set RESTART_UTF8");
10835        RExC_parse = p = oldp;
10836        goto loopdone;
10837       }
10838       p = RExC_parse;
10839       if (ender > 0xff) {
10840        REQUIRE_UTF8;
10841       }
10842       break;
10843      case 'r':
10844       ender = '\r';
10845       p++;
10846       break;
10847      case 't':
10848       ender = '\t';
10849       p++;
10850       break;
10851      case 'f':
10852       ender = '\f';
10853       p++;
10854       break;
10855      case 'e':
10856       ender = ASCII_TO_NATIVE('\033');
10857       p++;
10858       break;
10859      case 'a':
10860       ender = ASCII_TO_NATIVE('\007');
10861       p++;
10862       break;
10863      case 'o':
10864       {
10865        UV result;
10866        const char* error_msg;
10867
10868        bool valid = grok_bslash_o(&p,
10869              &result,
10870              &error_msg,
10871              TRUE, /* out warnings */
10872              FALSE, /* not strict */
10873              TRUE, /* Output warnings
10874                 for non-
10875                 portables */
10876              UTF);
10877        if (! valid) {
10878         RExC_parse = p; /* going to die anyway; point
10879             to exact spot of failure */
10880         vFAIL(error_msg);
10881        }
10882        ender = result;
10883        if (PL_encoding && ender < 0x100) {
10884         goto recode_encoding;
10885        }
10886        if (ender > 0xff) {
10887         REQUIRE_UTF8;
10888        }
10889        break;
10890       }
10891      case 'x':
10892       {
10893        UV result = UV_MAX; /* initialize to erroneous
10894             value */
10895        const char* error_msg;
10896
10897        bool valid = grok_bslash_x(&p,
10898              &result,
10899              &error_msg,
10900              TRUE, /* out warnings */
10901              FALSE, /* not strict */
10902              TRUE, /* Output warnings
10903                 for non-
10904                 portables */
10905              UTF);
10906        if (! valid) {
10907         RExC_parse = p; /* going to die anyway; point
10908             to exact spot of failure */
10909         vFAIL(error_msg);
10910        }
10911        ender = result;
10912
10913        if (PL_encoding && ender < 0x100) {
10914         goto recode_encoding;
10915        }
10916        if (ender > 0xff) {
10917         REQUIRE_UTF8;
10918        }
10919        break;
10920       }
10921      case 'c':
10922       p++;
10923       ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10924       break;
10925      case '0': case '1': case '2': case '3':case '4':
10926      case '5': case '6': case '7':
10927       if (*p == '0' ||
10928        (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10929       {
10930        I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10931        STRLEN numlen = 3;
10932        ender = grok_oct(p, &numlen, &flags, NULL);
10933        if (ender > 0xff) {
10934         REQUIRE_UTF8;
10935        }
10936        p += numlen;
10937        if (SIZE_ONLY   /* like \08, \178 */
10938         && numlen < 3
10939         && p < RExC_end
10940         && isDIGIT(*p) && ckWARN(WARN_REGEXP))
10941        {
10942         reg_warn_non_literal_string(
10943           p + 1,
10944           form_short_octal_warning(p, numlen));
10945        }
10946       }
10947       else {  /* Not to be treated as an octal constant, go
10948         find backref */
10949        --p;
10950        goto loopdone;
10951       }
10952       if (PL_encoding && ender < 0x100)
10953        goto recode_encoding;
10954       break;
10955      recode_encoding:
10956       if (! RExC_override_recoding) {
10957        SV* enc = PL_encoding;
10958        ender = reg_recode((const char)(U8)ender, &enc);
10959        if (!enc && SIZE_ONLY)
10960         ckWARNreg(p, "Invalid escape in the specified encoding");
10961        REQUIRE_UTF8;
10962       }
10963       break;
10964      case '\0':
10965       if (p >= RExC_end)
10966        FAIL("Trailing \\");
10967       /* FALL THROUGH */
10968      default:
10969       if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
10970        /* Include any { following the alpha to emphasize
10971        * that it could be part of an escape at some point
10972        * in the future */
10973        int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
10974        ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
10975       }
10976       goto normal_default;
10977      } /* End of switch on '\' */
10978      break;
10979     default:    /* A literal character */
10980
10981      if (! SIZE_ONLY
10982       && RExC_flags & RXf_PMf_EXTENDED
10983       && ckWARN(WARN_DEPRECATED)
10984       && is_PATWS_non_low(p, UTF))
10985      {
10986       vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
10987         "Escape literal pattern white space under /x");
10988      }
10989
10990     normal_default:
10991      if (UTF8_IS_START(*p) && UTF) {
10992       STRLEN numlen;
10993       ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10994            &numlen, UTF8_ALLOW_DEFAULT);
10995       p += numlen;
10996      }
10997      else
10998       ender = (U8) *p++;
10999      break;
11000     } /* End of switch on the literal */
11001
11002     /* Here, have looked at the literal character and <ender>
11003     * contains its ordinal, <p> points to the character after it
11004     */
11005
11006     if ( RExC_flags & RXf_PMf_EXTENDED)
11007      p = regwhite( pRExC_state, p );
11008
11009     /* If the next thing is a quantifier, it applies to this
11010     * character only, which means that this character has to be in
11011     * its own node and can't just be appended to the string in an
11012     * existing node, so if there are already other characters in
11013     * the node, close the node with just them, and set up to do
11014     * this character again next time through, when it will be the
11015     * only thing in its new node */
11016     if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11017     {
11018      p = oldp;
11019      goto loopdone;
11020     }
11021
11022     if (FOLD) {
11023      if (UTF
11024        /* See comments for join_exact() as to why we fold
11025        * this non-UTF at compile time */
11026       || (node_type == EXACTFU
11027        && ender == LATIN_SMALL_LETTER_SHARP_S))
11028      {
11029
11030
11031       /* Prime the casefolded buffer.  Locale rules, which
11032       * apply only to code points < 256, aren't known until
11033       * execution, so for them, just output the original
11034       * character using utf8.  If we start to fold non-UTF
11035       * patterns, be sure to update join_exact() */
11036       if (LOC && ender < 256) {
11037        if (UNI_IS_INVARIANT(ender)) {
11038         *s = (U8) ender;
11039         foldlen = 1;
11040        } else {
11041         *s = UTF8_TWO_BYTE_HI(ender);
11042         *(s + 1) = UTF8_TWO_BYTE_LO(ender);
11043         foldlen = 2;
11044        }
11045       }
11046       else {
11047        UV folded = _to_uni_fold_flags(
11048           ender,
11049           (U8 *) s,
11050           &foldlen,
11051           FOLD_FLAGS_FULL
11052           | ((LOC) ?  FOLD_FLAGS_LOCALE
11053              : (ASCII_FOLD_RESTRICTED)
11054              ? FOLD_FLAGS_NOMIX_ASCII
11055              : 0)
11056            );
11057
11058        /* If this node only contains non-folding code
11059        * points so far, see if this new one is also
11060        * non-folding */
11061        if (maybe_exact) {
11062         if (folded != ender) {
11063          maybe_exact = FALSE;
11064         }
11065         else {
11066          /* Here the fold is the original; we have
11067          * to check further to see if anything
11068          * folds to it */
11069          if (! PL_utf8_foldable) {
11070           SV* swash = swash_init("utf8",
11071               "_Perl_Any_Folds",
11072               &PL_sv_undef, 1, 0);
11073           PL_utf8_foldable =
11074              _get_swash_invlist(swash);
11075           SvREFCNT_dec_NN(swash);
11076          }
11077          if (_invlist_contains_cp(PL_utf8_foldable,
11078                ender))
11079          {
11080           maybe_exact = FALSE;
11081          }
11082         }
11083        }
11084        ender = folded;
11085       }
11086       s += foldlen;
11087
11088       /* The loop increments <len> each time, as all but this
11089       * path (and the one just below for UTF) through it add
11090       * a single byte to the EXACTish node.  But this one
11091       * has changed len to be the correct final value, so
11092       * subtract one to cancel out the increment that
11093       * follows */
11094       len += foldlen - 1;
11095      }
11096      else {
11097       *(s++) = (char) ender;
11098       maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
11099      }
11100     }
11101     else if (UTF) {
11102      const STRLEN unilen = reguni(pRExC_state, ender, s);
11103      if (unilen > 0) {
11104      s   += unilen;
11105      len += unilen;
11106      }
11107
11108      /* See comment just above for - 1 */
11109      len--;
11110     }
11111     else {
11112      REGC((char)ender, s++);
11113     }
11114
11115     if (next_is_quantifier) {
11116
11117      /* Here, the next input is a quantifier, and to get here,
11118      * the current character is the only one in the node.
11119      * Also, here <len> doesn't include the final byte for this
11120      * character */
11121      len++;
11122      goto loopdone;
11123     }
11124
11125    } /* End of loop through literal characters */
11126
11127    /* Here we have either exhausted the input or ran out of room in
11128    * the node.  (If we encountered a character that can't be in the
11129    * node, transfer is made directly to <loopdone>, and so we
11130    * wouldn't have fallen off the end of the loop.)  In the latter
11131    * case, we artificially have to split the node into two, because
11132    * we just don't have enough space to hold everything.  This
11133    * creates a problem if the final character participates in a
11134    * multi-character fold in the non-final position, as a match that
11135    * should have occurred won't, due to the way nodes are matched,
11136    * and our artificial boundary.  So back off until we find a non-
11137    * problematic character -- one that isn't at the beginning or
11138    * middle of such a fold.  (Either it doesn't participate in any
11139    * folds, or appears only in the final position of all the folds it
11140    * does participate in.)  A better solution with far fewer false
11141    * positives, and that would fill the nodes more completely, would
11142    * be to actually have available all the multi-character folds to
11143    * test against, and to back-off only far enough to be sure that
11144    * this node isn't ending with a partial one.  <upper_parse> is set
11145    * further below (if we need to reparse the node) to include just
11146    * up through that final non-problematic character that this code
11147    * identifies, so when it is set to less than the full node, we can
11148    * skip the rest of this */
11149    if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
11150
11151     const STRLEN full_len = len;
11152
11153     assert(len >= MAX_NODE_STRING_SIZE);
11154
11155     /* Here, <s> points to the final byte of the final character.
11156     * Look backwards through the string until find a non-
11157     * problematic character */
11158
11159     if (! UTF) {
11160
11161      /* These two have no multi-char folds to non-UTF characters
11162      */
11163      if (ASCII_FOLD_RESTRICTED || LOC) {
11164       goto loopdone;
11165      }
11166
11167      while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11168      len = s - s0 + 1;
11169     }
11170     else {
11171      if (!  PL_NonL1NonFinalFold) {
11172       PL_NonL1NonFinalFold = _new_invlist_C_array(
11173           NonL1_Perl_Non_Final_Folds_invlist);
11174      }
11175
11176      /* Point to the first byte of the final character */
11177      s = (char *) utf8_hop((U8 *) s, -1);
11178
11179      while (s >= s0) {   /* Search backwards until find
11180           non-problematic char */
11181       if (UTF8_IS_INVARIANT(*s)) {
11182
11183        /* There are no ascii characters that participate
11184        * in multi-char folds under /aa.  In EBCDIC, the
11185        * non-ascii invariants are all control characters,
11186        * so don't ever participate in any folds. */
11187        if (ASCII_FOLD_RESTRICTED
11188         || ! IS_NON_FINAL_FOLD(*s))
11189        {
11190         break;
11191        }
11192       }
11193       else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11194
11195        /* No Latin1 characters participate in multi-char
11196        * folds under /l */
11197        if (LOC
11198         || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
11199                 *s, *(s+1))))
11200        {
11201         break;
11202        }
11203       }
11204       else if (! _invlist_contains_cp(
11205           PL_NonL1NonFinalFold,
11206           valid_utf8_to_uvchr((U8 *) s, NULL)))
11207       {
11208        break;
11209       }
11210
11211       /* Here, the current character is problematic in that
11212       * it does occur in the non-final position of some
11213       * fold, so try the character before it, but have to
11214       * special case the very first byte in the string, so
11215       * we don't read outside the string */
11216       s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11217      } /* End of loop backwards through the string */
11218
11219      /* If there were only problematic characters in the string,
11220      * <s> will point to before s0, in which case the length
11221      * should be 0, otherwise include the length of the
11222      * non-problematic character just found */
11223      len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11224     }
11225
11226     /* Here, have found the final character, if any, that is
11227     * non-problematic as far as ending the node without splitting
11228     * it across a potential multi-char fold.  <len> contains the
11229     * number of bytes in the node up-to and including that
11230     * character, or is 0 if there is no such character, meaning
11231     * the whole node contains only problematic characters.  In
11232     * this case, give up and just take the node as-is.  We can't
11233     * do any better */
11234     if (len == 0) {
11235      len = full_len;
11236     } else {
11237
11238      /* Here, the node does contain some characters that aren't
11239      * problematic.  If one such is the final character in the
11240      * node, we are done */
11241      if (len == full_len) {
11242       goto loopdone;
11243      }
11244      else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11245
11246       /* If the final character is problematic, but the
11247       * penultimate is not, back-off that last character to
11248       * later start a new node with it */
11249       p = oldp;
11250       goto loopdone;
11251      }
11252
11253      /* Here, the final non-problematic character is earlier
11254      * in the input than the penultimate character.  What we do
11255      * is reparse from the beginning, going up only as far as
11256      * this final ok one, thus guaranteeing that the node ends
11257      * in an acceptable character.  The reason we reparse is
11258      * that we know how far in the character is, but we don't
11259      * know how to correlate its position with the input parse.
11260      * An alternate implementation would be to build that
11261      * correlation as we go along during the original parse,
11262      * but that would entail extra work for every node, whereas
11263      * this code gets executed only when the string is too
11264      * large for the node, and the final two characters are
11265      * problematic, an infrequent occurrence.  Yet another
11266      * possible strategy would be to save the tail of the
11267      * string, and the next time regatom is called, initialize
11268      * with that.  The problem with this is that unless you
11269      * back off one more character, you won't be guaranteed
11270      * regatom will get called again, unless regbranch,
11271      * regpiece ... are also changed.  If you do back off that
11272      * extra character, so that there is input guaranteed to
11273      * force calling regatom, you can't handle the case where
11274      * just the first character in the node is acceptable.  I
11275      * (khw) decided to try this method which doesn't have that
11276      * pitfall; if performance issues are found, we can do a
11277      * combination of the current approach plus that one */
11278      upper_parse = len;
11279      len = 0;
11280      s = s0;
11281      goto reparse;
11282     }
11283    }   /* End of verifying node ends with an appropriate char */
11284
11285   loopdone:   /* Jumped to when encounters something that shouldn't be in
11286      the node */
11287
11288    /* If 'maybe_exact' is still set here, means there are no
11289    * code points in the node that participate in folds */
11290    if (FOLD && maybe_exact) {
11291     OP(ret) = EXACT;
11292    }
11293
11294    /* I (khw) don't know if you can get here with zero length, but the
11295    * old code handled this situation by creating a zero-length EXACT
11296    * node.  Might as well be NOTHING instead */
11297    if (len == 0) {
11298     OP(ret) = NOTHING;
11299    }
11300    else{
11301     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11302    }
11303
11304    RExC_parse = p - 1;
11305    Set_Node_Cur_Length(ret); /* MJD */
11306    nextchar(pRExC_state);
11307    {
11308     /* len is STRLEN which is unsigned, need to copy to signed */
11309     IV iv = len;
11310     if (iv < 0)
11311      vFAIL("Internal disaster");
11312    }
11313
11314   } /* End of label 'defchar:' */
11315   break;
11316  } /* End of giant switch on input character */
11317
11318  return(ret);
11319 }
11320
11321 STATIC char *
11322 S_regwhite( RExC_state_t *pRExC_state, char *p )
11323 {
11324  const char *e = RExC_end;
11325
11326  PERL_ARGS_ASSERT_REGWHITE;
11327
11328  while (p < e) {
11329   if (isSPACE(*p))
11330    ++p;
11331   else if (*p == '#') {
11332    bool ended = 0;
11333    do {
11334     if (*p++ == '\n') {
11335      ended = 1;
11336      break;
11337     }
11338    } while (p < e);
11339    if (!ended)
11340     RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11341   }
11342   else
11343    break;
11344  }
11345  return p;
11346 }
11347
11348 STATIC char *
11349 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11350 {
11351  /* Returns the next non-pattern-white space, non-comment character (the
11352  * latter only if 'recognize_comment is true) in the string p, which is
11353  * ended by RExC_end.  If there is no line break ending a comment,
11354  * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11355  const char *e = RExC_end;
11356
11357  PERL_ARGS_ASSERT_REGPATWS;
11358
11359  while (p < e) {
11360   STRLEN len;
11361   if ((len = is_PATWS_safe(p, e, UTF))) {
11362    p += len;
11363   }
11364   else if (recognize_comment && *p == '#') {
11365    bool ended = 0;
11366    do {
11367     p++;
11368     if (is_LNBREAK_safe(p, e, UTF)) {
11369      ended = 1;
11370      break;
11371     }
11372    } while (p < e);
11373    if (!ended)
11374     RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11375   }
11376   else
11377    break;
11378  }
11379  return p;
11380 }
11381
11382 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11383    Character classes ([:foo:]) can also be negated ([:^foo:]).
11384    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11385    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11386    but trigger failures because they are currently unimplemented. */
11387
11388 #define POSIXCC_DONE(c)   ((c) == ':')
11389 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11390 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11391
11392 PERL_STATIC_INLINE I32
11393 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
11394 {
11395  dVAR;
11396  I32 namedclass = OOB_NAMEDCLASS;
11397
11398  PERL_ARGS_ASSERT_REGPPOSIXCC;
11399
11400  if (value == '[' && RExC_parse + 1 < RExC_end &&
11401   /* I smell either [: or [= or [. -- POSIX has been here, right? */
11402   POSIXCC(UCHARAT(RExC_parse)))
11403  {
11404   const char c = UCHARAT(RExC_parse);
11405   char* const s = RExC_parse++;
11406
11407   while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11408    RExC_parse++;
11409   if (RExC_parse == RExC_end) {
11410    if (strict) {
11411
11412     /* Try to give a better location for the error (than the end of
11413     * the string) by looking for the matching ']' */
11414     RExC_parse = s;
11415     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11416      RExC_parse++;
11417     }
11418     vFAIL2("Unmatched '%c' in POSIX class", c);
11419    }
11420    /* Grandfather lone [:, [=, [. */
11421    RExC_parse = s;
11422   }
11423   else {
11424    const char* const t = RExC_parse++; /* skip over the c */
11425    assert(*t == c);
11426
11427    if (UCHARAT(RExC_parse) == ']') {
11428     const char *posixcc = s + 1;
11429     RExC_parse++; /* skip over the ending ] */
11430
11431     if (*s == ':') {
11432      const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11433      const I32 skip = t - posixcc;
11434
11435      /* Initially switch on the length of the name.  */
11436      switch (skip) {
11437      case 4:
11438       if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
11439               this is the Perl \w
11440               */
11441        namedclass = ANYOF_WORDCHAR;
11442       break;
11443      case 5:
11444       /* Names all of length 5.  */
11445       /* alnum alpha ascii blank cntrl digit graph lower
11446       print punct space upper  */
11447       /* Offset 4 gives the best switch position.  */
11448       switch (posixcc[4]) {
11449       case 'a':
11450        if (memEQ(posixcc, "alph", 4)) /* alpha */
11451         namedclass = ANYOF_ALPHA;
11452        break;
11453       case 'e':
11454        if (memEQ(posixcc, "spac", 4)) /* space */
11455         namedclass = ANYOF_PSXSPC;
11456        break;
11457       case 'h':
11458        if (memEQ(posixcc, "grap", 4)) /* graph */
11459         namedclass = ANYOF_GRAPH;
11460        break;
11461       case 'i':
11462        if (memEQ(posixcc, "asci", 4)) /* ascii */
11463         namedclass = ANYOF_ASCII;
11464        break;
11465       case 'k':
11466        if (memEQ(posixcc, "blan", 4)) /* blank */
11467         namedclass = ANYOF_BLANK;
11468        break;
11469       case 'l':
11470        if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11471         namedclass = ANYOF_CNTRL;
11472        break;
11473       case 'm':
11474        if (memEQ(posixcc, "alnu", 4)) /* alnum */
11475         namedclass = ANYOF_ALPHANUMERIC;
11476        break;
11477       case 'r':
11478        if (memEQ(posixcc, "lowe", 4)) /* lower */
11479         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
11480        else if (memEQ(posixcc, "uppe", 4)) /* upper */
11481         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
11482        break;
11483       case 't':
11484        if (memEQ(posixcc, "digi", 4)) /* digit */
11485         namedclass = ANYOF_DIGIT;
11486        else if (memEQ(posixcc, "prin", 4)) /* print */
11487         namedclass = ANYOF_PRINT;
11488        else if (memEQ(posixcc, "punc", 4)) /* punct */
11489         namedclass = ANYOF_PUNCT;
11490        break;
11491       }
11492       break;
11493      case 6:
11494       if (memEQ(posixcc, "xdigit", 6))
11495        namedclass = ANYOF_XDIGIT;
11496       break;
11497      }
11498
11499      if (namedclass == OOB_NAMEDCLASS)
11500       Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11501          t - s - 1, s + 1);
11502
11503      /* The #defines are structured so each complement is +1 to
11504      * the normal one */
11505      if (complement) {
11506       namedclass++;
11507      }
11508      assert (posixcc[skip] == ':');
11509      assert (posixcc[skip+1] == ']');
11510     } else if (!SIZE_ONLY) {
11511      /* [[=foo=]] and [[.foo.]] are still future. */
11512
11513      /* adjust RExC_parse so the warning shows after
11514      the class closes */
11515      while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11516       RExC_parse++;
11517      vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11518     }
11519    } else {
11520     /* Maternal grandfather:
11521     * "[:" ending in ":" but not in ":]" */
11522     if (strict) {
11523      vFAIL("Unmatched '[' in POSIX class");
11524     }
11525
11526     /* Grandfather lone [:, [=, [. */
11527     RExC_parse = s;
11528    }
11529   }
11530  }
11531
11532  return namedclass;
11533 }
11534
11535 STATIC bool
11536 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
11537 {
11538  /* This applies some heuristics at the current parse position (which should
11539  * be at a '[') to see if what follows might be intended to be a [:posix:]
11540  * class.  It returns true if it really is a posix class, of course, but it
11541  * also can return true if it thinks that what was intended was a posix
11542  * class that didn't quite make it.
11543  *
11544  * It will return true for
11545  *      [:alphanumerics:
11546  *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
11547  *                         ')' indicating the end of the (?[
11548  *      [:any garbage including %^&$ punctuation:]
11549  *
11550  * This is designed to be called only from S_handle_regex_sets; it could be
11551  * easily adapted to be called from the spot at the beginning of regclass()
11552  * that checks to see in a normal bracketed class if the surrounding []
11553  * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
11554  * change long-standing behavior, so I (khw) didn't do that */
11555  char* p = RExC_parse + 1;
11556  char first_char = *p;
11557
11558  PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
11559
11560  assert(*(p - 1) == '[');
11561
11562  if (! POSIXCC(first_char)) {
11563   return FALSE;
11564  }
11565
11566  p++;
11567  while (p < RExC_end && isWORDCHAR(*p)) p++;
11568
11569  if (p >= RExC_end) {
11570   return FALSE;
11571  }
11572
11573  if (p - RExC_parse > 2    /* Got at least 1 word character */
11574   && (*p == first_char
11575    || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
11576  {
11577   return TRUE;
11578  }
11579
11580  p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
11581
11582  return (p
11583    && p - RExC_parse > 2 /* [:] evaluates to colon;
11584          [::] is a bad posix class. */
11585    && first_char == *(p - 1));
11586 }
11587
11588 STATIC regnode *
11589 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
11590     char * const oregcomp_parse)
11591 {
11592  /* Handle the (?[...]) construct to do set operations */
11593
11594  U8 curchar;
11595  UV start, end; /* End points of code point ranges */
11596  SV* result_string;
11597  char *save_end, *save_parse;
11598  SV* final;
11599  STRLEN len;
11600  regnode* node;
11601  AV* stack;
11602  const bool save_fold = FOLD;
11603
11604  GET_RE_DEBUG_FLAGS_DECL;
11605
11606  PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
11607
11608  if (LOC) {
11609   vFAIL("(?[...]) not valid in locale");
11610  }
11611  RExC_uni_semantics = 1;
11612
11613  /* This will return only an ANYOF regnode, or (unlikely) something smaller
11614  * (such as EXACT).  Thus we can skip most everything if just sizing.  We
11615  * call regclass to handle '[]' so as to not have to reinvent its parsing
11616  * rules here (throwing away the size it computes each time).  And, we exit
11617  * upon an unescaped ']' that isn't one ending a regclass.  To do both
11618  * these things, we need to realize that something preceded by a backslash
11619  * is escaped, so we have to keep track of backslashes */
11620  if (SIZE_ONLY) {
11621
11622   Perl_ck_warner_d(aTHX_
11623    packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
11624    "The regex_sets feature is experimental" REPORT_LOCATION,
11625    (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
11626
11627   while (RExC_parse < RExC_end) {
11628    SV* current = NULL;
11629    RExC_parse = regpatws(pRExC_state, RExC_parse,
11630         TRUE); /* means recognize comments */
11631    switch (*RExC_parse) {
11632     default:
11633      break;
11634     case '\\':
11635      /* Skip the next byte (which could cause us to end up in
11636      * the middle of a UTF-8 character, but since none of those
11637      * are confusable with anything we currently handle in this
11638      * switch (invariants all), it's safe.  We'll just hit the
11639      * default: case next time and keep on incrementing until
11640      * we find one of the invariants we do handle. */
11641      RExC_parse++;
11642      break;
11643     case '[':
11644     {
11645      /* If this looks like it is a [:posix:] class, leave the
11646      * parse pointer at the '[' to fool regclass() into
11647      * thinking it is part of a '[[:posix:]]'.  That function
11648      * will use strict checking to force a syntax error if it
11649      * doesn't work out to a legitimate class */
11650      bool is_posix_class
11651          = could_it_be_a_POSIX_class(pRExC_state);
11652      if (! is_posix_class) {
11653       RExC_parse++;
11654      }
11655
11656      /* regclass() can only return RESTART_UTF8 if multi-char
11657      folds are allowed.  */
11658      if (!regclass(pRExC_state, flagp,depth+1,
11659         is_posix_class, /* parse the whole char
11660              class only if not a
11661              posix class */
11662         FALSE, /* don't allow multi-char folds */
11663         TRUE, /* silence non-portable warnings. */
11664         &current))
11665       FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
11666        *flagp);
11667
11668      /* function call leaves parse pointing to the ']', except
11669      * if we faked it */
11670      if (is_posix_class) {
11671       RExC_parse--;
11672      }
11673
11674      SvREFCNT_dec(current);   /* In case it returned something */
11675      break;
11676     }
11677
11678     case ']':
11679      RExC_parse++;
11680      if (RExC_parse < RExC_end
11681       && *RExC_parse == ')')
11682      {
11683       node = reganode(pRExC_state, ANYOF, 0);
11684       RExC_size += ANYOF_SKIP;
11685       nextchar(pRExC_state);
11686       Set_Node_Length(node,
11687         RExC_parse - oregcomp_parse + 1); /* MJD */
11688       return node;
11689      }
11690      goto no_close;
11691    }
11692    RExC_parse++;
11693   }
11694
11695   no_close:
11696   FAIL("Syntax error in (?[...])");
11697  }
11698
11699  /* Pass 2 only after this.  Everything in this construct is a
11700  * metacharacter.  Operands begin with either a '\' (for an escape
11701  * sequence), or a '[' for a bracketed character class.  Any other
11702  * character should be an operator, or parenthesis for grouping.  Both
11703  * types of operands are handled by calling regclass() to parse them.  It
11704  * is called with a parameter to indicate to return the computed inversion
11705  * list.  The parsing here is implemented via a stack.  Each entry on the
11706  * stack is a single character representing one of the operators, or the
11707  * '('; or else a pointer to an operand inversion list. */
11708
11709 #define IS_OPERAND(a)  (! SvIOK(a))
11710
11711  /* The stack starts empty.  It is a syntax error if the first thing parsed
11712  * is a binary operator; everything else is pushed on the stack.  When an
11713  * operand is parsed, the top of the stack is examined.  If it is a binary
11714  * operator, the item before it should be an operand, and both are replaced
11715  * by the result of doing that operation on the new operand and the one on
11716  * the stack.   Thus a sequence of binary operands is reduced to a single
11717  * one before the next one is parsed.
11718  *
11719  * A unary operator may immediately follow a binary in the input, for
11720  * example
11721  *      [a] + ! [b]
11722  * When an operand is parsed and the top of the stack is a unary operator,
11723  * the operation is performed, and then the stack is rechecked to see if
11724  * this new operand is part of a binary operation; if so, it is handled as
11725  * above.
11726  *
11727  * A '(' is simply pushed on the stack; it is valid only if the stack is
11728  * empty, or the top element of the stack is an operator or another '('
11729  * (for which the parenthesized expression will become an operand).  By the
11730  * time the corresponding ')' is parsed everything in between should have
11731  * been parsed and evaluated to a single operand (or else is a syntax
11732  * error), and is handled as a regular operand */
11733
11734  sv_2mortal((SV *)(stack = newAV()));
11735
11736  while (RExC_parse < RExC_end) {
11737   I32 top_index = av_tindex(stack);
11738   SV** top_ptr;
11739   SV* current = NULL;
11740
11741   /* Skip white space */
11742   RExC_parse = regpatws(pRExC_state, RExC_parse,
11743         TRUE); /* means recognize comments */
11744   if (RExC_parse >= RExC_end) {
11745    Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
11746   }
11747   if ((curchar = UCHARAT(RExC_parse)) == ']') {
11748    break;
11749   }
11750
11751   switch (curchar) {
11752
11753    case '?':
11754     if (av_tindex(stack) >= 0   /* This makes sure that we can
11755            safely subtract 1 from
11756            RExC_parse in the next clause.
11757            If we have something on the
11758            stack, we have parsed something
11759            */
11760      && UCHARAT(RExC_parse - 1) == '('
11761      && RExC_parse < RExC_end)
11762     {
11763      /* If is a '(?', could be an embedded '(?flags:(?[...])'.
11764      * This happens when we have some thing like
11765      *
11766      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
11767      *   ...
11768      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
11769      *
11770      * Here we would be handling the interpolated
11771      * '$thai_or_lao'.  We handle this by a recursive call to
11772      * ourselves which returns the inversion list the
11773      * interpolated expression evaluates to.  We use the flags
11774      * from the interpolated pattern. */
11775      U32 save_flags = RExC_flags;
11776      const char * const save_parse = ++RExC_parse;
11777
11778      parse_lparen_question_flags(pRExC_state);
11779
11780      if (RExC_parse == save_parse  /* Makes sure there was at
11781              least one flag (or this
11782              embedding wasn't compiled)
11783             */
11784       || RExC_parse >= RExC_end - 4
11785       || UCHARAT(RExC_parse) != ':'
11786       || UCHARAT(++RExC_parse) != '('
11787       || UCHARAT(++RExC_parse) != '?'
11788       || UCHARAT(++RExC_parse) != '[')
11789      {
11790
11791       /* In combination with the above, this moves the
11792       * pointer to the point just after the first erroneous
11793       * character (or if there are no flags, to where they
11794       * should have been) */
11795       if (RExC_parse >= RExC_end - 4) {
11796        RExC_parse = RExC_end;
11797       }
11798       else if (RExC_parse != save_parse) {
11799        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11800       }
11801       vFAIL("Expecting '(?flags:(?[...'");
11802      }
11803      RExC_parse++;
11804      (void) handle_regex_sets(pRExC_state, &current, flagp,
11805              depth+1, oregcomp_parse);
11806
11807      /* Here, 'current' contains the embedded expression's
11808      * inversion list, and RExC_parse points to the trailing
11809      * ']'; the next character should be the ')' which will be
11810      * paired with the '(' that has been put on the stack, so
11811      * the whole embedded expression reduces to '(operand)' */
11812      RExC_parse++;
11813
11814      RExC_flags = save_flags;
11815      goto handle_operand;
11816     }
11817     /* FALL THROUGH */
11818
11819    default:
11820     RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11821     vFAIL("Unexpected character");
11822
11823    case '\\':
11824     /* regclass() can only return RESTART_UTF8 if multi-char
11825     folds are allowed.  */
11826     if (!regclass(pRExC_state, flagp,depth+1,
11827        TRUE, /* means parse just the next thing */
11828        FALSE, /* don't allow multi-char folds */
11829        FALSE, /* don't silence non-portable warnings.  */
11830        &current))
11831      FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
11832       *flagp);
11833     /* regclass() will return with parsing just the \ sequence,
11834     * leaving the parse pointer at the next thing to parse */
11835     RExC_parse--;
11836     goto handle_operand;
11837
11838    case '[':   /* Is a bracketed character class */
11839    {
11840     bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
11841
11842     if (! is_posix_class) {
11843      RExC_parse++;
11844     }
11845
11846     /* regclass() can only return RESTART_UTF8 if multi-char
11847     folds are allowed.  */
11848     if(!regclass(pRExC_state, flagp,depth+1,
11849        is_posix_class, /* parse the whole char class
11850             only if not a posix class */
11851        FALSE, /* don't allow multi-char folds */
11852        FALSE, /* don't silence non-portable warnings.  */
11853        &current))
11854      FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
11855       *flagp);
11856     /* function call leaves parse pointing to the ']', except if we
11857     * faked it */
11858     if (is_posix_class) {
11859      RExC_parse--;
11860     }
11861
11862     goto handle_operand;
11863    }
11864
11865    case '&':
11866    case '|':
11867    case '+':
11868    case '-':
11869    case '^':
11870     if (top_index < 0
11871      || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
11872      || ! IS_OPERAND(*top_ptr))
11873     {
11874      RExC_parse++;
11875      vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
11876     }
11877     av_push(stack, newSVuv(curchar));
11878     break;
11879
11880    case '!':
11881     av_push(stack, newSVuv(curchar));
11882     break;
11883
11884    case '(':
11885     if (top_index >= 0) {
11886      top_ptr = av_fetch(stack, top_index, FALSE);
11887      assert(top_ptr);
11888      if (IS_OPERAND(*top_ptr)) {
11889       RExC_parse++;
11890       vFAIL("Unexpected '(' with no preceding operator");
11891      }
11892     }
11893     av_push(stack, newSVuv(curchar));
11894     break;
11895
11896    case ')':
11897    {
11898     SV* lparen;
11899     if (top_index < 1
11900      || ! (current = av_pop(stack))
11901      || ! IS_OPERAND(current)
11902      || ! (lparen = av_pop(stack))
11903      || IS_OPERAND(lparen)
11904      || SvUV(lparen) != '(')
11905     {
11906      SvREFCNT_dec(current);
11907      RExC_parse++;
11908      vFAIL("Unexpected ')'");
11909     }
11910     top_index -= 2;
11911     SvREFCNT_dec_NN(lparen);
11912
11913     /* FALL THROUGH */
11914    }
11915
11916    handle_operand:
11917
11918     /* Here, we have an operand to process, in 'current' */
11919
11920     if (top_index < 0) {    /* Just push if stack is empty */
11921      av_push(stack, current);
11922     }
11923     else {
11924      SV* top = av_pop(stack);
11925      SV *prev = NULL;
11926      char current_operator;
11927
11928      if (IS_OPERAND(top)) {
11929       SvREFCNT_dec_NN(top);
11930       SvREFCNT_dec_NN(current);
11931       vFAIL("Operand with no preceding operator");
11932      }
11933      current_operator = (char) SvUV(top);
11934      switch (current_operator) {
11935       case '(':   /* Push the '(' back on followed by the new
11936          operand */
11937        av_push(stack, top);
11938        av_push(stack, current);
11939        SvREFCNT_inc(top);  /* Counters the '_dec' done
11940             just after the 'break', so
11941             it doesn't get wrongly freed
11942             */
11943        break;
11944
11945       case '!':
11946        _invlist_invert(current);
11947
11948        /* Unlike binary operators, the top of the stack,
11949        * now that this unary one has been popped off, may
11950        * legally be an operator, and we now have operand
11951        * for it. */
11952        top_index--;
11953        SvREFCNT_dec_NN(top);
11954        goto handle_operand;
11955
11956       case '&':
11957        prev = av_pop(stack);
11958        _invlist_intersection(prev,
11959             current,
11960             &current);
11961        av_push(stack, current);
11962        break;
11963
11964       case '|':
11965       case '+':
11966        prev = av_pop(stack);
11967        _invlist_union(prev, current, &current);
11968        av_push(stack, current);
11969        break;
11970
11971       case '-':
11972        prev = av_pop(stack);;
11973        _invlist_subtract(prev, current, &current);
11974        av_push(stack, current);
11975        break;
11976
11977       case '^':   /* The union minus the intersection */
11978       {
11979        SV* i = NULL;
11980        SV* u = NULL;
11981        SV* element;
11982
11983        prev = av_pop(stack);
11984        _invlist_union(prev, current, &u);
11985        _invlist_intersection(prev, current, &i);
11986        /* _invlist_subtract will overwrite current
11987         without freeing what it already contains */
11988        element = current;
11989        _invlist_subtract(u, i, &current);
11990        av_push(stack, current);
11991        SvREFCNT_dec_NN(i);
11992        SvREFCNT_dec_NN(u);
11993        SvREFCNT_dec_NN(element);
11994        break;
11995       }
11996
11997       default:
11998        Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
11999     }
12000     SvREFCNT_dec_NN(top);
12001     SvREFCNT_dec(prev);
12002    }
12003   }
12004
12005   RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12006  }
12007
12008  if (av_tindex(stack) < 0   /* Was empty */
12009   || ((final = av_pop(stack)) == NULL)
12010   || ! IS_OPERAND(final)
12011   || av_tindex(stack) >= 0)  /* More left on stack */
12012  {
12013   vFAIL("Incomplete expression within '(?[ ])'");
12014  }
12015
12016  /* Here, 'final' is the resultant inversion list from evaluating the
12017  * expression.  Return it if so requested */
12018  if (return_invlist) {
12019   *return_invlist = final;
12020   return END;
12021  }
12022
12023  /* Otherwise generate a resultant node, based on 'final'.  regclass() is
12024  * expecting a string of ranges and individual code points */
12025  invlist_iterinit(final);
12026  result_string = newSVpvs("");
12027  while (invlist_iternext(final, &start, &end)) {
12028   if (start == end) {
12029    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
12030   }
12031   else {
12032    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
12033              start,          end);
12034   }
12035  }
12036
12037  save_parse = RExC_parse;
12038  RExC_parse = SvPV(result_string, len);
12039  save_end = RExC_end;
12040  RExC_end = RExC_parse + len;
12041
12042  /* We turn off folding around the call, as the class we have constructed
12043  * already has all folding taken into consideration, and we don't want
12044  * regclass() to add to that */
12045  RExC_flags &= ~RXf_PMf_FOLD;
12046  /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
12047  */
12048  node = regclass(pRExC_state, flagp,depth+1,
12049      FALSE, /* means parse the whole char class */
12050      FALSE, /* don't allow multi-char folds */
12051      TRUE, /* silence non-portable warnings.  The above may very
12052        well have generated non-portable code points, but
12053        they're valid on this machine */
12054      NULL);
12055  if (!node)
12056   FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
12057      PTR2UV(flagp));
12058  if (save_fold) {
12059   RExC_flags |= RXf_PMf_FOLD;
12060  }
12061  RExC_parse = save_parse + 1;
12062  RExC_end = save_end;
12063  SvREFCNT_dec_NN(final);
12064  SvREFCNT_dec_NN(result_string);
12065
12066  nextchar(pRExC_state);
12067  Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
12068  return node;
12069 }
12070 #undef IS_OPERAND
12071
12072 /* The names of properties whose definitions are not known at compile time are
12073  * stored in this SV, after a constant heading.  So if the length has been
12074  * changed since initialization, then there is a run-time definition. */
12075 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
12076
12077 STATIC regnode *
12078 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
12079     const bool stop_at_1,  /* Just parse the next thing, don't
12080           look for a full character class */
12081     bool allow_multi_folds,
12082     const bool silence_non_portable,   /* Don't output warnings
12083              about too large
12084              characters */
12085     SV** ret_invlist)  /* Return an inversion list, not a node */
12086 {
12087  /* parse a bracketed class specification.  Most of these will produce an
12088  * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
12089  * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
12090  * under /i with multi-character folds: it will be rewritten following the
12091  * paradigm of this example, where the <multi-fold>s are characters which
12092  * fold to multiple character sequences:
12093  *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
12094  * gets effectively rewritten as:
12095  *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
12096  * reg() gets called (recursively) on the rewritten version, and this
12097  * function will return what it constructs.  (Actually the <multi-fold>s
12098  * aren't physically removed from the [abcdefghi], it's just that they are
12099  * ignored in the recursion by means of a flag:
12100  * <RExC_in_multi_char_class>.)
12101  *
12102  * ANYOF nodes contain a bit map for the first 256 characters, with the
12103  * corresponding bit set if that character is in the list.  For characters
12104  * above 255, a range list or swash is used.  There are extra bits for \w,
12105  * etc. in locale ANYOFs, as what these match is not determinable at
12106  * compile time
12107  *
12108  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
12109  * to be restarted.  This can only happen if ret_invlist is non-NULL.
12110  */
12111
12112  dVAR;
12113  UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
12114  IV range = 0;
12115  UV value = OOB_UNICODE, save_value = OOB_UNICODE;
12116  regnode *ret;
12117  STRLEN numlen;
12118  IV namedclass = OOB_NAMEDCLASS;
12119  char *rangebegin = NULL;
12120  bool need_class = 0;
12121  SV *listsv = NULL;
12122  STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
12123          than just initialized.  */
12124  SV* properties = NULL;    /* Code points that match \p{} \P{} */
12125  SV* posixes = NULL;     /* Code points that match classes like, [:word:],
12126        extended beyond the Latin1 range */
12127  UV element_count = 0;   /* Number of distinct elements in the class.
12128        Optimizations may be possible if this is tiny */
12129  AV * multi_char_matches = NULL; /* Code points that fold to more than one
12130          character; used under /i */
12131  UV n;
12132  char * stop_ptr = RExC_end;    /* where to stop parsing */
12133  const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
12134             space? */
12135  const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
12136
12137  /* Unicode properties are stored in a swash; this holds the current one
12138  * being parsed.  If this swash is the only above-latin1 component of the
12139  * character class, an optimization is to pass it directly on to the
12140  * execution engine.  Otherwise, it is set to NULL to indicate that there
12141  * are other things in the class that have to be dealt with at execution
12142  * time */
12143  SV* swash = NULL;  /* Code points that match \p{} \P{} */
12144
12145  /* Set if a component of this character class is user-defined; just passed
12146  * on to the engine */
12147  bool has_user_defined_property = FALSE;
12148
12149  /* inversion list of code points this node matches only when the target
12150  * string is in UTF-8.  (Because is under /d) */
12151  SV* depends_list = NULL;
12152
12153  /* inversion list of code points this node matches.  For much of the
12154  * function, it includes only those that match regardless of the utf8ness
12155  * of the target string */
12156  SV* cp_list = NULL;
12157
12158 #ifdef EBCDIC
12159  /* In a range, counts how many 0-2 of the ends of it came from literals,
12160  * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
12161  UV literal_endpoint = 0;
12162 #endif
12163  bool invert = FALSE;    /* Is this class to be complemented */
12164
12165  /* Is there any thing like \W or [:^digit:] that matches above the legal
12166  * Unicode range? */
12167  bool runtime_posix_matches_above_Unicode = FALSE;
12168
12169  regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
12170   case we need to change the emitted regop to an EXACT. */
12171  const char * orig_parse = RExC_parse;
12172  const I32 orig_size = RExC_size;
12173  GET_RE_DEBUG_FLAGS_DECL;
12174
12175  PERL_ARGS_ASSERT_REGCLASS;
12176 #ifndef DEBUGGING
12177  PERL_UNUSED_ARG(depth);
12178 #endif
12179
12180  DEBUG_PARSE("clas");
12181
12182  /* Assume we are going to generate an ANYOF node. */
12183  ret = reganode(pRExC_state, ANYOF, 0);
12184
12185  if (SIZE_ONLY) {
12186   RExC_size += ANYOF_SKIP;
12187   listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
12188  }
12189  else {
12190   ANYOF_FLAGS(ret) = 0;
12191
12192   RExC_emit += ANYOF_SKIP;
12193   if (LOC) {
12194    ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
12195   }
12196   listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
12197   initial_listsv_len = SvCUR(listsv);
12198   SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
12199  }
12200
12201  if (skip_white) {
12202   RExC_parse = regpatws(pRExC_state, RExC_parse,
12203        FALSE /* means don't recognize comments */);
12204  }
12205
12206  if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
12207   RExC_parse++;
12208   invert = TRUE;
12209   allow_multi_folds = FALSE;
12210   RExC_naughty++;
12211   if (skip_white) {
12212    RExC_parse = regpatws(pRExC_state, RExC_parse,
12213         FALSE /* means don't recognize comments */);
12214   }
12215  }
12216
12217  /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12218  if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12219   const char *s = RExC_parse;
12220   const char  c = *s++;
12221
12222   while (isWORDCHAR(*s))
12223    s++;
12224   if (*s && c == *s && s[1] == ']') {
12225    SAVEFREESV(RExC_rx_sv);
12226    ckWARN3reg(s+2,
12227      "POSIX syntax [%c %c] belongs inside character classes",
12228      c, c);
12229    (void)ReREFCNT_inc(RExC_rx_sv);
12230   }
12231  }
12232
12233  /* If the caller wants us to just parse a single element, accomplish this
12234  * by faking the loop ending condition */
12235  if (stop_at_1 && RExC_end > RExC_parse) {
12236   stop_ptr = RExC_parse + 1;
12237  }
12238
12239  /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12240  if (UCHARAT(RExC_parse) == ']')
12241   goto charclassloop;
12242
12243 parseit:
12244  while (1) {
12245   if  (RExC_parse >= stop_ptr) {
12246    break;
12247   }
12248
12249   if (skip_white) {
12250    RExC_parse = regpatws(pRExC_state, RExC_parse,
12251         FALSE /* means don't recognize comments */);
12252   }
12253
12254   if  (UCHARAT(RExC_parse) == ']') {
12255    break;
12256   }
12257
12258  charclassloop:
12259
12260   namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12261   save_value = value;
12262   save_prevvalue = prevvalue;
12263
12264   if (!range) {
12265    rangebegin = RExC_parse;
12266    element_count++;
12267   }
12268   if (UTF) {
12269    value = utf8n_to_uvchr((U8*)RExC_parse,
12270         RExC_end - RExC_parse,
12271         &numlen, UTF8_ALLOW_DEFAULT);
12272    RExC_parse += numlen;
12273   }
12274   else
12275    value = UCHARAT(RExC_parse++);
12276
12277   if (value == '['
12278    && RExC_parse < RExC_end
12279    && POSIXCC(UCHARAT(RExC_parse)))
12280   {
12281    namedclass = regpposixcc(pRExC_state, value, strict);
12282   }
12283   else if (value == '\\') {
12284    if (UTF) {
12285     value = utf8n_to_uvchr((U8*)RExC_parse,
12286         RExC_end - RExC_parse,
12287         &numlen, UTF8_ALLOW_DEFAULT);
12288     RExC_parse += numlen;
12289    }
12290    else
12291     value = UCHARAT(RExC_parse++);
12292
12293    /* Some compilers cannot handle switching on 64-bit integer
12294    * values, therefore value cannot be an UV.  Yes, this will
12295    * be a problem later if we want switch on Unicode.
12296    * A similar issue a little bit later when switching on
12297    * namedclass. --jhi */
12298
12299    /* If the \ is escaping white space when white space is being
12300    * skipped, it means that that white space is wanted literally, and
12301    * is already in 'value'.  Otherwise, need to translate the escape
12302    * into what it signifies. */
12303    if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
12304
12305    case 'w': namedclass = ANYOF_WORDCHAR; break;
12306    case 'W': namedclass = ANYOF_NWORDCHAR; break;
12307    case 's': namedclass = ANYOF_SPACE; break;
12308    case 'S': namedclass = ANYOF_NSPACE; break;
12309    case 'd': namedclass = ANYOF_DIGIT; break;
12310    case 'D': namedclass = ANYOF_NDIGIT; break;
12311    case 'v': namedclass = ANYOF_VERTWS; break;
12312    case 'V': namedclass = ANYOF_NVERTWS; break;
12313    case 'h': namedclass = ANYOF_HORIZWS; break;
12314    case 'H': namedclass = ANYOF_NHORIZWS; break;
12315    case 'N':  /* Handle \N{NAME} in class */
12316     {
12317      /* We only pay attention to the first char of
12318      multichar strings being returned. I kinda wonder
12319      if this makes sense as it does change the behaviour
12320      from earlier versions, OTOH that behaviour was broken
12321      as well. */
12322      if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
12323          TRUE, /* => charclass */
12324          strict))
12325      {
12326       if (*flagp & RESTART_UTF8)
12327        FAIL("panic: grok_bslash_N set RESTART_UTF8");
12328       goto parseit;
12329      }
12330     }
12331     break;
12332    case 'p':
12333    case 'P':
12334     {
12335     char *e;
12336
12337     /* We will handle any undefined properties ourselves */
12338     U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
12339
12340     if (RExC_parse >= RExC_end)
12341      vFAIL2("Empty \\%c{}", (U8)value);
12342     if (*RExC_parse == '{') {
12343      const U8 c = (U8)value;
12344      e = strchr(RExC_parse++, '}');
12345      if (!e)
12346       vFAIL2("Missing right brace on \\%c{}", c);
12347      while (isSPACE(UCHARAT(RExC_parse)))
12348       RExC_parse++;
12349      if (e == RExC_parse)
12350       vFAIL2("Empty \\%c{}", c);
12351      n = e - RExC_parse;
12352      while (isSPACE(UCHARAT(RExC_parse + n - 1)))
12353       n--;
12354     }
12355     else {
12356      e = RExC_parse;
12357      n = 1;
12358     }
12359     if (!SIZE_ONLY) {
12360      SV* invlist;
12361      char* name;
12362
12363      if (UCHARAT(RExC_parse) == '^') {
12364       RExC_parse++;
12365       n--;
12366       /* toggle.  (The rhs xor gets the single bit that
12367       * differs between P and p; the other xor inverts just
12368       * that bit) */
12369       value ^= 'P' ^ 'p';
12370
12371       while (isSPACE(UCHARAT(RExC_parse))) {
12372        RExC_parse++;
12373        n--;
12374       }
12375      }
12376      /* Try to get the definition of the property into
12377      * <invlist>.  If /i is in effect, the effective property
12378      * will have its name be <__NAME_i>.  The design is
12379      * discussed in commit
12380      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
12381      Newx(name, n + sizeof("_i__\n"), char);
12382
12383      sprintf(name, "%s%.*s%s\n",
12384          (FOLD) ? "__" : "",
12385          (int)n,
12386          RExC_parse,
12387          (FOLD) ? "_i" : ""
12388      );
12389
12390      /* Look up the property name, and get its swash and
12391      * inversion list, if the property is found  */
12392      if (swash) {
12393       SvREFCNT_dec_NN(swash);
12394      }
12395      swash = _core_swash_init("utf8", name, &PL_sv_undef,
12396            1, /* binary */
12397            0, /* not tr/// */
12398            NULL, /* No inversion list */
12399            &swash_init_flags
12400            );
12401      if (! swash || ! (invlist = _get_swash_invlist(swash))) {
12402       if (swash) {
12403        SvREFCNT_dec_NN(swash);
12404        swash = NULL;
12405       }
12406
12407       /* Here didn't find it.  It could be a user-defined
12408       * property that will be available at run-time.  If we
12409       * accept only compile-time properties, is an error;
12410       * otherwise add it to the list for run-time look up */
12411       if (ret_invlist) {
12412        RExC_parse = e + 1;
12413        vFAIL3("Property '%.*s' is unknown", (int) n, name);
12414       }
12415       Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
12416           (value == 'p' ? '+' : '!'),
12417           name);
12418       has_user_defined_property = TRUE;
12419
12420       /* We don't know yet, so have to assume that the
12421       * property could match something in the Latin1 range,
12422       * hence something that isn't utf8.  Note that this
12423       * would cause things in <depends_list> to match
12424       * inappropriately, except that any \p{}, including
12425       * this one forces Unicode semantics, which means there
12426       * is <no depends_list> */
12427       ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
12428      }
12429      else {
12430
12431       /* Here, did get the swash and its inversion list.  If
12432       * the swash is from a user-defined property, then this
12433       * whole character class should be regarded as such */
12434       has_user_defined_property =
12435          (swash_init_flags
12436          & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
12437
12438       /* Invert if asking for the complement */
12439       if (value == 'P') {
12440        _invlist_union_complement_2nd(properties,
12441               invlist,
12442               &properties);
12443
12444        /* The swash can't be used as-is, because we've
12445        * inverted things; delay removing it to here after
12446        * have copied its invlist above */
12447        SvREFCNT_dec_NN(swash);
12448        swash = NULL;
12449       }
12450       else {
12451        _invlist_union(properties, invlist, &properties);
12452       }
12453      }
12454      Safefree(name);
12455     }
12456     RExC_parse = e + 1;
12457     namedclass = ANYOF_UNIPROP;  /* no official name, but it's
12458             named */
12459
12460     /* \p means they want Unicode semantics */
12461     RExC_uni_semantics = 1;
12462     }
12463     break;
12464    case 'n': value = '\n';   break;
12465    case 'r': value = '\r';   break;
12466    case 't': value = '\t';   break;
12467    case 'f': value = '\f';   break;
12468    case 'b': value = '\b';   break;
12469    case 'e': value = ASCII_TO_NATIVE('\033');break;
12470    case 'a': value = ASCII_TO_NATIVE('\007');break;
12471    case 'o':
12472     RExC_parse--; /* function expects to be pointed at the 'o' */
12473     {
12474      const char* error_msg;
12475      bool valid = grok_bslash_o(&RExC_parse,
12476            &value,
12477            &error_msg,
12478            SIZE_ONLY,   /* warnings in pass
12479                1 only */
12480            strict,
12481            silence_non_portable,
12482            UTF);
12483      if (! valid) {
12484       vFAIL(error_msg);
12485      }
12486     }
12487     if (PL_encoding && value < 0x100) {
12488      goto recode_encoding;
12489     }
12490     break;
12491    case 'x':
12492     RExC_parse--; /* function expects to be pointed at the 'x' */
12493     {
12494      const char* error_msg;
12495      bool valid = grok_bslash_x(&RExC_parse,
12496            &value,
12497            &error_msg,
12498            TRUE, /* Output warnings */
12499            strict,
12500            silence_non_portable,
12501            UTF);
12502      if (! valid) {
12503       vFAIL(error_msg);
12504      }
12505     }
12506     if (PL_encoding && value < 0x100)
12507      goto recode_encoding;
12508     break;
12509    case 'c':
12510     value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
12511     break;
12512    case '0': case '1': case '2': case '3': case '4':
12513    case '5': case '6': case '7':
12514     {
12515      /* Take 1-3 octal digits */
12516      I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12517      numlen = (strict) ? 4 : 3;
12518      value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
12519      RExC_parse += numlen;
12520      if (numlen != 3) {
12521       if (strict) {
12522        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12523        vFAIL("Need exactly 3 octal digits");
12524       }
12525       else if (! SIZE_ONLY /* like \08, \178 */
12526         && numlen < 3
12527         && RExC_parse < RExC_end
12528         && isDIGIT(*RExC_parse)
12529         && ckWARN(WARN_REGEXP))
12530       {
12531        SAVEFREESV(RExC_rx_sv);
12532        reg_warn_non_literal_string(
12533         RExC_parse + 1,
12534         form_short_octal_warning(RExC_parse, numlen));
12535        (void)ReREFCNT_inc(RExC_rx_sv);
12536       }
12537      }
12538      if (PL_encoding && value < 0x100)
12539       goto recode_encoding;
12540      break;
12541     }
12542    recode_encoding:
12543     if (! RExC_override_recoding) {
12544      SV* enc = PL_encoding;
12545      value = reg_recode((const char)(U8)value, &enc);
12546      if (!enc) {
12547       if (strict) {
12548        vFAIL("Invalid escape in the specified encoding");
12549       }
12550       else if (SIZE_ONLY) {
12551        ckWARNreg(RExC_parse,
12552         "Invalid escape in the specified encoding");
12553       }
12554      }
12555      break;
12556     }
12557    default:
12558     /* Allow \_ to not give an error */
12559     if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
12560      if (strict) {
12561       vFAIL2("Unrecognized escape \\%c in character class",
12562        (int)value);
12563      }
12564      else {
12565       SAVEFREESV(RExC_rx_sv);
12566       ckWARN2reg(RExC_parse,
12567        "Unrecognized escape \\%c in character class passed through",
12568        (int)value);
12569       (void)ReREFCNT_inc(RExC_rx_sv);
12570      }
12571     }
12572     break;
12573    }   /* End of switch on char following backslash */
12574   } /* end of handling backslash escape sequences */
12575 #ifdef EBCDIC
12576   else
12577    literal_endpoint++;
12578 #endif
12579
12580   /* Here, we have the current token in 'value' */
12581
12582   /* What matches in a locale is not known until runtime.  This includes
12583   * what the Posix classes (like \w, [:space:]) match.  Room must be
12584   * reserved (one time per class) to store such classes, either if Perl
12585   * is compiled so that locale nodes always should have this space, or
12586   * if there is such class info to be stored.  The space will contain a
12587   * bit for each named class that is to be matched against.  This isn't
12588   * needed for \p{} and pseudo-classes, as they are not affected by
12589   * locale, and hence are dealt with separately */
12590   if (LOC
12591    && ! need_class
12592    && (ANYOF_LOCALE == ANYOF_CLASS
12593     || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
12594   {
12595    need_class = 1;
12596    if (SIZE_ONLY) {
12597     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12598    }
12599    else {
12600     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12601     ANYOF_CLASS_ZERO(ret);
12602    }
12603    ANYOF_FLAGS(ret) |= ANYOF_CLASS;
12604   }
12605
12606   if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
12607
12608    /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
12609    * literal, as is the character that began the false range, i.e.
12610    * the 'a' in the examples */
12611    if (range) {
12612     if (!SIZE_ONLY) {
12613      const int w = (RExC_parse >= rangebegin)
12614         ? RExC_parse - rangebegin
12615         : 0;
12616      if (strict) {
12617       vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin);
12618      }
12619      else {
12620       SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
12621       ckWARN4reg(RExC_parse,
12622         "False [] range \"%*.*s\"",
12623         w, w, rangebegin);
12624       (void)ReREFCNT_inc(RExC_rx_sv);
12625       cp_list = add_cp_to_invlist(cp_list, '-');
12626       cp_list = add_cp_to_invlist(cp_list, prevvalue);
12627      }
12628     }
12629
12630     range = 0; /* this was not a true range */
12631     element_count += 2; /* So counts for three values */
12632    }
12633
12634    if (! SIZE_ONLY) {
12635     U8 classnum = namedclass_to_classnum(namedclass);
12636     if (namedclass >= ANYOF_MAX) {  /* If a special class */
12637      if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
12638
12639       /* Here, should be \h, \H, \v, or \V.  Neither /d nor
12640       * /l make a difference in what these match.  There
12641       * would be problems if these characters had folds
12642       * other than themselves, as cp_list is subject to
12643       * folding. */
12644       if (classnum != _CC_VERTSPACE) {
12645        assert(   namedclass == ANYOF_HORIZWS
12646         || namedclass == ANYOF_NHORIZWS);
12647
12648        /* It turns out that \h is just a synonym for
12649        * XPosixBlank */
12650        classnum = _CC_BLANK;
12651       }
12652
12653       _invlist_union_maybe_complement_2nd(
12654         cp_list,
12655         PL_XPosix_ptrs[classnum],
12656         cBOOL(namedclass % 2), /* Complement if odd
12657               (NHORIZWS, NVERTWS)
12658               */
12659         &cp_list);
12660      }
12661     }
12662     else if (classnum == _CC_ASCII) {
12663 #ifdef HAS_ISASCII
12664      if (LOC) {
12665       ANYOF_CLASS_SET(ret, namedclass);
12666      }
12667      else
12668 #endif  /* Not isascii(); just use the hard-coded definition for it */
12669       _invlist_union_maybe_complement_2nd(
12670         posixes,
12671         PL_ASCII,
12672         cBOOL(namedclass % 2), /* Complement if odd
12673               (NASCII) */
12674         &posixes);
12675     }
12676     else {  /* Garden variety class */
12677
12678      /* The ascii range inversion list */
12679      SV* ascii_source = PL_Posix_ptrs[classnum];
12680
12681      /* The full Latin1 range inversion list */
12682      SV* l1_source = PL_L1Posix_ptrs[classnum];
12683
12684      /* This code is structured into two major clauses.  The
12685      * first is for classes whose complete definitions may not
12686      * already be known.  It not, the Latin1 definition
12687      * (guaranteed to already known) is used plus code is
12688      * generated to load the rest at run-time (only if needed).
12689      * If the complete definition is known, it drops down to
12690      * the second clause, where the complete definition is
12691      * known */
12692
12693      if (classnum < _FIRST_NON_SWASH_CC) {
12694
12695       /* Here, the class has a swash, which may or not
12696       * already be loaded */
12697
12698       /* The name of the property to use to match the full
12699       * eXtended Unicode range swash for this character
12700       * class */
12701       const char *Xname = swash_property_names[classnum];
12702
12703       /* If returning the inversion list, we can't defer
12704       * getting this until runtime */
12705       if (ret_invlist && !  PL_utf8_swash_ptrs[classnum]) {
12706        PL_utf8_swash_ptrs[classnum] =
12707         _core_swash_init("utf8", Xname, &PL_sv_undef,
12708            1, /* binary */
12709            0, /* not tr/// */
12710            NULL, /* No inversion list */
12711            NULL  /* No flags */
12712            );
12713        assert(PL_utf8_swash_ptrs[classnum]);
12714       }
12715       if ( !  PL_utf8_swash_ptrs[classnum]) {
12716        if (namedclass % 2 == 0) { /* A non-complemented
12717               class */
12718         /* If not /a matching, there are code points we
12719         * don't know at compile time.  Arrange for the
12720         * unknown matches to be loaded at run-time, if
12721         * needed */
12722         if (! AT_LEAST_ASCII_RESTRICTED) {
12723          Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
12724                 Xname);
12725         }
12726         if (LOC) {  /* Under locale, set run-time
12727            lookup */
12728          ANYOF_CLASS_SET(ret, namedclass);
12729         }
12730         else {
12731          /* Add the current class's code points to
12732          * the running total */
12733          _invlist_union(posixes,
12734             (AT_LEAST_ASCII_RESTRICTED)
12735               ? ascii_source
12736               : l1_source,
12737             &posixes);
12738         }
12739        }
12740        else {  /* A complemented class */
12741         if (AT_LEAST_ASCII_RESTRICTED) {
12742          /* Under /a should match everything above
12743          * ASCII, plus the complement of the set's
12744          * ASCII matches */
12745          _invlist_union_complement_2nd(posixes,
12746                 ascii_source,
12747                 &posixes);
12748         }
12749         else {
12750          /* Arrange for the unknown matches to be
12751          * loaded at run-time, if needed */
12752          Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
12753                 Xname);
12754          runtime_posix_matches_above_Unicode = TRUE;
12755          if (LOC) {
12756           ANYOF_CLASS_SET(ret, namedclass);
12757          }
12758          else {
12759
12760           /* We want to match everything in
12761           * Latin1, except those things that
12762           * l1_source matches */
12763           SV* scratch_list = NULL;
12764           _invlist_subtract(PL_Latin1, l1_source,
12765               &scratch_list);
12766
12767           /* Add the list from this class to the
12768           * running total */
12769           if (! posixes) {
12770            posixes = scratch_list;
12771           }
12772           else {
12773            _invlist_union(posixes,
12774               scratch_list,
12775               &posixes);
12776            SvREFCNT_dec_NN(scratch_list);
12777           }
12778           if (DEPENDS_SEMANTICS) {
12779            ANYOF_FLAGS(ret)
12780             |= ANYOF_NON_UTF8_LATIN1_ALL;
12781           }
12782          }
12783         }
12784        }
12785        goto namedclass_done;
12786       }
12787
12788       /* Here, there is a swash loaded for the class.  If no
12789       * inversion list for it yet, get it */
12790       if (! PL_XPosix_ptrs[classnum]) {
12791        PL_XPosix_ptrs[classnum]
12792        = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
12793       }
12794      }
12795
12796      /* Here there is an inversion list already loaded for the
12797      * entire class */
12798
12799      if (namedclass % 2 == 0) {  /* A non-complemented class,
12800             like ANYOF_PUNCT */
12801       if (! LOC) {
12802        /* For non-locale, just add it to any existing list
12803        * */
12804        _invlist_union(posixes,
12805           (AT_LEAST_ASCII_RESTRICTED)
12806            ? ascii_source
12807            : PL_XPosix_ptrs[classnum],
12808           &posixes);
12809       }
12810       else {  /* Locale */
12811        SV* scratch_list = NULL;
12812
12813        /* For above Latin1 code points, we use the full
12814        * Unicode range */
12815        _invlist_intersection(PL_AboveLatin1,
12816             PL_XPosix_ptrs[classnum],
12817             &scratch_list);
12818        /* And set the output to it, adding instead if
12819        * there already is an output.  Checking if
12820        * 'posixes' is NULL first saves an extra clone.
12821        * Its reference count will be decremented at the
12822        * next union, etc, or if this is the only
12823        * instance, at the end of the routine */
12824        if (! posixes) {
12825         posixes = scratch_list;
12826        }
12827        else {
12828         _invlist_union(posixes, scratch_list, &posixes);
12829         SvREFCNT_dec_NN(scratch_list);
12830        }
12831
12832 #ifndef HAS_ISBLANK
12833        if (namedclass != ANYOF_BLANK) {
12834 #endif
12835         /* Set this class in the node for runtime
12836         * matching */
12837         ANYOF_CLASS_SET(ret, namedclass);
12838 #ifndef HAS_ISBLANK
12839        }
12840        else {
12841         /* No isblank(), use the hard-coded ASCII-range
12842         * blanks, adding them to the running total. */
12843
12844         _invlist_union(posixes, ascii_source, &posixes);
12845        }
12846 #endif
12847       }
12848      }
12849      else {  /* A complemented class, like ANYOF_NPUNCT */
12850       if (! LOC) {
12851        _invlist_union_complement_2nd(
12852             posixes,
12853             (AT_LEAST_ASCII_RESTRICTED)
12854              ? ascii_source
12855              : PL_XPosix_ptrs[classnum],
12856             &posixes);
12857        /* Under /d, everything in the upper half of the
12858        * Latin1 range matches this complement */
12859        if (DEPENDS_SEMANTICS) {
12860         ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
12861        }
12862       }
12863       else {  /* Locale */
12864        SV* scratch_list = NULL;
12865        _invlist_subtract(PL_AboveLatin1,
12866            PL_XPosix_ptrs[classnum],
12867            &scratch_list);
12868        if (! posixes) {
12869         posixes = scratch_list;
12870        }
12871        else {
12872         _invlist_union(posixes, scratch_list, &posixes);
12873         SvREFCNT_dec_NN(scratch_list);
12874        }
12875 #ifndef HAS_ISBLANK
12876        if (namedclass != ANYOF_NBLANK) {
12877 #endif
12878         ANYOF_CLASS_SET(ret, namedclass);
12879 #ifndef HAS_ISBLANK
12880        }
12881        else {
12882         /* Get the list of all code points in Latin1
12883         * that are not ASCII blanks, and add them to
12884         * the running total */
12885         _invlist_subtract(PL_Latin1, ascii_source,
12886             &scratch_list);
12887         _invlist_union(posixes, scratch_list, &posixes);
12888         SvREFCNT_dec_NN(scratch_list);
12889        }
12890 #endif
12891       }
12892      }
12893     }
12894    namedclass_done:
12895     continue;   /* Go get next character */
12896    }
12897   } /* end of namedclass \blah */
12898
12899   /* Here, we have a single value.  If 'range' is set, it is the ending
12900   * of a range--check its validity.  Later, we will handle each
12901   * individual code point in the range.  If 'range' isn't set, this
12902   * could be the beginning of a range, so check for that by looking
12903   * ahead to see if the next real character to be processed is the range
12904   * indicator--the minus sign */
12905
12906   if (skip_white) {
12907    RExC_parse = regpatws(pRExC_state, RExC_parse,
12908         FALSE /* means don't recognize comments */);
12909   }
12910
12911   if (range) {
12912    if (prevvalue > value) /* b-a */ {
12913     const int w = RExC_parse - rangebegin;
12914     Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12915     range = 0; /* not a valid range */
12916    }
12917   }
12918   else {
12919    prevvalue = value; /* save the beginning of the potential range */
12920    if (! stop_at_1     /* Can't be a range if parsing just one thing */
12921     && *RExC_parse == '-')
12922    {
12923     char* next_char_ptr = RExC_parse + 1;
12924     if (skip_white) {   /* Get the next real char after the '-' */
12925      next_char_ptr = regpatws(pRExC_state,
12926            RExC_parse + 1,
12927            FALSE); /* means don't recognize
12928               comments */
12929     }
12930
12931     /* If the '-' is at the end of the class (just before the ']',
12932     * it is a literal minus; otherwise it is a range */
12933     if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
12934      RExC_parse = next_char_ptr;
12935
12936      /* a bad range like \w-, [:word:]- ? */
12937      if (namedclass > OOB_NAMEDCLASS) {
12938       if (strict || ckWARN(WARN_REGEXP)) {
12939        const int w =
12940         RExC_parse >= rangebegin ?
12941         RExC_parse - rangebegin : 0;
12942        if (strict) {
12943         vFAIL4("False [] range \"%*.*s\"",
12944          w, w, rangebegin);
12945        }
12946        else {
12947         vWARN4(RExC_parse,
12948          "False [] range \"%*.*s\"",
12949          w, w, rangebegin);
12950        }
12951       }
12952       if (!SIZE_ONLY) {
12953        cp_list = add_cp_to_invlist(cp_list, '-');
12954       }
12955       element_count++;
12956      } else
12957       range = 1; /* yeah, it's a range! */
12958      continue; /* but do it the next time */
12959     }
12960    }
12961   }
12962
12963   /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12964   * if not */
12965
12966   /* non-Latin1 code point implies unicode semantics.  Must be set in
12967   * pass1 so is there for the whole of pass 2 */
12968   if (value > 255) {
12969    RExC_uni_semantics = 1;
12970   }
12971
12972   /* Ready to process either the single value, or the completed range.
12973   * For single-valued non-inverted ranges, we consider the possibility
12974   * of multi-char folds.  (We made a conscious decision to not do this
12975   * for the other cases because it can often lead to non-intuitive
12976   * results.  For example, you have the peculiar case that:
12977   *  "s s" =~ /^[^\xDF]+$/i => Y
12978   *  "ss"  =~ /^[^\xDF]+$/i => N
12979   *
12980   * See [perl #89750] */
12981   if (FOLD && allow_multi_folds && value == prevvalue) {
12982    if (value == LATIN_SMALL_LETTER_SHARP_S
12983     || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12984               value)))
12985    {
12986     /* Here <value> is indeed a multi-char fold.  Get what it is */
12987
12988     U8 foldbuf[UTF8_MAXBYTES_CASE];
12989     STRLEN foldlen;
12990
12991     UV folded = _to_uni_fold_flags(
12992         value,
12993         foldbuf,
12994         &foldlen,
12995         FOLD_FLAGS_FULL
12996         | ((LOC) ?  FOLD_FLAGS_LOCALE
12997            : (ASCII_FOLD_RESTRICTED)
12998            ? FOLD_FLAGS_NOMIX_ASCII
12999            : 0)
13000         );
13001
13002     /* Here, <folded> should be the first character of the
13003     * multi-char fold of <value>, with <foldbuf> containing the
13004     * whole thing.  But, if this fold is not allowed (because of
13005     * the flags), <fold> will be the same as <value>, and should
13006     * be processed like any other character, so skip the special
13007     * handling */
13008     if (folded != value) {
13009
13010      /* Skip if we are recursed, currently parsing the class
13011      * again.  Otherwise add this character to the list of
13012      * multi-char folds. */
13013      if (! RExC_in_multi_char_class) {
13014       AV** this_array_ptr;
13015       AV* this_array;
13016       STRLEN cp_count = utf8_length(foldbuf,
13017              foldbuf + foldlen);
13018       SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13019
13020       Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13021
13022
13023       if (! multi_char_matches) {
13024        multi_char_matches = newAV();
13025       }
13026
13027       /* <multi_char_matches> is actually an array of arrays.
13028       * There will be one or two top-level elements: [2],
13029       * and/or [3].  The [2] element is an array, each
13030       * element thereof is a character which folds to two
13031       * characters; likewise for [3].  (Unicode guarantees a
13032       * maximum of 3 characters in any fold.)  When we
13033       * rewrite the character class below, we will do so
13034       * such that the longest folds are written first, so
13035       * that it prefers the longest matching strings first.
13036       * This is done even if it turns out that any
13037       * quantifier is non-greedy, out of programmer
13038       * laziness.  Tom Christiansen has agreed that this is
13039       * ok.  This makes the test for the ligature 'ffi' come
13040       * before the test for 'ff' */
13041       if (av_exists(multi_char_matches, cp_count)) {
13042        this_array_ptr = (AV**) av_fetch(multi_char_matches,
13043                cp_count, FALSE);
13044        this_array = *this_array_ptr;
13045       }
13046       else {
13047        this_array = newAV();
13048        av_store(multi_char_matches, cp_count,
13049          (SV*) this_array);
13050       }
13051       av_push(this_array, multi_fold);
13052      }
13053
13054      /* This element should not be processed further in this
13055      * class */
13056      element_count--;
13057      value = save_value;
13058      prevvalue = save_prevvalue;
13059      continue;
13060     }
13061    }
13062   }
13063
13064   /* Deal with this element of the class */
13065   if (! SIZE_ONLY) {
13066 #ifndef EBCDIC
13067    cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
13068 #else
13069    SV* this_range = _new_invlist(1);
13070    _append_range_to_invlist(this_range, prevvalue, value);
13071
13072    /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
13073    * If this range was specified using something like 'i-j', we want
13074    * to include only the 'i' and the 'j', and not anything in
13075    * between, so exclude non-ASCII, non-alphabetics from it.
13076    * However, if the range was specified with something like
13077    * [\x89-\x91] or [\x89-j], all code points within it should be
13078    * included.  literal_endpoint==2 means both ends of the range used
13079    * a literal character, not \x{foo} */
13080    if (literal_endpoint == 2
13081     && (prevvalue >= 'a' && value <= 'z')
13082      || (prevvalue >= 'A' && value <= 'Z'))
13083    {
13084     _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
13085          &this_range);
13086    }
13087    _invlist_union(cp_list, this_range, &cp_list);
13088    literal_endpoint = 0;
13089 #endif
13090   }
13091
13092   range = 0; /* this range (if it was one) is done now */
13093  } /* End of loop through all the text within the brackets */
13094
13095  /* If anything in the class expands to more than one character, we have to
13096  * deal with them by building up a substitute parse string, and recursively
13097  * calling reg() on it, instead of proceeding */
13098  if (multi_char_matches) {
13099   SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
13100   I32 cp_count;
13101   STRLEN len;
13102   char *save_end = RExC_end;
13103   char *save_parse = RExC_parse;
13104   bool first_time = TRUE;     /* First multi-char occurrence doesn't get
13105          a "|" */
13106   I32 reg_flags;
13107
13108   assert(! invert);
13109 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
13110   because too confusing */
13111   if (invert) {
13112    sv_catpv(substitute_parse, "(?:");
13113   }
13114 #endif
13115
13116   /* Look at the longest folds first */
13117   for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
13118
13119    if (av_exists(multi_char_matches, cp_count)) {
13120     AV** this_array_ptr;
13121     SV* this_sequence;
13122
13123     this_array_ptr = (AV**) av_fetch(multi_char_matches,
13124             cp_count, FALSE);
13125     while ((this_sequence = av_pop(*this_array_ptr)) !=
13126                 &PL_sv_undef)
13127     {
13128      if (! first_time) {
13129       sv_catpv(substitute_parse, "|");
13130      }
13131      first_time = FALSE;
13132
13133      sv_catpv(substitute_parse, SvPVX(this_sequence));
13134     }
13135    }
13136   }
13137
13138   /* If the character class contains anything else besides these
13139   * multi-character folds, have to include it in recursive parsing */
13140   if (element_count) {
13141    sv_catpv(substitute_parse, "|[");
13142    sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
13143    sv_catpv(substitute_parse, "]");
13144   }
13145
13146   sv_catpv(substitute_parse, ")");
13147 #if 0
13148   if (invert) {
13149    /* This is a way to get the parse to skip forward a whole named
13150    * sequence instead of matching the 2nd character when it fails the
13151    * first */
13152    sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
13153   }
13154 #endif
13155
13156   RExC_parse = SvPV(substitute_parse, len);
13157   RExC_end = RExC_parse + len;
13158   RExC_in_multi_char_class = 1;
13159   RExC_emit = (regnode *)orig_emit;
13160
13161   ret = reg(pRExC_state, 1, &reg_flags, depth+1);
13162
13163   *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
13164
13165   RExC_parse = save_parse;
13166   RExC_end = save_end;
13167   RExC_in_multi_char_class = 0;
13168   SvREFCNT_dec_NN(multi_char_matches);
13169   return ret;
13170  }
13171
13172  /* If the character class contains only a single element, it may be
13173  * optimizable into another node type which is smaller and runs faster.
13174  * Check if this is the case for this class */
13175  if (element_count == 1 && ! ret_invlist) {
13176   U8 op = END;
13177   U8 arg = 0;
13178
13179   if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
13180            [:digit:] or \p{foo} */
13181
13182    /* All named classes are mapped into POSIXish nodes, with its FLAG
13183    * argument giving which class it is */
13184    switch ((I32)namedclass) {
13185     case ANYOF_UNIPROP:
13186      break;
13187
13188     /* These don't depend on the charset modifiers.  They always
13189     * match under /u rules */
13190     case ANYOF_NHORIZWS:
13191     case ANYOF_HORIZWS:
13192      namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13193      /* FALLTHROUGH */
13194
13195     case ANYOF_NVERTWS:
13196     case ANYOF_VERTWS:
13197      op = POSIXU;
13198      goto join_posix;
13199
13200     /* The actual POSIXish node for all the rest depends on the
13201     * charset modifier.  The ones in the first set depend only on
13202     * ASCII or, if available on this platform, locale */
13203     case ANYOF_ASCII:
13204     case ANYOF_NASCII:
13205 #ifdef HAS_ISASCII
13206      op = (LOC) ? POSIXL : POSIXA;
13207 #else
13208      op = POSIXA;
13209 #endif
13210      goto join_posix;
13211
13212     case ANYOF_NCASED:
13213     case ANYOF_LOWER:
13214     case ANYOF_NLOWER:
13215     case ANYOF_UPPER:
13216     case ANYOF_NUPPER:
13217      /* under /a could be alpha */
13218      if (FOLD) {
13219       if (ASCII_RESTRICTED) {
13220        namedclass = ANYOF_ALPHA + (namedclass % 2);
13221       }
13222       else if (! LOC) {
13223        break;
13224       }
13225      }
13226      /* FALLTHROUGH */
13227
13228     /* The rest have more possibilities depending on the charset.
13229     * We take advantage of the enum ordering of the charset
13230     * modifiers to get the exact node type, */
13231     default:
13232      op = POSIXD + get_regex_charset(RExC_flags);
13233      if (op > POSIXA) { /* /aa is same as /a */
13234       op = POSIXA;
13235      }
13236 #ifndef HAS_ISBLANK
13237      if (op == POSIXL
13238       && (namedclass == ANYOF_BLANK
13239        || namedclass == ANYOF_NBLANK))
13240      {
13241       op = POSIXA;
13242      }
13243 #endif
13244
13245     join_posix:
13246      /* The odd numbered ones are the complements of the
13247      * next-lower even number one */
13248      if (namedclass % 2 == 1) {
13249       invert = ! invert;
13250       namedclass--;
13251      }
13252      arg = namedclass_to_classnum(namedclass);
13253      break;
13254    }
13255   }
13256   else if (value == prevvalue) {
13257
13258    /* Here, the class consists of just a single code point */
13259
13260    if (invert) {
13261     if (! LOC && value == '\n') {
13262      op = REG_ANY; /* Optimize [^\n] */
13263      *flagp |= HASWIDTH|SIMPLE;
13264      RExC_naughty++;
13265     }
13266    }
13267    else if (value < 256 || UTF) {
13268
13269     /* Optimize a single value into an EXACTish node, but not if it
13270     * would require converting the pattern to UTF-8. */
13271     op = compute_EXACTish(pRExC_state);
13272    }
13273   } /* Otherwise is a range */
13274   else if (! LOC) {   /* locale could vary these */
13275    if (prevvalue == '0') {
13276     if (value == '9') {
13277      arg = _CC_DIGIT;
13278      op = POSIXA;
13279     }
13280    }
13281   }
13282
13283   /* Here, we have changed <op> away from its initial value iff we found
13284   * an optimization */
13285   if (op != END) {
13286
13287    /* Throw away this ANYOF regnode, and emit the calculated one,
13288    * which should correspond to the beginning, not current, state of
13289    * the parse */
13290    const char * cur_parse = RExC_parse;
13291    RExC_parse = (char *)orig_parse;
13292    if ( SIZE_ONLY) {
13293     if (! LOC) {
13294
13295      /* To get locale nodes to not use the full ANYOF size would
13296      * require moving the code above that writes the portions
13297      * of it that aren't in other nodes to after this point.
13298      * e.g.  ANYOF_CLASS_SET */
13299      RExC_size = orig_size;
13300     }
13301    }
13302    else {
13303     RExC_emit = (regnode *)orig_emit;
13304     if (PL_regkind[op] == POSIXD) {
13305      if (invert) {
13306       op += NPOSIXD - POSIXD;
13307      }
13308     }
13309    }
13310
13311    ret = reg_node(pRExC_state, op);
13312
13313    if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
13314     if (! SIZE_ONLY) {
13315      FLAGS(ret) = arg;
13316     }
13317     *flagp |= HASWIDTH|SIMPLE;
13318    }
13319    else if (PL_regkind[op] == EXACT) {
13320     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13321    }
13322
13323    RExC_parse = (char *) cur_parse;
13324
13325    SvREFCNT_dec(posixes);
13326    SvREFCNT_dec(cp_list);
13327    return ret;
13328   }
13329  }
13330
13331  if (SIZE_ONLY)
13332   return ret;
13333  /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
13334
13335  /* If folding, we calculate all characters that could fold to or from the
13336  * ones already on the list */
13337  if (FOLD && cp_list) {
13338   UV start, end; /* End points of code point ranges */
13339
13340   SV* fold_intersection = NULL;
13341
13342   /* If the highest code point is within Latin1, we can use the
13343   * compiled-in Alphas list, and not have to go out to disk.  This
13344   * yields two false positives, the masculine and feminine ordinal
13345   * indicators, which are weeded out below using the
13346   * IS_IN_SOME_FOLD_L1() macro */
13347   if (invlist_highest(cp_list) < 256) {
13348    _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
13349               &fold_intersection);
13350   }
13351   else {
13352
13353    /* Here, there are non-Latin1 code points, so we will have to go
13354    * fetch the list of all the characters that participate in folds
13355    */
13356    if (! PL_utf8_foldable) {
13357     SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13358          &PL_sv_undef, 1, 0);
13359     PL_utf8_foldable = _get_swash_invlist(swash);
13360     SvREFCNT_dec_NN(swash);
13361    }
13362
13363    /* This is a hash that for a particular fold gives all characters
13364    * that are involved in it */
13365    if (! PL_utf8_foldclosures) {
13366
13367     /* If we were unable to find any folds, then we likely won't be
13368     * able to find the closures.  So just create an empty list.
13369     * Folding will effectively be restricted to the non-Unicode
13370     * rules hard-coded into Perl.  (This case happens legitimately
13371     * during compilation of Perl itself before the Unicode tables
13372     * are generated) */
13373     if (_invlist_len(PL_utf8_foldable) == 0) {
13374      PL_utf8_foldclosures = newHV();
13375     }
13376     else {
13377      /* If the folds haven't been read in, call a fold function
13378      * to force that */
13379      if (! PL_utf8_tofold) {
13380       U8 dummy[UTF8_MAXBYTES+1];
13381
13382       /* This string is just a short named one above \xff */
13383       to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
13384       assert(PL_utf8_tofold); /* Verify that worked */
13385      }
13386      PL_utf8_foldclosures =
13387          _swash_inversion_hash(PL_utf8_tofold);
13388     }
13389    }
13390
13391    /* Only the characters in this class that participate in folds need
13392    * be checked.  Get the intersection of this class and all the
13393    * possible characters that are foldable.  This can quickly narrow
13394    * down a large class */
13395    _invlist_intersection(PL_utf8_foldable, cp_list,
13396         &fold_intersection);
13397   }
13398
13399   /* Now look at the foldable characters in this class individually */
13400   invlist_iterinit(fold_intersection);
13401   while (invlist_iternext(fold_intersection, &start, &end)) {
13402    UV j;
13403
13404    /* Locale folding for Latin1 characters is deferred until runtime */
13405    if (LOC && start < 256) {
13406     start = 256;
13407    }
13408
13409    /* Look at every character in the range */
13410    for (j = start; j <= end; j++) {
13411
13412     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
13413     STRLEN foldlen;
13414     SV** listp;
13415
13416     if (j < 256) {
13417
13418      /* We have the latin1 folding rules hard-coded here so that
13419      * an innocent-looking character class, like /[ks]/i won't
13420      * have to go out to disk to find the possible matches.
13421      * XXX It would be better to generate these via regen, in
13422      * case a new version of the Unicode standard adds new
13423      * mappings, though that is not really likely, and may be
13424      * caught by the default: case of the switch below. */
13425
13426      if (IS_IN_SOME_FOLD_L1(j)) {
13427
13428       /* ASCII is always matched; non-ASCII is matched only
13429       * under Unicode rules */
13430       if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
13431        cp_list =
13432         add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
13433       }
13434       else {
13435        depends_list =
13436        add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
13437       }
13438      }
13439
13440      if (HAS_NONLATIN1_FOLD_CLOSURE(j)
13441       && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
13442      {
13443       /* Certain Latin1 characters have matches outside
13444       * Latin1.  To get here, <j> is one of those
13445       * characters.   None of these matches is valid for
13446       * ASCII characters under /aa, which is why the 'if'
13447       * just above excludes those.  These matches only
13448       * happen when the target string is utf8.  The code
13449       * below adds the single fold closures for <j> to the
13450       * inversion list. */
13451       switch (j) {
13452        case 'k':
13453        case 'K':
13454         cp_list =
13455          add_cp_to_invlist(cp_list, KELVIN_SIGN);
13456         break;
13457        case 's':
13458        case 'S':
13459         cp_list = add_cp_to_invlist(cp_list,
13460              LATIN_SMALL_LETTER_LONG_S);
13461         break;
13462        case MICRO_SIGN:
13463         cp_list = add_cp_to_invlist(cp_list,
13464              GREEK_CAPITAL_LETTER_MU);
13465         cp_list = add_cp_to_invlist(cp_list,
13466              GREEK_SMALL_LETTER_MU);
13467         break;
13468        case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13469        case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13470         cp_list =
13471          add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
13472         break;
13473        case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13474         cp_list = add_cp_to_invlist(cp_list,
13475           LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13476         break;
13477        case LATIN_SMALL_LETTER_SHARP_S:
13478         cp_list = add_cp_to_invlist(cp_list,
13479             LATIN_CAPITAL_LETTER_SHARP_S);
13480         break;
13481        case 'F': case 'f':
13482        case 'I': case 'i':
13483        case 'L': case 'l':
13484        case 'T': case 't':
13485        case 'A': case 'a':
13486        case 'H': case 'h':
13487        case 'J': case 'j':
13488        case 'N': case 'n':
13489        case 'W': case 'w':
13490        case 'Y': case 'y':
13491         /* These all are targets of multi-character
13492         * folds from code points that require UTF8 to
13493         * express, so they can't match unless the
13494         * target string is in UTF-8, so no action here
13495         * is necessary, as regexec.c properly handles
13496         * the general case for UTF-8 matching and
13497         * multi-char folds */
13498         break;
13499        default:
13500         /* Use deprecated warning to increase the
13501         * chances of this being output */
13502         ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
13503         break;
13504       }
13505      }
13506      continue;
13507     }
13508
13509     /* Here is an above Latin1 character.  We don't have the rules
13510     * hard-coded for it.  First, get its fold.  This is the simple
13511     * fold, as the multi-character folds have been handled earlier
13512     * and separated out */
13513     _to_uni_fold_flags(j, foldbuf, &foldlen,
13514            ((LOC)
13515            ? FOLD_FLAGS_LOCALE
13516            : (ASCII_FOLD_RESTRICTED)
13517             ? FOLD_FLAGS_NOMIX_ASCII
13518             : 0));
13519
13520     /* Single character fold of above Latin1.  Add everything in
13521     * its fold closure to the list that this node should match.
13522     * The fold closures data structure is a hash with the keys
13523     * being the UTF-8 of every character that is folded to, like
13524     * 'k', and the values each an array of all code points that
13525     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
13526     * Multi-character folds are not included */
13527     if ((listp = hv_fetch(PL_utf8_foldclosures,
13528          (char *) foldbuf, foldlen, FALSE)))
13529     {
13530      AV* list = (AV*) *listp;
13531      IV k;
13532      for (k = 0; k <= av_len(list); k++) {
13533       SV** c_p = av_fetch(list, k, FALSE);
13534       UV c;
13535       if (c_p == NULL) {
13536        Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
13537       }
13538       c = SvUV(*c_p);
13539
13540       /* /aa doesn't allow folds between ASCII and non-; /l
13541       * doesn't allow them between above and below 256 */
13542       if ((ASCII_FOLD_RESTRICTED
13543         && (isASCII(c) != isASCII(j)))
13544        || (LOC && ((c < 256) != (j < 256))))
13545       {
13546        continue;
13547       }
13548
13549       /* Folds involving non-ascii Latin1 characters
13550       * under /d are added to a separate list */
13551       if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
13552       {
13553        cp_list = add_cp_to_invlist(cp_list, c);
13554       }
13555       else {
13556       depends_list = add_cp_to_invlist(depends_list, c);
13557       }
13558      }
13559     }
13560    }
13561   }
13562   SvREFCNT_dec_NN(fold_intersection);
13563  }
13564
13565  /* And combine the result (if any) with any inversion list from posix
13566  * classes.  The lists are kept separate up to now because we don't want to
13567  * fold the classes (folding of those is automatically handled by the swash
13568  * fetching code) */
13569  if (posixes) {
13570   if (! DEPENDS_SEMANTICS) {
13571    if (cp_list) {
13572     _invlist_union(cp_list, posixes, &cp_list);
13573     SvREFCNT_dec_NN(posixes);
13574    }
13575    else {
13576     cp_list = posixes;
13577    }
13578   }
13579   else {
13580    /* Under /d, we put into a separate list the Latin1 things that
13581    * match only when the target string is utf8 */
13582    SV* nonascii_but_latin1_properties = NULL;
13583    _invlist_intersection(posixes, PL_Latin1,
13584         &nonascii_but_latin1_properties);
13585    _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
13586        &nonascii_but_latin1_properties);
13587    _invlist_subtract(posixes, nonascii_but_latin1_properties,
13588        &posixes);
13589    if (cp_list) {
13590     _invlist_union(cp_list, posixes, &cp_list);
13591     SvREFCNT_dec_NN(posixes);
13592    }
13593    else {
13594     cp_list = posixes;
13595    }
13596
13597    if (depends_list) {
13598     _invlist_union(depends_list, nonascii_but_latin1_properties,
13599        &depends_list);
13600     SvREFCNT_dec_NN(nonascii_but_latin1_properties);
13601    }
13602    else {
13603     depends_list = nonascii_but_latin1_properties;
13604    }
13605   }
13606  }
13607
13608  /* And combine the result (if any) with any inversion list from properties.
13609  * The lists are kept separate up to now so that we can distinguish the two
13610  * in regards to matching above-Unicode.  A run-time warning is generated
13611  * if a Unicode property is matched against a non-Unicode code point. But,
13612  * we allow user-defined properties to match anything, without any warning,
13613  * and we also suppress the warning if there is a portion of the character
13614  * class that isn't a Unicode property, and which matches above Unicode, \W
13615  * or [\x{110000}] for example.
13616  * (Note that in this case, unlike the Posix one above, there is no
13617  * <depends_list>, because having a Unicode property forces Unicode
13618  * semantics */
13619  if (properties) {
13620   bool warn_super = ! has_user_defined_property;
13621   if (cp_list) {
13622
13623    /* If it matters to the final outcome, see if a non-property
13624    * component of the class matches above Unicode.  If so, the
13625    * warning gets suppressed.  This is true even if just a single
13626    * such code point is specified, as though not strictly correct if
13627    * another such code point is matched against, the fact that they
13628    * are using above-Unicode code points indicates they should know
13629    * the issues involved */
13630    if (warn_super) {
13631     bool non_prop_matches_above_Unicode =
13632        runtime_posix_matches_above_Unicode
13633        | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
13634     if (invert) {
13635      non_prop_matches_above_Unicode =
13636            !  non_prop_matches_above_Unicode;
13637     }
13638     warn_super = ! non_prop_matches_above_Unicode;
13639    }
13640
13641    _invlist_union(properties, cp_list, &cp_list);
13642    SvREFCNT_dec_NN(properties);
13643   }
13644   else {
13645    cp_list = properties;
13646   }
13647
13648   if (warn_super) {
13649    OP(ret) = ANYOF_WARN_SUPER;
13650   }
13651  }
13652
13653  /* Here, we have calculated what code points should be in the character
13654  * class.
13655  *
13656  * Now we can see about various optimizations.  Fold calculation (which we
13657  * did above) needs to take place before inversion.  Otherwise /[^k]/i
13658  * would invert to include K, which under /i would match k, which it
13659  * shouldn't.  Therefore we can't invert folded locale now, as it won't be
13660  * folded until runtime */
13661
13662  /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
13663  * at compile time.  Besides not inverting folded locale now, we can't
13664  * invert if there are things such as \w, which aren't known until runtime
13665  * */
13666  if (invert
13667   && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
13668   && ! depends_list
13669   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13670  {
13671   _invlist_invert(cp_list);
13672
13673   /* Any swash can't be used as-is, because we've inverted things */
13674   if (swash) {
13675    SvREFCNT_dec_NN(swash);
13676    swash = NULL;
13677   }
13678
13679   /* Clear the invert flag since have just done it here */
13680   invert = FALSE;
13681  }
13682
13683  if (ret_invlist) {
13684   *ret_invlist = cp_list;
13685   SvREFCNT_dec(swash);
13686
13687   /* Discard the generated node */
13688   if (SIZE_ONLY) {
13689    RExC_size = orig_size;
13690   }
13691   else {
13692    RExC_emit = orig_emit;
13693   }
13694   return orig_emit;
13695  }
13696
13697  /* If we didn't do folding, it's because some information isn't available
13698  * until runtime; set the run-time fold flag for these.  (We don't have to
13699  * worry about properties folding, as that is taken care of by the swash
13700  * fetching) */
13701  if (FOLD && LOC)
13702  {
13703  ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
13704  }
13705
13706  /* Some character classes are equivalent to other nodes.  Such nodes take
13707  * up less room and generally fewer operations to execute than ANYOF nodes.
13708  * Above, we checked for and optimized into some such equivalents for
13709  * certain common classes that are easy to test.  Getting to this point in
13710  * the code means that the class didn't get optimized there.  Since this
13711  * code is only executed in Pass 2, it is too late to save space--it has
13712  * been allocated in Pass 1, and currently isn't given back.  But turning
13713  * things into an EXACTish node can allow the optimizer to join it to any
13714  * adjacent such nodes.  And if the class is equivalent to things like /./,
13715  * expensive run-time swashes can be avoided.  Now that we have more
13716  * complete information, we can find things necessarily missed by the
13717  * earlier code.  I (khw) am not sure how much to look for here.  It would
13718  * be easy, but perhaps too slow, to check any candidates against all the
13719  * node types they could possibly match using _invlistEQ(). */
13720
13721  if (cp_list
13722   && ! invert
13723   && ! depends_list
13724   && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
13725   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13726  {
13727   UV start, end;
13728   U8 op = END;  /* The optimzation node-type */
13729   const char * cur_parse= RExC_parse;
13730
13731   invlist_iterinit(cp_list);
13732   if (! invlist_iternext(cp_list, &start, &end)) {
13733
13734    /* Here, the list is empty.  This happens, for example, when a
13735    * Unicode property is the only thing in the character class, and
13736    * it doesn't match anything.  (perluniprops.pod notes such
13737    * properties) */
13738    op = OPFAIL;
13739    *flagp |= HASWIDTH|SIMPLE;
13740   }
13741   else if (start == end) {    /* The range is a single code point */
13742    if (! invlist_iternext(cp_list, &start, &end)
13743
13744      /* Don't do this optimization if it would require changing
13745      * the pattern to UTF-8 */
13746     && (start < 256 || UTF))
13747    {
13748     /* Here, the list contains a single code point.  Can optimize
13749     * into an EXACT node */
13750
13751     value = start;
13752
13753     if (! FOLD) {
13754      op = EXACT;
13755     }
13756     else if (LOC) {
13757
13758      /* A locale node under folding with one code point can be
13759      * an EXACTFL, as its fold won't be calculated until
13760      * runtime */
13761      op = EXACTFL;
13762     }
13763     else {
13764
13765      /* Here, we are generally folding, but there is only one
13766      * code point to match.  If we have to, we use an EXACT
13767      * node, but it would be better for joining with adjacent
13768      * nodes in the optimization pass if we used the same
13769      * EXACTFish node that any such are likely to be.  We can
13770      * do this iff the code point doesn't participate in any
13771      * folds.  For example, an EXACTF of a colon is the same as
13772      * an EXACT one, since nothing folds to or from a colon. */
13773      if (value < 256) {
13774       if (IS_IN_SOME_FOLD_L1(value)) {
13775        op = EXACT;
13776       }
13777      }
13778      else {
13779       if (! PL_utf8_foldable) {
13780        SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13781             &PL_sv_undef, 1, 0);
13782        PL_utf8_foldable = _get_swash_invlist(swash);
13783        SvREFCNT_dec_NN(swash);
13784       }
13785       if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13786        op = EXACT;
13787       }
13788      }
13789
13790      /* If we haven't found the node type, above, it means we
13791      * can use the prevailing one */
13792      if (op == END) {
13793       op = compute_EXACTish(pRExC_state);
13794      }
13795     }
13796    }
13797   }
13798   else if (start == 0) {
13799    if (end == UV_MAX) {
13800     op = SANY;
13801     *flagp |= HASWIDTH|SIMPLE;
13802     RExC_naughty++;
13803    }
13804    else if (end == '\n' - 1
13805      && invlist_iternext(cp_list, &start, &end)
13806      && start == '\n' + 1 && end == UV_MAX)
13807    {
13808     op = REG_ANY;
13809     *flagp |= HASWIDTH|SIMPLE;
13810     RExC_naughty++;
13811    }
13812   }
13813   invlist_iterfinish(cp_list);
13814
13815   if (op != END) {
13816    RExC_parse = (char *)orig_parse;
13817    RExC_emit = (regnode *)orig_emit;
13818
13819    ret = reg_node(pRExC_state, op);
13820
13821    RExC_parse = (char *)cur_parse;
13822
13823    if (PL_regkind[op] == EXACT) {
13824     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13825    }
13826
13827    SvREFCNT_dec_NN(cp_list);
13828    return ret;
13829   }
13830  }
13831
13832  /* Here, <cp_list> contains all the code points we can determine at
13833  * compile time that match under all conditions.  Go through it, and
13834  * for things that belong in the bitmap, put them there, and delete from
13835  * <cp_list>.  While we are at it, see if everything above 255 is in the
13836  * list, and if so, set a flag to speed up execution */
13837  ANYOF_BITMAP_ZERO(ret);
13838  if (cp_list) {
13839
13840   /* This gets set if we actually need to modify things */
13841   bool change_invlist = FALSE;
13842
13843   UV start, end;
13844
13845   /* Start looking through <cp_list> */
13846   invlist_iterinit(cp_list);
13847   while (invlist_iternext(cp_list, &start, &end)) {
13848    UV high;
13849    int i;
13850
13851    if (end == UV_MAX && start <= 256) {
13852     ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13853    }
13854
13855    /* Quit if are above what we should change */
13856    if (start > 255) {
13857     break;
13858    }
13859
13860    change_invlist = TRUE;
13861
13862    /* Set all the bits in the range, up to the max that we are doing */
13863    high = (end < 255) ? end : 255;
13864    for (i = start; i <= (int) high; i++) {
13865     if (! ANYOF_BITMAP_TEST(ret, i)) {
13866      ANYOF_BITMAP_SET(ret, i);
13867      prevvalue = value;
13868      value = i;
13869     }
13870    }
13871   }
13872   invlist_iterfinish(cp_list);
13873
13874   /* Done with loop; remove any code points that are in the bitmap from
13875   * <cp_list> */
13876   if (change_invlist) {
13877    _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13878   }
13879
13880   /* If have completely emptied it, remove it completely */
13881   if (_invlist_len(cp_list) == 0) {
13882    SvREFCNT_dec_NN(cp_list);
13883    cp_list = NULL;
13884   }
13885  }
13886
13887  if (invert) {
13888   ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13889  }
13890
13891  /* Here, the bitmap has been populated with all the Latin1 code points that
13892  * always match.  Can now add to the overall list those that match only
13893  * when the target string is UTF-8 (<depends_list>). */
13894  if (depends_list) {
13895   if (cp_list) {
13896    _invlist_union(cp_list, depends_list, &cp_list);
13897    SvREFCNT_dec_NN(depends_list);
13898   }
13899   else {
13900    cp_list = depends_list;
13901   }
13902  }
13903
13904  /* If there is a swash and more than one element, we can't use the swash in
13905  * the optimization below. */
13906  if (swash && element_count > 1) {
13907   SvREFCNT_dec_NN(swash);
13908   swash = NULL;
13909  }
13910
13911  if (! cp_list
13912   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13913  {
13914   ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13915  }
13916  else {
13917   /* av[0] stores the character class description in its textual form:
13918   *       used later (regexec.c:Perl_regclass_swash()) to initialize the
13919   *       appropriate swash, and is also useful for dumping the regnode.
13920   * av[1] if NULL, is a placeholder to later contain the swash computed
13921   *       from av[0].  But if no further computation need be done, the
13922   *       swash is stored there now.
13923   * av[2] stores the cp_list inversion list for use in addition or
13924   *       instead of av[0]; used only if av[1] is NULL
13925   * av[3] is set if any component of the class is from a user-defined
13926   *       property; used only if av[1] is NULL */
13927   AV * const av = newAV();
13928   SV *rv;
13929
13930   av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13931       ? SvREFCNT_inc(listsv) : &PL_sv_undef);
13932   if (swash) {
13933    av_store(av, 1, swash);
13934    SvREFCNT_dec_NN(cp_list);
13935   }
13936   else {
13937    av_store(av, 1, NULL);
13938    if (cp_list) {
13939     av_store(av, 2, cp_list);
13940     av_store(av, 3, newSVuv(has_user_defined_property));
13941    }
13942   }
13943
13944   rv = newRV_noinc(MUTABLE_SV(av));
13945   n = add_data(pRExC_state, 1, "s");
13946   RExC_rxi->data->data[n] = (void*)rv;
13947   ARG_SET(ret, n);
13948  }
13949
13950  *flagp |= HASWIDTH|SIMPLE;
13951  return ret;
13952 }
13953 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13954
13955
13956 /* reg_skipcomment()
13957
13958    Absorbs an /x style # comments from the input stream.
13959    Returns true if there is more text remaining in the stream.
13960    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13961    terminates the pattern without including a newline.
13962
13963    Note its the callers responsibility to ensure that we are
13964    actually in /x mode
13965
13966 */
13967
13968 STATIC bool
13969 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13970 {
13971  bool ended = 0;
13972
13973  PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13974
13975  while (RExC_parse < RExC_end)
13976   if (*RExC_parse++ == '\n') {
13977    ended = 1;
13978    break;
13979   }
13980  if (!ended) {
13981   /* we ran off the end of the pattern without ending
13982   the comment, so we have to add an \n when wrapping */
13983   RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13984   return 0;
13985  } else
13986   return 1;
13987 }
13988
13989 /* nextchar()
13990
13991    Advances the parse position, and optionally absorbs
13992    "whitespace" from the inputstream.
13993
13994    Without /x "whitespace" means (?#...) style comments only,
13995    with /x this means (?#...) and # comments and whitespace proper.
13996
13997    Returns the RExC_parse point from BEFORE the scan occurs.
13998
13999    This is the /x friendly way of saying RExC_parse++.
14000 */
14001
14002 STATIC char*
14003 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
14004 {
14005  char* const retval = RExC_parse++;
14006
14007  PERL_ARGS_ASSERT_NEXTCHAR;
14008
14009  for (;;) {
14010   if (RExC_end - RExC_parse >= 3
14011    && *RExC_parse == '('
14012    && RExC_parse[1] == '?'
14013    && RExC_parse[2] == '#')
14014   {
14015    while (*RExC_parse != ')') {
14016     if (RExC_parse == RExC_end)
14017      FAIL("Sequence (?#... not terminated");
14018     RExC_parse++;
14019    }
14020    RExC_parse++;
14021    continue;
14022   }
14023   if (RExC_flags & RXf_PMf_EXTENDED) {
14024    if (isSPACE(*RExC_parse)) {
14025     RExC_parse++;
14026     continue;
14027    }
14028    else if (*RExC_parse == '#') {
14029     if ( reg_skipcomment( pRExC_state ) )
14030      continue;
14031    }
14032   }
14033   return retval;
14034  }
14035 }
14036
14037 /*
14038 - reg_node - emit a node
14039 */
14040 STATIC regnode *   /* Location. */
14041 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
14042 {
14043  dVAR;
14044  regnode *ptr;
14045  regnode * const ret = RExC_emit;
14046  GET_RE_DEBUG_FLAGS_DECL;
14047
14048  PERL_ARGS_ASSERT_REG_NODE;
14049
14050  if (SIZE_ONLY) {
14051   SIZE_ALIGN(RExC_size);
14052   RExC_size += 1;
14053   return(ret);
14054  }
14055  if (RExC_emit >= RExC_emit_bound)
14056   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14057     op, RExC_emit, RExC_emit_bound);
14058
14059  NODE_ALIGN_FILL(ret);
14060  ptr = ret;
14061  FILL_ADVANCE_NODE(ptr, op);
14062  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
14063 #ifdef RE_TRACK_PATTERN_OFFSETS
14064  if (RExC_offsets) {         /* MJD */
14065   MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
14066    "reg_node", __LINE__,
14067    PL_reg_name[op],
14068    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
14069     ? "Overwriting end of array!\n" : "OK",
14070    (UV)(RExC_emit - RExC_emit_start),
14071    (UV)(RExC_parse - RExC_start),
14072    (UV)RExC_offsets[0]));
14073   Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
14074  }
14075 #endif
14076  RExC_emit = ptr;
14077  return(ret);
14078 }
14079
14080 /*
14081 - reganode - emit a node with an argument
14082 */
14083 STATIC regnode *   /* Location. */
14084 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
14085 {
14086  dVAR;
14087  regnode *ptr;
14088  regnode * const ret = RExC_emit;
14089  GET_RE_DEBUG_FLAGS_DECL;
14090
14091  PERL_ARGS_ASSERT_REGANODE;
14092
14093  if (SIZE_ONLY) {
14094   SIZE_ALIGN(RExC_size);
14095   RExC_size += 2;
14096   /*
14097   We can't do this:
14098
14099   assert(2==regarglen[op]+1);
14100
14101   Anything larger than this has to allocate the extra amount.
14102   If we changed this to be:
14103
14104   RExC_size += (1 + regarglen[op]);
14105
14106   then it wouldn't matter. Its not clear what side effect
14107   might come from that so its not done so far.
14108   -- dmq
14109   */
14110   return(ret);
14111  }
14112  if (RExC_emit >= RExC_emit_bound)
14113   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14114     op, RExC_emit, RExC_emit_bound);
14115
14116  NODE_ALIGN_FILL(ret);
14117  ptr = ret;
14118  FILL_ADVANCE_NODE_ARG(ptr, op, arg);
14119  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
14120 #ifdef RE_TRACK_PATTERN_OFFSETS
14121  if (RExC_offsets) {         /* MJD */
14122   MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14123    "reganode",
14124    __LINE__,
14125    PL_reg_name[op],
14126    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
14127    "Overwriting end of array!\n" : "OK",
14128    (UV)(RExC_emit - RExC_emit_start),
14129    (UV)(RExC_parse - RExC_start),
14130    (UV)RExC_offsets[0]));
14131   Set_Cur_Node_Offset;
14132  }
14133 #endif
14134  RExC_emit = ptr;
14135  return(ret);
14136 }
14137
14138 /*
14139 - reguni - emit (if appropriate) a Unicode character
14140 */
14141 STATIC STRLEN
14142 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
14143 {
14144  dVAR;
14145
14146  PERL_ARGS_ASSERT_REGUNI;
14147
14148  return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
14149 }
14150
14151 /*
14152 - reginsert - insert an operator in front of already-emitted operand
14153 *
14154 * Means relocating the operand.
14155 */
14156 STATIC void
14157 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
14158 {
14159  dVAR;
14160  regnode *src;
14161  regnode *dst;
14162  regnode *place;
14163  const int offset = regarglen[(U8)op];
14164  const int size = NODE_STEP_REGNODE + offset;
14165  GET_RE_DEBUG_FLAGS_DECL;
14166
14167  PERL_ARGS_ASSERT_REGINSERT;
14168  PERL_UNUSED_ARG(depth);
14169 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
14170  DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
14171  if (SIZE_ONLY) {
14172   RExC_size += size;
14173   return;
14174  }
14175
14176  src = RExC_emit;
14177  RExC_emit += size;
14178  dst = RExC_emit;
14179  if (RExC_open_parens) {
14180   int paren;
14181   /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
14182   for ( paren=0 ; paren < RExC_npar ; paren++ ) {
14183    if ( RExC_open_parens[paren] >= opnd ) {
14184     /*DEBUG_PARSE_FMT("open"," - %d",size);*/
14185     RExC_open_parens[paren] += size;
14186    } else {
14187     /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
14188    }
14189    if ( RExC_close_parens[paren] >= opnd ) {
14190     /*DEBUG_PARSE_FMT("close"," - %d",size);*/
14191     RExC_close_parens[paren] += size;
14192    } else {
14193     /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14194    }
14195   }
14196  }
14197
14198  while (src > opnd) {
14199   StructCopy(--src, --dst, regnode);
14200 #ifdef RE_TRACK_PATTERN_OFFSETS
14201   if (RExC_offsets) {     /* MJD 20010112 */
14202    MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14203     "reg_insert",
14204     __LINE__,
14205     PL_reg_name[op],
14206     (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14207      ? "Overwriting end of array!\n" : "OK",
14208     (UV)(src - RExC_emit_start),
14209     (UV)(dst - RExC_emit_start),
14210     (UV)RExC_offsets[0]));
14211    Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14212    Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14213   }
14214 #endif
14215  }
14216
14217
14218  place = opnd;  /* Op node, where operand used to be. */
14219 #ifdef RE_TRACK_PATTERN_OFFSETS
14220  if (RExC_offsets) {         /* MJD */
14221   MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14222    "reginsert",
14223    __LINE__,
14224    PL_reg_name[op],
14225    (UV)(place - RExC_emit_start) > RExC_offsets[0]
14226    ? "Overwriting end of array!\n" : "OK",
14227    (UV)(place - RExC_emit_start),
14228    (UV)(RExC_parse - RExC_start),
14229    (UV)RExC_offsets[0]));
14230   Set_Node_Offset(place, RExC_parse);
14231   Set_Node_Length(place, 1);
14232  }
14233 #endif
14234  src = NEXTOPER(place);
14235  FILL_ADVANCE_NODE(place, op);
14236  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
14237  Zero(src, offset, regnode);
14238 }
14239
14240 /*
14241 - regtail - set the next-pointer at the end of a node chain of p to val.
14242 - SEE ALSO: regtail_study
14243 */
14244 /* TODO: All three parms should be const */
14245 STATIC void
14246 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14247 {
14248  dVAR;
14249  regnode *scan;
14250  GET_RE_DEBUG_FLAGS_DECL;
14251
14252  PERL_ARGS_ASSERT_REGTAIL;
14253 #ifndef DEBUGGING
14254  PERL_UNUSED_ARG(depth);
14255 #endif
14256
14257  if (SIZE_ONLY)
14258   return;
14259
14260  /* Find last node. */
14261  scan = p;
14262  for (;;) {
14263   regnode * const temp = regnext(scan);
14264   DEBUG_PARSE_r({
14265    SV * const mysv=sv_newmortal();
14266    DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14267    regprop(RExC_rx, mysv, scan);
14268    PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14269     SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14270      (temp == NULL ? "->" : ""),
14271      (temp == NULL ? PL_reg_name[OP(val)] : "")
14272    );
14273   });
14274   if (temp == NULL)
14275    break;
14276   scan = temp;
14277  }
14278
14279  if (reg_off_by_arg[OP(scan)]) {
14280   ARG_SET(scan, val - scan);
14281  }
14282  else {
14283   NEXT_OFF(scan) = val - scan;
14284  }
14285 }
14286
14287 #ifdef DEBUGGING
14288 /*
14289 - regtail_study - set the next-pointer at the end of a node chain of p to val.
14290 - Look for optimizable sequences at the same time.
14291 - currently only looks for EXACT chains.
14292
14293 This is experimental code. The idea is to use this routine to perform
14294 in place optimizations on branches and groups as they are constructed,
14295 with the long term intention of removing optimization from study_chunk so
14296 that it is purely analytical.
14297
14298 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
14299 to control which is which.
14300
14301 */
14302 /* TODO: All four parms should be const */
14303
14304 STATIC U8
14305 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14306 {
14307  dVAR;
14308  regnode *scan;
14309  U8 exact = PSEUDO;
14310 #ifdef EXPERIMENTAL_INPLACESCAN
14311  I32 min = 0;
14312 #endif
14313  GET_RE_DEBUG_FLAGS_DECL;
14314
14315  PERL_ARGS_ASSERT_REGTAIL_STUDY;
14316
14317
14318  if (SIZE_ONLY)
14319   return exact;
14320
14321  /* Find last node. */
14322
14323  scan = p;
14324  for (;;) {
14325   regnode * const temp = regnext(scan);
14326 #ifdef EXPERIMENTAL_INPLACESCAN
14327   if (PL_regkind[OP(scan)] == EXACT) {
14328    bool has_exactf_sharp_s; /* Unexamined in this routine */
14329    if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
14330     return EXACT;
14331   }
14332 #endif
14333   if ( exact ) {
14334    switch (OP(scan)) {
14335     case EXACT:
14336     case EXACTF:
14337     case EXACTFA:
14338     case EXACTFU:
14339     case EXACTFU_SS:
14340     case EXACTFU_TRICKYFOLD:
14341     case EXACTFL:
14342       if( exact == PSEUDO )
14343        exact= OP(scan);
14344       else if ( exact != OP(scan) )
14345        exact= 0;
14346     case NOTHING:
14347      break;
14348     default:
14349      exact= 0;
14350    }
14351   }
14352   DEBUG_PARSE_r({
14353    SV * const mysv=sv_newmortal();
14354    DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
14355    regprop(RExC_rx, mysv, scan);
14356    PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
14357     SvPV_nolen_const(mysv),
14358     REG_NODE_NUM(scan),
14359     PL_reg_name[exact]);
14360   });
14361   if (temp == NULL)
14362    break;
14363   scan = temp;
14364  }
14365  DEBUG_PARSE_r({
14366   SV * const mysv_val=sv_newmortal();
14367   DEBUG_PARSE_MSG("");
14368   regprop(RExC_rx, mysv_val, val);
14369   PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
14370      SvPV_nolen_const(mysv_val),
14371      (IV)REG_NODE_NUM(val),
14372      (IV)(val - scan)
14373   );
14374  });
14375  if (reg_off_by_arg[OP(scan)]) {
14376   ARG_SET(scan, val - scan);
14377  }
14378  else {
14379   NEXT_OFF(scan) = val - scan;
14380  }
14381
14382  return exact;
14383 }
14384 #endif
14385
14386 /*
14387  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
14388  */
14389 #ifdef DEBUGGING
14390 static void
14391 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
14392 {
14393  int bit;
14394  int set=0;
14395  regex_charset cs;
14396
14397  for (bit=0; bit<32; bit++) {
14398   if (flags & (1<<bit)) {
14399    if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
14400     continue;
14401    }
14402    if (!set++ && lead)
14403     PerlIO_printf(Perl_debug_log, "%s",lead);
14404    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
14405   }
14406  }
14407  if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
14408    if (!set++ && lead) {
14409     PerlIO_printf(Perl_debug_log, "%s",lead);
14410    }
14411    switch (cs) {
14412     case REGEX_UNICODE_CHARSET:
14413      PerlIO_printf(Perl_debug_log, "UNICODE");
14414      break;
14415     case REGEX_LOCALE_CHARSET:
14416      PerlIO_printf(Perl_debug_log, "LOCALE");
14417      break;
14418     case REGEX_ASCII_RESTRICTED_CHARSET:
14419      PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
14420      break;
14421     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
14422      PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
14423      break;
14424     default:
14425      PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
14426      break;
14427    }
14428  }
14429  if (lead)  {
14430   if (set)
14431    PerlIO_printf(Perl_debug_log, "\n");
14432   else
14433    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14434  }
14435 }
14436 #endif
14437
14438 void
14439 Perl_regdump(pTHX_ const regexp *r)
14440 {
14441 #ifdef DEBUGGING
14442  dVAR;
14443  SV * const sv = sv_newmortal();
14444  SV *dsv= sv_newmortal();
14445  RXi_GET_DECL(r,ri);
14446  GET_RE_DEBUG_FLAGS_DECL;
14447
14448  PERL_ARGS_ASSERT_REGDUMP;
14449
14450  (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
14451
14452  /* Header fields of interest. */
14453  if (r->anchored_substr) {
14454   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
14455    RE_SV_DUMPLEN(r->anchored_substr), 30);
14456   PerlIO_printf(Perl_debug_log,
14457      "anchored %s%s at %"IVdf" ",
14458      s, RE_SV_TAIL(r->anchored_substr),
14459      (IV)r->anchored_offset);
14460  } else if (r->anchored_utf8) {
14461   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
14462    RE_SV_DUMPLEN(r->anchored_utf8), 30);
14463   PerlIO_printf(Perl_debug_log,
14464      "anchored utf8 %s%s at %"IVdf" ",
14465      s, RE_SV_TAIL(r->anchored_utf8),
14466      (IV)r->anchored_offset);
14467  }
14468  if (r->float_substr) {
14469   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
14470    RE_SV_DUMPLEN(r->float_substr), 30);
14471   PerlIO_printf(Perl_debug_log,
14472      "floating %s%s at %"IVdf"..%"UVuf" ",
14473      s, RE_SV_TAIL(r->float_substr),
14474      (IV)r->float_min_offset, (UV)r->float_max_offset);
14475  } else if (r->float_utf8) {
14476   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
14477    RE_SV_DUMPLEN(r->float_utf8), 30);
14478   PerlIO_printf(Perl_debug_log,
14479      "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
14480      s, RE_SV_TAIL(r->float_utf8),
14481      (IV)r->float_min_offset, (UV)r->float_max_offset);
14482  }
14483  if (r->check_substr || r->check_utf8)
14484   PerlIO_printf(Perl_debug_log,
14485      (const char *)
14486      (r->check_substr == r->float_substr
14487      && r->check_utf8 == r->float_utf8
14488      ? "(checking floating" : "(checking anchored"));
14489  if (r->extflags & RXf_NOSCAN)
14490   PerlIO_printf(Perl_debug_log, " noscan");
14491  if (r->extflags & RXf_CHECK_ALL)
14492   PerlIO_printf(Perl_debug_log, " isall");
14493  if (r->check_substr || r->check_utf8)
14494   PerlIO_printf(Perl_debug_log, ") ");
14495
14496  if (ri->regstclass) {
14497   regprop(r, sv, ri->regstclass);
14498   PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
14499  }
14500  if (r->extflags & RXf_ANCH) {
14501   PerlIO_printf(Perl_debug_log, "anchored");
14502   if (r->extflags & RXf_ANCH_BOL)
14503    PerlIO_printf(Perl_debug_log, "(BOL)");
14504   if (r->extflags & RXf_ANCH_MBOL)
14505    PerlIO_printf(Perl_debug_log, "(MBOL)");
14506   if (r->extflags & RXf_ANCH_SBOL)
14507    PerlIO_printf(Perl_debug_log, "(SBOL)");
14508   if (r->extflags & RXf_ANCH_GPOS)
14509    PerlIO_printf(Perl_debug_log, "(GPOS)");
14510   PerlIO_putc(Perl_debug_log, ' ');
14511  }
14512  if (r->extflags & RXf_GPOS_SEEN)
14513   PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
14514  if (r->intflags & PREGf_SKIP)
14515   PerlIO_printf(Perl_debug_log, "plus ");
14516  if (r->intflags & PREGf_IMPLICIT)
14517   PerlIO_printf(Perl_debug_log, "implicit ");
14518  PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
14519  if (r->extflags & RXf_EVAL_SEEN)
14520   PerlIO_printf(Perl_debug_log, "with eval ");
14521  PerlIO_printf(Perl_debug_log, "\n");
14522  DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
14523 #else
14524  PERL_ARGS_ASSERT_REGDUMP;
14525  PERL_UNUSED_CONTEXT;
14526  PERL_UNUSED_ARG(r);
14527 #endif /* DEBUGGING */
14528 }
14529
14530 /*
14531 - regprop - printable representation of opcode
14532 */
14533 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
14534 STMT_START { \
14535   if (do_sep) {                           \
14536    Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
14537    if (flags & ANYOF_INVERT)           \
14538     /*make sure the invert info is in each */ \
14539     sv_catpvs(sv, "^");             \
14540    do_sep = 0;                         \
14541   }                                       \
14542 } STMT_END
14543
14544 void
14545 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
14546 {
14547 #ifdef DEBUGGING
14548  dVAR;
14549  int k;
14550
14551  /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
14552  static const char * const anyofs[] = {
14553 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
14554  || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
14555  || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
14556  || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
14557  || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
14558  || _CC_VERTSPACE != 16
14559   #error Need to adjust order of anyofs[]
14560 #endif
14561   "[\\w]",
14562   "[\\W]",
14563   "[\\d]",
14564   "[\\D]",
14565   "[:alpha:]",
14566   "[:^alpha:]",
14567   "[:lower:]",
14568   "[:^lower:]",
14569   "[:upper:]",
14570   "[:^upper:]",
14571   "[:punct:]",
14572   "[:^punct:]",
14573   "[:print:]",
14574   "[:^print:]",
14575   "[:alnum:]",
14576   "[:^alnum:]",
14577   "[:graph:]",
14578   "[:^graph:]",
14579   "[:cased:]",
14580   "[:^cased:]",
14581   "[\\s]",
14582   "[\\S]",
14583   "[:blank:]",
14584   "[:^blank:]",
14585   "[:xdigit:]",
14586   "[:^xdigit:]",
14587   "[:space:]",
14588   "[:^space:]",
14589   "[:cntrl:]",
14590   "[:^cntrl:]",
14591   "[:ascii:]",
14592   "[:^ascii:]",
14593   "[\\v]",
14594   "[\\V]"
14595  };
14596  RXi_GET_DECL(prog,progi);
14597  GET_RE_DEBUG_FLAGS_DECL;
14598
14599  PERL_ARGS_ASSERT_REGPROP;
14600
14601  sv_setpvs(sv, "");
14602
14603  if (OP(o) > REGNODE_MAX)  /* regnode.type is unsigned */
14604   /* It would be nice to FAIL() here, but this may be called from
14605   regexec.c, and it would be hard to supply pRExC_state. */
14606   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
14607  sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
14608
14609  k = PL_regkind[OP(o)];
14610
14611  if (k == EXACT) {
14612   sv_catpvs(sv, " ");
14613   /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
14614   * is a crude hack but it may be the best for now since
14615   * we have no flag "this EXACTish node was UTF-8"
14616   * --jhi */
14617   pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
14618     PERL_PV_ESCAPE_UNI_DETECT |
14619     PERL_PV_ESCAPE_NONASCII   |
14620     PERL_PV_PRETTY_ELLIPSES   |
14621     PERL_PV_PRETTY_LTGT       |
14622     PERL_PV_PRETTY_NOCLEAR
14623     );
14624  } else if (k == TRIE) {
14625   /* print the details of the trie in dumpuntil instead, as
14626   * progi->data isn't available here */
14627   const char op = OP(o);
14628   const U32 n = ARG(o);
14629   const reg_ac_data * const ac = IS_TRIE_AC(op) ?
14630    (reg_ac_data *)progi->data->data[n] :
14631    NULL;
14632   const reg_trie_data * const trie
14633    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
14634
14635   Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
14636   DEBUG_TRIE_COMPILE_r(
14637    Perl_sv_catpvf(aTHX_ sv,
14638     "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
14639     (UV)trie->startstate,
14640     (IV)trie->statecount-1, /* -1 because of the unused 0 element */
14641     (UV)trie->wordcount,
14642     (UV)trie->minlen,
14643     (UV)trie->maxlen,
14644     (UV)TRIE_CHARCOUNT(trie),
14645     (UV)trie->uniquecharcount
14646    )
14647   );
14648   if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
14649    int i;
14650    int rangestart = -1;
14651    U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
14652    sv_catpvs(sv, "[");
14653    for (i = 0; i <= 256; i++) {
14654     if (i < 256 && BITMAP_TEST(bitmap,i)) {
14655      if (rangestart == -1)
14656       rangestart = i;
14657     } else if (rangestart != -1) {
14658      if (i <= rangestart + 3)
14659       for (; rangestart < i; rangestart++)
14660        put_byte(sv, rangestart);
14661      else {
14662       put_byte(sv, rangestart);
14663       sv_catpvs(sv, "-");
14664       put_byte(sv, i - 1);
14665      }
14666      rangestart = -1;
14667     }
14668    }
14669    sv_catpvs(sv, "]");
14670   }
14671
14672  } else if (k == CURLY) {
14673   if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
14674    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
14675   Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
14676  }
14677  else if (k == WHILEM && o->flags)   /* Ordinal/of */
14678   Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
14679  else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
14680   Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
14681   if ( RXp_PAREN_NAMES(prog) ) {
14682    if ( k != REF || (OP(o) < NREF)) {
14683     AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
14684     SV **name= av_fetch(list, ARG(o), 0 );
14685     if (name)
14686      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14687    }
14688    else {
14689     AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
14690     SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
14691     I32 *nums=(I32*)SvPVX(sv_dat);
14692     SV **name= av_fetch(list, nums[0], 0 );
14693     I32 n;
14694     if (name) {
14695      for ( n=0; n<SvIVX(sv_dat); n++ ) {
14696       Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
14697          (n ? "," : ""), (IV)nums[n]);
14698      }
14699      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14700     }
14701    }
14702   }
14703  } else if (k == GOSUB)
14704   Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
14705  else if (k == VERB) {
14706   if (!o->flags)
14707    Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
14708       SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
14709  } else if (k == LOGICAL)
14710   Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
14711  else if (k == ANYOF) {
14712   int i, rangestart = -1;
14713   const U8 flags = ANYOF_FLAGS(o);
14714   int do_sep = 0;
14715
14716
14717   if (flags & ANYOF_LOCALE)
14718    sv_catpvs(sv, "{loc}");
14719   if (flags & ANYOF_LOC_FOLD)
14720    sv_catpvs(sv, "{i}");
14721   Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
14722   if (flags & ANYOF_INVERT)
14723    sv_catpvs(sv, "^");
14724
14725   /* output what the standard cp 0-255 bitmap matches */
14726   for (i = 0; i <= 256; i++) {
14727    if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
14728     if (rangestart == -1)
14729      rangestart = i;
14730    } else if (rangestart != -1) {
14731     if (i <= rangestart + 3)
14732      for (; rangestart < i; rangestart++)
14733       put_byte(sv, rangestart);
14734     else {
14735      put_byte(sv, rangestart);
14736      sv_catpvs(sv, "-");
14737      put_byte(sv, i - 1);
14738     }
14739     do_sep = 1;
14740     rangestart = -1;
14741    }
14742   }
14743
14744   EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14745   /* output any special charclass tests (used entirely under use locale) */
14746   if (ANYOF_CLASS_TEST_ANY_SET(o))
14747    for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
14748     if (ANYOF_CLASS_TEST(o,i)) {
14749      sv_catpv(sv, anyofs[i]);
14750      do_sep = 1;
14751     }
14752
14753   EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14754
14755   if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14756    sv_catpvs(sv, "{non-utf8-latin1-all}");
14757   }
14758
14759   /* output information about the unicode matching */
14760   if (flags & ANYOF_UNICODE_ALL)
14761    sv_catpvs(sv, "{unicode_all}");
14762   else if (ANYOF_NONBITMAP(o))
14763    sv_catpvs(sv, "{unicode}");
14764   if (flags & ANYOF_NONBITMAP_NON_UTF8)
14765    sv_catpvs(sv, "{outside bitmap}");
14766
14767   if (ANYOF_NONBITMAP(o)) {
14768    SV *lv; /* Set if there is something outside the bit map */
14769    SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
14770    bool byte_output = FALSE;   /* If something in the bitmap has been
14771           output */
14772
14773    if (lv && lv != &PL_sv_undef) {
14774     if (sw) {
14775      U8 s[UTF8_MAXBYTES_CASE+1];
14776
14777      for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
14778       uvchr_to_utf8(s, i);
14779
14780       if (i < 256
14781        && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
14782                things already
14783                output as part
14784                of the bitmap */
14785        && swash_fetch(sw, s, TRUE))
14786       {
14787        if (rangestart == -1)
14788         rangestart = i;
14789       } else if (rangestart != -1) {
14790        byte_output = TRUE;
14791        if (i <= rangestart + 3)
14792         for (; rangestart < i; rangestart++) {
14793          put_byte(sv, rangestart);
14794         }
14795        else {
14796         put_byte(sv, rangestart);
14797         sv_catpvs(sv, "-");
14798         put_byte(sv, i-1);
14799        }
14800        rangestart = -1;
14801       }
14802      }
14803     }
14804
14805     {
14806      char *s = savesvpv(lv);
14807      char * const origs = s;
14808
14809      while (*s && *s != '\n')
14810       s++;
14811
14812      if (*s == '\n') {
14813       const char * const t = ++s;
14814
14815       if (byte_output) {
14816        sv_catpvs(sv, " ");
14817       }
14818
14819       while (*s) {
14820        if (*s == '\n') {
14821
14822         /* Truncate very long output */
14823         if (s - origs > 256) {
14824          Perl_sv_catpvf(aTHX_ sv,
14825             "%.*s...",
14826             (int) (s - origs - 1),
14827             t);
14828          goto out_dump;
14829         }
14830         *s = ' ';
14831        }
14832        else if (*s == '\t') {
14833         *s = '-';
14834        }
14835        s++;
14836       }
14837       if (s[-1] == ' ')
14838        s[-1] = 0;
14839
14840       sv_catpv(sv, t);
14841      }
14842
14843     out_dump:
14844
14845      Safefree(origs);
14846     }
14847     SvREFCNT_dec_NN(lv);
14848    }
14849   }
14850
14851   Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14852  }
14853  else if (k == POSIXD || k == NPOSIXD) {
14854   U8 index = FLAGS(o) * 2;
14855   if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14856    Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14857   }
14858   else {
14859    sv_catpv(sv, anyofs[index]);
14860   }
14861  }
14862  else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14863   Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14864 #else
14865  PERL_UNUSED_CONTEXT;
14866  PERL_UNUSED_ARG(sv);
14867  PERL_UNUSED_ARG(o);
14868  PERL_UNUSED_ARG(prog);
14869 #endif /* DEBUGGING */
14870 }
14871
14872 SV *
14873 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14874 {    /* Assume that RE_INTUIT is set */
14875  dVAR;
14876  struct regexp *const prog = ReANY(r);
14877  GET_RE_DEBUG_FLAGS_DECL;
14878
14879  PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14880  PERL_UNUSED_CONTEXT;
14881
14882  DEBUG_COMPILE_r(
14883   {
14884    const char * const s = SvPV_nolen_const(prog->check_substr
14885      ? prog->check_substr : prog->check_utf8);
14886
14887    if (!PL_colorset) reginitcolors();
14888    PerlIO_printf(Perl_debug_log,
14889      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14890      PL_colors[4],
14891      prog->check_substr ? "" : "utf8 ",
14892      PL_colors[5],PL_colors[0],
14893      s,
14894      PL_colors[1],
14895      (strlen(s) > 60 ? "..." : ""));
14896   } );
14897
14898  return prog->check_substr ? prog->check_substr : prog->check_utf8;
14899 }
14900
14901 /*
14902    pregfree()
14903
14904    handles refcounting and freeing the perl core regexp structure. When
14905    it is necessary to actually free the structure the first thing it
14906    does is call the 'free' method of the regexp_engine associated to
14907    the regexp, allowing the handling of the void *pprivate; member
14908    first. (This routine is not overridable by extensions, which is why
14909    the extensions free is called first.)
14910
14911    See regdupe and regdupe_internal if you change anything here.
14912 */
14913 #ifndef PERL_IN_XSUB_RE
14914 void
14915 Perl_pregfree(pTHX_ REGEXP *r)
14916 {
14917  SvREFCNT_dec(r);
14918 }
14919
14920 void
14921 Perl_pregfree2(pTHX_ REGEXP *rx)
14922 {
14923  dVAR;
14924  struct regexp *const r = ReANY(rx);
14925  GET_RE_DEBUG_FLAGS_DECL;
14926
14927  PERL_ARGS_ASSERT_PREGFREE2;
14928
14929  if (r->mother_re) {
14930   ReREFCNT_dec(r->mother_re);
14931  } else {
14932   CALLREGFREE_PVT(rx); /* free the private data */
14933   SvREFCNT_dec(RXp_PAREN_NAMES(r));
14934   Safefree(r->xpv_len_u.xpvlenu_pv);
14935  }
14936  if (r->substrs) {
14937   SvREFCNT_dec(r->anchored_substr);
14938   SvREFCNT_dec(r->anchored_utf8);
14939   SvREFCNT_dec(r->float_substr);
14940   SvREFCNT_dec(r->float_utf8);
14941   Safefree(r->substrs);
14942  }
14943  RX_MATCH_COPY_FREE(rx);
14944 #ifdef PERL_ANY_COW
14945  SvREFCNT_dec(r->saved_copy);
14946 #endif
14947  Safefree(r->offs);
14948  SvREFCNT_dec(r->qr_anoncv);
14949  rx->sv_u.svu_rx = 0;
14950 }
14951
14952 /*  reg_temp_copy()
14953
14954  This is a hacky workaround to the structural issue of match results
14955  being stored in the regexp structure which is in turn stored in
14956  PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14957  could be PL_curpm in multiple contexts, and could require multiple
14958  result sets being associated with the pattern simultaneously, such
14959  as when doing a recursive match with (??{$qr})
14960
14961  The solution is to make a lightweight copy of the regexp structure
14962  when a qr// is returned from the code executed by (??{$qr}) this
14963  lightweight copy doesn't actually own any of its data except for
14964  the starp/end and the actual regexp structure itself.
14965
14966 */
14967
14968
14969 REGEXP *
14970 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14971 {
14972  struct regexp *ret;
14973  struct regexp *const r = ReANY(rx);
14974  const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
14975
14976  PERL_ARGS_ASSERT_REG_TEMP_COPY;
14977
14978  if (!ret_x)
14979   ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14980  else {
14981   SvOK_off((SV *)ret_x);
14982   if (islv) {
14983    /* For PVLVs, SvANY points to the xpvlv body while sv_u points
14984    to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
14985    made both spots point to the same regexp body.) */
14986    REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
14987    assert(!SvPVX(ret_x));
14988    ret_x->sv_u.svu_rx = temp->sv_any;
14989    temp->sv_any = NULL;
14990    SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
14991    SvREFCNT_dec_NN(temp);
14992    /* SvCUR still resides in the xpvlv struct, so the regexp copy-
14993    ing below will not set it. */
14994    SvCUR_set(ret_x, SvCUR(rx));
14995   }
14996  }
14997  /* This ensures that SvTHINKFIRST(sv) is true, and hence that
14998  sv_force_normal(sv) is called.  */
14999  SvFAKE_on(ret_x);
15000  ret = ReANY(ret_x);
15001
15002  SvFLAGS(ret_x) |= SvUTF8(rx);
15003  /* We share the same string buffer as the original regexp, on which we
15004  hold a reference count, incremented when mother_re is set below.
15005  The string pointer is copied here, being part of the regexp struct.
15006  */
15007  memcpy(&(ret->xpv_cur), &(r->xpv_cur),
15008   sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
15009  if (r->offs) {
15010   const I32 npar = r->nparens+1;
15011   Newx(ret->offs, npar, regexp_paren_pair);
15012   Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15013  }
15014  if (r->substrs) {
15015   Newx(ret->substrs, 1, struct reg_substr_data);
15016   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15017
15018   SvREFCNT_inc_void(ret->anchored_substr);
15019   SvREFCNT_inc_void(ret->anchored_utf8);
15020   SvREFCNT_inc_void(ret->float_substr);
15021   SvREFCNT_inc_void(ret->float_utf8);
15022
15023   /* check_substr and check_utf8, if non-NULL, point to either their
15024   anchored or float namesakes, and don't hold a second reference.  */
15025  }
15026  RX_MATCH_COPIED_off(ret_x);
15027 #ifdef PERL_ANY_COW
15028  ret->saved_copy = NULL;
15029 #endif
15030  ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
15031  SvREFCNT_inc_void(ret->qr_anoncv);
15032
15033  return ret_x;
15034 }
15035 #endif
15036
15037 /* regfree_internal()
15038
15039    Free the private data in a regexp. This is overloadable by
15040    extensions. Perl takes care of the regexp structure in pregfree(),
15041    this covers the *pprivate pointer which technically perl doesn't
15042    know about, however of course we have to handle the
15043    regexp_internal structure when no extension is in use.
15044
15045    Note this is called before freeing anything in the regexp
15046    structure.
15047  */
15048
15049 void
15050 Perl_regfree_internal(pTHX_ REGEXP * const rx)
15051 {
15052  dVAR;
15053  struct regexp *const r = ReANY(rx);
15054  RXi_GET_DECL(r,ri);
15055  GET_RE_DEBUG_FLAGS_DECL;
15056
15057  PERL_ARGS_ASSERT_REGFREE_INTERNAL;
15058
15059  DEBUG_COMPILE_r({
15060   if (!PL_colorset)
15061    reginitcolors();
15062   {
15063    SV *dsv= sv_newmortal();
15064    RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
15065     dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
15066    PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
15067     PL_colors[4],PL_colors[5],s);
15068   }
15069  });
15070 #ifdef RE_TRACK_PATTERN_OFFSETS
15071  if (ri->u.offsets)
15072   Safefree(ri->u.offsets);             /* 20010421 MJD */
15073 #endif
15074  if (ri->code_blocks) {
15075   int n;
15076   for (n = 0; n < ri->num_code_blocks; n++)
15077    SvREFCNT_dec(ri->code_blocks[n].src_regex);
15078   Safefree(ri->code_blocks);
15079  }
15080
15081  if (ri->data) {
15082   int n = ri->data->count;
15083
15084   while (--n >= 0) {
15085   /* If you add a ->what type here, update the comment in regcomp.h */
15086    switch (ri->data->what[n]) {
15087    case 'a':
15088    case 'r':
15089    case 's':
15090    case 'S':
15091    case 'u':
15092     SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
15093     break;
15094    case 'f':
15095     Safefree(ri->data->data[n]);
15096     break;
15097    case 'l':
15098    case 'L':
15099     break;
15100    case 'T':
15101     { /* Aho Corasick add-on structure for a trie node.
15102      Used in stclass optimization only */
15103      U32 refcount;
15104      reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
15105      OP_REFCNT_LOCK;
15106      refcount = --aho->refcount;
15107      OP_REFCNT_UNLOCK;
15108      if ( !refcount ) {
15109       PerlMemShared_free(aho->states);
15110       PerlMemShared_free(aho->fail);
15111       /* do this last!!!! */
15112       PerlMemShared_free(ri->data->data[n]);
15113       PerlMemShared_free(ri->regstclass);
15114      }
15115     }
15116     break;
15117    case 't':
15118     {
15119      /* trie structure. */
15120      U32 refcount;
15121      reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
15122      OP_REFCNT_LOCK;
15123      refcount = --trie->refcount;
15124      OP_REFCNT_UNLOCK;
15125      if ( !refcount ) {
15126       PerlMemShared_free(trie->charmap);
15127       PerlMemShared_free(trie->states);
15128       PerlMemShared_free(trie->trans);
15129       if (trie->bitmap)
15130        PerlMemShared_free(trie->bitmap);
15131       if (trie->jump)
15132        PerlMemShared_free(trie->jump);
15133       PerlMemShared_free(trie->wordinfo);
15134       /* do this last!!!! */
15135       PerlMemShared_free(ri->data->data[n]);
15136      }
15137     }
15138     break;
15139    default:
15140     Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
15141    }
15142   }
15143   Safefree(ri->data->what);
15144   Safefree(ri->data);
15145  }
15146
15147  Safefree(ri);
15148 }
15149
15150 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
15151 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
15152 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
15153
15154 /*
15155    re_dup - duplicate a regexp.
15156
15157    This routine is expected to clone a given regexp structure. It is only
15158    compiled under USE_ITHREADS.
15159
15160    After all of the core data stored in struct regexp is duplicated
15161    the regexp_engine.dupe method is used to copy any private data
15162    stored in the *pprivate pointer. This allows extensions to handle
15163    any duplication it needs to do.
15164
15165    See pregfree() and regfree_internal() if you change anything here.
15166 */
15167 #if defined(USE_ITHREADS)
15168 #ifndef PERL_IN_XSUB_RE
15169 void
15170 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
15171 {
15172  dVAR;
15173  I32 npar;
15174  const struct regexp *r = ReANY(sstr);
15175  struct regexp *ret = ReANY(dstr);
15176
15177  PERL_ARGS_ASSERT_RE_DUP_GUTS;
15178
15179  npar = r->nparens+1;
15180  Newx(ret->offs, npar, regexp_paren_pair);
15181  Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15182
15183  if (ret->substrs) {
15184   /* Do it this way to avoid reading from *r after the StructCopy().
15185   That way, if any of the sv_dup_inc()s dislodge *r from the L1
15186   cache, it doesn't matter.  */
15187   const bool anchored = r->check_substr
15188    ? r->check_substr == r->anchored_substr
15189    : r->check_utf8 == r->anchored_utf8;
15190   Newx(ret->substrs, 1, struct reg_substr_data);
15191   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15192
15193   ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15194   ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15195   ret->float_substr = sv_dup_inc(ret->float_substr, param);
15196   ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15197
15198   /* check_substr and check_utf8, if non-NULL, point to either their
15199   anchored or float namesakes, and don't hold a second reference.  */
15200
15201   if (ret->check_substr) {
15202    if (anchored) {
15203     assert(r->check_utf8 == r->anchored_utf8);
15204     ret->check_substr = ret->anchored_substr;
15205     ret->check_utf8 = ret->anchored_utf8;
15206    } else {
15207     assert(r->check_substr == r->float_substr);
15208     assert(r->check_utf8 == r->float_utf8);
15209     ret->check_substr = ret->float_substr;
15210     ret->check_utf8 = ret->float_utf8;
15211    }
15212   } else if (ret->check_utf8) {
15213    if (anchored) {
15214     ret->check_utf8 = ret->anchored_utf8;
15215    } else {
15216     ret->check_utf8 = ret->float_utf8;
15217    }
15218   }
15219  }
15220
15221  RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15222  ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15223
15224  if (ret->pprivate)
15225   RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15226
15227  if (RX_MATCH_COPIED(dstr))
15228   ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
15229  else
15230   ret->subbeg = NULL;
15231 #ifdef PERL_ANY_COW
15232  ret->saved_copy = NULL;
15233 #endif
15234
15235  /* Whether mother_re be set or no, we need to copy the string.  We
15236  cannot refrain from copying it when the storage points directly to
15237  our mother regexp, because that's
15238    1: a buffer in a different thread
15239    2: something we no longer hold a reference on
15240    so we need to copy it locally.  */
15241  RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15242  ret->mother_re   = NULL;
15243  ret->gofs = 0;
15244 }
15245 #endif /* PERL_IN_XSUB_RE */
15246
15247 /*
15248    regdupe_internal()
15249
15250    This is the internal complement to regdupe() which is used to copy
15251    the structure pointed to by the *pprivate pointer in the regexp.
15252    This is the core version of the extension overridable cloning hook.
15253    The regexp structure being duplicated will be copied by perl prior
15254    to this and will be provided as the regexp *r argument, however
15255    with the /old/ structures pprivate pointer value. Thus this routine
15256    may override any copying normally done by perl.
15257
15258    It returns a pointer to the new regexp_internal structure.
15259 */
15260
15261 void *
15262 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15263 {
15264  dVAR;
15265  struct regexp *const r = ReANY(rx);
15266  regexp_internal *reti;
15267  int len;
15268  RXi_GET_DECL(r,ri);
15269
15270  PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15271
15272  len = ProgLen(ri);
15273
15274  Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15275  Copy(ri->program, reti->program, len+1, regnode);
15276
15277  reti->num_code_blocks = ri->num_code_blocks;
15278  if (ri->code_blocks) {
15279   int n;
15280   Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15281     struct reg_code_block);
15282   Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15283     struct reg_code_block);
15284   for (n = 0; n < ri->num_code_blocks; n++)
15285    reti->code_blocks[n].src_regex = (REGEXP*)
15286      sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15287  }
15288  else
15289   reti->code_blocks = NULL;
15290
15291  reti->regstclass = NULL;
15292
15293  if (ri->data) {
15294   struct reg_data *d;
15295   const int count = ri->data->count;
15296   int i;
15297
15298   Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15299     char, struct reg_data);
15300   Newx(d->what, count, U8);
15301
15302   d->count = count;
15303   for (i = 0; i < count; i++) {
15304    d->what[i] = ri->data->what[i];
15305    switch (d->what[i]) {
15306     /* see also regcomp.h and regfree_internal() */
15307    case 'a': /* actually an AV, but the dup function is identical.  */
15308    case 'r':
15309    case 's':
15310    case 'S':
15311    case 'u': /* actually an HV, but the dup function is identical.  */
15312     d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15313     break;
15314    case 'f':
15315     /* This is cheating. */
15316     Newx(d->data[i], 1, struct regnode_charclass_class);
15317     StructCopy(ri->data->data[i], d->data[i],
15318        struct regnode_charclass_class);
15319     reti->regstclass = (regnode*)d->data[i];
15320     break;
15321    case 'T':
15322     /* Trie stclasses are readonly and can thus be shared
15323     * without duplication. We free the stclass in pregfree
15324     * when the corresponding reg_ac_data struct is freed.
15325     */
15326     reti->regstclass= ri->regstclass;
15327     /* Fall through */
15328    case 't':
15329     OP_REFCNT_LOCK;
15330     ((reg_trie_data*)ri->data->data[i])->refcount++;
15331     OP_REFCNT_UNLOCK;
15332     /* Fall through */
15333    case 'l':
15334    case 'L':
15335     d->data[i] = ri->data->data[i];
15336     break;
15337    default:
15338     Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
15339    }
15340   }
15341
15342   reti->data = d;
15343  }
15344  else
15345   reti->data = NULL;
15346
15347  reti->name_list_idx = ri->name_list_idx;
15348
15349 #ifdef RE_TRACK_PATTERN_OFFSETS
15350  if (ri->u.offsets) {
15351   Newx(reti->u.offsets, 2*len+1, U32);
15352   Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
15353  }
15354 #else
15355  SetProgLen(reti,len);
15356 #endif
15357
15358  return (void*)reti;
15359 }
15360
15361 #endif    /* USE_ITHREADS */
15362
15363 #ifndef PERL_IN_XSUB_RE
15364
15365 /*
15366  - regnext - dig the "next" pointer out of a node
15367  */
15368 regnode *
15369 Perl_regnext(pTHX_ regnode *p)
15370 {
15371  dVAR;
15372  I32 offset;
15373
15374  if (!p)
15375   return(NULL);
15376
15377  if (OP(p) > REGNODE_MAX) {  /* regnode.type is unsigned */
15378   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
15379  }
15380
15381  offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
15382  if (offset == 0)
15383   return(NULL);
15384
15385  return(p+offset);
15386 }
15387 #endif
15388
15389 STATIC void
15390 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
15391 {
15392  va_list args;
15393  STRLEN l1 = strlen(pat1);
15394  STRLEN l2 = strlen(pat2);
15395  char buf[512];
15396  SV *msv;
15397  const char *message;
15398
15399  PERL_ARGS_ASSERT_RE_CROAK2;
15400
15401  if (l1 > 510)
15402   l1 = 510;
15403  if (l1 + l2 > 510)
15404   l2 = 510 - l1;
15405  Copy(pat1, buf, l1 , char);
15406  Copy(pat2, buf + l1, l2 , char);
15407  buf[l1 + l2] = '\n';
15408  buf[l1 + l2 + 1] = '\0';
15409 #ifdef I_STDARG
15410  /* ANSI variant takes additional second argument */
15411  va_start(args, pat2);
15412 #else
15413  va_start(args);
15414 #endif
15415  msv = vmess(buf, &args);
15416  va_end(args);
15417  message = SvPV_const(msv,l1);
15418  if (l1 > 512)
15419   l1 = 512;
15420  Copy(message, buf, l1 , char);
15421  buf[l1-1] = '\0';   /* Overwrite \n */
15422  Perl_croak(aTHX_ "%s", buf);
15423 }
15424
15425 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
15426
15427 #ifndef PERL_IN_XSUB_RE
15428 void
15429 Perl_save_re_context(pTHX)
15430 {
15431  dVAR;
15432
15433  struct re_save_state *state;
15434
15435  SAVEVPTR(PL_curcop);
15436  SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
15437
15438  state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
15439  PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
15440  SSPUSHUV(SAVEt_RE_STATE);
15441
15442  Copy(&PL_reg_state, state, 1, struct re_save_state);
15443
15444  PL_reg_oldsaved = NULL;
15445  PL_reg_oldsavedlen = 0;
15446  PL_reg_oldsavedoffset = 0;
15447  PL_reg_oldsavedcoffset = 0;
15448  PL_reg_maxiter = 0;
15449  PL_reg_leftiter = 0;
15450  PL_reg_poscache = NULL;
15451  PL_reg_poscache_size = 0;
15452 #ifdef PERL_ANY_COW
15453  PL_nrs = NULL;
15454 #endif
15455
15456  /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
15457  if (PL_curpm) {
15458   const REGEXP * const rx = PM_GETRE(PL_curpm);
15459   if (rx) {
15460    U32 i;
15461    for (i = 1; i <= RX_NPARENS(rx); i++) {
15462     char digits[TYPE_CHARS(long)];
15463     const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
15464     GV *const *const gvp
15465      = (GV**)hv_fetch(PL_defstash, digits, len, 0);
15466
15467     if (gvp) {
15468      GV * const gv = *gvp;
15469      if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
15470       save_scalar(gv);
15471     }
15472    }
15473   }
15474  }
15475 }
15476 #endif
15477
15478 #ifdef DEBUGGING
15479
15480 STATIC void
15481 S_put_byte(pTHX_ SV *sv, int c)
15482 {
15483  PERL_ARGS_ASSERT_PUT_BYTE;
15484
15485  /* Our definition of isPRINT() ignores locales, so only bytes that are
15486  not part of UTF-8 are considered printable. I assume that the same
15487  holds for UTF-EBCDIC.
15488  Also, code point 255 is not printable in either (it's E0 in EBCDIC,
15489  which Wikipedia says:
15490
15491  EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
15492  ones (binary 1111 1111, hexadecimal FF). It is similar, but not
15493  identical, to the ASCII delete (DEL) or rubout control character. ...
15494  it is typically mapped to hexadecimal code 9F, in order to provide a
15495  unique character mapping in both directions)
15496
15497  So the old condition can be simplified to !isPRINT(c)  */
15498  if (!isPRINT(c)) {
15499   if (c < 256) {
15500    Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
15501   }
15502   else {
15503    Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
15504   }
15505  }
15506  else {
15507   const char string = c;
15508   if (c == '-' || c == ']' || c == '\\' || c == '^')
15509    sv_catpvs(sv, "\\");
15510   sv_catpvn(sv, &string, 1);
15511  }
15512 }
15513
15514
15515 #define CLEAR_OPTSTART \
15516  if (optstart) STMT_START { \
15517    DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
15518    optstart=NULL; \
15519  } STMT_END
15520
15521 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
15522
15523 STATIC const regnode *
15524 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
15525    const regnode *last, const regnode *plast,
15526    SV* sv, I32 indent, U32 depth)
15527 {
15528  dVAR;
15529  U8 op = PSEUDO; /* Arbitrary non-END op. */
15530  const regnode *next;
15531  const regnode *optstart= NULL;
15532
15533  RXi_GET_DECL(r,ri);
15534  GET_RE_DEBUG_FLAGS_DECL;
15535
15536  PERL_ARGS_ASSERT_DUMPUNTIL;
15537
15538 #ifdef DEBUG_DUMPUNTIL
15539  PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
15540   last ? last-start : 0,plast ? plast-start : 0);
15541 #endif
15542
15543  if (plast && plast < last)
15544   last= plast;
15545
15546  while (PL_regkind[op] != END && (!last || node < last)) {
15547   /* While that wasn't END last time... */
15548   NODE_ALIGN(node);
15549   op = OP(node);
15550   if (op == CLOSE || op == WHILEM)
15551    indent--;
15552   next = regnext((regnode *)node);
15553
15554   /* Where, what. */
15555   if (OP(node) == OPTIMIZED) {
15556    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
15557     optstart = node;
15558    else
15559     goto after_print;
15560   } else
15561    CLEAR_OPTSTART;
15562
15563   regprop(r, sv, node);
15564   PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
15565      (int)(2*indent + 1), "", SvPVX_const(sv));
15566
15567   if (OP(node) != OPTIMIZED) {
15568    if (next == NULL)  /* Next ptr. */
15569     PerlIO_printf(Perl_debug_log, " (0)");
15570    else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
15571     PerlIO_printf(Perl_debug_log, " (FAIL)");
15572    else
15573     PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
15574    (void)PerlIO_putc(Perl_debug_log, '\n');
15575   }
15576
15577  after_print:
15578   if (PL_regkind[(U8)op] == BRANCHJ) {
15579    assert(next);
15580    {
15581     const regnode *nnode = (OP(next) == LONGJMP
15582          ? regnext((regnode *)next)
15583          : next);
15584     if (last && nnode > last)
15585      nnode = last;
15586     DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
15587    }
15588   }
15589   else if (PL_regkind[(U8)op] == BRANCH) {
15590    assert(next);
15591    DUMPUNTIL(NEXTOPER(node), next);
15592   }
15593   else if ( PL_regkind[(U8)op]  == TRIE ) {
15594    const regnode *this_trie = node;
15595    const char op = OP(node);
15596    const U32 n = ARG(node);
15597    const reg_ac_data * const ac = op>=AHOCORASICK ?
15598    (reg_ac_data *)ri->data->data[n] :
15599    NULL;
15600    const reg_trie_data * const trie =
15601     (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
15602 #ifdef DEBUGGING
15603    AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
15604 #endif
15605    const regnode *nextbranch= NULL;
15606    I32 word_idx;
15607    sv_setpvs(sv, "");
15608    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
15609     SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
15610
15611     PerlIO_printf(Perl_debug_log, "%*s%s ",
15612     (int)(2*(indent+3)), "",
15613      elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
15614        PL_colors[0], PL_colors[1],
15615        (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
15616        PERL_PV_PRETTY_ELLIPSES    |
15617        PERL_PV_PRETTY_LTGT
15618        )
15619        : "???"
15620     );
15621     if (trie->jump) {
15622      U16 dist= trie->jump[word_idx+1];
15623      PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
15624         (UV)((dist ? this_trie + dist : next) - start));
15625      if (dist) {
15626       if (!nextbranch)
15627        nextbranch= this_trie + trie->jump[0];
15628       DUMPUNTIL(this_trie + dist, nextbranch);
15629      }
15630      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
15631       nextbranch= regnext((regnode *)nextbranch);
15632     } else {
15633      PerlIO_printf(Perl_debug_log, "\n");
15634     }
15635    }
15636    if (last && next > last)
15637     node= last;
15638    else
15639     node= next;
15640   }
15641   else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
15642    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
15643      NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
15644   }
15645   else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
15646    assert(next);
15647    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
15648   }
15649   else if ( op == PLUS || op == STAR) {
15650    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
15651   }
15652   else if (PL_regkind[(U8)op] == ANYOF) {
15653    /* arglen 1 + class block */
15654    node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
15655      ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
15656    node = NEXTOPER(node);
15657   }
15658   else if (PL_regkind[(U8)op] == EXACT) {
15659    /* Literal string, where present. */
15660    node += NODE_SZ_STR(node) - 1;
15661    node = NEXTOPER(node);
15662   }
15663   else {
15664    node = NEXTOPER(node);
15665    node += regarglen[(U8)op];
15666   }
15667   if (op == CURLYX || op == OPEN)
15668    indent++;
15669  }
15670  CLEAR_OPTSTART;
15671 #ifdef DEBUG_DUMPUNTIL
15672  PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
15673 #endif
15674  return node;
15675 }
15676
15677 #endif /* DEBUGGING */
15678
15679 /*
15680  * Local variables:
15681  * c-indentation-style: bsd
15682  * c-basic-offset: 4
15683  * indent-tabs-mode: nil
15684  * End:
15685  *
15686  * ex: set ts=8 sts=4 sw=4 et:
15687  */