src/5023001/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  * Copyright (c) 1986 by University of Toronto.
  42  * Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  * Permission is granted to anyone to use this software for any
  45  * purpose on any computer system, and to redistribute it freely,
  46  * subject to the following restrictions:
  47  *
  48  * 1. The author is not responsible for the consequences of use of
  49  *  this software, no matter how awful, even if they arise
  50  *  from defects in it.
  51  *
  52  * 2. The origin of this software must not be misrepresented, either
  53  *  by explicit claim or by omission.
  54  *
  55  * 3. Altered versions must be plainly marked as such, and must not
  56  *  be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #undef PERL_IN_XSUB_RE
  76 #define PERL_IN_XSUB_RE 1
  77 #include "perl.h"
  78 #undef PERL_IN_XSUB_RE
  79
  80 #ifndef PERL_IN_XSUB_RE
  81 #include "re_defs.h"
  82 #endif
  83
  84 #define REG_COMP_C
  85 #ifdef PERL_IN_XSUB_RE
  86 #  include "re_comp.h"
  87 EXTERN_C const struct regexp_engine my_reg_engine;
  88 #else
  89 #  include "regcomp.h"
  90 #endif
  91
  92 #include "dquote_static.c"
  93 #include "inline_invlist.c"
  94 #include "unicode_constants.h"
  95
  96 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  97  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  98 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
  99  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 100 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 101 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 102
 103 #ifndef STATIC
 104 #define STATIC static
 105 #endif
 106
 107 #ifndef MIN
 108 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 109 #endif
 110
 111 /* this is a chain of data about sub patterns we are processing that
 112    need to be handled separately/specially in study_chunk. Its so
 113    we can simulate recursion without losing state.  */
 114 struct scan_frame;
 115 typedef struct scan_frame {
 116  regnode *last_regnode;      /* last node to process in this frame */
 117  regnode *next_regnode;      /* next node to process when last is reached */
 118  U32 prev_recursed_depth;
 119  I32 stopparen;              /* what stopparen do we use */
 120  U32 is_top_frame;           /* what flags do we use? */
 121
 122  struct scan_frame *this_prev_frame; /* this previous frame */
 123  struct scan_frame *prev_frame;      /* previous frame */
 124  struct scan_frame *next_frame;      /* next frame */
 125 } scan_frame;
 126
 127 /* Certain characters are output as a sequence with the first being a
 128  * backslash. */
 129 #define isBACKSLASHED_PUNCT(c)                                              \
 130      ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
 131
 132
 133 struct RExC_state_t {
 134  U32  flags;   /* RXf_* are we folding, multilining? */
 135  U32  pm_flags;  /* PMf_* stuff from the calling PMOP */
 136  char *precomp;  /* uncompiled string. */
 137  REGEXP *rx_sv;   /* The SV that is the regexp. */
 138  regexp *rx;                    /* perl core regexp structure */
 139  regexp_internal *rxi;           /* internal data for regexp object
 140           pprivate field */
 141  char *start;   /* Start of input for compile */
 142  char *end;   /* End of input for compile */
 143  char *parse;   /* Input-scan pointer. */
 144  SSize_t whilem_seen;  /* number of WHILEM in this expr */
 145  regnode *emit_start;  /* Start of emitted-code area */
 146  regnode *emit_bound;  /* First regnode outside of the
 147           allocated space */
 148  regnode *emit;   /* Code-emit pointer; if = &emit_dummy,
 149           implies compiling, so don't emit */
 150  regnode_ssc emit_dummy;  /* placeholder for emit to point to;
 151           large enough for the largest
 152           non-EXACTish node, so can use it as
 153           scratch in pass1 */
 154  I32  naughty;  /* How bad is this pattern? */
 155  I32  sawback;  /* Did we see \1, ...? */
 156  U32  seen;
 157  SSize_t size;   /* Code size. */
 158  I32                npar;            /* Capture buffer count, (OPEN) plus
 159           one. ("par" 0 is the whole
 160           pattern)*/
 161  I32  nestroot;  /* root parens we are in - used by
 162           accept */
 163  I32  extralen;
 164  I32  seen_zerolen;
 165  regnode **open_parens;  /* pointers to open parens */
 166  regnode **close_parens;  /* pointers to close parens */
 167  regnode *opend;   /* END node in program */
 168  I32  utf8;  /* whether the pattern is utf8 or not */
 169  I32  orig_utf8; /* whether the pattern was originally in utf8 */
 170         /* XXX use this for future optimisation of case
 171         * where pattern must be upgraded to utf8. */
 172  I32  uni_semantics; /* If a d charset modifier should use unicode
 173         rules, even if the pattern is not in
 174         utf8 */
 175  HV  *paren_names;  /* Paren names */
 176
 177  regnode **recurse;  /* Recurse regops */
 178  I32  recurse_count;  /* Number of recurse regops */
 179  U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
 180           through */
 181  U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 182  I32  in_lookbehind;
 183  I32  contains_locale;
 184  I32  contains_i;
 185  I32  override_recoding;
 186 #ifdef EBCDIC
 187  I32  recode_x_to_native;
 188 #endif
 189  I32  in_multi_char_class;
 190  struct reg_code_block *code_blocks; /* positions of literal (?{})
 191            within pattern */
 192  int  num_code_blocks; /* size of code_blocks[] */
 193  int  code_index;  /* next code_blocks[] slot */
 194  SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 195  scan_frame *frame_head;
 196  scan_frame *frame_last;
 197  U32         frame_count;
 198  U32         strict;
 199 #ifdef ADD_TO_REGEXEC
 200  char  *starttry;  /* -Dr: where regtry was called. */
 201 #define RExC_starttry (pRExC_state->starttry)
 202 #endif
 203  SV  *runtime_code_qr; /* qr with the runtime code blocks */
 204 #ifdef DEBUGGING
 205  const char  *lastparse;
 206  I32         lastnum;
 207  AV          *paren_name_list;       /* idx -> name */
 208  U32         study_chunk_recursed_count;
 209  SV          *mysv1;
 210  SV          *mysv2;
 211 #define RExC_lastparse (pRExC_state->lastparse)
 212 #define RExC_lastnum (pRExC_state->lastnum)
 213 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 214 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
 215 #define RExC_mysv (pRExC_state->mysv1)
 216 #define RExC_mysv1 (pRExC_state->mysv1)
 217 #define RExC_mysv2 (pRExC_state->mysv2)
 218
 219 #endif
 220 };
 221
 222 #define RExC_flags (pRExC_state->flags)
 223 #define RExC_pm_flags (pRExC_state->pm_flags)
 224 #define RExC_precomp (pRExC_state->precomp)
 225 #define RExC_rx_sv (pRExC_state->rx_sv)
 226 #define RExC_rx  (pRExC_state->rx)
 227 #define RExC_rxi (pRExC_state->rxi)
 228 #define RExC_start (pRExC_state->start)
 229 #define RExC_end (pRExC_state->end)
 230 #define RExC_parse (pRExC_state->parse)
 231 #define RExC_whilem_seen (pRExC_state->whilem_seen)
 232 #ifdef RE_TRACK_PATTERN_OFFSETS
 233 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
 234               others */
 235 #endif
 236 #define RExC_emit (pRExC_state->emit)
 237 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 238 #define RExC_emit_start (pRExC_state->emit_start)
 239 #define RExC_emit_bound (pRExC_state->emit_bound)
 240 #define RExC_sawback (pRExC_state->sawback)
 241 #define RExC_seen (pRExC_state->seen)
 242 #define RExC_size (pRExC_state->size)
 243 #define RExC_maxlen        (pRExC_state->maxlen)
 244 #define RExC_npar (pRExC_state->npar)
 245 #define RExC_nestroot   (pRExC_state->nestroot)
 246 #define RExC_extralen (pRExC_state->extralen)
 247 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
 248 #define RExC_utf8 (pRExC_state->utf8)
 249 #define RExC_uni_semantics (pRExC_state->uni_semantics)
 250 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
 251 #define RExC_open_parens (pRExC_state->open_parens)
 252 #define RExC_close_parens (pRExC_state->close_parens)
 253 #define RExC_opend (pRExC_state->opend)
 254 #define RExC_paren_names (pRExC_state->paren_names)
 255 #define RExC_recurse (pRExC_state->recurse)
 256 #define RExC_recurse_count (pRExC_state->recurse_count)
 257 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 258 #define RExC_study_chunk_recursed_bytes  \
 259         (pRExC_state->study_chunk_recursed_bytes)
 260 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
 261 #define RExC_contains_locale (pRExC_state->contains_locale)
 262 #define RExC_contains_i (pRExC_state->contains_i)
 263 #define RExC_override_recoding (pRExC_state->override_recoding)
 264 #ifdef EBCDIC
 265 #   define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
 266 #endif
 267 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 268 #define RExC_frame_head (pRExC_state->frame_head)
 269 #define RExC_frame_last (pRExC_state->frame_last)
 270 #define RExC_frame_count (pRExC_state->frame_count)
 271 #define RExC_strict (pRExC_state->strict)
 272
 273 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
 274  * a flag to disable back-off on the fixed/floating substrings - if it's
 275  * a high complexity pattern we assume the benefit of avoiding a full match
 276  * is worth the cost of checking for the substrings even if they rarely help.
 277  */
 278 #define RExC_naughty (pRExC_state->naughty)
 279 #define TOO_NAUGHTY (10)
 280 #define MARK_NAUGHTY(add) \
 281  if (RExC_naughty < TOO_NAUGHTY) \
 282   RExC_naughty += (add)
 283 #define MARK_NAUGHTY_EXP(exp, add) \
 284  if (RExC_naughty < TOO_NAUGHTY) \
 285   RExC_naughty += RExC_naughty / (exp) + (add)
 286
 287 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
 288 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 289   ((*s) == '{' && regcurly(s)))
 290
 291 /*
 292  * Flags to be passed up and down.
 293  */
 294 #define WORST  0 /* Worst case. */
 295 #define HASWIDTH 0x01 /* Known to match non-null strings. */
 296
 297 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 298  * character.  (There needs to be a case: in the switch statement in regexec.c
 299  * for any node marked SIMPLE.)  Note that this is not the same thing as
 300  * REGNODE_SIMPLE */
 301 #define SIMPLE  0x02
 302 #define SPSTART  0x04 /* Starts with * or + */
 303 #define POSTPONED 0x08    /* (?1),(?&name), (??{...}) or similar */
 304 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
 305 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 306
 307 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 308
 309 /* whether trie related optimizations are enabled */
 310 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 311 #define TRIE_STUDY_OPT
 312 #define FULL_TRIE_STUDY
 313 #define TRIE_STCLASS
 314 #endif
 315
 316
 317
 318 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 319 #define PBITVAL(paren) (1 << ((paren) & 7))
 320 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 321 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 322 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 323
 324 #define REQUIRE_UTF8 STMT_START {                                       \
 325          if (!UTF) {                           \
 326           *flagp = RESTART_UTF8;            \
 327           return NULL;                      \
 328          }                                     \
 329       } STMT_END
 330
 331 /* This converts the named class defined in regcomp.h to its equivalent class
 332  * number defined in handy.h. */
 333 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 334 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 335
 336 #define _invlist_union_complement_2nd(a, b, output) \
 337       _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 338 #define _invlist_intersection_complement_2nd(a, b, output) \
 339     _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 340
 341 /* About scan_data_t.
 342
 343   During optimisation we recurse through the regexp program performing
 344   various inplace (keyhole style) optimisations. In addition study_chunk
 345   and scan_commit populate this data structure with information about
 346   what strings MUST appear in the pattern. We look for the longest
 347   string that must appear at a fixed location, and we look for the
 348   longest string that may appear at a floating location. So for instance
 349   in the pattern:
 350
 351  /FOO[xX]A.*B[xX]BAR/
 352
 353   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 354   strings (because they follow a .* construct). study_chunk will identify
 355   both FOO and BAR as being the longest fixed and floating strings respectively.
 356
 357   The strings can be composites, for instance
 358
 359  /(f)(o)(o)/
 360
 361   will result in a composite fixed substring 'foo'.
 362
 363   For each string some basic information is maintained:
 364
 365   - offset or min_offset
 366  This is the position the string must appear at, or not before.
 367  It also implicitly (when combined with minlenp) tells us how many
 368  characters must match before the string we are searching for.
 369  Likewise when combined with minlenp and the length of the string it
 370  tells us how many characters must appear after the string we have
 371  found.
 372
 373   - max_offset
 374  Only used for floating strings. This is the rightmost point that
 375  the string can appear at. If set to SSize_t_MAX it indicates that the
 376  string can occur infinitely far to the right.
 377
 378   - minlenp
 379  A pointer to the minimum number of characters of the pattern that the
 380  string was found inside. This is important as in the case of positive
 381  lookahead or positive lookbehind we can have multiple patterns
 382  involved. Consider
 383
 384  /(?=FOO).*F/
 385
 386  The minimum length of the pattern overall is 3, the minimum length
 387  of the lookahead part is 3, but the minimum length of the part that
 388  will actually match is 1. So 'FOO's minimum length is 3, but the
 389  minimum length for the F is 1. This is important as the minimum length
 390  is used to determine offsets in front of and behind the string being
 391  looked for.  Since strings can be composites this is the length of the
 392  pattern at the time it was committed with a scan_commit. Note that
 393  the length is calculated by study_chunk, so that the minimum lengths
 394  are not known until the full pattern has been compiled, thus the
 395  pointer to the value.
 396
 397   - lookbehind
 398
 399  In the case of lookbehind the string being searched for can be
 400  offset past the start point of the final matching string.
 401  If this value was just blithely removed from the min_offset it would
 402  invalidate some of the calculations for how many chars must match
 403  before or after (as they are derived from min_offset and minlen and
 404  the length of the string being searched for).
 405  When the final pattern is compiled and the data is moved from the
 406  scan_data_t structure into the regexp structure the information
 407  about lookbehind is factored in, with the information that would
 408  have been lost precalculated in the end_shift field for the
 409  associated string.
 410
 411   The fields pos_min and pos_delta are used to store the minimum offset
 412   and the delta to the maximum offset at the current point in the pattern.
 413
 414 */
 415
 416 typedef struct scan_data_t {
 417  /*I32 len_min;      unused */
 418  /*I32 len_delta;    unused */
 419  SSize_t pos_min;
 420  SSize_t pos_delta;
 421  SV *last_found;
 422  SSize_t last_end;     /* min value, <0 unless valid. */
 423  SSize_t last_start_min;
 424  SSize_t last_start_max;
 425  SV **longest;     /* Either &l_fixed, or &l_float. */
 426  SV *longest_fixed;      /* longest fixed string found in pattern */
 427  SSize_t offset_fixed;   /* offset where it starts */
 428  SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 429  I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 430  SV *longest_float;      /* longest floating string found in pattern */
 431  SSize_t offset_float_min; /* earliest point in string it can appear */
 432  SSize_t offset_float_max; /* latest point in string it can appear */
 433  SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 434  SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 435  I32 flags;
 436  I32 whilem_c;
 437  SSize_t *last_closep;
 438  regnode_ssc *start_class;
 439 } scan_data_t;
 440
 441 /*
 442  * Forward declarations for pregcomp()'s friends.
 443  */
 444
 445 static const scan_data_t zero_scan_data =
 446   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 447
 448 #define SF_BEFORE_EOL  (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 449 #define SF_BEFORE_SEOL  0x0001
 450 #define SF_BEFORE_MEOL  0x0002
 451 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 452 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 453
 454 #define SF_FIX_SHIFT_EOL (+2)
 455 #define SF_FL_SHIFT_EOL  (+4)
 456
 457 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 458 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 459
 460 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 461 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 462 #define SF_IS_INF  0x0040
 463 #define SF_HAS_PAR  0x0080
 464 #define SF_IN_PAR  0x0100
 465 #define SF_HAS_EVAL  0x0200
 466 #define SCF_DO_SUBSTR  0x0400
 467 #define SCF_DO_STCLASS_AND 0x0800
 468 #define SCF_DO_STCLASS_OR 0x1000
 469 #define SCF_DO_STCLASS  (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 470 #define SCF_WHILEM_VISITED_POS 0x2000
 471
 472 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 473 #define SCF_SEEN_ACCEPT         0x8000
 474 #define SCF_TRIE_DOING_RESTUDY 0x10000
 475 #define SCF_IN_DEFINE          0x20000
 476
 477
 478
 479
 480 #define UTF cBOOL(RExC_utf8)
 481
 482 /* The enums for all these are ordered so things work out correctly */
 483 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 484 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 485              == REGEX_DEPENDS_CHARSET)
 486 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 487 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 488              >= REGEX_UNICODE_CHARSET)
 489 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 490            == REGEX_ASCII_RESTRICTED_CHARSET)
 491 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 492            >= REGEX_ASCII_RESTRICTED_CHARSET)
 493 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 494           == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 495
 496 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 497
 498 /* For programs that want to be strictly Unicode compatible by dying if any
 499  * attempt is made to match a non-Unicode code point against a Unicode
 500  * property.  */
 501 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 502
 503 #define OOB_NAMEDCLASS  -1
 504
 505 /* There is no code point that is out-of-bounds, so this is problematic.  But
 506  * its only current use is to initialize a variable that is always set before
 507  * looked at. */
 508 #define OOB_UNICODE  0xDEADBEEF
 509
 510 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 511 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 512
 513
 514 /* length of regex to show in messages that don't mark a position within */
 515 #define RegexLengthToShowInErrorMessages 127
 516
 517 /*
 518  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 519  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 520  * op/pragma/warn/regcomp.
 521  */
 522 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 523 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 524
 525 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 526       " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 527
 528 #define REPORT_LOCATION_ARGS(offset)            \
 529     UTF8fARG(UTF, offset, RExC_precomp), \
 530     UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 531
 532 /* Used to point after bad bytes for an error message, but avoid skipping
 533  * past a nul byte. */
 534 #define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
 535
 536 /*
 537  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 538  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 539  * "...".
 540  */
 541 #define _FAIL(code) STMT_START {     \
 542  const char *ellipses = "";      \
 543  IV len = RExC_end - RExC_precomp;     \
 544                   \
 545  if (!SIZE_ONLY)       \
 546   SAVEFREESV(RExC_rx_sv);      \
 547  if (len > RegexLengthToShowInErrorMessages) {   \
 548   /* chop 10 shorter than the max, to ensure meaning of "..." */ \
 549   len = RegexLengthToShowInErrorMessages - 10;   \
 550   ellipses = "...";      \
 551  }         \
 552  code;                                                               \
 553 } STMT_END
 554
 555 #define FAIL(msg) _FAIL(       \
 556  Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",     \
 557    msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 558
 559 #define FAIL2(msg,arg) _FAIL(       \
 560  Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",     \
 561    arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 562
 563 /*
 564  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 565  */
 566 #define Simple_vFAIL(m) STMT_START {     \
 567  const IV offset =                                                   \
 568   (RExC_parse > RExC_end ? RExC_end : RExC_parse) - RExC_precomp; \
 569  Perl_croak(aTHX_ "%s" REPORT_LOCATION,    \
 570    m, REPORT_LOCATION_ARGS(offset)); \
 571 } STMT_END
 572
 573 /*
 574  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 575  */
 576 #define vFAIL(m) STMT_START {    \
 577  if (!SIZE_ONLY)     \
 578   SAVEFREESV(RExC_rx_sv);    \
 579  Simple_vFAIL(m);     \
 580 } STMT_END
 581
 582 /*
 583  * Like Simple_vFAIL(), but accepts two arguments.
 584  */
 585 #define Simple_vFAIL2(m,a1) STMT_START {   \
 586  const IV offset = RExC_parse - RExC_precomp;   \
 587  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,   \
 588      REPORT_LOCATION_ARGS(offset)); \
 589 } STMT_END
 590
 591 /*
 592  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 593  */
 594 #define vFAIL2(m,a1) STMT_START {   \
 595  if (!SIZE_ONLY)     \
 596   SAVEFREESV(RExC_rx_sv);    \
 597  Simple_vFAIL2(m, a1);    \
 598 } STMT_END
 599
 600
 601 /*
 602  * Like Simple_vFAIL(), but accepts three arguments.
 603  */
 604 #define Simple_vFAIL3(m, a1, a2) STMT_START {   \
 605  const IV offset = RExC_parse - RExC_precomp;  \
 606  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,  \
 607    REPORT_LOCATION_ARGS(offset)); \
 608 } STMT_END
 609
 610 /*
 611  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 612  */
 613 #define vFAIL3(m,a1,a2) STMT_START {   \
 614  if (!SIZE_ONLY)     \
 615   SAVEFREESV(RExC_rx_sv);    \
 616  Simple_vFAIL3(m, a1, a2);    \
 617 } STMT_END
 618
 619 /*
 620  * Like Simple_vFAIL(), but accepts four arguments.
 621  */
 622 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {  \
 623  const IV offset = RExC_parse - RExC_precomp;  \
 624  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,  \
 625    REPORT_LOCATION_ARGS(offset)); \
 626 } STMT_END
 627
 628 #define vFAIL4(m,a1,a2,a3) STMT_START {   \
 629  if (!SIZE_ONLY)     \
 630   SAVEFREESV(RExC_rx_sv);    \
 631  Simple_vFAIL4(m, a1, a2, a3);   \
 632 } STMT_END
 633
 634 /* A specialized version of vFAIL2 that works with UTF8f */
 635 #define vFAIL2utf8f(m, a1) STMT_START { \
 636  const IV offset = RExC_parse - RExC_precomp;   \
 637  if (!SIZE_ONLY)                                \
 638   SAVEFREESV(RExC_rx_sv);                    \
 639  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 640    REPORT_LOCATION_ARGS(offset));         \
 641 } STMT_END
 642
 643 /* These have asserts in them because of [perl #122671] Many warnings in
 644  * regcomp.c can occur twice.  If they get output in pass1 and later in that
 645  * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
 646  * would get output again.  So they should be output in pass2, and these
 647  * asserts make sure new warnings follow that paradigm. */
 648
 649 /* m is not necessarily a "literal string", in this macro */
 650 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 651  const IV offset = loc - RExC_precomp;                               \
 652  __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 653    m, REPORT_LOCATION_ARGS(offset));       \
 654 } STMT_END
 655
 656 #define ckWARNreg(loc,m) STMT_START {     \
 657  const IV offset = loc - RExC_precomp;    \
 658  __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 659    REPORT_LOCATION_ARGS(offset));  \
 660 } STMT_END
 661
 662 #define vWARN(loc, m) STMT_START {            \
 663  const IV offset = loc - RExC_precomp;    \
 664  __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 665    REPORT_LOCATION_ARGS(offset));         \
 666 } STMT_END
 667
 668 #define vWARN_dep(loc, m) STMT_START {            \
 669  const IV offset = loc - RExC_precomp;    \
 670  __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
 671    REPORT_LOCATION_ARGS(offset));         \
 672 } STMT_END
 673
 674 #define ckWARNdep(loc,m) STMT_START {            \
 675  const IV offset = loc - RExC_precomp;    \
 676  __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                 \
 677    m REPORT_LOCATION,      \
 678    REPORT_LOCATION_ARGS(offset));  \
 679 } STMT_END
 680
 681 #define ckWARNregdep(loc,m) STMT_START {    \
 682  const IV offset = loc - RExC_precomp;    \
 683  __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
 684    m REPORT_LOCATION,      \
 685    REPORT_LOCATION_ARGS(offset));  \
 686 } STMT_END
 687
 688 #define ckWARN2reg_d(loc,m, a1) STMT_START {    \
 689  const IV offset = loc - RExC_precomp;    \
 690  __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),   \
 691    m REPORT_LOCATION,      \
 692    a1, REPORT_LOCATION_ARGS(offset)); \
 693 } STMT_END
 694
 695 #define ckWARN2reg(loc, m, a1) STMT_START {    \
 696  const IV offset = loc - RExC_precomp;    \
 697  __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 698    a1, REPORT_LOCATION_ARGS(offset)); \
 699 } STMT_END
 700
 701 #define vWARN3(loc, m, a1, a2) STMT_START {    \
 702  const IV offset = loc - RExC_precomp;    \
 703  __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 704    a1, a2, REPORT_LOCATION_ARGS(offset)); \
 705 } STMT_END
 706
 707 #define ckWARN3reg(loc, m, a1, a2) STMT_START {    \
 708  const IV offset = loc - RExC_precomp;    \
 709  __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 710    a1, a2, REPORT_LOCATION_ARGS(offset)); \
 711 } STMT_END
 712
 713 #define vWARN4(loc, m, a1, a2, a3) STMT_START {    \
 714  const IV offset = loc - RExC_precomp;    \
 715  __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 716    a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 717 } STMT_END
 718
 719 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {   \
 720  const IV offset = loc - RExC_precomp;    \
 721  __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 722    a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 723 } STMT_END
 724
 725 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {   \
 726  const IV offset = loc - RExC_precomp;    \
 727  __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 728    a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 729 } STMT_END
 730
 731 /* Macros for recording node offsets.   20001227 mjd@plover.com
 732  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 733  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 734  * Element 0 holds the number n.
 735  * Position is 1 indexed.
 736  */
 737 #ifndef RE_TRACK_PATTERN_OFFSETS
 738 #define Set_Node_Offset_To_R(node,byte)
 739 #define Set_Node_Offset(node,byte)
 740 #define Set_Cur_Node_Offset
 741 #define Set_Node_Length_To_R(node,len)
 742 #define Set_Node_Length(node,len)
 743 #define Set_Node_Cur_Length(node,start)
 744 #define Node_Offset(n)
 745 #define Node_Length(n)
 746 #define Set_Node_Offset_Length(node,offset,len)
 747 #define ProgLen(ri) ri->u.proglen
 748 #define SetProgLen(ri,x) ri->u.proglen = x
 749 #else
 750 #define ProgLen(ri) ri->u.offsets[0]
 751 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 752 #define Set_Node_Offset_To_R(node,byte) STMT_START {   \
 753  if (! SIZE_ONLY) {       \
 754   MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",  \
 755      __LINE__, (int)(node), (int)(byte)));  \
 756   if((node) < 0) {      \
 757    Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 758           (int)(node));                  \
 759   } else {       \
 760    RExC_offsets[2*(node)-1] = (byte);    \
 761   }        \
 762  }         \
 763 } STMT_END
 764
 765 #define Set_Node_Offset(node,byte) \
 766  Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 767 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 768
 769 #define Set_Node_Length_To_R(node,len) STMT_START {   \
 770  if (! SIZE_ONLY) {       \
 771   MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",  \
 772     __LINE__, (int)(node), (int)(len)));   \
 773   if((node) < 0) {      \
 774    Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 775           (int)(node));                  \
 776   } else {       \
 777    RExC_offsets[2*(node)] = (len);    \
 778   }        \
 779  }         \
 780 } STMT_END
 781
 782 #define Set_Node_Length(node,len) \
 783  Set_Node_Length_To_R((node)-RExC_emit_start, len)
 784 #define Set_Node_Cur_Length(node, start)                \
 785  Set_Node_Length(node, RExC_parse - start)
 786
 787 /* Get offsets and lengths */
 788 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 789 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 790
 791 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
 792  Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
 793  Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
 794 } STMT_END
 795 #endif
 796
 797 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 798 #define EXPERIMENTAL_INPLACESCAN
 799 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 800
 801 #define DEBUG_RExC_seen() \
 802   DEBUG_OPTIMISE_MORE_r({                                             \
 803    PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
 804                    \
 805    if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 806     PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
 807                    \
 808    if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 809     PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
 810                    \
 811    if (RExC_seen & REG_GPOS_SEEN)                                  \
 812     PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
 813                    \
 814    if (RExC_seen & REG_RECURSE_SEEN)                               \
 815     PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
 816                    \
 817    if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
 818     PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
 819                    \
 820    if (RExC_seen & REG_VERBARG_SEEN)                               \
 821     PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
 822                    \
 823    if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 824     PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
 825                    \
 826    if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 827     PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
 828                    \
 829    if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 830     PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
 831                    \
 832    if (RExC_seen & REG_GOSTART_SEEN)                               \
 833     PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
 834                    \
 835    if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
 836     PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
 837                    \
 838    PerlIO_printf(Perl_debug_log,"\n");                             \
 839   });
 840
 841 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
 842   if ((flags) & flag) PerlIO_printf(Perl_debug_log, "%s ", #flag)
 843
 844 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str)                    \
 845  if ( ( flags ) ) {                                                      \
 846   PerlIO_printf(Perl_debug_log, "%s", open_str);                      \
 847   DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL);                     \
 848   DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL);                     \
 849   DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF);                             \
 850   DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR);                            \
 851   DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR);                             \
 852   DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL);                           \
 853   DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR);                         \
 854   DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND);                    \
 855   DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR);                     \
 856   DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS);                        \
 857   DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS);                \
 858   DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY);                      \
 859   DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT);                       \
 860   DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY);                \
 861   DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE);                         \
 862   PerlIO_printf(Perl_debug_log, "%s", close_str);                     \
 863  }
 864
 865
 866 #define DEBUG_STUDYDATA(str,data,depth)                              \
 867 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 868  PerlIO_printf(Perl_debug_log,                                    \
 869   "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 870   " Flags: 0x%"UVXf,                                           \
 871   (int)(depth)*2, "",                                          \
 872   (IV)((data)->pos_min),                                       \
 873   (IV)((data)->pos_delta),                                     \
 874   (UV)((data)->flags)                                          \
 875  );                                                               \
 876  DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]");                 \
 877  PerlIO_printf(Perl_debug_log,                                    \
 878   " Whilem_c: %"IVdf" Lcp: %"IVdf" %s",                        \
 879   (IV)((data)->whilem_c),                                      \
 880   (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 881   is_inf ? "INF " : ""                                         \
 882  );                                                               \
 883  if ((data)->last_found)                                          \
 884   PerlIO_printf(Perl_debug_log,                                \
 885    "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 886    " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 887    SvPVX_const((data)->last_found),                         \
 888    (IV)((data)->last_end),                                  \
 889    (IV)((data)->last_start_min),                            \
 890    (IV)((data)->last_start_max),                            \
 891    ((data)->longest &&                                      \
 892    (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 893    SvPVX_const((data)->longest_fixed),                      \
 894    (IV)((data)->offset_fixed),                              \
 895    ((data)->longest &&                                      \
 896    (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 897    SvPVX_const((data)->longest_float),                      \
 898    (IV)((data)->offset_float_min),                          \
 899    (IV)((data)->offset_float_max)                           \
 900   );                                                           \
 901  PerlIO_printf(Perl_debug_log,"\n");                              \
 902 });
 903
 904 /* is c a control character for which we have a mnemonic? */
 905 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 906
 907 STATIC const char *
 908 S_cntrl_to_mnemonic(const U8 c)
 909 {
 910  /* Returns the mnemonic string that represents character 'c', if one
 911  * exists; NULL otherwise.  The only ones that exist for the purposes of
 912  * this routine are a few control characters */
 913
 914  switch (c) {
 915   case '\a':       return "\\a";
 916   case '\b':       return "\\b";
 917   case ESC_NATIVE: return "\\e";
 918   case '\f':       return "\\f";
 919   case '\n':       return "\\n";
 920   case '\r':       return "\\r";
 921   case '\t':       return "\\t";
 922  }
 923
 924  return NULL;
 925 }
 926
 927 /* Mark that we cannot extend a found fixed substring at this point.
 928    Update the longest found anchored substring and the longest found
 929    floating substrings if needed. */
 930
 931 STATIC void
 932 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 933      SSize_t *minlenp, int is_inf)
 934 {
 935  const STRLEN l = CHR_SVLEN(data->last_found);
 936  const STRLEN old_l = CHR_SVLEN(*data->longest);
 937  GET_RE_DEBUG_FLAGS_DECL;
 938
 939  PERL_ARGS_ASSERT_SCAN_COMMIT;
 940
 941  if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 942   SvSetMagicSV(*data->longest, data->last_found);
 943   if (*data->longest == data->longest_fixed) {
 944    data->offset_fixed = l ? data->last_start_min : data->pos_min;
 945    if (data->flags & SF_BEFORE_EOL)
 946     data->flags
 947      |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 948    else
 949     data->flags &= ~SF_FIX_BEFORE_EOL;
 950    data->minlen_fixed=minlenp;
 951    data->lookbehind_fixed=0;
 952   }
 953   else { /* *data->longest == data->longest_float */
 954    data->offset_float_min = l ? data->last_start_min : data->pos_min;
 955    data->offset_float_max = (l
 956       ? data->last_start_max
 957       : (data->pos_delta > SSize_t_MAX - data->pos_min
 958           ? SSize_t_MAX
 959           : data->pos_min + data->pos_delta));
 960    if (is_inf
 961     || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 962     data->offset_float_max = SSize_t_MAX;
 963    if (data->flags & SF_BEFORE_EOL)
 964     data->flags
 965      |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 966    else
 967     data->flags &= ~SF_FL_BEFORE_EOL;
 968    data->minlen_float=minlenp;
 969    data->lookbehind_float=0;
 970   }
 971  }
 972  SvCUR_set(data->last_found, 0);
 973  {
 974   SV * const sv = data->last_found;
 975   if (SvUTF8(sv) && SvMAGICAL(sv)) {
 976    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 977    if (mg)
 978     mg->mg_len = 0;
 979   }
 980  }
 981  data->last_end = -1;
 982  data->flags &= ~SF_BEFORE_EOL;
 983  DEBUG_STUDYDATA("commit: ",data,0);
 984 }
 985
 986 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 987  * list that describes which code points it matches */
 988
 989 STATIC void
 990 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 991 {
 992  /* Set the SSC 'ssc' to match an empty string or any code point */
 993
 994  PERL_ARGS_ASSERT_SSC_ANYTHING;
 995
 996  assert(is_ANYOF_SYNTHETIC(ssc));
 997
 998  ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 999  _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
1000  ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
1001 }
1002
1003 STATIC int
1004 S_ssc_is_anything(const regnode_ssc *ssc)
1005 {
1006  /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1007  * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
1008  * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1009  * in any way, so there's no point in using it */
1010
1011  UV start, end;
1012  bool ret;
1013
1014  PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1015
1016  assert(is_ANYOF_SYNTHETIC(ssc));
1017
1018  if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1019   return FALSE;
1020  }
1021
1022  /* See if the list consists solely of the range 0 - Infinity */
1023  invlist_iterinit(ssc->invlist);
1024  ret = invlist_iternext(ssc->invlist, &start, &end)
1025   && start == 0
1026   && end == UV_MAX;
1027
1028  invlist_iterfinish(ssc->invlist);
1029
1030  if (ret) {
1031   return TRUE;
1032  }
1033
1034  /* If e.g., both \w and \W are set, matches everything */
1035  if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1036   int i;
1037   for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1038    if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1039     return TRUE;
1040    }
1041   }
1042  }
1043
1044  return FALSE;
1045 }
1046
1047 STATIC void
1048 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1049 {
1050  /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
1051  * string, any code point, or any posix class under locale */
1052
1053  PERL_ARGS_ASSERT_SSC_INIT;
1054
1055  Zero(ssc, 1, regnode_ssc);
1056  set_ANYOF_SYNTHETIC(ssc);
1057  ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1058  ssc_anything(ssc);
1059
1060  /* If any portion of the regex is to operate under locale rules that aren't
1061  * fully known at compile time, initialization includes it.  The reason
1062  * this isn't done for all regexes is that the optimizer was written under
1063  * the assumption that locale was all-or-nothing.  Given the complexity and
1064  * lack of documentation in the optimizer, and that there are inadequate
1065  * test cases for locale, many parts of it may not work properly, it is
1066  * safest to avoid locale unless necessary. */
1067  if (RExC_contains_locale) {
1068   ANYOF_POSIXL_SETALL(ssc);
1069  }
1070  else {
1071   ANYOF_POSIXL_ZERO(ssc);
1072  }
1073 }
1074
1075 STATIC int
1076 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1077       const regnode_ssc *ssc)
1078 {
1079  /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1080  * to the list of code points matched, and locale posix classes; hence does
1081  * not check its flags) */
1082
1083  UV start, end;
1084  bool ret;
1085
1086  PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1087
1088  assert(is_ANYOF_SYNTHETIC(ssc));
1089
1090  invlist_iterinit(ssc->invlist);
1091  ret = invlist_iternext(ssc->invlist, &start, &end)
1092   && start == 0
1093   && end == UV_MAX;
1094
1095  invlist_iterfinish(ssc->invlist);
1096
1097  if (! ret) {
1098   return FALSE;
1099  }
1100
1101  if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1102   return FALSE;
1103  }
1104
1105  return TRUE;
1106 }
1107
1108 STATIC SV*
1109 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1110        const regnode_charclass* const node)
1111 {
1112  /* Returns a mortal inversion list defining which code points are matched
1113  * by 'node', which is of type ANYOF.  Handles complementing the result if
1114  * appropriate.  If some code points aren't knowable at this time, the
1115  * returned list must, and will, contain every code point that is a
1116  * possibility. */
1117
1118  SV* invlist = sv_2mortal(_new_invlist(0));
1119  SV* only_utf8_locale_invlist = NULL;
1120  unsigned int i;
1121  const U32 n = ARG(node);
1122  bool new_node_has_latin1 = FALSE;
1123
1124  PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1125
1126  /* Look at the data structure created by S_set_ANYOF_arg() */
1127  if (n != ANYOF_ONLY_HAS_BITMAP) {
1128   SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1129   AV * const av = MUTABLE_AV(SvRV(rv));
1130   SV **const ary = AvARRAY(av);
1131   assert(RExC_rxi->data->what[n] == 's');
1132
1133   if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1134    invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1135   }
1136   else if (ary[0] && ary[0] != &PL_sv_undef) {
1137
1138    /* Here, no compile-time swash, and there are things that won't be
1139    * known until runtime -- we have to assume it could be anything */
1140    return _add_range_to_invlist(invlist, 0, UV_MAX);
1141   }
1142   else if (ary[3] && ary[3] != &PL_sv_undef) {
1143
1144    /* Here no compile-time swash, and no run-time only data.  Use the
1145    * node's inversion list */
1146    invlist = sv_2mortal(invlist_clone(ary[3]));
1147   }
1148
1149   /* Get the code points valid only under UTF-8 locales */
1150   if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1151    && ary[2] && ary[2] != &PL_sv_undef)
1152   {
1153    only_utf8_locale_invlist = ary[2];
1154   }
1155  }
1156
1157  /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1158  * code points, and an inversion list for the others, but if there are code
1159  * points that should match only conditionally on the target string being
1160  * UTF-8, those are placed in the inversion list, and not the bitmap.
1161  * Since there are circumstances under which they could match, they are
1162  * included in the SSC.  But if the ANYOF node is to be inverted, we have
1163  * to exclude them here, so that when we invert below, the end result
1164  * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1165  * have to do this here before we add the unconditionally matched code
1166  * points */
1167  if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1168   _invlist_intersection_complement_2nd(invlist,
1169            PL_UpperLatin1,
1170            &invlist);
1171  }
1172
1173  /* Add in the points from the bit map */
1174  for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1175   if (ANYOF_BITMAP_TEST(node, i)) {
1176    invlist = add_cp_to_invlist(invlist, i);
1177    new_node_has_latin1 = TRUE;
1178   }
1179  }
1180
1181  /* If this can match all upper Latin1 code points, have to add them
1182  * as well */
1183  if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
1184   _invlist_union(invlist, PL_UpperLatin1, &invlist);
1185  }
1186
1187  /* Similarly for these */
1188  if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1189   _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1190  }
1191
1192  if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1193   _invlist_invert(invlist);
1194  }
1195  else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1196
1197   /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1198   * locale.  We can skip this if there are no 0-255 at all. */
1199   _invlist_union(invlist, PL_Latin1, &invlist);
1200  }
1201
1202  /* Similarly add the UTF-8 locale possible matches.  These have to be
1203  * deferred until after the non-UTF-8 locale ones are taken care of just
1204  * above, or it leads to wrong results under ANYOF_INVERT */
1205  if (only_utf8_locale_invlist) {
1206   _invlist_union_maybe_complement_2nd(invlist,
1207            only_utf8_locale_invlist,
1208            ANYOF_FLAGS(node) & ANYOF_INVERT,
1209            &invlist);
1210  }
1211
1212  return invlist;
1213 }
1214
1215 /* These two functions currently do the exact same thing */
1216 #define ssc_init_zero  ssc_init
1217
1218 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1219 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1220
1221 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1222  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1223  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1224
1225 STATIC void
1226 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1227     const regnode_charclass *and_with)
1228 {
1229  /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1230  * another SSC or a regular ANYOF class.  Can create false positives. */
1231
1232  SV* anded_cp_list;
1233  U8  anded_flags;
1234
1235  PERL_ARGS_ASSERT_SSC_AND;
1236
1237  assert(is_ANYOF_SYNTHETIC(ssc));
1238
1239  /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1240  * the code point inversion list and just the relevant flags */
1241  if (is_ANYOF_SYNTHETIC(and_with)) {
1242   anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1243   anded_flags = ANYOF_FLAGS(and_with);
1244
1245   /* XXX This is a kludge around what appears to be deficiencies in the
1246   * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1247   * there are paths through the optimizer where it doesn't get weeded
1248   * out when it should.  And if we don't make some extra provision for
1249   * it like the code just below, it doesn't get added when it should.
1250   * This solution is to add it only when AND'ing, which is here, and
1251   * only when what is being AND'ed is the pristine, original node
1252   * matching anything.  Thus it is like adding it to ssc_anything() but
1253   * only when the result is to be AND'ed.  Probably the same solution
1254   * could be adopted for the same problem we have with /l matching,
1255   * which is solved differently in S_ssc_init(), and that would lead to
1256   * fewer false positives than that solution has.  But if this solution
1257   * creates bugs, the consequences are only that a warning isn't raised
1258   * that should be; while the consequences for having /l bugs is
1259   * incorrect matches */
1260   if (ssc_is_anything((regnode_ssc *)and_with)) {
1261    anded_flags |= ANYOF_WARN_SUPER;
1262   }
1263  }
1264  else {
1265   anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1266   anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1267  }
1268
1269  ANYOF_FLAGS(ssc) &= anded_flags;
1270
1271  /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1272  * C2 is the list of code points in 'and-with'; P2, its posix classes.
1273  * 'and_with' may be inverted.  When not inverted, we have the situation of
1274  * computing:
1275  *  (C1 | P1) & (C2 | P2)
1276  *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1277  *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1278  *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1279  *                    <=  ((C1 & C2) | P1 | P2)
1280  * Alternatively, the last few steps could be:
1281  *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1282  *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1283  *                    <=  (C1 | C2 | (P1 & P2))
1284  * We favor the second approach if either P1 or P2 is non-empty.  This is
1285  * because these components are a barrier to doing optimizations, as what
1286  * they match cannot be known until the moment of matching as they are
1287  * dependent on the current locale, 'AND"ing them likely will reduce or
1288  * eliminate them.
1289  * But we can do better if we know that C1,P1 are in their initial state (a
1290  * frequent occurrence), each matching everything:
1291  *  (<everything>) & (C2 | P2) =  C2 | P2
1292  * Similarly, if C2,P2 are in their initial state (again a frequent
1293  * occurrence), the result is a no-op
1294  *  (C1 | P1) & (<everything>) =  C1 | P1
1295  *
1296  * Inverted, we have
1297  *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1298  *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1299  *                         <=  (C1 & ~C2) | (P1 & ~P2)
1300  * */
1301
1302  if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1303   && ! is_ANYOF_SYNTHETIC(and_with))
1304  {
1305   unsigned int i;
1306
1307   ssc_intersection(ssc,
1308       anded_cp_list,
1309       FALSE /* Has already been inverted */
1310       );
1311
1312   /* If either P1 or P2 is empty, the intersection will be also; can skip
1313   * the loop */
1314   if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1315    ANYOF_POSIXL_ZERO(ssc);
1316   }
1317   else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1318
1319    /* Note that the Posix class component P from 'and_with' actually
1320    * looks like:
1321    *      P = Pa | Pb | ... | Pn
1322    * where each component is one posix class, such as in [\w\s].
1323    * Thus
1324    *      ~P = ~(Pa | Pb | ... | Pn)
1325    *         = ~Pa & ~Pb & ... & ~Pn
1326    *        <= ~Pa | ~Pb | ... | ~Pn
1327    * The last is something we can easily calculate, but unfortunately
1328    * is likely to have many false positives.  We could do better
1329    * in some (but certainly not all) instances if two classes in
1330    * P have known relationships.  For example
1331    *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1332    * So
1333    *      :lower: & :print: = :lower:
1334    * And similarly for classes that must be disjoint.  For example,
1335    * since \s and \w can have no elements in common based on rules in
1336    * the POSIX standard,
1337    *      \w & ^\S = nothing
1338    * Unfortunately, some vendor locales do not meet the Posix
1339    * standard, in particular almost everything by Microsoft.
1340    * The loop below just changes e.g., \w into \W and vice versa */
1341
1342    regnode_charclass_posixl temp;
1343    int add = 1;    /* To calculate the index of the complement */
1344
1345    ANYOF_POSIXL_ZERO(&temp);
1346    for (i = 0; i < ANYOF_MAX; i++) {
1347     assert(i % 2 != 0
1348      || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1349      || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1350
1351     if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1352      ANYOF_POSIXL_SET(&temp, i + add);
1353     }
1354     add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1355    }
1356    ANYOF_POSIXL_AND(&temp, ssc);
1357
1358   } /* else ssc already has no posixes */
1359  } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1360   in its initial state */
1361  else if (! is_ANYOF_SYNTHETIC(and_with)
1362    || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1363  {
1364   /* But if 'ssc' is in its initial state, the result is just 'and_with';
1365   * copy it over 'ssc' */
1366   if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1367    if (is_ANYOF_SYNTHETIC(and_with)) {
1368     StructCopy(and_with, ssc, regnode_ssc);
1369    }
1370    else {
1371     ssc->invlist = anded_cp_list;
1372     ANYOF_POSIXL_ZERO(ssc);
1373     if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1374      ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1375     }
1376    }
1377   }
1378   else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1379     || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1380   {
1381    /* One or the other of P1, P2 is non-empty. */
1382    if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1383     ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1384    }
1385    ssc_union(ssc, anded_cp_list, FALSE);
1386   }
1387   else { /* P1 = P2 = empty */
1388    ssc_intersection(ssc, anded_cp_list, FALSE);
1389   }
1390  }
1391 }
1392
1393 STATIC void
1394 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1395    const regnode_charclass *or_with)
1396 {
1397  /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1398  * another SSC or a regular ANYOF class.  Can create false positives if
1399  * 'or_with' is to be inverted. */
1400
1401  SV* ored_cp_list;
1402  U8 ored_flags;
1403
1404  PERL_ARGS_ASSERT_SSC_OR;
1405
1406  assert(is_ANYOF_SYNTHETIC(ssc));
1407
1408  /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1409  * the code point inversion list and just the relevant flags */
1410  if (is_ANYOF_SYNTHETIC(or_with)) {
1411   ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1412   ored_flags = ANYOF_FLAGS(or_with);
1413  }
1414  else {
1415   ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1416   ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1417  }
1418
1419  ANYOF_FLAGS(ssc) |= ored_flags;
1420
1421  /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1422  * C2 is the list of code points in 'or-with'; P2, its posix classes.
1423  * 'or_with' may be inverted.  When not inverted, we have the simple
1424  * situation of computing:
1425  *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1426  * If P1|P2 yields a situation with both a class and its complement are
1427  * set, like having both \w and \W, this matches all code points, and we
1428  * can delete these from the P component of the ssc going forward.  XXX We
1429  * might be able to delete all the P components, but I (khw) am not certain
1430  * about this, and it is better to be safe.
1431  *
1432  * Inverted, we have
1433  *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1434  *                         <=  (C1 | P1) | ~C2
1435  *                         <=  (C1 | ~C2) | P1
1436  * (which results in actually simpler code than the non-inverted case)
1437  * */
1438
1439  if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1440   && ! is_ANYOF_SYNTHETIC(or_with))
1441  {
1442   /* We ignore P2, leaving P1 going forward */
1443  }   /* else  Not inverted */
1444  else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1445   ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1446   if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1447    unsigned int i;
1448    for (i = 0; i < ANYOF_MAX; i += 2) {
1449     if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1450     {
1451      ssc_match_all_cp(ssc);
1452      ANYOF_POSIXL_CLEAR(ssc, i);
1453      ANYOF_POSIXL_CLEAR(ssc, i+1);
1454     }
1455    }
1456   }
1457  }
1458
1459  ssc_union(ssc,
1460    ored_cp_list,
1461    FALSE /* Already has been inverted */
1462    );
1463 }
1464
1465 PERL_STATIC_INLINE void
1466 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1467 {
1468  PERL_ARGS_ASSERT_SSC_UNION;
1469
1470  assert(is_ANYOF_SYNTHETIC(ssc));
1471
1472  _invlist_union_maybe_complement_2nd(ssc->invlist,
1473           invlist,
1474           invert2nd,
1475           &ssc->invlist);
1476 }
1477
1478 PERL_STATIC_INLINE void
1479 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1480       SV* const invlist,
1481       const bool invert2nd)
1482 {
1483  PERL_ARGS_ASSERT_SSC_INTERSECTION;
1484
1485  assert(is_ANYOF_SYNTHETIC(ssc));
1486
1487  _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1488            invlist,
1489            invert2nd,
1490            &ssc->invlist);
1491 }
1492
1493 PERL_STATIC_INLINE void
1494 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1495 {
1496  PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1497
1498  assert(is_ANYOF_SYNTHETIC(ssc));
1499
1500  ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1501 }
1502
1503 PERL_STATIC_INLINE void
1504 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1505 {
1506  /* AND just the single code point 'cp' into the SSC 'ssc' */
1507
1508  SV* cp_list = _new_invlist(2);
1509
1510  PERL_ARGS_ASSERT_SSC_CP_AND;
1511
1512  assert(is_ANYOF_SYNTHETIC(ssc));
1513
1514  cp_list = add_cp_to_invlist(cp_list, cp);
1515  ssc_intersection(ssc, cp_list,
1516      FALSE /* Not inverted */
1517      );
1518  SvREFCNT_dec_NN(cp_list);
1519 }
1520
1521 PERL_STATIC_INLINE void
1522 S_ssc_clear_locale(regnode_ssc *ssc)
1523 {
1524  /* Set the SSC 'ssc' to not match any locale things */
1525  PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1526
1527  assert(is_ANYOF_SYNTHETIC(ssc));
1528
1529  ANYOF_POSIXL_ZERO(ssc);
1530  ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1531 }
1532
1533 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1534
1535 STATIC bool
1536 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1537 {
1538  /* The synthetic start class is used to hopefully quickly winnow down
1539  * places where a pattern could start a match in the target string.  If it
1540  * doesn't really narrow things down that much, there isn't much point to
1541  * having the overhead of using it.  This function uses some very crude
1542  * heuristics to decide if to use the ssc or not.
1543  *
1544  * It returns TRUE if 'ssc' rules out more than half what it considers to
1545  * be the "likely" possible matches, but of course it doesn't know what the
1546  * actual things being matched are going to be; these are only guesses
1547  *
1548  * For /l matches, it assumes that the only likely matches are going to be
1549  *      in the 0-255 range, uniformly distributed, so half of that is 127
1550  * For /a and /d matches, it assumes that the likely matches will be just
1551  *      the ASCII range, so half of that is 63
1552  * For /u and there isn't anything matching above the Latin1 range, it
1553  *      assumes that that is the only range likely to be matched, and uses
1554  *      half that as the cut-off: 127.  If anything matches above Latin1,
1555  *      it assumes that all of Unicode could match (uniformly), except for
1556  *      non-Unicode code points and things in the General Category "Other"
1557  *      (unassigned, private use, surrogates, controls and formats).  This
1558  *      is a much large number. */
1559
1560  const U32 max_match = (LOC)
1561       ? 127
1562       : (! UNI_SEMANTICS)
1563        ? 63
1564        : (invlist_highest(ssc->invlist) < 256)
1565        ? 127
1566        : ((NON_OTHER_COUNT + 1) / 2) - 1;
1567  U32 count = 0;      /* Running total of number of code points matched by
1568       'ssc' */
1569  UV start, end;      /* Start and end points of current range in inversion
1570       list */
1571
1572  PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1573
1574  invlist_iterinit(ssc->invlist);
1575  while (invlist_iternext(ssc->invlist, &start, &end)) {
1576
1577   /* /u is the only thing that we expect to match above 255; so if not /u
1578   * and even if there are matches above 255, ignore them.  This catches
1579   * things like \d under /d which does match the digits above 255, but
1580   * since the pattern is /d, it is not likely to be expecting them */
1581   if (! UNI_SEMANTICS) {
1582    if (start > 255) {
1583     break;
1584    }
1585    end = MIN(end, 255);
1586   }
1587   count += end - start + 1;
1588   if (count > max_match) {
1589    invlist_iterfinish(ssc->invlist);
1590    return FALSE;
1591   }
1592  }
1593
1594  return TRUE;
1595 }
1596
1597
1598 STATIC void
1599 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1600 {
1601  /* The inversion list in the SSC is marked mortal; now we need a more
1602  * permanent copy, which is stored the same way that is done in a regular
1603  * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1604  * map */
1605
1606  SV* invlist = invlist_clone(ssc->invlist);
1607
1608  PERL_ARGS_ASSERT_SSC_FINALIZE;
1609
1610  assert(is_ANYOF_SYNTHETIC(ssc));
1611
1612  /* The code in this file assumes that all but these flags aren't relevant
1613  * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1614  * by the time we reach here */
1615  assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1616
1617  populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1618
1619  set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1620         NULL, NULL, NULL, FALSE);
1621
1622  /* Make sure is clone-safe */
1623  ssc->invlist = NULL;
1624
1625  if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1626   ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1627  }
1628
1629  assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1630 }
1631
1632 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1633 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1634 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1635 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1636        ? (TRIE_LIST_CUR( idx ) - 1)           \
1637        : 0 )
1638
1639
1640 #ifdef DEBUGGING
1641 /*
1642    dump_trie(trie,widecharmap,revcharmap)
1643    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1644    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1645
1646    These routines dump out a trie in a somewhat readable format.
1647    The _interim_ variants are used for debugging the interim
1648    tables that are used to generate the final compressed
1649    representation which is what dump_trie expects.
1650
1651    Part of the reason for their existence is to provide a form
1652    of documentation as to how the different representations function.
1653
1654 */
1655
1656 /*
1657   Dumps the final compressed table form of the trie to Perl_debug_log.
1658   Used for debugging make_trie().
1659 */
1660
1661 STATIC void
1662 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1663    AV *revcharmap, U32 depth)
1664 {
1665  U32 state;
1666  SV *sv=sv_newmortal();
1667  int colwidth= widecharmap ? 6 : 4;
1668  U16 word;
1669  GET_RE_DEBUG_FLAGS_DECL;
1670
1671  PERL_ARGS_ASSERT_DUMP_TRIE;
1672
1673  PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1674   (int)depth * 2 + 2,"",
1675   "Match","Base","Ofs" );
1676
1677  for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1678   SV ** const tmp = av_fetch( revcharmap, state, 0);
1679   if ( tmp ) {
1680    PerlIO_printf( Perl_debug_log, "%*s",
1681     colwidth,
1682     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1683        PL_colors[0], PL_colors[1],
1684        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1685        PERL_PV_ESCAPE_FIRSTCHAR
1686     )
1687    );
1688   }
1689  }
1690  PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1691   (int)depth * 2 + 2,"");
1692
1693  for( state = 0 ; state < trie->uniquecharcount ; state++ )
1694   PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1695  PerlIO_printf( Perl_debug_log, "\n");
1696
1697  for( state = 1 ; state < trie->statecount ; state++ ) {
1698   const U32 base = trie->states[ state ].trans.base;
1699
1700   PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1701          (int)depth * 2 + 2,"", (UV)state);
1702
1703   if ( trie->states[ state ].wordnum ) {
1704    PerlIO_printf( Perl_debug_log, " W%4X",
1705           trie->states[ state ].wordnum );
1706   } else {
1707    PerlIO_printf( Perl_debug_log, "%6s", "" );
1708   }
1709
1710   PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1711
1712   if ( base ) {
1713    U32 ofs = 0;
1714
1715    while( ( base + ofs  < trie->uniquecharcount ) ||
1716     ( base + ofs - trie->uniquecharcount < trie->lasttrans
1717      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1718                  != state))
1719      ofs++;
1720
1721    PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1722
1723    for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1724     if ( ( base + ofs >= trie->uniquecharcount )
1725       && ( base + ofs - trie->uniquecharcount
1726               < trie->lasttrans )
1727       && trie->trans[ base + ofs
1728          - trie->uniquecharcount ].check == state )
1729     {
1730     PerlIO_printf( Perl_debug_log, "%*"UVXf,
1731      colwidth,
1732      (UV)trie->trans[ base + ofs
1733            - trie->uniquecharcount ].next );
1734     } else {
1735      PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1736     }
1737    }
1738
1739    PerlIO_printf( Perl_debug_log, "]");
1740
1741   }
1742   PerlIO_printf( Perl_debug_log, "\n" );
1743  }
1744  PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1745         (int)depth*2, "");
1746  for (word=1; word <= trie->wordcount; word++) {
1747   PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1748    (int)word, (int)(trie->wordinfo[word].prev),
1749    (int)(trie->wordinfo[word].len));
1750  }
1751  PerlIO_printf(Perl_debug_log, "\n" );
1752 }
1753 /*
1754   Dumps a fully constructed but uncompressed trie in list form.
1755   List tries normally only are used for construction when the number of
1756   possible chars (trie->uniquecharcount) is very high.
1757   Used for debugging make_trie().
1758 */
1759 STATIC void
1760 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1761       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1762       U32 depth)
1763 {
1764  U32 state;
1765  SV *sv=sv_newmortal();
1766  int colwidth= widecharmap ? 6 : 4;
1767  GET_RE_DEBUG_FLAGS_DECL;
1768
1769  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1770
1771  /* print out the table precompression.  */
1772  PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1773   (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1774   "------:-----+-----------------\n" );
1775
1776  for( state=1 ; state < next_alloc ; state ++ ) {
1777   U16 charid;
1778
1779   PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1780    (int)depth * 2 + 2,"", (UV)state  );
1781   if ( ! trie->states[ state ].wordnum ) {
1782    PerlIO_printf( Perl_debug_log, "%5s| ","");
1783   } else {
1784    PerlIO_printf( Perl_debug_log, "W%4x| ",
1785     trie->states[ state ].wordnum
1786    );
1787   }
1788   for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1789    SV ** const tmp = av_fetch( revcharmap,
1790           TRIE_LIST_ITEM(state,charid).forid, 0);
1791    if ( tmp ) {
1792     PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1793      colwidth,
1794      pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1795        colwidth,
1796        PL_colors[0], PL_colors[1],
1797        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1798        | PERL_PV_ESCAPE_FIRSTCHAR
1799      ) ,
1800      TRIE_LIST_ITEM(state,charid).forid,
1801      (UV)TRIE_LIST_ITEM(state,charid).newstate
1802     );
1803     if (!(charid % 10))
1804      PerlIO_printf(Perl_debug_log, "\n%*s| ",
1805       (int)((depth * 2) + 14), "");
1806    }
1807   }
1808   PerlIO_printf( Perl_debug_log, "\n");
1809  }
1810 }
1811
1812 /*
1813   Dumps a fully constructed but uncompressed trie in table form.
1814   This is the normal DFA style state transition table, with a few
1815   twists to facilitate compression later.
1816   Used for debugging make_trie().
1817 */
1818 STATIC void
1819 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1820       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1821       U32 depth)
1822 {
1823  U32 state;
1824  U16 charid;
1825  SV *sv=sv_newmortal();
1826  int colwidth= widecharmap ? 6 : 4;
1827  GET_RE_DEBUG_FLAGS_DECL;
1828
1829  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1830
1831  /*
1832  print out the table precompression so that we can do a visual check
1833  that they are identical.
1834  */
1835
1836  PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1837
1838  for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1839   SV ** const tmp = av_fetch( revcharmap, charid, 0);
1840   if ( tmp ) {
1841    PerlIO_printf( Perl_debug_log, "%*s",
1842     colwidth,
1843     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1844        PL_colors[0], PL_colors[1],
1845        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1846        PERL_PV_ESCAPE_FIRSTCHAR
1847     )
1848    );
1849   }
1850  }
1851
1852  PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1853
1854  for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1855   PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1856  }
1857
1858  PerlIO_printf( Perl_debug_log, "\n" );
1859
1860  for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1861
1862   PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1863    (int)depth * 2 + 2,"",
1864    (UV)TRIE_NODENUM( state ) );
1865
1866   for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1867    UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1868    if (v)
1869     PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1870    else
1871     PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1872   }
1873   if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1874    PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1875            (UV)trie->trans[ state ].check );
1876   } else {
1877    PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1878            (UV)trie->trans[ state ].check,
1879    trie->states[ TRIE_NODENUM( state ) ].wordnum );
1880   }
1881  }
1882 }
1883
1884 #endif
1885
1886
1887 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1888   startbranch: the first branch in the whole branch sequence
1889   first      : start branch of sequence of branch-exact nodes.
1890    May be the same as startbranch
1891   last       : Thing following the last branch.
1892    May be the same as tail.
1893   tail       : item following the branch sequence
1894   count      : words in the sequence
1895   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
1896   depth      : indent depth
1897
1898 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1899
1900 A trie is an N'ary tree where the branches are determined by digital
1901 decomposition of the key. IE, at the root node you look up the 1st character and
1902 follow that branch repeat until you find the end of the branches. Nodes can be
1903 marked as "accepting" meaning they represent a complete word. Eg:
1904
1905   /he|she|his|hers/
1906
1907 would convert into the following structure. Numbers represent states, letters
1908 following numbers represent valid transitions on the letter from that state, if
1909 the number is in square brackets it represents an accepting state, otherwise it
1910 will be in parenthesis.
1911
1912  +-h->+-e->[3]-+-r->(8)-+-s->[9]
1913  |    |
1914  |   (2)
1915  |    |
1916  (1)   +-i->(6)-+-s->[7]
1917  |
1918  +-s->(3)-+-h->(4)-+-e->[5]
1919
1920  Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1921
1922 This shows that when matching against the string 'hers' we will begin at state 1
1923 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1924 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1925 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1926 single traverse. We store a mapping from accepting to state to which word was
1927 matched, and then when we have multiple possibilities we try to complete the
1928 rest of the regex in the order in which they occurred in the alternation.
1929
1930 The only prior NFA like behaviour that would be changed by the TRIE support is
1931 the silent ignoring of duplicate alternations which are of the form:
1932
1933  / (DUPE|DUPE) X? (?{ ... }) Y /x
1934
1935 Thus EVAL blocks following a trie may be called a different number of times with
1936 and without the optimisation. With the optimisations dupes will be silently
1937 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1938 the following demonstrates:
1939
1940  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1941
1942 which prints out 'word' three times, but
1943
1944  'words'=~/(word|word|word)(?{ print $1 })S/
1945
1946 which doesnt print it out at all. This is due to other optimisations kicking in.
1947
1948 Example of what happens on a structural level:
1949
1950 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1951
1952    1: CURLYM[1] {1,32767}(18)
1953    5:   BRANCH(8)
1954    6:     EXACT <ac>(16)
1955    8:   BRANCH(11)
1956    9:     EXACT <ad>(16)
1957   11:   BRANCH(14)
1958   12:     EXACT <ab>(16)
1959   16:   SUCCEED(0)
1960   17:   NOTHING(18)
1961   18: END(0)
1962
1963 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1964 and should turn into:
1965
1966    1: CURLYM[1] {1,32767}(18)
1967    5:   TRIE(16)
1968   [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1969   <ac>
1970   <ad>
1971   <ab>
1972   16:   SUCCEED(0)
1973   17:   NOTHING(18)
1974   18: END(0)
1975
1976 Cases where tail != last would be like /(?foo|bar)baz/:
1977
1978    1: BRANCH(4)
1979    2:   EXACT <foo>(8)
1980    4: BRANCH(7)
1981    5:   EXACT <bar>(8)
1982    7: TAIL(8)
1983    8: EXACT <baz>(10)
1984   10: END(0)
1985
1986 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1987 and would end up looking like:
1988
1989  1: TRIE(8)
1990  [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1991   <foo>
1992   <bar>
1993    7: TAIL(8)
1994    8: EXACT <baz>(10)
1995   10: END(0)
1996
1997  d = uvchr_to_utf8_flags(d, uv, 0);
1998
1999 is the recommended Unicode-aware way of saying
2000
2001  *(d++) = uv;
2002 */
2003
2004 #define TRIE_STORE_REVCHAR(val)                                            \
2005  STMT_START {                                                           \
2006   if (UTF) {          \
2007    SV *zlopp = newSV(7); /* XXX: optimize me */                   \
2008    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);    \
2009    unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2010    SvCUR_set(zlopp, kapow - flrbbbbb);       \
2011    SvPOK_on(zlopp);         \
2012    SvUTF8_on(zlopp);         \
2013    av_push(revcharmap, zlopp);        \
2014   } else {          \
2015    char ooooff = (char)val;                                           \
2016    av_push(revcharmap, newSVpvn(&ooooff, 1));      \
2017   }           \
2018   } STMT_END
2019
2020 /* This gets the next character from the input, folding it if not already
2021  * folded. */
2022 #define TRIE_READ_CHAR STMT_START {                                           \
2023  wordlen++;                                                                \
2024  if ( UTF ) {                                                              \
2025   /* if it is UTF then it is either already folded, or does not need    \
2026   * folding */                                                         \
2027   uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
2028  }                                                                         \
2029  else if (folder == PL_fold_latin1) {                                      \
2030   /* This folder implies Unicode rules, which in the range expressible  \
2031   *  by not UTF is the lower case, with the two exceptions, one of     \
2032   *  which should have been taken care of before calling this */       \
2033   assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
2034   uvc = toLOWER_L1(*uc);                                                \
2035   if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
2036   len = 1;                                                              \
2037  } else {                                                                  \
2038   /* raw data, will be folded later if needed */                        \
2039   uvc = (U32)*uc;                                                       \
2040   len = 1;                                                              \
2041  }                                                                         \
2042 } STMT_END
2043
2044
2045
2046 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
2047  if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
2048   U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
2049   Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2050  }                                                           \
2051  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
2052  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
2053  TRIE_LIST_CUR( state )++;                                   \
2054 } STMT_END
2055
2056 #define TRIE_LIST_NEW(state) STMT_START {                       \
2057  Newxz( trie->states[ state ].trans.list,               \
2058   4, reg_trie_trans_le );                                 \
2059  TRIE_LIST_CUR( state ) = 1;                                \
2060  TRIE_LIST_LEN( state ) = 4;                                \
2061 } STMT_END
2062
2063 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
2064  U16 dupe= trie->states[ state ].wordnum;                    \
2065  regnode * const noper_next = regnext( noper );              \
2066                 \
2067  DEBUG_r({                                                   \
2068   /* store the word for dumping */                        \
2069   SV* tmp;                                                \
2070   if (OP(noper) != NOTHING)                               \
2071    tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2072   else                                                    \
2073    tmp = newSVpvn_utf8( "", 0, UTF );   \
2074   av_push( trie_words, tmp );                             \
2075  });                                                         \
2076                 \
2077  curword++;                                                  \
2078  trie->wordinfo[curword].prev   = 0;                         \
2079  trie->wordinfo[curword].len    = wordlen;                   \
2080  trie->wordinfo[curword].accept = state;                     \
2081                 \
2082  if ( noper_next < tail ) {                                  \
2083   if (!trie->jump)                                        \
2084    trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2085             sizeof(U16) ); \
2086   trie->jump[curword] = (U16)(noper_next - convert);      \
2087   if (!jumper)                                            \
2088    jumper = noper_next;                                \
2089   if (!nextbranch)                                        \
2090    nextbranch= regnext(cur);                           \
2091  }                                                           \
2092                 \
2093  if ( dupe ) {                                               \
2094   /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2095   /* chain, so that when the bits of chain are later    */\
2096   /* linked together, the dups appear in the chain      */\
2097   trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2098   trie->wordinfo[dupe].prev = curword;                    \
2099  } else {                                                    \
2100   /* we haven't inserted this word yet.                */ \
2101   trie->states[ state ].wordnum = curword;                \
2102  }                                                           \
2103 } STMT_END
2104
2105
2106 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)  \
2107  ( ( base + charid >=  ucharcount     \
2108   && base + charid < ubound     \
2109   && state == trie->trans[ base - ucharcount + charid ].check \
2110   && trie->trans[ base - ucharcount + charid ].next )  \
2111   ? trie->trans[ base - ucharcount + charid ].next  \
2112   : ( state==1 ? special : 0 )     \
2113  )
2114
2115 #define MADE_TRIE       1
2116 #define MADE_JUMP_TRIE  2
2117 #define MADE_EXACT_TRIE 4
2118
2119 STATIC I32
2120 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2121     regnode *first, regnode *last, regnode *tail,
2122     U32 word_count, U32 flags, U32 depth)
2123 {
2124  /* first pass, loop through and scan words */
2125  reg_trie_data *trie;
2126  HV *widecharmap = NULL;
2127  AV *revcharmap = newAV();
2128  regnode *cur;
2129  STRLEN len = 0;
2130  UV uvc = 0;
2131  U16 curword = 0;
2132  U32 next_alloc = 0;
2133  regnode *jumper = NULL;
2134  regnode *nextbranch = NULL;
2135  regnode *convert = NULL;
2136  U32 *prev_states; /* temp array mapping each state to previous one */
2137  /* we just use folder as a flag in utf8 */
2138  const U8 * folder = NULL;
2139
2140 #ifdef DEBUGGING
2141  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2142  AV *trie_words = NULL;
2143  /* along with revcharmap, this only used during construction but both are
2144  * useful during debugging so we store them in the struct when debugging.
2145  */
2146 #else
2147  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2148  STRLEN trie_charcount=0;
2149 #endif
2150  SV *re_trie_maxbuff;
2151  GET_RE_DEBUG_FLAGS_DECL;
2152
2153  PERL_ARGS_ASSERT_MAKE_TRIE;
2154 #ifndef DEBUGGING
2155  PERL_UNUSED_ARG(depth);
2156 #endif
2157
2158  switch (flags) {
2159   case EXACT: case EXACTL: break;
2160   case EXACTFA:
2161   case EXACTFU_SS:
2162   case EXACTFU:
2163   case EXACTFLU8: folder = PL_fold_latin1; break;
2164   case EXACTF:  folder = PL_fold; break;
2165   default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2166  }
2167
2168  trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2169  trie->refcount = 1;
2170  trie->startstate = 1;
2171  trie->wordcount = word_count;
2172  RExC_rxi->data->data[ data_slot ] = (void*)trie;
2173  trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2174  if (flags == EXACT || flags == EXACTL)
2175   trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2176  trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2177      trie->wordcount+1, sizeof(reg_trie_wordinfo));
2178
2179  DEBUG_r({
2180   trie_words = newAV();
2181  });
2182
2183  re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2184  assert(re_trie_maxbuff);
2185  if (!SvIOK(re_trie_maxbuff)) {
2186   sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2187  }
2188  DEBUG_TRIE_COMPILE_r({
2189   PerlIO_printf( Perl_debug_log,
2190   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2191   (int)depth * 2 + 2, "",
2192   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2193   REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2194  });
2195
2196    /* Find the node we are going to overwrite */
2197  if ( first == startbranch && OP( last ) != BRANCH ) {
2198   /* whole branch chain */
2199   convert = first;
2200  } else {
2201   /* branch sub-chain */
2202   convert = NEXTOPER( first );
2203  }
2204
2205  /*  -- First loop and Setup --
2206
2207  We first traverse the branches and scan each word to determine if it
2208  contains widechars, and how many unique chars there are, this is
2209  important as we have to build a table with at least as many columns as we
2210  have unique chars.
2211
2212  We use an array of integers to represent the character codes 0..255
2213  (trie->charmap) and we use a an HV* to store Unicode characters. We use
2214  the native representation of the character value as the key and IV's for
2215  the coded index.
2216
2217  *TODO* If we keep track of how many times each character is used we can
2218  remap the columns so that the table compression later on is more
2219  efficient in terms of memory by ensuring the most common value is in the
2220  middle and the least common are on the outside.  IMO this would be better
2221  than a most to least common mapping as theres a decent chance the most
2222  common letter will share a node with the least common, meaning the node
2223  will not be compressible. With a middle is most common approach the worst
2224  case is when we have the least common nodes twice.
2225
2226  */
2227
2228  for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2229   regnode *noper = NEXTOPER( cur );
2230   const U8 *uc = (U8*)STRING( noper );
2231   const U8 *e  = uc + STR_LEN( noper );
2232   int foldlen = 0;
2233   U32 wordlen      = 0;         /* required init */
2234   STRLEN minchars = 0;
2235   STRLEN maxchars = 0;
2236   bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2237            bitmap?*/
2238
2239   if (OP(noper) == NOTHING) {
2240    regnode *noper_next= regnext(noper);
2241    if (noper_next != tail && OP(noper_next) == flags) {
2242     noper = noper_next;
2243     uc= (U8*)STRING(noper);
2244     e= uc + STR_LEN(noper);
2245     trie->minlen= STR_LEN(noper);
2246    } else {
2247     trie->minlen= 0;
2248     continue;
2249    }
2250   }
2251
2252   if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2253    TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2254           regardless of encoding */
2255    if (OP( noper ) == EXACTFU_SS) {
2256     /* false positives are ok, so just set this */
2257     TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2258    }
2259   }
2260   for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2261           branch */
2262    TRIE_CHARCOUNT(trie)++;
2263    TRIE_READ_CHAR;
2264
2265    /* TRIE_READ_CHAR returns the current character, or its fold if /i
2266    * is in effect.  Under /i, this character can match itself, or
2267    * anything that folds to it.  If not under /i, it can match just
2268    * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2269    * all fold to k, and all are single characters.   But some folds
2270    * expand to more than one character, so for example LATIN SMALL
2271    * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2272    * the string beginning at 'uc' is 'ffi', it could be matched by
2273    * three characters, or just by the one ligature character. (It
2274    * could also be matched by two characters: LATIN SMALL LIGATURE FF
2275    * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2276    * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2277    * match.)  The trie needs to know the minimum and maximum number
2278    * of characters that could match so that it can use size alone to
2279    * quickly reject many match attempts.  The max is simple: it is
2280    * the number of folded characters in this branch (since a fold is
2281    * never shorter than what folds to it. */
2282
2283    maxchars++;
2284
2285    /* And the min is equal to the max if not under /i (indicated by
2286    * 'folder' being NULL), or there are no multi-character folds.  If
2287    * there is a multi-character fold, the min is incremented just
2288    * once, for the character that folds to the sequence.  Each
2289    * character in the sequence needs to be added to the list below of
2290    * characters in the trie, but we count only the first towards the
2291    * min number of characters needed.  This is done through the
2292    * variable 'foldlen', which is returned by the macros that look
2293    * for these sequences as the number of bytes the sequence
2294    * occupies.  Each time through the loop, we decrement 'foldlen' by
2295    * how many bytes the current char occupies.  Only when it reaches
2296    * 0 do we increment 'minchars' or look for another multi-character
2297    * sequence. */
2298    if (folder == NULL) {
2299     minchars++;
2300    }
2301    else if (foldlen > 0) {
2302     foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2303    }
2304    else {
2305     minchars++;
2306
2307     /* See if *uc is the beginning of a multi-character fold.  If
2308     * so, we decrement the length remaining to look at, to account
2309     * for the current character this iteration.  (We can use 'uc'
2310     * instead of the fold returned by TRIE_READ_CHAR because for
2311     * non-UTF, the latin1_safe macro is smart enough to account
2312     * for all the unfolded characters, and because for UTF, the
2313     * string will already have been folded earlier in the
2314     * compilation process */
2315     if (UTF) {
2316      if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2317       foldlen -= UTF8SKIP(uc);
2318      }
2319     }
2320     else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2321      foldlen--;
2322     }
2323    }
2324
2325    /* The current character (and any potential folds) should be added
2326    * to the possible matching characters for this position in this
2327    * branch */
2328    if ( uvc < 256 ) {
2329     if ( folder ) {
2330      U8 folded= folder[ (U8) uvc ];
2331      if ( !trie->charmap[ folded ] ) {
2332       trie->charmap[ folded ]=( ++trie->uniquecharcount );
2333       TRIE_STORE_REVCHAR( folded );
2334      }
2335     }
2336     if ( !trie->charmap[ uvc ] ) {
2337      trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2338      TRIE_STORE_REVCHAR( uvc );
2339     }
2340     if ( set_bit ) {
2341      /* store the codepoint in the bitmap, and its folded
2342      * equivalent. */
2343      TRIE_BITMAP_SET(trie, uvc);
2344
2345      /* store the folded codepoint */
2346      if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2347
2348      if ( !UTF ) {
2349       /* store first byte of utf8 representation of
2350       variant codepoints */
2351       if (! UVCHR_IS_INVARIANT(uvc)) {
2352        TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2353       }
2354      }
2355      set_bit = 0; /* We've done our bit :-) */
2356     }
2357    } else {
2358
2359     /* XXX We could come up with the list of code points that fold
2360     * to this using PL_utf8_foldclosures, except not for
2361     * multi-char folds, as there may be multiple combinations
2362     * there that could work, which needs to wait until runtime to
2363     * resolve (The comment about LIGATURE FFI above is such an
2364     * example */
2365
2366     SV** svpp;
2367     if ( !widecharmap )
2368      widecharmap = newHV();
2369
2370     svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2371
2372     if ( !svpp )
2373      Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2374
2375     if ( !SvTRUE( *svpp ) ) {
2376      sv_setiv( *svpp, ++trie->uniquecharcount );
2377      TRIE_STORE_REVCHAR(uvc);
2378     }
2379    }
2380   } /* end loop through characters in this branch of the trie */
2381
2382   /* We take the min and max for this branch and combine to find the min
2383   * and max for all branches processed so far */
2384   if( cur == first ) {
2385    trie->minlen = minchars;
2386    trie->maxlen = maxchars;
2387   } else if (minchars < trie->minlen) {
2388    trie->minlen = minchars;
2389   } else if (maxchars > trie->maxlen) {
2390    trie->maxlen = maxchars;
2391   }
2392  } /* end first pass */
2393  DEBUG_TRIE_COMPILE_r(
2394   PerlIO_printf( Perl_debug_log,
2395     "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2396     (int)depth * 2 + 2,"",
2397     ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2398     (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2399     (int)trie->minlen, (int)trie->maxlen )
2400  );
2401
2402  /*
2403   We now know what we are dealing with in terms of unique chars and
2404   string sizes so we can calculate how much memory a naive
2405   representation using a flat table  will take. If it's over a reasonable
2406   limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2407   conservative but potentially much slower representation using an array
2408   of lists.
2409
2410   At the end we convert both representations into the same compressed
2411   form that will be used in regexec.c for matching with. The latter
2412   is a form that cannot be used to construct with but has memory
2413   properties similar to the list form and access properties similar
2414   to the table form making it both suitable for fast searches and
2415   small enough that its feasable to store for the duration of a program.
2416
2417   See the comment in the code where the compressed table is produced
2418   inplace from the flat tabe representation for an explanation of how
2419   the compression works.
2420
2421  */
2422
2423
2424  Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2425  prev_states[1] = 0;
2426
2427  if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2428              > SvIV(re_trie_maxbuff) )
2429  {
2430   /*
2431    Second Pass -- Array Of Lists Representation
2432
2433    Each state will be represented by a list of charid:state records
2434    (reg_trie_trans_le) the first such element holds the CUR and LEN
2435    points of the allocated array. (See defines above).
2436
2437    We build the initial structure using the lists, and then convert
2438    it into the compressed table form which allows faster lookups
2439    (but cant be modified once converted).
2440   */
2441
2442   STRLEN transcount = 1;
2443
2444   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2445    "%*sCompiling trie using list compiler\n",
2446    (int)depth * 2 + 2, ""));
2447
2448   trie->states = (reg_trie_state *)
2449    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2450         sizeof(reg_trie_state) );
2451   TRIE_LIST_NEW(1);
2452   next_alloc = 2;
2453
2454   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2455
2456    regnode *noper   = NEXTOPER( cur );
2457    U8 *uc           = (U8*)STRING( noper );
2458    const U8 *e      = uc + STR_LEN( noper );
2459    U32 state        = 1;         /* required init */
2460    U16 charid       = 0;         /* sanity init */
2461    U32 wordlen      = 0;         /* required init */
2462
2463    if (OP(noper) == NOTHING) {
2464     regnode *noper_next= regnext(noper);
2465     if (noper_next != tail && OP(noper_next) == flags) {
2466      noper = noper_next;
2467      uc= (U8*)STRING(noper);
2468      e= uc + STR_LEN(noper);
2469     }
2470    }
2471
2472    if (OP(noper) != NOTHING) {
2473     for ( ; uc < e ; uc += len ) {
2474
2475      TRIE_READ_CHAR;
2476
2477      if ( uvc < 256 ) {
2478       charid = trie->charmap[ uvc ];
2479      } else {
2480       SV** const svpp = hv_fetch( widecharmap,
2481              (char*)&uvc,
2482              sizeof( UV ),
2483              0);
2484       if ( !svpp ) {
2485        charid = 0;
2486       } else {
2487        charid=(U16)SvIV( *svpp );
2488       }
2489      }
2490      /* charid is now 0 if we dont know the char read, or
2491      * nonzero if we do */
2492      if ( charid ) {
2493
2494       U16 check;
2495       U32 newstate = 0;
2496
2497       charid--;
2498       if ( !trie->states[ state ].trans.list ) {
2499        TRIE_LIST_NEW( state );
2500       }
2501       for ( check = 1;
2502        check <= TRIE_LIST_USED( state );
2503        check++ )
2504       {
2505        if ( TRIE_LIST_ITEM( state, check ).forid
2506                  == charid )
2507        {
2508         newstate = TRIE_LIST_ITEM( state, check ).newstate;
2509         break;
2510        }
2511       }
2512       if ( ! newstate ) {
2513        newstate = next_alloc++;
2514        prev_states[newstate] = state;
2515        TRIE_LIST_PUSH( state, charid, newstate );
2516        transcount++;
2517       }
2518       state = newstate;
2519      } else {
2520       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2521      }
2522     }
2523    }
2524    TRIE_HANDLE_WORD(state);
2525
2526   } /* end second pass */
2527
2528   /* next alloc is the NEXT state to be allocated */
2529   trie->statecount = next_alloc;
2530   trie->states = (reg_trie_state *)
2531    PerlMemShared_realloc( trie->states,
2532         next_alloc
2533         * sizeof(reg_trie_state) );
2534
2535   /* and now dump it out before we compress it */
2536   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2537               revcharmap, next_alloc,
2538               depth+1)
2539   );
2540
2541   trie->trans = (reg_trie_trans *)
2542    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2543   {
2544    U32 state;
2545    U32 tp = 0;
2546    U32 zp = 0;
2547
2548
2549    for( state=1 ; state < next_alloc ; state ++ ) {
2550     U32 base=0;
2551
2552     /*
2553     DEBUG_TRIE_COMPILE_MORE_r(
2554      PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2555     );
2556     */
2557
2558     if (trie->states[state].trans.list) {
2559      U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2560      U16 maxid=minid;
2561      U16 idx;
2562
2563      for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2564       const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2565       if ( forid < minid ) {
2566        minid=forid;
2567       } else if ( forid > maxid ) {
2568        maxid=forid;
2569       }
2570      }
2571      if ( transcount < tp + maxid - minid + 1) {
2572       transcount *= 2;
2573       trie->trans = (reg_trie_trans *)
2574        PerlMemShared_realloc( trie->trans,
2575              transcount
2576              * sizeof(reg_trie_trans) );
2577       Zero( trie->trans + (transcount / 2),
2578        transcount / 2,
2579        reg_trie_trans );
2580      }
2581      base = trie->uniquecharcount + tp - minid;
2582      if ( maxid == minid ) {
2583       U32 set = 0;
2584       for ( ; zp < tp ; zp++ ) {
2585        if ( ! trie->trans[ zp ].next ) {
2586         base = trie->uniquecharcount + zp - minid;
2587         trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2588                 1).newstate;
2589         trie->trans[ zp ].check = state;
2590         set = 1;
2591         break;
2592        }
2593       }
2594       if ( !set ) {
2595        trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2596                 1).newstate;
2597        trie->trans[ tp ].check = state;
2598        tp++;
2599        zp = tp;
2600       }
2601      } else {
2602       for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2603        const U32 tid = base
2604           - trie->uniquecharcount
2605           + TRIE_LIST_ITEM( state, idx ).forid;
2606        trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2607                 idx ).newstate;
2608        trie->trans[ tid ].check = state;
2609       }
2610       tp += ( maxid - minid + 1 );
2611      }
2612      Safefree(trie->states[ state ].trans.list);
2613     }
2614     /*
2615     DEBUG_TRIE_COMPILE_MORE_r(
2616      PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2617     );
2618     */
2619     trie->states[ state ].trans.base=base;
2620    }
2621    trie->lasttrans = tp + 1;
2622   }
2623  } else {
2624   /*
2625   Second Pass -- Flat Table Representation.
2626
2627   we dont use the 0 slot of either trans[] or states[] so we add 1 to
2628   each.  We know that we will need Charcount+1 trans at most to store
2629   the data (one row per char at worst case) So we preallocate both
2630   structures assuming worst case.
2631
2632   We then construct the trie using only the .next slots of the entry
2633   structs.
2634
2635   We use the .check field of the first entry of the node temporarily
2636   to make compression both faster and easier by keeping track of how
2637   many non zero fields are in the node.
2638
2639   Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2640   transition.
2641
2642   There are two terms at use here: state as a TRIE_NODEIDX() which is
2643   a number representing the first entry of the node, and state as a
2644   TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2645   and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2646   if there are 2 entrys per node. eg:
2647
2648    A B       A B
2649   1. 2 4    1. 3 7
2650   2. 0 3    3. 0 5
2651   3. 0 0    5. 0 0
2652   4. 0 0    7. 0 0
2653
2654   The table is internally in the right hand, idx form. However as we
2655   also have to deal with the states array which is indexed by nodenum
2656   we have to use TRIE_NODENUM() to convert.
2657
2658   */
2659   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2660    "%*sCompiling trie using table compiler\n",
2661    (int)depth * 2 + 2, ""));
2662
2663   trie->trans = (reg_trie_trans *)
2664    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2665         * trie->uniquecharcount + 1,
2666         sizeof(reg_trie_trans) );
2667   trie->states = (reg_trie_state *)
2668    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2669         sizeof(reg_trie_state) );
2670   next_alloc = trie->uniquecharcount + 1;
2671
2672
2673   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2674
2675    regnode *noper   = NEXTOPER( cur );
2676    const U8 *uc     = (U8*)STRING( noper );
2677    const U8 *e      = uc + STR_LEN( noper );
2678
2679    U32 state        = 1;         /* required init */
2680
2681    U16 charid       = 0;         /* sanity init */
2682    U32 accept_state = 0;         /* sanity init */
2683
2684    U32 wordlen      = 0;         /* required init */
2685
2686    if (OP(noper) == NOTHING) {
2687     regnode *noper_next= regnext(noper);
2688     if (noper_next != tail && OP(noper_next) == flags) {
2689      noper = noper_next;
2690      uc= (U8*)STRING(noper);
2691      e= uc + STR_LEN(noper);
2692     }
2693    }
2694
2695    if ( OP(noper) != NOTHING ) {
2696     for ( ; uc < e ; uc += len ) {
2697
2698      TRIE_READ_CHAR;
2699
2700      if ( uvc < 256 ) {
2701       charid = trie->charmap[ uvc ];
2702      } else {
2703       SV* const * const svpp = hv_fetch( widecharmap,
2704               (char*)&uvc,
2705               sizeof( UV ),
2706               0);
2707       charid = svpp ? (U16)SvIV(*svpp) : 0;
2708      }
2709      if ( charid ) {
2710       charid--;
2711       if ( !trie->trans[ state + charid ].next ) {
2712        trie->trans[ state + charid ].next = next_alloc;
2713        trie->trans[ state ].check++;
2714        prev_states[TRIE_NODENUM(next_alloc)]
2715          = TRIE_NODENUM(state);
2716        next_alloc += trie->uniquecharcount;
2717       }
2718       state = trie->trans[ state + charid ].next;
2719      } else {
2720       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2721      }
2722      /* charid is now 0 if we dont know the char read, or
2723      * nonzero if we do */
2724     }
2725    }
2726    accept_state = TRIE_NODENUM( state );
2727    TRIE_HANDLE_WORD(accept_state);
2728
2729   } /* end second pass */
2730
2731   /* and now dump it out before we compress it */
2732   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2733               revcharmap,
2734               next_alloc, depth+1));
2735
2736   {
2737   /*
2738   * Inplace compress the table.*
2739
2740   For sparse data sets the table constructed by the trie algorithm will
2741   be mostly 0/FAIL transitions or to put it another way mostly empty.
2742   (Note that leaf nodes will not contain any transitions.)
2743
2744   This algorithm compresses the tables by eliminating most such
2745   transitions, at the cost of a modest bit of extra work during lookup:
2746
2747   - Each states[] entry contains a .base field which indicates the
2748   index in the state[] array wheres its transition data is stored.
2749
2750   - If .base is 0 there are no valid transitions from that node.
2751
2752   - If .base is nonzero then charid is added to it to find an entry in
2753   the trans array.
2754
2755   -If trans[states[state].base+charid].check!=state then the
2756   transition is taken to be a 0/Fail transition. Thus if there are fail
2757   transitions at the front of the node then the .base offset will point
2758   somewhere inside the previous nodes data (or maybe even into a node
2759   even earlier), but the .check field determines if the transition is
2760   valid.
2761
2762   XXX - wrong maybe?
2763   The following process inplace converts the table to the compressed
2764   table: We first do not compress the root node 1,and mark all its
2765   .check pointers as 1 and set its .base pointer as 1 as well. This
2766   allows us to do a DFA construction from the compressed table later,
2767   and ensures that any .base pointers we calculate later are greater
2768   than 0.
2769
2770   - We set 'pos' to indicate the first entry of the second node.
2771
2772   - We then iterate over the columns of the node, finding the first and
2773   last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2774   and set the .check pointers accordingly, and advance pos
2775   appropriately and repreat for the next node. Note that when we copy
2776   the next pointers we have to convert them from the original
2777   NODEIDX form to NODENUM form as the former is not valid post
2778   compression.
2779
2780   - If a node has no transitions used we mark its base as 0 and do not
2781   advance the pos pointer.
2782
2783   - If a node only has one transition we use a second pointer into the
2784   structure to fill in allocated fail transitions from other states.
2785   This pointer is independent of the main pointer and scans forward
2786   looking for null transitions that are allocated to a state. When it
2787   finds one it writes the single transition into the "hole".  If the
2788   pointer doesnt find one the single transition is appended as normal.
2789
2790   - Once compressed we can Renew/realloc the structures to release the
2791   excess space.
2792
2793   See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2794   specifically Fig 3.47 and the associated pseudocode.
2795
2796   demq
2797   */
2798   const U32 laststate = TRIE_NODENUM( next_alloc );
2799   U32 state, charid;
2800   U32 pos = 0, zp=0;
2801   trie->statecount = laststate;
2802
2803   for ( state = 1 ; state < laststate ; state++ ) {
2804    U8 flag = 0;
2805    const U32 stateidx = TRIE_NODEIDX( state );
2806    const U32 o_used = trie->trans[ stateidx ].check;
2807    U32 used = trie->trans[ stateidx ].check;
2808    trie->trans[ stateidx ].check = 0;
2809
2810    for ( charid = 0;
2811     used && charid < trie->uniquecharcount;
2812     charid++ )
2813    {
2814     if ( flag || trie->trans[ stateidx + charid ].next ) {
2815      if ( trie->trans[ stateidx + charid ].next ) {
2816       if (o_used == 1) {
2817        for ( ; zp < pos ; zp++ ) {
2818         if ( ! trie->trans[ zp ].next ) {
2819          break;
2820         }
2821        }
2822        trie->states[ state ].trans.base
2823              = zp
2824              + trie->uniquecharcount
2825              - charid ;
2826        trie->trans[ zp ].next
2827         = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2828                + charid ].next );
2829        trie->trans[ zp ].check = state;
2830        if ( ++zp > pos ) pos = zp;
2831        break;
2832       }
2833       used--;
2834      }
2835      if ( !flag ) {
2836       flag = 1;
2837       trie->states[ state ].trans.base
2838          = pos + trie->uniquecharcount - charid ;
2839      }
2840      trie->trans[ pos ].next
2841       = SAFE_TRIE_NODENUM(
2842          trie->trans[ stateidx + charid ].next );
2843      trie->trans[ pos ].check = state;
2844      pos++;
2845     }
2846    }
2847   }
2848   trie->lasttrans = pos + 1;
2849   trie->states = (reg_trie_state *)
2850    PerlMemShared_realloc( trie->states, laststate
2851         * sizeof(reg_trie_state) );
2852   DEBUG_TRIE_COMPILE_MORE_r(
2853    PerlIO_printf( Perl_debug_log,
2854     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2855     (int)depth * 2 + 2,"",
2856     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2857      + 1 ),
2858     (IV)next_alloc,
2859     (IV)pos,
2860     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2861    );
2862
2863   } /* end table compress */
2864  }
2865  DEBUG_TRIE_COMPILE_MORE_r(
2866    PerlIO_printf(Perl_debug_log,
2867     "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2868     (int)depth * 2 + 2, "",
2869     (UV)trie->statecount,
2870     (UV)trie->lasttrans)
2871  );
2872  /* resize the trans array to remove unused space */
2873  trie->trans = (reg_trie_trans *)
2874   PerlMemShared_realloc( trie->trans, trie->lasttrans
2875        * sizeof(reg_trie_trans) );
2876
2877  {   /* Modify the program and insert the new TRIE node */
2878   U8 nodetype =(U8)(flags & 0xFF);
2879   char *str=NULL;
2880
2881 #ifdef DEBUGGING
2882   regnode *optimize = NULL;
2883 #ifdef RE_TRACK_PATTERN_OFFSETS
2884
2885   U32 mjd_offset = 0;
2886   U32 mjd_nodelen = 0;
2887 #endif /* RE_TRACK_PATTERN_OFFSETS */
2888 #endif /* DEBUGGING */
2889   /*
2890   This means we convert either the first branch or the first Exact,
2891   depending on whether the thing following (in 'last') is a branch
2892   or not and whther first is the startbranch (ie is it a sub part of
2893   the alternation or is it the whole thing.)
2894   Assuming its a sub part we convert the EXACT otherwise we convert
2895   the whole branch sequence, including the first.
2896   */
2897   /* Find the node we are going to overwrite */
2898   if ( first != startbranch || OP( last ) == BRANCH ) {
2899    /* branch sub-chain */
2900    NEXT_OFF( first ) = (U16)(last - first);
2901 #ifdef RE_TRACK_PATTERN_OFFSETS
2902    DEBUG_r({
2903     mjd_offset= Node_Offset((convert));
2904     mjd_nodelen= Node_Length((convert));
2905    });
2906 #endif
2907    /* whole branch chain */
2908   }
2909 #ifdef RE_TRACK_PATTERN_OFFSETS
2910   else {
2911    DEBUG_r({
2912     const  regnode *nop = NEXTOPER( convert );
2913     mjd_offset= Node_Offset((nop));
2914     mjd_nodelen= Node_Length((nop));
2915    });
2916   }
2917   DEBUG_OPTIMISE_r(
2918    PerlIO_printf(Perl_debug_log,
2919     "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2920     (int)depth * 2 + 2, "",
2921     (UV)mjd_offset, (UV)mjd_nodelen)
2922   );
2923 #endif
2924   /* But first we check to see if there is a common prefix we can
2925   split out as an EXACT and put in front of the TRIE node.  */
2926   trie->startstate= 1;
2927   if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2928    U32 state;
2929    for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2930     U32 ofs = 0;
2931     I32 idx = -1;
2932     U32 count = 0;
2933     const U32 base = trie->states[ state ].trans.base;
2934
2935     if ( trie->states[state].wordnum )
2936       count = 1;
2937
2938     for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2939      if ( ( base + ofs >= trie->uniquecharcount ) &&
2940       ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2941       trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2942      {
2943       if ( ++count > 1 ) {
2944        SV **tmp = av_fetch( revcharmap, ofs, 0);
2945        const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2946        if ( state == 1 ) break;
2947        if ( count == 2 ) {
2948         Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2949         DEBUG_OPTIMISE_r(
2950          PerlIO_printf(Perl_debug_log,
2951           "%*sNew Start State=%"UVuf" Class: [",
2952           (int)depth * 2 + 2, "",
2953           (UV)state));
2954         if (idx >= 0) {
2955          SV ** const tmp = av_fetch( revcharmap, idx, 0);
2956          const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2957
2958          TRIE_BITMAP_SET(trie,*ch);
2959          if ( folder )
2960           TRIE_BITMAP_SET(trie, folder[ *ch ]);
2961          DEBUG_OPTIMISE_r(
2962           PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2963          );
2964         }
2965        }
2966        TRIE_BITMAP_SET(trie,*ch);
2967        if ( folder )
2968         TRIE_BITMAP_SET(trie,folder[ *ch ]);
2969        DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2970       }
2971       idx = ofs;
2972      }
2973     }
2974     if ( count == 1 ) {
2975      SV **tmp = av_fetch( revcharmap, idx, 0);
2976      STRLEN len;
2977      char *ch = SvPV( *tmp, len );
2978      DEBUG_OPTIMISE_r({
2979       SV *sv=sv_newmortal();
2980       PerlIO_printf( Perl_debug_log,
2981        "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2982        (int)depth * 2 + 2, "",
2983        (UV)state, (UV)idx,
2984        pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2985         PL_colors[0], PL_colors[1],
2986         (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2987         PERL_PV_ESCAPE_FIRSTCHAR
2988        )
2989       );
2990      });
2991      if ( state==1 ) {
2992       OP( convert ) = nodetype;
2993       str=STRING(convert);
2994       STR_LEN(convert)=0;
2995      }
2996      STR_LEN(convert) += len;
2997      while (len--)
2998       *str++ = *ch++;
2999     } else {
3000 #ifdef DEBUGGING
3001      if (state>1)
3002       DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
3003 #endif
3004      break;
3005     }
3006    }
3007    trie->prefixlen = (state-1);
3008    if (str) {
3009     regnode *n = convert+NODE_SZ_STR(convert);
3010     NEXT_OFF(convert) = NODE_SZ_STR(convert);
3011     trie->startstate = state;
3012     trie->minlen -= (state - 1);
3013     trie->maxlen -= (state - 1);
3014 #ifdef DEBUGGING
3015    /* At least the UNICOS C compiler choked on this
3016     * being argument to DEBUG_r(), so let's just have
3017     * it right here. */
3018    if (
3019 #ifdef PERL_EXT_RE_BUILD
3020     1
3021 #else
3022     DEBUG_r_TEST
3023 #endif
3024     ) {
3025     regnode *fix = convert;
3026     U32 word = trie->wordcount;
3027     mjd_nodelen++;
3028     Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3029     while( ++fix < n ) {
3030      Set_Node_Offset_Length(fix, 0, 0);
3031     }
3032     while (word--) {
3033      SV ** const tmp = av_fetch( trie_words, word, 0 );
3034      if (tmp) {
3035       if ( STR_LEN(convert) <= SvCUR(*tmp) )
3036        sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3037       else
3038        sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3039      }
3040     }
3041    }
3042 #endif
3043     if (trie->maxlen) {
3044      convert = n;
3045     } else {
3046      NEXT_OFF(convert) = (U16)(tail - convert);
3047      DEBUG_r(optimize= n);
3048     }
3049    }
3050   }
3051   if (!jumper)
3052    jumper = last;
3053   if ( trie->maxlen ) {
3054    NEXT_OFF( convert ) = (U16)(tail - convert);
3055    ARG_SET( convert, data_slot );
3056    /* Store the offset to the first unabsorbed branch in
3057    jump[0], which is otherwise unused by the jump logic.
3058    We use this when dumping a trie and during optimisation. */
3059    if (trie->jump)
3060     trie->jump[0] = (U16)(nextbranch - convert);
3061
3062    /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3063    *   and there is a bitmap
3064    *   and the first "jump target" node we found leaves enough room
3065    * then convert the TRIE node into a TRIEC node, with the bitmap
3066    * embedded inline in the opcode - this is hypothetically faster.
3067    */
3068    if ( !trie->states[trie->startstate].wordnum
3069     && trie->bitmap
3070     && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3071    {
3072     OP( convert ) = TRIEC;
3073     Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3074     PerlMemShared_free(trie->bitmap);
3075     trie->bitmap= NULL;
3076    } else
3077     OP( convert ) = TRIE;
3078
3079    /* store the type in the flags */
3080    convert->flags = nodetype;
3081    DEBUG_r({
3082    optimize = convert
3083      + NODE_STEP_REGNODE
3084      + regarglen[ OP( convert ) ];
3085    });
3086    /* XXX We really should free up the resource in trie now,
3087     as we won't use them - (which resources?) dmq */
3088   }
3089   /* needed for dumping*/
3090   DEBUG_r(if (optimize) {
3091    regnode *opt = convert;
3092
3093    while ( ++opt < optimize) {
3094     Set_Node_Offset_Length(opt,0,0);
3095    }
3096    /*
3097     Try to clean up some of the debris left after the
3098     optimisation.
3099    */
3100    while( optimize < jumper ) {
3101     mjd_nodelen += Node_Length((optimize));
3102     OP( optimize ) = OPTIMIZED;
3103     Set_Node_Offset_Length(optimize,0,0);
3104     optimize++;
3105    }
3106    Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3107   });
3108  } /* end node insert */
3109  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
3110
3111  /*  Finish populating the prev field of the wordinfo array.  Walk back
3112  *  from each accept state until we find another accept state, and if
3113  *  so, point the first word's .prev field at the second word. If the
3114  *  second already has a .prev field set, stop now. This will be the
3115  *  case either if we've already processed that word's accept state,
3116  *  or that state had multiple words, and the overspill words were
3117  *  already linked up earlier.
3118  */
3119  {
3120   U16 word;
3121   U32 state;
3122   U16 prev;
3123
3124   for (word=1; word <= trie->wordcount; word++) {
3125    prev = 0;
3126    if (trie->wordinfo[word].prev)
3127     continue;
3128    state = trie->wordinfo[word].accept;
3129    while (state) {
3130     state = prev_states[state];
3131     if (!state)
3132      break;
3133     prev = trie->states[state].wordnum;
3134     if (prev)
3135      break;
3136    }
3137    trie->wordinfo[word].prev = prev;
3138   }
3139   Safefree(prev_states);
3140  }
3141
3142
3143  /* and now dump out the compressed format */
3144  DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3145
3146  RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3147 #ifdef DEBUGGING
3148  RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3149  RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3150 #else
3151  SvREFCNT_dec_NN(revcharmap);
3152 #endif
3153  return trie->jump
3154   ? MADE_JUMP_TRIE
3155   : trie->startstate>1
3156    ? MADE_EXACT_TRIE
3157    : MADE_TRIE;
3158 }
3159
3160 STATIC regnode *
3161 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3162 {
3163 /* The Trie is constructed and compressed now so we can build a fail array if
3164  * it's needed
3165
3166    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3167    3.32 in the
3168    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3169    Ullman 1985/88
3170    ISBN 0-201-10088-6
3171
3172    We find the fail state for each state in the trie, this state is the longest
3173    proper suffix of the current state's 'word' that is also a proper prefix of
3174    another word in our trie. State 1 represents the word '' and is thus the
3175    default fail state. This allows the DFA not to have to restart after its
3176    tried and failed a word at a given point, it simply continues as though it
3177    had been matching the other word in the first place.
3178    Consider
3179  'abcdgu'=~/abcdefg|cdgu/
3180    When we get to 'd' we are still matching the first word, we would encounter
3181    'g' which would fail, which would bring us to the state representing 'd' in
3182    the second word where we would try 'g' and succeed, proceeding to match
3183    'cdgu'.
3184  */
3185  /* add a fail transition */
3186  const U32 trie_offset = ARG(source);
3187  reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3188  U32 *q;
3189  const U32 ucharcount = trie->uniquecharcount;
3190  const U32 numstates = trie->statecount;
3191  const U32 ubound = trie->lasttrans + ucharcount;
3192  U32 q_read = 0;
3193  U32 q_write = 0;
3194  U32 charid;
3195  U32 base = trie->states[ 1 ].trans.base;
3196  U32 *fail;
3197  reg_ac_data *aho;
3198  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3199  regnode *stclass;
3200  GET_RE_DEBUG_FLAGS_DECL;
3201
3202  PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3203  PERL_UNUSED_CONTEXT;
3204 #ifndef DEBUGGING
3205  PERL_UNUSED_ARG(depth);
3206 #endif
3207
3208  if ( OP(source) == TRIE ) {
3209   struct regnode_1 *op = (struct regnode_1 *)
3210    PerlMemShared_calloc(1, sizeof(struct regnode_1));
3211   StructCopy(source,op,struct regnode_1);
3212   stclass = (regnode *)op;
3213  } else {
3214   struct regnode_charclass *op = (struct regnode_charclass *)
3215    PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3216   StructCopy(source,op,struct regnode_charclass);
3217   stclass = (regnode *)op;
3218  }
3219  OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3220
3221  ARG_SET( stclass, data_slot );
3222  aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3223  RExC_rxi->data->data[ data_slot ] = (void*)aho;
3224  aho->trie=trie_offset;
3225  aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3226  Copy( trie->states, aho->states, numstates, reg_trie_state );
3227  Newxz( q, numstates, U32);
3228  aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3229  aho->refcount = 1;
3230  fail = aho->fail;
3231  /* initialize fail[0..1] to be 1 so that we always have
3232  a valid final fail state */
3233  fail[ 0 ] = fail[ 1 ] = 1;
3234
3235  for ( charid = 0; charid < ucharcount ; charid++ ) {
3236   const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3237   if ( newstate ) {
3238    q[ q_write ] = newstate;
3239    /* set to point at the root */
3240    fail[ q[ q_write++ ] ]=1;
3241   }
3242  }
3243  while ( q_read < q_write) {
3244   const U32 cur = q[ q_read++ % numstates ];
3245   base = trie->states[ cur ].trans.base;
3246
3247   for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3248    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3249    if (ch_state) {
3250     U32 fail_state = cur;
3251     U32 fail_base;
3252     do {
3253      fail_state = fail[ fail_state ];
3254      fail_base = aho->states[ fail_state ].trans.base;
3255     } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3256
3257     fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3258     fail[ ch_state ] = fail_state;
3259     if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3260     {
3261       aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3262     }
3263     q[ q_write++ % numstates] = ch_state;
3264    }
3265   }
3266  }
3267  /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3268  when we fail in state 1, this allows us to use the
3269  charclass scan to find a valid start char. This is based on the principle
3270  that theres a good chance the string being searched contains lots of stuff
3271  that cant be a start char.
3272  */
3273  fail[ 0 ] = fail[ 1 ] = 0;
3274  DEBUG_TRIE_COMPILE_r({
3275   PerlIO_printf(Perl_debug_log,
3276      "%*sStclass Failtable (%"UVuf" states): 0",
3277      (int)(depth * 2), "", (UV)numstates
3278   );
3279   for( q_read=1; q_read<numstates; q_read++ ) {
3280    PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3281   }
3282   PerlIO_printf(Perl_debug_log, "\n");
3283  });
3284  Safefree(q);
3285  /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3286  return stclass;
3287 }
3288
3289
3290 #define DEBUG_PEEP(str,scan,depth) \
3291  DEBUG_OPTIMISE_r({if (scan){ \
3292  regnode *Next = regnext(scan); \
3293  regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); \
3294  PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)", \
3295   (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3296   Next ? (REG_NODE_NUM(Next)) : 0 ); \
3297  DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3298  PerlIO_printf(Perl_debug_log, "\n"); \
3299    }});
3300
3301 /* The below joins as many adjacent EXACTish nodes as possible into a single
3302  * one.  The regop may be changed if the node(s) contain certain sequences that
3303  * require special handling.  The joining is only done if:
3304  * 1) there is room in the current conglomerated node to entirely contain the
3305  *    next one.
3306  * 2) they are the exact same node type
3307  *
3308  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3309  * these get optimized out
3310  *
3311  * If a node is to match under /i (folded), the number of characters it matches
3312  * can be different than its character length if it contains a multi-character
3313  * fold.  *min_subtract is set to the total delta number of characters of the
3314  * input nodes.
3315  *
3316  * And *unfolded_multi_char is set to indicate whether or not the node contains
3317  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3318  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3319  * SMALL LETTER SHARP S, as only if the target string being matched against
3320  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3321  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3322  * whose components are all above the Latin1 range are not run-time locale
3323  * dependent, and have already been folded by the time this function is
3324  * called.)
3325  *
3326  * This is as good a place as any to discuss the design of handling these
3327  * multi-character fold sequences.  It's been wrong in Perl for a very long
3328  * time.  There are three code points in Unicode whose multi-character folds
3329  * were long ago discovered to mess things up.  The previous designs for
3330  * dealing with these involved assigning a special node for them.  This
3331  * approach doesn't always work, as evidenced by this example:
3332  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3333  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3334  * would match just the \xDF, it won't be able to handle the case where a
3335  * successful match would have to cross the node's boundary.  The new approach
3336  * that hopefully generally solves the problem generates an EXACTFU_SS node
3337  * that is "sss" in this case.
3338  *
3339  * It turns out that there are problems with all multi-character folds, and not
3340  * just these three.  Now the code is general, for all such cases.  The
3341  * approach taken is:
3342  * 1)   This routine examines each EXACTFish node that could contain multi-
3343  *      character folded sequences.  Since a single character can fold into
3344  *      such a sequence, the minimum match length for this node is less than
3345  *      the number of characters in the node.  This routine returns in
3346  *      *min_subtract how many characters to subtract from the the actual
3347  *      length of the string to get a real minimum match length; it is 0 if
3348  *      there are no multi-char foldeds.  This delta is used by the caller to
3349  *      adjust the min length of the match, and the delta between min and max,
3350  *      so that the optimizer doesn't reject these possibilities based on size
3351  *      constraints.
3352  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3353  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3354  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3355  *      there is a possible fold length change.  That means that a regular
3356  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3357  *      with length changes, and so can be processed faster.  regexec.c takes
3358  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3359  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3360  *      known until runtime).  This saves effort in regex matching.  However,
3361  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3362  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3363  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3364  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3365  *      possibilities for the non-UTF8 patterns are quite simple, except for
3366  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3367  *      members of a fold-pair, and arrays are set up for all of them so that
3368  *      the other member of the pair can be found quickly.  Code elsewhere in
3369  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3370  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3371  *      described in the next item.
3372  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3373  *      validity of the fold won't be known until runtime, and so must remain
3374  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3375  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3376  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3377  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3378  *      The reason this is a problem is that the optimizer part of regexec.c
3379  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3380  *      that a character in the pattern corresponds to at most a single
3381  *      character in the target string.  (And I do mean character, and not byte
3382  *      here, unlike other parts of the documentation that have never been
3383  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3384  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3385  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3386  *      nodes, violate the assumption, and they are the only instances where it
3387  *      is violated.  I'm reluctant to try to change the assumption, as the
3388  *      code involved is impenetrable to me (khw), so instead the code here
3389  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3390  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3391  *      boolean indicating whether or not the node contains such a fold.  When
3392  *      it is true, the caller sets a flag that later causes the optimizer in
3393  *      this file to not set values for the floating and fixed string lengths,
3394  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3395  *      assumption.  Thus, there is no optimization based on string lengths for
3396  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3397  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3398  *      assumption is wrong only in these cases is that all other non-UTF-8
3399  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3400  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3401  *      EXACTF nodes because we don't know at compile time if it actually
3402  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3403  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3404  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3405  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3406  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3407  *      string would require the pattern to be forced into UTF-8, the overhead
3408  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3409  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3410  *      locale.)
3411  *
3412  *      Similarly, the code that generates tries doesn't currently handle
3413  *      not-already-folded multi-char folds, and it looks like a pain to change
3414  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3415  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3416  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3417  *      using /iaa matching will be doing so almost entirely with ASCII
3418  *      strings, so this should rarely be encountered in practice */
3419
3420 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3421  if (PL_regkind[OP(scan)] == EXACT) \
3422   join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3423
3424 STATIC U32
3425 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3426     UV *min_subtract, bool *unfolded_multi_char,
3427     U32 flags,regnode *val, U32 depth)
3428 {
3429  /* Merge several consecutive EXACTish nodes into one. */
3430  regnode *n = regnext(scan);
3431  U32 stringok = 1;
3432  regnode *next = scan + NODE_SZ_STR(scan);
3433  U32 merged = 0;
3434  U32 stopnow = 0;
3435 #ifdef DEBUGGING
3436  regnode *stop = scan;
3437  GET_RE_DEBUG_FLAGS_DECL;
3438 #else
3439  PERL_UNUSED_ARG(depth);
3440 #endif
3441
3442  PERL_ARGS_ASSERT_JOIN_EXACT;
3443 #ifndef EXPERIMENTAL_INPLACESCAN
3444  PERL_UNUSED_ARG(flags);
3445  PERL_UNUSED_ARG(val);
3446 #endif
3447  DEBUG_PEEP("join",scan,depth);
3448
3449  /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3450  * EXACT ones that are mergeable to the current one. */
3451  while (n
3452   && (PL_regkind[OP(n)] == NOTHING
3453    || (stringok && OP(n) == OP(scan)))
3454   && NEXT_OFF(n)
3455   && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3456  {
3457
3458   if (OP(n) == TAIL || n > next)
3459    stringok = 0;
3460   if (PL_regkind[OP(n)] == NOTHING) {
3461    DEBUG_PEEP("skip:",n,depth);
3462    NEXT_OFF(scan) += NEXT_OFF(n);
3463    next = n + NODE_STEP_REGNODE;
3464 #ifdef DEBUGGING
3465    if (stringok)
3466     stop = n;
3467 #endif
3468    n = regnext(n);
3469   }
3470   else if (stringok) {
3471    const unsigned int oldl = STR_LEN(scan);
3472    regnode * const nnext = regnext(n);
3473
3474    /* XXX I (khw) kind of doubt that this works on platforms (should
3475    * Perl ever run on one) where U8_MAX is above 255 because of lots
3476    * of other assumptions */
3477    /* Don't join if the sum can't fit into a single node */
3478    if (oldl + STR_LEN(n) > U8_MAX)
3479     break;
3480
3481    DEBUG_PEEP("merg",n,depth);
3482    merged++;
3483
3484    NEXT_OFF(scan) += NEXT_OFF(n);
3485    STR_LEN(scan) += STR_LEN(n);
3486    next = n + NODE_SZ_STR(n);
3487    /* Now we can overwrite *n : */
3488    Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3489 #ifdef DEBUGGING
3490    stop = next - 1;
3491 #endif
3492    n = nnext;
3493    if (stopnow) break;
3494   }
3495
3496 #ifdef EXPERIMENTAL_INPLACESCAN
3497   if (flags && !NEXT_OFF(n)) {
3498    DEBUG_PEEP("atch", val, depth);
3499    if (reg_off_by_arg[OP(n)]) {
3500     ARG_SET(n, val - n);
3501    }
3502    else {
3503     NEXT_OFF(n) = val - n;
3504    }
3505    stopnow = 1;
3506   }
3507 #endif
3508  }
3509
3510  *min_subtract = 0;
3511  *unfolded_multi_char = FALSE;
3512
3513  /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3514  * can now analyze for sequences of problematic code points.  (Prior to
3515  * this final joining, sequences could have been split over boundaries, and
3516  * hence missed).  The sequences only happen in folding, hence for any
3517  * non-EXACT EXACTish node */
3518  if (OP(scan) != EXACT && OP(scan) != EXACTL) {
3519   U8* s0 = (U8*) STRING(scan);
3520   U8* s = s0;
3521   U8* s_end = s0 + STR_LEN(scan);
3522
3523   int total_count_delta = 0;  /* Total delta number of characters that
3524          multi-char folds expand to */
3525
3526   /* One pass is made over the node's string looking for all the
3527   * possibilities.  To avoid some tests in the loop, there are two main
3528   * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3529   * non-UTF-8 */
3530   if (UTF) {
3531    U8* folded = NULL;
3532
3533    if (OP(scan) == EXACTFL) {
3534     U8 *d;
3535
3536     /* An EXACTFL node would already have been changed to another
3537     * node type unless there is at least one character in it that
3538     * is problematic; likely a character whose fold definition
3539     * won't be known until runtime, and so has yet to be folded.
3540     * For all but the UTF-8 locale, folds are 1-1 in length, but
3541     * to handle the UTF-8 case, we need to create a temporary
3542     * folded copy using UTF-8 locale rules in order to analyze it.
3543     * This is because our macros that look to see if a sequence is
3544     * a multi-char fold assume everything is folded (otherwise the
3545     * tests in those macros would be too complicated and slow).
3546     * Note that here, the non-problematic folds will have already
3547     * been done, so we can just copy such characters.  We actually
3548     * don't completely fold the EXACTFL string.  We skip the
3549     * unfolded multi-char folds, as that would just create work
3550     * below to figure out the size they already are */
3551
3552     Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3553     d = folded;
3554     while (s < s_end) {
3555      STRLEN s_len = UTF8SKIP(s);
3556      if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3557       Copy(s, d, s_len, U8);
3558       d += s_len;
3559      }
3560      else if (is_FOLDS_TO_MULTI_utf8(s)) {
3561       *unfolded_multi_char = TRUE;
3562       Copy(s, d, s_len, U8);
3563       d += s_len;
3564      }
3565      else if (isASCII(*s)) {
3566       *(d++) = toFOLD(*s);
3567      }
3568      else {
3569       STRLEN len;
3570       _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3571       d += len;
3572      }
3573      s += s_len;
3574     }
3575
3576     /* Point the remainder of the routine to look at our temporary
3577     * folded copy */
3578     s = folded;
3579     s_end = d;
3580    } /* End of creating folded copy of EXACTFL string */
3581
3582    /* Examine the string for a multi-character fold sequence.  UTF-8
3583    * patterns have all characters pre-folded by the time this code is
3584    * executed */
3585    while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3586          length sequence we are looking for is 2 */
3587    {
3588     int count = 0;  /* How many characters in a multi-char fold */
3589     int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3590     if (! len) {    /* Not a multi-char fold: get next char */
3591      s += UTF8SKIP(s);
3592      continue;
3593     }
3594
3595     /* Nodes with 'ss' require special handling, except for
3596     * EXACTFA-ish for which there is no multi-char fold to this */
3597     if (len == 2 && *s == 's' && *(s+1) == 's'
3598      && OP(scan) != EXACTFA
3599      && OP(scan) != EXACTFA_NO_TRIE)
3600     {
3601      count = 2;
3602      if (OP(scan) != EXACTFL) {
3603       OP(scan) = EXACTFU_SS;
3604      }
3605      s += 2;
3606     }
3607     else { /* Here is a generic multi-char fold. */
3608      U8* multi_end  = s + len;
3609
3610      /* Count how many characters are in it.  In the case of
3611      * /aa, no folds which contain ASCII code points are
3612      * allowed, so check for those, and skip if found. */
3613      if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3614       count = utf8_length(s, multi_end);
3615       s = multi_end;
3616      }
3617      else {
3618       while (s < multi_end) {
3619        if (isASCII(*s)) {
3620         s++;
3621         goto next_iteration;
3622        }
3623        else {
3624         s += UTF8SKIP(s);
3625        }
3626        count++;
3627       }
3628      }
3629     }
3630
3631     /* The delta is how long the sequence is minus 1 (1 is how long
3632     * the character that folds to the sequence is) */
3633     total_count_delta += count - 1;
3634    next_iteration: ;
3635    }
3636
3637    /* We created a temporary folded copy of the string in EXACTFL
3638    * nodes.  Therefore we need to be sure it doesn't go below zero,
3639    * as the real string could be shorter */
3640    if (OP(scan) == EXACTFL) {
3641     int total_chars = utf8_length((U8*) STRING(scan),
3642           (U8*) STRING(scan) + STR_LEN(scan));
3643     if (total_count_delta > total_chars) {
3644      total_count_delta = total_chars;
3645     }
3646    }
3647
3648    *min_subtract += total_count_delta;
3649    Safefree(folded);
3650   }
3651   else if (OP(scan) == EXACTFA) {
3652
3653    /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3654    * fold to the ASCII range (and there are no existing ones in the
3655    * upper latin1 range).  But, as outlined in the comments preceding
3656    * this function, we need to flag any occurrences of the sharp s.
3657    * This character forbids trie formation (because of added
3658    * complexity) */
3659    while (s < s_end) {
3660     if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3661      OP(scan) = EXACTFA_NO_TRIE;
3662      *unfolded_multi_char = TRUE;
3663      break;
3664     }
3665     s++;
3666     continue;
3667    }
3668   }
3669   else {
3670
3671    /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3672    * folds that are all Latin1.  As explained in the comments
3673    * preceding this function, we look also for the sharp s in EXACTF
3674    * and EXACTFL nodes; it can be in the final position.  Otherwise
3675    * we can stop looking 1 byte earlier because have to find at least
3676    * two characters for a multi-fold */
3677    const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3678        ? s_end
3679        : s_end -1;
3680
3681    while (s < upper) {
3682     int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3683     if (! len) {    /* Not a multi-char fold. */
3684      if (*s == LATIN_SMALL_LETTER_SHARP_S
3685       && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3686      {
3687       *unfolded_multi_char = TRUE;
3688      }
3689      s++;
3690      continue;
3691     }
3692
3693     if (len == 2
3694      && isALPHA_FOLD_EQ(*s, 's')
3695      && isALPHA_FOLD_EQ(*(s+1), 's'))
3696     {
3697
3698      /* EXACTF nodes need to know that the minimum length
3699      * changed so that a sharp s in the string can match this
3700      * ss in the pattern, but they remain EXACTF nodes, as they
3701      * won't match this unless the target string is is UTF-8,
3702      * which we don't know until runtime.  EXACTFL nodes can't
3703      * transform into EXACTFU nodes */
3704      if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3705       OP(scan) = EXACTFU_SS;
3706      }
3707     }
3708
3709     *min_subtract += len - 1;
3710     s += len;
3711    }
3712   }
3713  }
3714
3715 #ifdef DEBUGGING
3716  /* Allow dumping but overwriting the collection of skipped
3717  * ops and/or strings with fake optimized ops */
3718  n = scan + NODE_SZ_STR(scan);
3719  while (n <= stop) {
3720   OP(n) = OPTIMIZED;
3721   FLAGS(n) = 0;
3722   NEXT_OFF(n) = 0;
3723   n++;
3724  }
3725 #endif
3726  DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3727  return stopnow;
3728 }
3729
3730 /* REx optimizer.  Converts nodes into quicker variants "in place".
3731    Finds fixed substrings.  */
3732
3733 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3734    to the position after last scanned or to NULL. */
3735
3736 #define INIT_AND_WITHP \
3737  assert(!and_withp); \
3738  Newx(and_withp,1, regnode_ssc); \
3739  SAVEFREEPV(and_withp)
3740
3741
3742 static void
3743 S_unwind_scan_frames(pTHX_ const void *p)
3744 {
3745  scan_frame *f= (scan_frame *)p;
3746  do {
3747   scan_frame *n= f->next_frame;
3748   Safefree(f);
3749   f= n;
3750  } while (f);
3751 }
3752
3753
3754 STATIC SSize_t
3755 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3756       SSize_t *minlenp, SSize_t *deltap,
3757       regnode *last,
3758       scan_data_t *data,
3759       I32 stopparen,
3760       U32 recursed_depth,
3761       regnode_ssc *and_withp,
3762       U32 flags, U32 depth)
3763       /* scanp: Start here (read-write). */
3764       /* deltap: Write maxlen-minlen here. */
3765       /* last: Stop before this one. */
3766       /* data: string data about the pattern */
3767       /* stopparen: treat close N as END */
3768       /* recursed: which subroutines have we recursed into */
3769       /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3770 {
3771  /* There must be at least this number of characters to match */
3772  SSize_t min = 0;
3773  I32 pars = 0, code;
3774  regnode *scan = *scanp, *next;
3775  SSize_t delta = 0;
3776  int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3777  int is_inf_internal = 0;  /* The studied chunk is infinite */
3778  I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3779  scan_data_t data_fake;
3780  SV *re_trie_maxbuff = NULL;
3781  regnode *first_non_open = scan;
3782  SSize_t stopmin = SSize_t_MAX;
3783  scan_frame *frame = NULL;
3784  GET_RE_DEBUG_FLAGS_DECL;
3785
3786  PERL_ARGS_ASSERT_STUDY_CHUNK;
3787
3788
3789  if ( depth == 0 ) {
3790   while (first_non_open && OP(first_non_open) == OPEN)
3791    first_non_open=regnext(first_non_open);
3792  }
3793
3794
3795   fake_study_recurse:
3796  DEBUG_r(
3797   RExC_study_chunk_recursed_count++;
3798  );
3799  DEBUG_OPTIMISE_MORE_r(
3800  {
3801   PerlIO_printf(Perl_debug_log,
3802    "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
3803    (int)(depth*2), "", (long)stopparen,
3804    (unsigned long)RExC_study_chunk_recursed_count,
3805    (unsigned long)depth, (unsigned long)recursed_depth,
3806    scan,
3807    last);
3808   if (recursed_depth) {
3809    U32 i;
3810    U32 j;
3811    for ( j = 0 ; j < recursed_depth ; j++ ) {
3812     for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
3813      if (
3814       PAREN_TEST(RExC_study_chunk_recursed +
3815         ( j * RExC_study_chunk_recursed_bytes), i )
3816       && (
3817        !j ||
3818        !PAREN_TEST(RExC_study_chunk_recursed +
3819         (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
3820       )
3821      ) {
3822       PerlIO_printf(Perl_debug_log," %d",(int)i);
3823       break;
3824      }
3825     }
3826     if ( j + 1 < recursed_depth ) {
3827      PerlIO_printf(Perl_debug_log, ",");
3828     }
3829    }
3830   }
3831   PerlIO_printf(Perl_debug_log,"\n");
3832  }
3833  );
3834  while ( scan && OP(scan) != END && scan < last ){
3835   UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3836         node length to get a real minimum (because
3837         the folded version may be shorter) */
3838   bool unfolded_multi_char = FALSE;
3839   /* Peephole optimizer: */
3840   DEBUG_STUDYDATA("Peep:", data, depth);
3841   DEBUG_PEEP("Peep", scan, depth);
3842
3843
3844   /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3845   * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3846   * by a different invocation of reg() -- Yves
3847   */
3848   JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3849
3850   /* Follow the next-chain of the current node and optimize
3851   away all the NOTHINGs from it.  */
3852   if (OP(scan) != CURLYX) {
3853    const int max = (reg_off_by_arg[OP(scan)]
3854      ? I32_MAX
3855      /* I32 may be smaller than U16 on CRAYs! */
3856      : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3857    int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3858    int noff;
3859    regnode *n = scan;
3860
3861    /* Skip NOTHING and LONGJMP. */
3862    while ((n = regnext(n))
3863     && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3864      || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3865     && off + noff < max)
3866     off += noff;
3867    if (reg_off_by_arg[OP(scan)])
3868     ARG(scan) = off;
3869    else
3870     NEXT_OFF(scan) = off;
3871   }
3872
3873   /* The principal pseudo-switch.  Cannot be a switch, since we
3874   look into several different things.  */
3875   if ( OP(scan) == DEFINEP ) {
3876    SSize_t minlen = 0;
3877    SSize_t deltanext = 0;
3878    SSize_t fake_last_close = 0;
3879    I32 f = SCF_IN_DEFINE;
3880
3881    StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3882    scan = regnext(scan);
3883    assert( OP(scan) == IFTHEN );
3884    DEBUG_PEEP("expect IFTHEN", scan, depth);
3885
3886    data_fake.last_closep= &fake_last_close;
3887    minlen = *minlenp;
3888    next = regnext(scan);
3889    scan = NEXTOPER(NEXTOPER(scan));
3890    DEBUG_PEEP("scan", scan, depth);
3891    DEBUG_PEEP("next", next, depth);
3892
3893    /* we suppose the run is continuous, last=next...
3894    * NOTE we dont use the return here! */
3895    (void)study_chunk(pRExC_state, &scan, &minlen,
3896        &deltanext, next, &data_fake, stopparen,
3897        recursed_depth, NULL, f, depth+1);
3898
3899    scan = next;
3900   } else
3901   if (
3902    OP(scan) == BRANCH  ||
3903    OP(scan) == BRANCHJ ||
3904    OP(scan) == IFTHEN
3905   ) {
3906    next = regnext(scan);
3907    code = OP(scan);
3908
3909    /* The op(next)==code check below is to see if we
3910    * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
3911    * IFTHEN is special as it might not appear in pairs.
3912    * Not sure whether BRANCH-BRANCHJ is possible, regardless
3913    * we dont handle it cleanly. */
3914    if (OP(next) == code || code == IFTHEN) {
3915     /* NOTE - There is similar code to this block below for
3916     * handling TRIE nodes on a re-study.  If you change stuff here
3917     * check there too. */
3918     SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3919     regnode_ssc accum;
3920     regnode * const startbranch=scan;
3921
3922     if (flags & SCF_DO_SUBSTR) {
3923      /* Cannot merge strings after this. */
3924      scan_commit(pRExC_state, data, minlenp, is_inf);
3925     }
3926
3927     if (flags & SCF_DO_STCLASS)
3928      ssc_init_zero(pRExC_state, &accum);
3929
3930     while (OP(scan) == code) {
3931      SSize_t deltanext, minnext, fake;
3932      I32 f = 0;
3933      regnode_ssc this_class;
3934
3935      DEBUG_PEEP("Branch", scan, depth);
3936
3937      num++;
3938      StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3939      if (data) {
3940       data_fake.whilem_c = data->whilem_c;
3941       data_fake.last_closep = data->last_closep;
3942      }
3943      else
3944       data_fake.last_closep = &fake;
3945
3946      data_fake.pos_delta = delta;
3947      next = regnext(scan);
3948
3949      scan = NEXTOPER(scan); /* everything */
3950      if (code != BRANCH)    /* everything but BRANCH */
3951       scan = NEXTOPER(scan);
3952
3953      if (flags & SCF_DO_STCLASS) {
3954       ssc_init(pRExC_state, &this_class);
3955       data_fake.start_class = &this_class;
3956       f = SCF_DO_STCLASS_AND;
3957      }
3958      if (flags & SCF_WHILEM_VISITED_POS)
3959       f |= SCF_WHILEM_VISITED_POS;
3960
3961      /* we suppose the run is continuous, last=next...*/
3962      minnext = study_chunk(pRExC_state, &scan, minlenp,
3963          &deltanext, next, &data_fake, stopparen,
3964          recursed_depth, NULL, f,depth+1);
3965
3966      if (min1 > minnext)
3967       min1 = minnext;
3968      if (deltanext == SSize_t_MAX) {
3969       is_inf = is_inf_internal = 1;
3970       max1 = SSize_t_MAX;
3971      } else if (max1 < minnext + deltanext)
3972       max1 = minnext + deltanext;
3973      scan = next;
3974      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3975       pars++;
3976      if (data_fake.flags & SCF_SEEN_ACCEPT) {
3977       if ( stopmin > minnext)
3978        stopmin = min + min1;
3979       flags &= ~SCF_DO_SUBSTR;
3980       if (data)
3981        data->flags |= SCF_SEEN_ACCEPT;
3982      }
3983      if (data) {
3984       if (data_fake.flags & SF_HAS_EVAL)
3985        data->flags |= SF_HAS_EVAL;
3986       data->whilem_c = data_fake.whilem_c;
3987      }
3988      if (flags & SCF_DO_STCLASS)
3989       ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3990     }
3991     if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3992      min1 = 0;
3993     if (flags & SCF_DO_SUBSTR) {
3994      data->pos_min += min1;
3995      if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3996       data->pos_delta = SSize_t_MAX;
3997      else
3998       data->pos_delta += max1 - min1;
3999      if (max1 != min1 || is_inf)
4000       data->longest = &(data->longest_float);
4001     }
4002     min += min1;
4003     if (delta == SSize_t_MAX
4004     || SSize_t_MAX - delta - (max1 - min1) < 0)
4005      delta = SSize_t_MAX;
4006     else
4007      delta += max1 - min1;
4008     if (flags & SCF_DO_STCLASS_OR) {
4009      ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4010      if (min1) {
4011       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4012       flags &= ~SCF_DO_STCLASS;
4013      }
4014     }
4015     else if (flags & SCF_DO_STCLASS_AND) {
4016      if (min1) {
4017       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4018       flags &= ~SCF_DO_STCLASS;
4019      }
4020      else {
4021       /* Switch to OR mode: cache the old value of
4022       * data->start_class */
4023       INIT_AND_WITHP;
4024       StructCopy(data->start_class, and_withp, regnode_ssc);
4025       flags &= ~SCF_DO_STCLASS_AND;
4026       StructCopy(&accum, data->start_class, regnode_ssc);
4027       flags |= SCF_DO_STCLASS_OR;
4028      }
4029     }
4030
4031     if (PERL_ENABLE_TRIE_OPTIMISATION &&
4032       OP( startbranch ) == BRANCH )
4033     {
4034     /* demq.
4035
4036     Assuming this was/is a branch we are dealing with: 'scan'
4037     now points at the item that follows the branch sequence,
4038     whatever it is. We now start at the beginning of the
4039     sequence and look for subsequences of
4040
4041     BRANCH->EXACT=>x1
4042     BRANCH->EXACT=>x2
4043     tail
4044
4045     which would be constructed from a pattern like
4046     /A|LIST|OF|WORDS/
4047
4048     If we can find such a subsequence we need to turn the first
4049     element into a trie and then add the subsequent branch exact
4050     strings to the trie.
4051
4052     We have two cases
4053
4054      1. patterns where the whole set of branches can be
4055       converted.
4056
4057      2. patterns where only a subset can be converted.
4058
4059     In case 1 we can replace the whole set with a single regop
4060     for the trie. In case 2 we need to keep the start and end
4061     branches so
4062
4063      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4064      becomes BRANCH TRIE; BRANCH X;
4065
4066     There is an additional case, that being where there is a
4067     common prefix, which gets split out into an EXACT like node
4068     preceding the TRIE node.
4069
4070     If x(1..n)==tail then we can do a simple trie, if not we make
4071     a "jump" trie, such that when we match the appropriate word
4072     we "jump" to the appropriate tail node. Essentially we turn
4073     a nested if into a case structure of sorts.
4074
4075     */
4076
4077      int made=0;
4078      if (!re_trie_maxbuff) {
4079       re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4080       if (!SvIOK(re_trie_maxbuff))
4081        sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4082      }
4083      if ( SvIV(re_trie_maxbuff)>=0  ) {
4084       regnode *cur;
4085       regnode *first = (regnode *)NULL;
4086       regnode *last = (regnode *)NULL;
4087       regnode *tail = scan;
4088       U8 trietype = 0;
4089       U32 count=0;
4090
4091       /* var tail is used because there may be a TAIL
4092       regop in the way. Ie, the exacts will point to the
4093       thing following the TAIL, but the last branch will
4094       point at the TAIL. So we advance tail. If we
4095       have nested (?:) we may have to move through several
4096       tails.
4097       */
4098
4099       while ( OP( tail ) == TAIL ) {
4100        /* this is the TAIL generated by (?:) */
4101        tail = regnext( tail );
4102       }
4103
4104
4105       DEBUG_TRIE_COMPILE_r({
4106        regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4107        PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
4108        (int)depth * 2 + 2, "",
4109        "Looking for TRIE'able sequences. Tail node is: ",
4110        SvPV_nolen_const( RExC_mysv )
4111        );
4112       });
4113
4114       /*
4115
4116        Step through the branches
4117         cur represents each branch,
4118         noper is the first thing to be matched as part
4119          of that branch
4120         noper_next is the regnext() of that node.
4121
4122        We normally handle a case like this
4123        /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4124        support building with NOJUMPTRIE, which restricts
4125        the trie logic to structures like /FOO|BAR/.
4126
4127        If noper is a trieable nodetype then the branch is
4128        a possible optimization target. If we are building
4129        under NOJUMPTRIE then we require that noper_next is
4130        the same as scan (our current position in the regex
4131        program).
4132
4133        Once we have two or more consecutive such branches
4134        we can create a trie of the EXACT's contents and
4135        stitch it in place into the program.
4136
4137        If the sequence represents all of the branches in
4138        the alternation we replace the entire thing with a
4139        single TRIE node.
4140
4141        Otherwise when it is a subsequence we need to
4142        stitch it in place and replace only the relevant
4143        branches. This means the first branch has to remain
4144        as it is used by the alternation logic, and its
4145        next pointer, and needs to be repointed at the item
4146        on the branch chain following the last branch we
4147        have optimized away.
4148
4149        This could be either a BRANCH, in which case the
4150        subsequence is internal, or it could be the item
4151        following the branch sequence in which case the
4152        subsequence is at the end (which does not
4153        necessarily mean the first node is the start of the
4154        alternation).
4155
4156        TRIE_TYPE(X) is a define which maps the optype to a
4157        trietype.
4158
4159         optype          |  trietype
4160         ----------------+-----------
4161         NOTHING         | NOTHING
4162         EXACT           | EXACT
4163         EXACTFU         | EXACTFU
4164         EXACTFU_SS      | EXACTFU
4165         EXACTFA         | EXACTFA
4166         EXACTL          | EXACTL
4167         EXACTFLU8       | EXACTFLU8
4168
4169
4170       */
4171 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
4172      ? NOTHING                                            \
4173      : ( EXACT == (X) )                                   \
4174       ? EXACT                                            \
4175       : ( EXACTFU == (X) || EXACTFU_SS == (X) )          \
4176       ? EXACTFU                                        \
4177       : ( EXACTFA == (X) )                             \
4178        ? EXACTFA                                      \
4179        : ( EXACTL == (X) )                            \
4180        ? EXACTL                                     \
4181        : ( EXACTFLU8 == (X) )                        \
4182         ? EXACTFLU8                                 \
4183         : 0 )
4184
4185       /* dont use tail as the end marker for this traverse */
4186       for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4187        regnode * const noper = NEXTOPER( cur );
4188        U8 noper_type = OP( noper );
4189        U8 noper_trietype = TRIE_TYPE( noper_type );
4190 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4191        regnode * const noper_next = regnext( noper );
4192        U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
4193        U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
4194 #endif
4195
4196        DEBUG_TRIE_COMPILE_r({
4197         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4198         PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
4199         (int)depth * 2 + 2,"", SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4200
4201         regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4202         PerlIO_printf( Perl_debug_log, " -> %s",
4203          SvPV_nolen_const(RExC_mysv));
4204
4205         if ( noper_next ) {
4206         regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4207         PerlIO_printf( Perl_debug_log,"\t=> %s\t",
4208          SvPV_nolen_const(RExC_mysv));
4209         }
4210         PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
4211         REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4212         PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4213         );
4214        });
4215
4216        /* Is noper a trieable nodetype that can be merged
4217        * with the current trie (if there is one)? */
4218        if ( noper_trietype
4219         &&
4220         (
4221           ( noper_trietype == NOTHING)
4222           || ( trietype == NOTHING )
4223           || ( trietype == noper_trietype )
4224         )
4225 #ifdef NOJUMPTRIE
4226         && noper_next == tail
4227 #endif
4228         && count < U16_MAX)
4229        {
4230         /* Handle mergable triable node Either we are
4231         * the first node in a new trieable sequence,
4232         * in which case we do some bookkeeping,
4233         * otherwise we update the end pointer. */
4234         if ( !first ) {
4235          first = cur;
4236          if ( noper_trietype == NOTHING ) {
4237 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4238           regnode * const noper_next = regnext( noper );
4239           U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4240           U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4241 #endif
4242
4243           if ( noper_next_trietype ) {
4244            trietype = noper_next_trietype;
4245           } else if (noper_next_type)  {
4246            /* a NOTHING regop is 1 regop wide.
4247            * We need at least two for a trie
4248            * so we can't merge this in */
4249            first = NULL;
4250           }
4251          } else {
4252           trietype = noper_trietype;
4253          }
4254         } else {
4255          if ( trietype == NOTHING )
4256           trietype = noper_trietype;
4257          last = cur;
4258         }
4259         if (first)
4260          count++;
4261        } /* end handle mergable triable node */
4262        else {
4263         /* handle unmergable node -
4264         * noper may either be a triable node which can
4265         * not be tried together with the current trie,
4266         * or a non triable node */
4267         if ( last ) {
4268          /* If last is set and trietype is not
4269          * NOTHING then we have found at least two
4270          * triable branch sequences in a row of a
4271          * similar trietype so we can turn them
4272          * into a trie. If/when we allow NOTHING to
4273          * start a trie sequence this condition
4274          * will be required, and it isn't expensive
4275          * so we leave it in for now. */
4276          if ( trietype && trietype != NOTHING )
4277           make_trie( pRExC_state,
4278             startbranch, first, cur, tail,
4279             count, trietype, depth+1 );
4280          last = NULL; /* note: we clear/update
4281              first, trietype etc below,
4282              so we dont do it here */
4283         }
4284         if ( noper_trietype
4285 #ifdef NOJUMPTRIE
4286          && noper_next == tail
4287 #endif
4288         ){
4289          /* noper is triable, so we can start a new
4290          * trie sequence */
4291          count = 1;
4292          first = cur;
4293          trietype = noper_trietype;
4294         } else if (first) {
4295          /* if we already saw a first but the
4296          * current node is not triable then we have
4297          * to reset the first information. */
4298          count = 0;
4299          first = NULL;
4300          trietype = 0;
4301         }
4302        } /* end handle unmergable node */
4303       } /* loop over branches */
4304       DEBUG_TRIE_COMPILE_r({
4305        regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4306        PerlIO_printf( Perl_debug_log,
4307        "%*s- %s (%d) <SCAN FINISHED>\n",
4308        (int)depth * 2 + 2,
4309        "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4310
4311       });
4312       if ( last && trietype ) {
4313        if ( trietype != NOTHING ) {
4314         /* the last branch of the sequence was part of
4315         * a trie, so we have to construct it here
4316         * outside of the loop */
4317         made= make_trie( pRExC_state, startbranch,
4318             first, scan, tail, count,
4319             trietype, depth+1 );
4320 #ifdef TRIE_STUDY_OPT
4321         if ( ((made == MADE_EXACT_TRIE &&
4322          startbranch == first)
4323          || ( first_non_open == first )) &&
4324          depth==0 ) {
4325          flags |= SCF_TRIE_RESTUDY;
4326          if ( startbranch == first
4327           && scan == tail )
4328          {
4329           RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4330          }
4331         }
4332 #endif
4333        } else {
4334         /* at this point we know whatever we have is a
4335         * NOTHING sequence/branch AND if 'startbranch'
4336         * is 'first' then we can turn the whole thing
4337         * into a NOTHING
4338         */
4339         if ( startbranch == first ) {
4340          regnode *opt;
4341          /* the entire thing is a NOTHING sequence,
4342          * something like this: (?:|) So we can
4343          * turn it into a plain NOTHING op. */
4344          DEBUG_TRIE_COMPILE_r({
4345           regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4346           PerlIO_printf( Perl_debug_log,
4347           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4348           "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4349
4350          });
4351          OP(startbranch)= NOTHING;
4352          NEXT_OFF(startbranch)= tail - startbranch;
4353          for ( opt= startbranch + 1; opt < tail ; opt++ )
4354           OP(opt)= OPTIMIZED;
4355         }
4356        }
4357       } /* end if ( last) */
4358      } /* TRIE_MAXBUF is non zero */
4359
4360     } /* do trie */
4361
4362    }
4363    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4364     scan = NEXTOPER(NEXTOPER(scan));
4365    } else   /* single branch is optimized. */
4366     scan = NEXTOPER(scan);
4367    continue;
4368   } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4369    I32 paren = 0;
4370    regnode *start = NULL;
4371    regnode *end = NULL;
4372    U32 my_recursed_depth= recursed_depth;
4373
4374
4375    if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
4376     /* Do setup, note this code has side effects beyond
4377     * the rest of this block. Specifically setting
4378     * RExC_recurse[] must happen at least once during
4379     * study_chunk(). */
4380     if (OP(scan) == GOSUB) {
4381      paren = ARG(scan);
4382      RExC_recurse[ARG2L(scan)] = scan;
4383      start = RExC_open_parens[paren-1];
4384      end   = RExC_close_parens[paren-1];
4385     } else {
4386      start = RExC_rxi->program + 1;
4387      end   = RExC_opend;
4388     }
4389     /* NOTE we MUST always execute the above code, even
4390     * if we do nothing with a GOSUB/GOSTART */
4391     if (
4392      ( flags & SCF_IN_DEFINE )
4393      ||
4394      (
4395       (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
4396       &&
4397       ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4398      )
4399     ) {
4400      /* no need to do anything here if we are in a define. */
4401      /* or we are after some kind of infinite construct
4402      * so we can skip recursing into this item.
4403      * Since it is infinite we will not change the maxlen
4404      * or delta, and if we miss something that might raise
4405      * the minlen it will merely pessimise a little.
4406      *
4407      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4408      * might result in a minlen of 1 and not of 4,
4409      * but this doesn't make us mismatch, just try a bit
4410      * harder than we should.
4411      * */
4412      scan= regnext(scan);
4413      continue;
4414     }
4415
4416     if (
4417      !recursed_depth
4418      ||
4419      !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4420     ) {
4421      /* it is quite possible that there are more efficient ways
4422      * to do this. We maintain a bitmap per level of recursion
4423      * of which patterns we have entered so we can detect if a
4424      * pattern creates a possible infinite loop. When we
4425      * recurse down a level we copy the previous levels bitmap
4426      * down. When we are at recursion level 0 we zero the top
4427      * level bitmap. It would be nice to implement a different
4428      * more efficient way of doing this. In particular the top
4429      * level bitmap may be unnecessary.
4430      */
4431      if (!recursed_depth) {
4432       Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4433      } else {
4434       Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4435        RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4436        RExC_study_chunk_recursed_bytes, U8);
4437      }
4438      /* we havent recursed into this paren yet, so recurse into it */
4439      DEBUG_STUDYDATA("set:", data,depth);
4440      PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4441      my_recursed_depth= recursed_depth + 1;
4442     } else {
4443      DEBUG_STUDYDATA("inf:", data,depth);
4444      /* some form of infinite recursion, assume infinite length
4445      * */
4446      if (flags & SCF_DO_SUBSTR) {
4447       scan_commit(pRExC_state, data, minlenp, is_inf);
4448       data->longest = &(data->longest_float);
4449      }
4450      is_inf = is_inf_internal = 1;
4451      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4452       ssc_anything(data->start_class);
4453      flags &= ~SCF_DO_STCLASS;
4454
4455      start= NULL; /* reset start so we dont recurse later on. */
4456     }
4457    } else {
4458     paren = stopparen;
4459     start = scan + 2;
4460     end = regnext(scan);
4461    }
4462    if (start) {
4463     scan_frame *newframe;
4464     assert(end);
4465     if (!RExC_frame_last) {
4466      Newxz(newframe, 1, scan_frame);
4467      SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4468      RExC_frame_head= newframe;
4469      RExC_frame_count++;
4470     } else if (!RExC_frame_last->next_frame) {
4471      Newxz(newframe,1,scan_frame);
4472      RExC_frame_last->next_frame= newframe;
4473      newframe->prev_frame= RExC_frame_last;
4474      RExC_frame_count++;
4475     } else {
4476      newframe= RExC_frame_last->next_frame;
4477     }
4478     RExC_frame_last= newframe;
4479
4480     newframe->next_regnode = regnext(scan);
4481     newframe->last_regnode = last;
4482     newframe->stopparen = stopparen;
4483     newframe->prev_recursed_depth = recursed_depth;
4484     newframe->this_prev_frame= frame;
4485
4486     DEBUG_STUDYDATA("frame-new:",data,depth);
4487     DEBUG_PEEP("fnew", scan, depth);
4488
4489     frame = newframe;
4490     scan =  start;
4491     stopparen = paren;
4492     last = end;
4493     depth = depth + 1;
4494     recursed_depth= my_recursed_depth;
4495
4496     continue;
4497    }
4498   }
4499   else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
4500    SSize_t l = STR_LEN(scan);
4501    UV uc;
4502    if (UTF) {
4503     const U8 * const s = (U8*)STRING(scan);
4504     uc = utf8_to_uvchr_buf(s, s + l, NULL);
4505     l = utf8_length(s, s + l);
4506    } else {
4507     uc = *((U8*)STRING(scan));
4508    }
4509    min += l;
4510    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4511     /* The code below prefers earlier match for fixed
4512     offset, later match for variable offset.  */
4513     if (data->last_end == -1) { /* Update the start info. */
4514      data->last_start_min = data->pos_min;
4515      data->last_start_max = is_inf
4516       ? SSize_t_MAX : data->pos_min + data->pos_delta;
4517     }
4518     sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4519     if (UTF)
4520      SvUTF8_on(data->last_found);
4521     {
4522      SV * const sv = data->last_found;
4523      MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4524       mg_find(sv, PERL_MAGIC_utf8) : NULL;
4525      if (mg && mg->mg_len >= 0)
4526       mg->mg_len += utf8_length((U8*)STRING(scan),
4527            (U8*)STRING(scan)+STR_LEN(scan));
4528     }
4529     data->last_end = data->pos_min + l;
4530     data->pos_min += l; /* As in the first entry. */
4531     data->flags &= ~SF_BEFORE_EOL;
4532    }
4533
4534    /* ANDing the code point leaves at most it, and not in locale, and
4535    * can't match null string */
4536    if (flags & SCF_DO_STCLASS_AND) {
4537     ssc_cp_and(data->start_class, uc);
4538     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4539     ssc_clear_locale(data->start_class);
4540    }
4541    else if (flags & SCF_DO_STCLASS_OR) {
4542     ssc_add_cp(data->start_class, uc);
4543     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4544
4545     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4546     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4547    }
4548    flags &= ~SCF_DO_STCLASS;
4549   }
4550   else if (PL_regkind[OP(scan)] == EXACT) {
4551    /* But OP != EXACT!, so is EXACTFish */
4552    SSize_t l = STR_LEN(scan);
4553    const U8 * s = (U8*)STRING(scan);
4554
4555    /* Search for fixed substrings supports EXACT only. */
4556    if (flags & SCF_DO_SUBSTR) {
4557     assert(data);
4558     scan_commit(pRExC_state, data, minlenp, is_inf);
4559    }
4560    if (UTF) {
4561     l = utf8_length(s, s + l);
4562    }
4563    if (unfolded_multi_char) {
4564     RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4565    }
4566    min += l - min_subtract;
4567    assert (min >= 0);
4568    delta += min_subtract;
4569    if (flags & SCF_DO_SUBSTR) {
4570     data->pos_min += l - min_subtract;
4571     if (data->pos_min < 0) {
4572      data->pos_min = 0;
4573     }
4574     data->pos_delta += min_subtract;
4575     if (min_subtract) {
4576      data->longest = &(data->longest_float);
4577     }
4578    }
4579
4580    if (flags & SCF_DO_STCLASS) {
4581     SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
4582
4583     assert(EXACTF_invlist);
4584     if (flags & SCF_DO_STCLASS_AND) {
4585      if (OP(scan) != EXACTFL)
4586       ssc_clear_locale(data->start_class);
4587      ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4588      ANYOF_POSIXL_ZERO(data->start_class);
4589      ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4590     }
4591     else {  /* SCF_DO_STCLASS_OR */
4592      ssc_union(data->start_class, EXACTF_invlist, FALSE);
4593      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4594
4595      /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4596      ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4597     }
4598     flags &= ~SCF_DO_STCLASS;
4599     SvREFCNT_dec(EXACTF_invlist);
4600    }
4601   }
4602   else if (REGNODE_VARIES(OP(scan))) {
4603    SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4604    I32 fl = 0, f = flags;
4605    regnode * const oscan = scan;
4606    regnode_ssc this_class;
4607    regnode_ssc *oclass = NULL;
4608    I32 next_is_eval = 0;
4609
4610    switch (PL_regkind[OP(scan)]) {
4611    case WHILEM:  /* End of (?:...)* . */
4612     scan = NEXTOPER(scan);
4613     goto finish;
4614    case PLUS:
4615     if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4616      next = NEXTOPER(scan);
4617      if (OP(next) == EXACT
4618       || OP(next) == EXACTL
4619       || (flags & SCF_DO_STCLASS))
4620      {
4621       mincount = 1;
4622       maxcount = REG_INFTY;
4623       next = regnext(scan);
4624       scan = NEXTOPER(scan);
4625       goto do_curly;
4626      }
4627     }
4628     if (flags & SCF_DO_SUBSTR)
4629      data->pos_min++;
4630     min++;
4631     /* FALLTHROUGH */
4632    case STAR:
4633     if (flags & SCF_DO_STCLASS) {
4634      mincount = 0;
4635      maxcount = REG_INFTY;
4636      next = regnext(scan);
4637      scan = NEXTOPER(scan);
4638      goto do_curly;
4639     }
4640     if (flags & SCF_DO_SUBSTR) {
4641      scan_commit(pRExC_state, data, minlenp, is_inf);
4642      /* Cannot extend fixed substrings */
4643      data->longest = &(data->longest_float);
4644     }
4645     is_inf = is_inf_internal = 1;
4646     scan = regnext(scan);
4647     goto optimize_curly_tail;
4648    case CURLY:
4649     if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4650      && (scan->flags == stopparen))
4651     {
4652      mincount = 1;
4653      maxcount = 1;
4654     } else {
4655      mincount = ARG1(scan);
4656      maxcount = ARG2(scan);
4657     }
4658     next = regnext(scan);
4659     if (OP(scan) == CURLYX) {
4660      I32 lp = (data ? *(data->last_closep) : 0);
4661      scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4662     }
4663     scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4664     next_is_eval = (OP(scan) == EVAL);
4665    do_curly:
4666     if (flags & SCF_DO_SUBSTR) {
4667      if (mincount == 0)
4668       scan_commit(pRExC_state, data, minlenp, is_inf);
4669      /* Cannot extend fixed substrings */
4670      pos_before = data->pos_min;
4671     }
4672     if (data) {
4673      fl = data->flags;
4674      data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4675      if (is_inf)
4676       data->flags |= SF_IS_INF;
4677     }
4678     if (flags & SCF_DO_STCLASS) {
4679      ssc_init(pRExC_state, &this_class);
4680      oclass = data->start_class;
4681      data->start_class = &this_class;
4682      f |= SCF_DO_STCLASS_AND;
4683      f &= ~SCF_DO_STCLASS_OR;
4684     }
4685     /* Exclude from super-linear cache processing any {n,m}
4686     regops for which the combination of input pos and regex
4687     pos is not enough information to determine if a match
4688     will be possible.
4689
4690     For example, in the regex /foo(bar\s*){4,8}baz/ with the
4691     regex pos at the \s*, the prospects for a match depend not
4692     only on the input position but also on how many (bar\s*)
4693     repeats into the {4,8} we are. */
4694    if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4695      f &= ~SCF_WHILEM_VISITED_POS;
4696
4697     /* This will finish on WHILEM, setting scan, or on NULL: */
4698     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4699         last, data, stopparen, recursed_depth, NULL,
4700         (mincount == 0
4701         ? (f & ~SCF_DO_SUBSTR)
4702         : f)
4703         ,depth+1);
4704
4705     if (flags & SCF_DO_STCLASS)
4706      data->start_class = oclass;
4707     if (mincount == 0 || minnext == 0) {
4708      if (flags & SCF_DO_STCLASS_OR) {
4709       ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4710      }
4711      else if (flags & SCF_DO_STCLASS_AND) {
4712       /* Switch to OR mode: cache the old value of
4713       * data->start_class */
4714       INIT_AND_WITHP;
4715       StructCopy(data->start_class, and_withp, regnode_ssc);
4716       flags &= ~SCF_DO_STCLASS_AND;
4717       StructCopy(&this_class, data->start_class, regnode_ssc);
4718       flags |= SCF_DO_STCLASS_OR;
4719       ANYOF_FLAGS(data->start_class)
4720             |= SSC_MATCHES_EMPTY_STRING;
4721      }
4722     } else {  /* Non-zero len */
4723      if (flags & SCF_DO_STCLASS_OR) {
4724       ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4725       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4726      }
4727      else if (flags & SCF_DO_STCLASS_AND)
4728       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4729      flags &= ~SCF_DO_STCLASS;
4730     }
4731     if (!scan)   /* It was not CURLYX, but CURLY. */
4732      scan = next;
4733     if (!(flags & SCF_TRIE_DOING_RESTUDY)
4734      /* ? quantifier ok, except for (?{ ... }) */
4735      && (next_is_eval || !(mincount == 0 && maxcount == 1))
4736      && (minnext == 0) && (deltanext == 0)
4737      && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4738      && maxcount <= REG_INFTY/3) /* Complement check for big
4739             count */
4740     {
4741      /* Fatal warnings may leak the regexp without this: */
4742      SAVEFREESV(RExC_rx_sv);
4743      Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
4744       "Quantifier unexpected on zero-length expression "
4745       "in regex m/%"UTF8f"/",
4746       UTF8fARG(UTF, RExC_end - RExC_precomp,
4747         RExC_precomp));
4748      (void)ReREFCNT_inc(RExC_rx_sv);
4749     }
4750
4751     min += minnext * mincount;
4752     is_inf_internal |= deltanext == SSize_t_MAX
4753       || (maxcount == REG_INFTY && minnext + deltanext > 0);
4754     is_inf |= is_inf_internal;
4755     if (is_inf) {
4756      delta = SSize_t_MAX;
4757     } else {
4758      delta += (minnext + deltanext) * maxcount
4759        - minnext * mincount;
4760     }
4761     /* Try powerful optimization CURLYX => CURLYN. */
4762     if (  OP(oscan) == CURLYX && data
4763      && data->flags & SF_IN_PAR
4764      && !(data->flags & SF_HAS_EVAL)
4765      && !deltanext && minnext == 1 ) {
4766      /* Try to optimize to CURLYN.  */
4767      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4768      regnode * const nxt1 = nxt;
4769 #ifdef DEBUGGING
4770      regnode *nxt2;
4771 #endif
4772
4773      /* Skip open. */
4774      nxt = regnext(nxt);
4775      if (!REGNODE_SIMPLE(OP(nxt))
4776       && !(PL_regkind[OP(nxt)] == EXACT
4777        && STR_LEN(nxt) == 1))
4778       goto nogo;
4779 #ifdef DEBUGGING
4780      nxt2 = nxt;
4781 #endif
4782      nxt = regnext(nxt);
4783      if (OP(nxt) != CLOSE)
4784       goto nogo;
4785      if (RExC_open_parens) {
4786       RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4787       RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4788      }
4789      /* Now we know that nxt2 is the only contents: */
4790      oscan->flags = (U8)ARG(nxt);
4791      OP(oscan) = CURLYN;
4792      OP(nxt1) = NOTHING; /* was OPEN. */
4793
4794 #ifdef DEBUGGING
4795      OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4796      NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4797      NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4798      OP(nxt) = OPTIMIZED; /* was CLOSE. */
4799      OP(nxt + 1) = OPTIMIZED; /* was count. */
4800      NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4801 #endif
4802     }
4803    nogo:
4804
4805     /* Try optimization CURLYX => CURLYM. */
4806     if (  OP(oscan) == CURLYX && data
4807      && !(data->flags & SF_HAS_PAR)
4808      && !(data->flags & SF_HAS_EVAL)
4809      && !deltanext /* atom is fixed width */
4810      && minnext != 0 /* CURLYM can't handle zero width */
4811
4812       /* Nor characters whose fold at run-time may be
4813       * multi-character */
4814      && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4815     ) {
4816      /* XXXX How to optimize if data == 0? */
4817      /* Optimize to a simpler form.  */
4818      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4819      regnode *nxt2;
4820
4821      OP(oscan) = CURLYM;
4822      while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4823        && (OP(nxt2) != WHILEM))
4824       nxt = nxt2;
4825      OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4826      /* Need to optimize away parenths. */
4827      if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4828       /* Set the parenth number.  */
4829       regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4830
4831       oscan->flags = (U8)ARG(nxt);
4832       if (RExC_open_parens) {
4833        RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4834        RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4835       }
4836       OP(nxt1) = OPTIMIZED; /* was OPEN. */
4837       OP(nxt) = OPTIMIZED; /* was CLOSE. */
4838
4839 #ifdef DEBUGGING
4840       OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4841       OP(nxt + 1) = OPTIMIZED; /* was count. */
4842       NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4843       NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4844 #endif
4845 #if 0
4846       while ( nxt1 && (OP(nxt1) != WHILEM)) {
4847        regnode *nnxt = regnext(nxt1);
4848        if (nnxt == nxt) {
4849         if (reg_off_by_arg[OP(nxt1)])
4850          ARG_SET(nxt1, nxt2 - nxt1);
4851         else if (nxt2 - nxt1 < U16_MAX)
4852          NEXT_OFF(nxt1) = nxt2 - nxt1;
4853         else
4854          OP(nxt) = NOTHING; /* Cannot beautify */
4855        }
4856        nxt1 = nnxt;
4857       }
4858 #endif
4859       /* Optimize again: */
4860       study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4861          NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4862      }
4863      else
4864       oscan->flags = 0;
4865     }
4866     else if ((OP(oscan) == CURLYX)
4867       && (flags & SCF_WHILEM_VISITED_POS)
4868       /* See the comment on a similar expression above.
4869        However, this time it's not a subexpression
4870        we care about, but the expression itself. */
4871       && (maxcount == REG_INFTY)
4872       && data && ++data->whilem_c < 16) {
4873      /* This stays as CURLYX, we can put the count/of pair. */
4874      /* Find WHILEM (as in regexec.c) */
4875      regnode *nxt = oscan + NEXT_OFF(oscan);
4876
4877      if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4878       nxt += ARG(nxt);
4879      PREVOPER(nxt)->flags = (U8)(data->whilem_c
4880       | (RExC_whilem_seen << 4)); /* On WHILEM */
4881     }
4882     if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4883      pars++;
4884     if (flags & SCF_DO_SUBSTR) {
4885      SV *last_str = NULL;
4886      STRLEN last_chrs = 0;
4887      int counted = mincount != 0;
4888
4889      if (data->last_end > 0 && mincount != 0) { /* Ends with a
4890                 string. */
4891       SSize_t b = pos_before >= data->last_start_min
4892        ? pos_before : data->last_start_min;
4893       STRLEN l;
4894       const char * const s = SvPV_const(data->last_found, l);
4895       SSize_t old = b - data->last_start_min;
4896
4897       if (UTF)
4898        old = utf8_hop((U8*)s, old) - (U8*)s;
4899       l -= old;
4900       /* Get the added string: */
4901       last_str = newSVpvn_utf8(s  + old, l, UTF);
4902       last_chrs = UTF ? utf8_length((U8*)(s + old),
4903            (U8*)(s + old + l)) : l;
4904       if (deltanext == 0 && pos_before == b) {
4905        /* What was added is a constant string */
4906        if (mincount > 1) {
4907
4908         SvGROW(last_str, (mincount * l) + 1);
4909         repeatcpy(SvPVX(last_str) + l,
4910           SvPVX_const(last_str), l,
4911           mincount - 1);
4912         SvCUR_set(last_str, SvCUR(last_str) * mincount);
4913         /* Add additional parts. */
4914         SvCUR_set(data->last_found,
4915           SvCUR(data->last_found) - l);
4916         sv_catsv(data->last_found, last_str);
4917         {
4918          SV * sv = data->last_found;
4919          MAGIC *mg =
4920           SvUTF8(sv) && SvMAGICAL(sv) ?
4921           mg_find(sv, PERL_MAGIC_utf8) : NULL;
4922          if (mg && mg->mg_len >= 0)
4923           mg->mg_len += last_chrs * (mincount-1);
4924         }
4925         last_chrs *= mincount;
4926         data->last_end += l * (mincount - 1);
4927        }
4928       } else {
4929        /* start offset must point into the last copy */
4930        data->last_start_min += minnext * (mincount - 1);
4931        data->last_start_max =
4932        is_inf
4933        ? SSize_t_MAX
4934        : data->last_start_max +
4935         (maxcount - 1) * (minnext + data->pos_delta);
4936       }
4937      }
4938      /* It is counted once already... */
4939      data->pos_min += minnext * (mincount - counted);
4940 #if 0
4941 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
4942        " SSize_t_MAX=%"UVuf" minnext=%"UVuf
4943        " maxcount=%"UVuf" mincount=%"UVuf"\n",
4944  (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4945  (UV)mincount);
4946 if (deltanext != SSize_t_MAX)
4947 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
4948  (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4949   - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4950 #endif
4951      if (deltanext == SSize_t_MAX
4952       || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4953       data->pos_delta = SSize_t_MAX;
4954      else
4955       data->pos_delta += - counted * deltanext +
4956       (minnext + deltanext) * maxcount - minnext * mincount;
4957      if (mincount != maxcount) {
4958       /* Cannot extend fixed substrings found inside
4959        the group.  */
4960       scan_commit(pRExC_state, data, minlenp, is_inf);
4961       if (mincount && last_str) {
4962        SV * const sv = data->last_found;
4963        MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4964         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4965
4966        if (mg)
4967         mg->mg_len = -1;
4968        sv_setsv(sv, last_str);
4969        data->last_end = data->pos_min;
4970        data->last_start_min = data->pos_min - last_chrs;
4971        data->last_start_max = is_inf
4972         ? SSize_t_MAX
4973         : data->pos_min + data->pos_delta - last_chrs;
4974       }
4975       data->longest = &(data->longest_float);
4976      }
4977      SvREFCNT_dec(last_str);
4978     }
4979     if (data && (fl & SF_HAS_EVAL))
4980      data->flags |= SF_HAS_EVAL;
4981    optimize_curly_tail:
4982     if (OP(oscan) != CURLYX) {
4983      while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4984       && NEXT_OFF(next))
4985       NEXT_OFF(oscan) += NEXT_OFF(next);
4986     }
4987     continue;
4988
4989    default:
4990 #ifdef DEBUGGING
4991     Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4992                  OP(scan));
4993 #endif
4994    case REF:
4995    case CLUMP:
4996     if (flags & SCF_DO_SUBSTR) {
4997      /* Cannot expect anything... */
4998      scan_commit(pRExC_state, data, minlenp, is_inf);
4999      data->longest = &(data->longest_float);
5000     }
5001     is_inf = is_inf_internal = 1;
5002     if (flags & SCF_DO_STCLASS_OR) {
5003      if (OP(scan) == CLUMP) {
5004       /* Actually is any start char, but very few code points
5005       * aren't start characters */
5006       ssc_match_all_cp(data->start_class);
5007      }
5008      else {
5009       ssc_anything(data->start_class);
5010      }
5011     }
5012     flags &= ~SCF_DO_STCLASS;
5013     break;
5014    }
5015   }
5016   else if (OP(scan) == LNBREAK) {
5017    if (flags & SCF_DO_STCLASS) {
5018      if (flags & SCF_DO_STCLASS_AND) {
5019      ssc_intersection(data->start_class,
5020          PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5021      ssc_clear_locale(data->start_class);
5022      ANYOF_FLAGS(data->start_class)
5023             &= ~SSC_MATCHES_EMPTY_STRING;
5024     }
5025     else if (flags & SCF_DO_STCLASS_OR) {
5026      ssc_union(data->start_class,
5027        PL_XPosix_ptrs[_CC_VERTSPACE],
5028        FALSE);
5029      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5030
5031      /* See commit msg for
5032      * 749e076fceedeb708a624933726e7989f2302f6a */
5033      ANYOF_FLAGS(data->start_class)
5034             &= ~SSC_MATCHES_EMPTY_STRING;
5035     }
5036     flags &= ~SCF_DO_STCLASS;
5037    }
5038    min++;
5039    if (delta != SSize_t_MAX)
5040     delta++;    /* Because of the 2 char string cr-lf */
5041    if (flags & SCF_DO_SUBSTR) {
5042     /* Cannot expect anything... */
5043     scan_commit(pRExC_state, data, minlenp, is_inf);
5044      data->pos_min += 1;
5045     data->pos_delta += 1;
5046     data->longest = &(data->longest_float);
5047     }
5048   }
5049   else if (REGNODE_SIMPLE(OP(scan))) {
5050
5051    if (flags & SCF_DO_SUBSTR) {
5052     scan_commit(pRExC_state, data, minlenp, is_inf);
5053     data->pos_min++;
5054    }
5055    min++;
5056    if (flags & SCF_DO_STCLASS) {
5057     bool invert = 0;
5058     SV* my_invlist = NULL;
5059     U8 namedclass;
5060
5061     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5062     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5063
5064     /* Some of the logic below assumes that switching
5065     locale on will only add false positives. */
5066     switch (OP(scan)) {
5067
5068     default:
5069 #ifdef DEBUGGING
5070     Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5071                  OP(scan));
5072 #endif
5073     case SANY:
5074      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5075       ssc_match_all_cp(data->start_class);
5076      break;
5077
5078     case REG_ANY:
5079      {
5080       SV* REG_ANY_invlist = _new_invlist(2);
5081       REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5082                '\n');
5083       if (flags & SCF_DO_STCLASS_OR) {
5084        ssc_union(data->start_class,
5085          REG_ANY_invlist,
5086          TRUE /* TRUE => invert, hence all but \n
5087            */
5088          );
5089       }
5090       else if (flags & SCF_DO_STCLASS_AND) {
5091        ssc_intersection(data->start_class,
5092            REG_ANY_invlist,
5093            TRUE  /* TRUE => invert */
5094            );
5095        ssc_clear_locale(data->start_class);
5096       }
5097       SvREFCNT_dec_NN(REG_ANY_invlist);
5098      }
5099      break;
5100
5101     case ANYOFL:
5102     case ANYOF:
5103      if (flags & SCF_DO_STCLASS_AND)
5104       ssc_and(pRExC_state, data->start_class,
5105         (regnode_charclass *) scan);
5106      else
5107       ssc_or(pRExC_state, data->start_class,
5108               (regnode_charclass *) scan);
5109      break;
5110
5111     case NPOSIXL:
5112      invert = 1;
5113      /* FALLTHROUGH */
5114
5115     case POSIXL:
5116      namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5117      if (flags & SCF_DO_STCLASS_AND) {
5118       bool was_there = cBOOL(
5119           ANYOF_POSIXL_TEST(data->start_class,
5120                 namedclass));
5121       ANYOF_POSIXL_ZERO(data->start_class);
5122       if (was_there) {    /* Do an AND */
5123        ANYOF_POSIXL_SET(data->start_class, namedclass);
5124       }
5125       /* No individual code points can now match */
5126       data->start_class->invlist
5127             = sv_2mortal(_new_invlist(0));
5128      }
5129      else {
5130       int complement = namedclass + ((invert) ? -1 : 1);
5131
5132       assert(flags & SCF_DO_STCLASS_OR);
5133
5134       /* If the complement of this class was already there,
5135       * the result is that they match all code points,
5136       * (\d + \D == everything).  Remove the classes from
5137       * future consideration.  Locale is not relevant in
5138       * this case */
5139       if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5140        ssc_match_all_cp(data->start_class);
5141        ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5142        ANYOF_POSIXL_CLEAR(data->start_class, complement);
5143       }
5144       else {  /* The usual case; just add this class to the
5145         existing set */
5146        ANYOF_POSIXL_SET(data->start_class, namedclass);
5147       }
5148      }
5149      break;
5150
5151     case NPOSIXA:   /* For these, we always know the exact set of
5152         what's matched */
5153      invert = 1;
5154      /* FALLTHROUGH */
5155     case POSIXA:
5156      if (FLAGS(scan) == _CC_ASCII) {
5157       my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
5158      }
5159      else {
5160       _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5161            PL_XPosix_ptrs[_CC_ASCII],
5162            &my_invlist);
5163      }
5164      goto join_posix;
5165
5166     case NPOSIXD:
5167     case NPOSIXU:
5168      invert = 1;
5169      /* FALLTHROUGH */
5170     case POSIXD:
5171     case POSIXU:
5172      my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5173
5174      /* NPOSIXD matches all upper Latin1 code points unless the
5175      * target string being matched is UTF-8, which is
5176      * unknowable until match time.  Since we are going to
5177      * invert, we want to get rid of all of them so that the
5178      * inversion will match all */
5179      if (OP(scan) == NPOSIXD) {
5180       _invlist_subtract(my_invlist, PL_UpperLatin1,
5181           &my_invlist);
5182      }
5183
5184     join_posix:
5185
5186      if (flags & SCF_DO_STCLASS_AND) {
5187       ssc_intersection(data->start_class, my_invlist, invert);
5188       ssc_clear_locale(data->start_class);
5189      }
5190      else {
5191       assert(flags & SCF_DO_STCLASS_OR);
5192       ssc_union(data->start_class, my_invlist, invert);
5193      }
5194      SvREFCNT_dec(my_invlist);
5195     }
5196     if (flags & SCF_DO_STCLASS_OR)
5197      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5198     flags &= ~SCF_DO_STCLASS;
5199    }
5200   }
5201   else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5202    data->flags |= (OP(scan) == MEOL
5203        ? SF_BEFORE_MEOL
5204        : SF_BEFORE_SEOL);
5205    scan_commit(pRExC_state, data, minlenp, is_inf);
5206
5207   }
5208   else if (  PL_regkind[OP(scan)] == BRANCHJ
5209     /* Lookbehind, or need to calculate parens/evals/stclass: */
5210     && (scan->flags || data || (flags & SCF_DO_STCLASS))
5211     && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5212   {
5213    if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5214     || OP(scan) == UNLESSM )
5215    {
5216     /* Negative Lookahead/lookbehind
5217     In this case we can't do fixed string optimisation.
5218     */
5219
5220     SSize_t deltanext, minnext, fake = 0;
5221     regnode *nscan;
5222     regnode_ssc intrnl;
5223     int f = 0;
5224
5225     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5226     if (data) {
5227      data_fake.whilem_c = data->whilem_c;
5228      data_fake.last_closep = data->last_closep;
5229     }
5230     else
5231      data_fake.last_closep = &fake;
5232     data_fake.pos_delta = delta;
5233     if ( flags & SCF_DO_STCLASS && !scan->flags
5234      && OP(scan) == IFMATCH ) { /* Lookahead */
5235      ssc_init(pRExC_state, &intrnl);
5236      data_fake.start_class = &intrnl;
5237      f |= SCF_DO_STCLASS_AND;
5238     }
5239     if (flags & SCF_WHILEM_VISITED_POS)
5240      f |= SCF_WHILEM_VISITED_POS;
5241     next = regnext(scan);
5242     nscan = NEXTOPER(NEXTOPER(scan));
5243     minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5244          last, &data_fake, stopparen,
5245          recursed_depth, NULL, f, depth+1);
5246     if (scan->flags) {
5247      if (deltanext) {
5248       FAIL("Variable length lookbehind not implemented");
5249      }
5250      else if (minnext > (I32)U8_MAX) {
5251       FAIL2("Lookbehind longer than %"UVuf" not implemented",
5252        (UV)U8_MAX);
5253      }
5254      scan->flags = (U8)minnext;
5255     }
5256     if (data) {
5257      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5258       pars++;
5259      if (data_fake.flags & SF_HAS_EVAL)
5260       data->flags |= SF_HAS_EVAL;
5261      data->whilem_c = data_fake.whilem_c;
5262     }
5263     if (f & SCF_DO_STCLASS_AND) {
5264      if (flags & SCF_DO_STCLASS_OR) {
5265       /* OR before, AND after: ideally we would recurse with
5266       * data_fake to get the AND applied by study of the
5267       * remainder of the pattern, and then derecurse;
5268       * *** HACK *** for now just treat as "no information".
5269       * See [perl #56690].
5270       */
5271       ssc_init(pRExC_state, data->start_class);
5272      }  else {
5273       /* AND before and after: combine and continue.  These
5274       * assertions are zero-length, so can match an EMPTY
5275       * string */
5276       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5277       ANYOF_FLAGS(data->start_class)
5278             |= SSC_MATCHES_EMPTY_STRING;
5279      }
5280     }
5281    }
5282 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5283    else {
5284     /* Positive Lookahead/lookbehind
5285     In this case we can do fixed string optimisation,
5286     but we must be careful about it. Note in the case of
5287     lookbehind the positions will be offset by the minimum
5288     length of the pattern, something we won't know about
5289     until after the recurse.
5290     */
5291     SSize_t deltanext, fake = 0;
5292     regnode *nscan;
5293     regnode_ssc intrnl;
5294     int f = 0;
5295     /* We use SAVEFREEPV so that when the full compile
5296      is finished perl will clean up the allocated
5297      minlens when it's all done. This way we don't
5298      have to worry about freeing them when we know
5299      they wont be used, which would be a pain.
5300     */
5301     SSize_t *minnextp;
5302     Newx( minnextp, 1, SSize_t );
5303     SAVEFREEPV(minnextp);
5304
5305     if (data) {
5306      StructCopy(data, &data_fake, scan_data_t);
5307      if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5308       f |= SCF_DO_SUBSTR;
5309       if (scan->flags)
5310        scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5311       data_fake.last_found=newSVsv(data->last_found);
5312      }
5313     }
5314     else
5315      data_fake.last_closep = &fake;
5316     data_fake.flags = 0;
5317     data_fake.pos_delta = delta;
5318     if (is_inf)
5319      data_fake.flags |= SF_IS_INF;
5320     if ( flags & SCF_DO_STCLASS && !scan->flags
5321      && OP(scan) == IFMATCH ) { /* Lookahead */
5322      ssc_init(pRExC_state, &intrnl);
5323      data_fake.start_class = &intrnl;
5324      f |= SCF_DO_STCLASS_AND;
5325     }
5326     if (flags & SCF_WHILEM_VISITED_POS)
5327      f |= SCF_WHILEM_VISITED_POS;
5328     next = regnext(scan);
5329     nscan = NEXTOPER(NEXTOPER(scan));
5330
5331     *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5332           &deltanext, last, &data_fake,
5333           stopparen, recursed_depth, NULL,
5334           f,depth+1);
5335     if (scan->flags) {
5336      if (deltanext) {
5337       FAIL("Variable length lookbehind not implemented");
5338      }
5339      else if (*minnextp > (I32)U8_MAX) {
5340       FAIL2("Lookbehind longer than %"UVuf" not implemented",
5341        (UV)U8_MAX);
5342      }
5343      scan->flags = (U8)*minnextp;
5344     }
5345
5346     *minnextp += min;
5347
5348     if (f & SCF_DO_STCLASS_AND) {
5349      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5350      ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5351     }
5352     if (data) {
5353      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5354       pars++;
5355      if (data_fake.flags & SF_HAS_EVAL)
5356       data->flags |= SF_HAS_EVAL;
5357      data->whilem_c = data_fake.whilem_c;
5358      if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5359       if (RExC_rx->minlen<*minnextp)
5360        RExC_rx->minlen=*minnextp;
5361       scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5362       SvREFCNT_dec_NN(data_fake.last_found);
5363
5364       if ( data_fake.minlen_fixed != minlenp )
5365       {
5366        data->offset_fixed= data_fake.offset_fixed;
5367        data->minlen_fixed= data_fake.minlen_fixed;
5368        data->lookbehind_fixed+= scan->flags;
5369       }
5370       if ( data_fake.minlen_float != minlenp )
5371       {
5372        data->minlen_float= data_fake.minlen_float;
5373        data->offset_float_min=data_fake.offset_float_min;
5374        data->offset_float_max=data_fake.offset_float_max;
5375        data->lookbehind_float+= scan->flags;
5376       }
5377      }
5378     }
5379    }
5380 #endif
5381   }
5382   else if (OP(scan) == OPEN) {
5383    if (stopparen != (I32)ARG(scan))
5384     pars++;
5385   }
5386   else if (OP(scan) == CLOSE) {
5387    if (stopparen == (I32)ARG(scan)) {
5388     break;
5389    }
5390    if ((I32)ARG(scan) == is_par) {
5391     next = regnext(scan);
5392
5393     if ( next && (OP(next) != WHILEM) && next < last)
5394      is_par = 0;  /* Disable optimization */
5395    }
5396    if (data)
5397     *(data->last_closep) = ARG(scan);
5398   }
5399   else if (OP(scan) == EVAL) {
5400     if (data)
5401      data->flags |= SF_HAS_EVAL;
5402   }
5403   else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5404    if (flags & SCF_DO_SUBSTR) {
5405     scan_commit(pRExC_state, data, minlenp, is_inf);
5406     flags &= ~SCF_DO_SUBSTR;
5407    }
5408    if (data && OP(scan)==ACCEPT) {
5409     data->flags |= SCF_SEEN_ACCEPT;
5410     if (stopmin > min)
5411      stopmin = min;
5412    }
5413   }
5414   else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5415   {
5416     if (flags & SCF_DO_SUBSTR) {
5417      scan_commit(pRExC_state, data, minlenp, is_inf);
5418      data->longest = &(data->longest_float);
5419     }
5420     is_inf = is_inf_internal = 1;
5421     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5422      ssc_anything(data->start_class);
5423     flags &= ~SCF_DO_STCLASS;
5424   }
5425   else if (OP(scan) == GPOS) {
5426    if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5427     !(delta || is_inf || (data && data->pos_delta)))
5428    {
5429     if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5430      RExC_rx->intflags |= PREGf_ANCH_GPOS;
5431     if (RExC_rx->gofs < (STRLEN)min)
5432      RExC_rx->gofs = min;
5433    } else {
5434     RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5435     RExC_rx->gofs = 0;
5436    }
5437   }
5438 #ifdef TRIE_STUDY_OPT
5439 #ifdef FULL_TRIE_STUDY
5440   else if (PL_regkind[OP(scan)] == TRIE) {
5441    /* NOTE - There is similar code to this block above for handling
5442    BRANCH nodes on the initial study.  If you change stuff here
5443    check there too. */
5444    regnode *trie_node= scan;
5445    regnode *tail= regnext(scan);
5446    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5447    SSize_t max1 = 0, min1 = SSize_t_MAX;
5448    regnode_ssc accum;
5449
5450    if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5451     /* Cannot merge strings after this. */
5452     scan_commit(pRExC_state, data, minlenp, is_inf);
5453    }
5454    if (flags & SCF_DO_STCLASS)
5455     ssc_init_zero(pRExC_state, &accum);
5456
5457    if (!trie->jump) {
5458     min1= trie->minlen;
5459     max1= trie->maxlen;
5460    } else {
5461     const regnode *nextbranch= NULL;
5462     U32 word;
5463
5464     for ( word=1 ; word <= trie->wordcount ; word++)
5465     {
5466      SSize_t deltanext=0, minnext=0, f = 0, fake;
5467      regnode_ssc this_class;
5468
5469      StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5470      if (data) {
5471       data_fake.whilem_c = data->whilem_c;
5472       data_fake.last_closep = data->last_closep;
5473      }
5474      else
5475       data_fake.last_closep = &fake;
5476      data_fake.pos_delta = delta;
5477      if (flags & SCF_DO_STCLASS) {
5478       ssc_init(pRExC_state, &this_class);
5479       data_fake.start_class = &this_class;
5480       f = SCF_DO_STCLASS_AND;
5481      }
5482      if (flags & SCF_WHILEM_VISITED_POS)
5483       f |= SCF_WHILEM_VISITED_POS;
5484
5485      if (trie->jump[word]) {
5486       if (!nextbranch)
5487        nextbranch = trie_node + trie->jump[0];
5488       scan= trie_node + trie->jump[word];
5489       /* We go from the jump point to the branch that follows
5490       it. Note this means we need the vestigal unused
5491       branches even though they arent otherwise used. */
5492       minnext = study_chunk(pRExC_state, &scan, minlenp,
5493        &deltanext, (regnode *)nextbranch, &data_fake,
5494        stopparen, recursed_depth, NULL, f,depth+1);
5495      }
5496      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5497       nextbranch= regnext((regnode*)nextbranch);
5498
5499      if (min1 > (SSize_t)(minnext + trie->minlen))
5500       min1 = minnext + trie->minlen;
5501      if (deltanext == SSize_t_MAX) {
5502       is_inf = is_inf_internal = 1;
5503       max1 = SSize_t_MAX;
5504      } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5505       max1 = minnext + deltanext + trie->maxlen;
5506
5507      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5508       pars++;
5509      if (data_fake.flags & SCF_SEEN_ACCEPT) {
5510       if ( stopmin > min + min1)
5511        stopmin = min + min1;
5512       flags &= ~SCF_DO_SUBSTR;
5513       if (data)
5514        data->flags |= SCF_SEEN_ACCEPT;
5515      }
5516      if (data) {
5517       if (data_fake.flags & SF_HAS_EVAL)
5518        data->flags |= SF_HAS_EVAL;
5519       data->whilem_c = data_fake.whilem_c;
5520      }
5521      if (flags & SCF_DO_STCLASS)
5522       ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5523     }
5524    }
5525    if (flags & SCF_DO_SUBSTR) {
5526     data->pos_min += min1;
5527     data->pos_delta += max1 - min1;
5528     if (max1 != min1 || is_inf)
5529      data->longest = &(data->longest_float);
5530    }
5531    min += min1;
5532    if (delta != SSize_t_MAX)
5533     delta += max1 - min1;
5534    if (flags & SCF_DO_STCLASS_OR) {
5535     ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5536     if (min1) {
5537      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5538      flags &= ~SCF_DO_STCLASS;
5539     }
5540    }
5541    else if (flags & SCF_DO_STCLASS_AND) {
5542     if (min1) {
5543      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5544      flags &= ~SCF_DO_STCLASS;
5545     }
5546     else {
5547      /* Switch to OR mode: cache the old value of
5548      * data->start_class */
5549      INIT_AND_WITHP;
5550      StructCopy(data->start_class, and_withp, regnode_ssc);
5551      flags &= ~SCF_DO_STCLASS_AND;
5552      StructCopy(&accum, data->start_class, regnode_ssc);
5553      flags |= SCF_DO_STCLASS_OR;
5554     }
5555    }
5556    scan= tail;
5557    continue;
5558   }
5559 #else
5560   else if (PL_regkind[OP(scan)] == TRIE) {
5561    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5562    U8*bang=NULL;
5563
5564    min += trie->minlen;
5565    delta += (trie->maxlen - trie->minlen);
5566    flags &= ~SCF_DO_STCLASS; /* xxx */
5567    if (flags & SCF_DO_SUBSTR) {
5568     /* Cannot expect anything... */
5569     scan_commit(pRExC_state, data, minlenp, is_inf);
5570      data->pos_min += trie->minlen;
5571      data->pos_delta += (trie->maxlen - trie->minlen);
5572     if (trie->maxlen != trie->minlen)
5573      data->longest = &(data->longest_float);
5574     }
5575     if (trie->jump) /* no more substrings -- for now /grr*/
5576    flags &= ~SCF_DO_SUBSTR;
5577   }
5578 #endif /* old or new */
5579 #endif /* TRIE_STUDY_OPT */
5580
5581   /* Else: zero-length, ignore. */
5582   scan = regnext(scan);
5583  }
5584  /* If we are exiting a recursion we can unset its recursed bit
5585  * and allow ourselves to enter it again - no danger of an
5586  * infinite loop there.
5587  if (stopparen > -1 && recursed) {
5588   DEBUG_STUDYDATA("unset:", data,depth);
5589   PAREN_UNSET( recursed, stopparen);
5590  }
5591  */
5592  if (frame) {
5593   depth = depth - 1;
5594
5595   DEBUG_STUDYDATA("frame-end:",data,depth);
5596   DEBUG_PEEP("fend", scan, depth);
5597
5598   /* restore previous context */
5599   last = frame->last_regnode;
5600   scan = frame->next_regnode;
5601   stopparen = frame->stopparen;
5602   recursed_depth = frame->prev_recursed_depth;
5603
5604   RExC_frame_last = frame->prev_frame;
5605   frame = frame->this_prev_frame;
5606   goto fake_study_recurse;
5607  }
5608
5609   finish:
5610  assert(!frame);
5611  DEBUG_STUDYDATA("pre-fin:",data,depth);
5612
5613  *scanp = scan;
5614  *deltap = is_inf_internal ? SSize_t_MAX : delta;
5615
5616  if (flags & SCF_DO_SUBSTR && is_inf)
5617   data->pos_delta = SSize_t_MAX - data->pos_min;
5618  if (is_par > (I32)U8_MAX)
5619   is_par = 0;
5620  if (is_par && pars==1 && data) {
5621   data->flags |= SF_IN_PAR;
5622   data->flags &= ~SF_HAS_PAR;
5623  }
5624  else if (pars && data) {
5625   data->flags |= SF_HAS_PAR;
5626   data->flags &= ~SF_IN_PAR;
5627  }
5628  if (flags & SCF_DO_STCLASS_OR)
5629   ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5630  if (flags & SCF_TRIE_RESTUDY)
5631   data->flags |=  SCF_TRIE_RESTUDY;
5632
5633  DEBUG_STUDYDATA("post-fin:",data,depth);
5634
5635  {
5636   SSize_t final_minlen= min < stopmin ? min : stopmin;
5637
5638   if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
5639    if (final_minlen > SSize_t_MAX - delta)
5640     RExC_maxlen = SSize_t_MAX;
5641    else if (RExC_maxlen < final_minlen + delta)
5642     RExC_maxlen = final_minlen + delta;
5643   }
5644   return final_minlen;
5645  }
5646  NOT_REACHED; /* NOTREACHED */
5647 }
5648
5649 STATIC U32
5650 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5651 {
5652  U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5653
5654  PERL_ARGS_ASSERT_ADD_DATA;
5655
5656  Renewc(RExC_rxi->data,
5657   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5658   char, struct reg_data);
5659  if(count)
5660   Renew(RExC_rxi->data->what, count + n, U8);
5661  else
5662   Newx(RExC_rxi->data->what, n, U8);
5663  RExC_rxi->data->count = count + n;
5664  Copy(s, RExC_rxi->data->what + count, n, U8);
5665  return count;
5666 }
5667
5668 /*XXX: todo make this not included in a non debugging perl, but appears to be
5669  * used anyway there, in 'use re' */
5670 #ifndef PERL_IN_XSUB_RE
5671 void
5672 Perl_reginitcolors(pTHX)
5673 {
5674  const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5675  if (s) {
5676   char *t = savepv(s);
5677   int i = 0;
5678   PL_colors[0] = t;
5679   while (++i < 6) {
5680    t = strchr(t, '\t');
5681    if (t) {
5682     *t = '\0';
5683     PL_colors[i] = ++t;
5684    }
5685    else
5686     PL_colors[i] = t = (char *)"";
5687   }
5688  } else {
5689   int i = 0;
5690   while (i < 6)
5691    PL_colors[i++] = (char *)"";
5692  }
5693  PL_colorset = 1;
5694 }
5695 #endif
5696
5697
5698 #ifdef TRIE_STUDY_OPT
5699 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5700  STMT_START {                                            \
5701   if (                                                \
5702    (data.flags & SCF_TRIE_RESTUDY)               \
5703    && ! restudied++                              \
5704   ) {                                                 \
5705    dOsomething;                                    \
5706    goto reStudy;                                   \
5707   }                                                   \
5708  } STMT_END
5709 #else
5710 #define CHECK_RESTUDY_GOTO_butfirst
5711 #endif
5712
5713 /*
5714  * pregcomp - compile a regular expression into internal code
5715  *
5716  * Decides which engine's compiler to call based on the hint currently in
5717  * scope
5718  */
5719
5720 #ifndef PERL_IN_XSUB_RE
5721
5722 /* return the currently in-scope regex engine (or the default if none)  */
5723
5724 regexp_engine const *
5725 Perl_current_re_engine(pTHX)
5726 {
5727  if (IN_PERL_COMPILETIME) {
5728   HV * const table = GvHV(PL_hintgv);
5729   SV **ptr;
5730
5731   if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5732    return &reh_regexp_engine;
5733   ptr = hv_fetchs(table, "regcomp", FALSE);
5734   if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5735    return &reh_regexp_engine;
5736   return INT2PTR(regexp_engine*,SvIV(*ptr));
5737  }
5738  else {
5739   SV *ptr;
5740   if (!PL_curcop->cop_hints_hash)
5741    return &reh_regexp_engine;
5742   ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5743   if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5744    return &reh_regexp_engine;
5745   return INT2PTR(regexp_engine*,SvIV(ptr));
5746  }
5747 }
5748
5749
5750 REGEXP *
5751 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5752 {
5753  regexp_engine const *eng = current_re_engine();
5754  GET_RE_DEBUG_FLAGS_DECL;
5755
5756  PERL_ARGS_ASSERT_PREGCOMP;
5757
5758  /* Dispatch a request to compile a regexp to correct regexp engine. */
5759  DEBUG_COMPILE_r({
5760   PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5761       PTR2UV(eng));
5762  });
5763  return CALLREGCOMP_ENG(eng, pattern, flags);
5764 }
5765 #endif
5766
5767 /* public(ish) entry point for the perl core's own regex compiling code.
5768  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5769  * pattern rather than a list of OPs, and uses the internal engine rather
5770  * than the current one */
5771
5772 REGEXP *
5773 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5774 {
5775  SV *pat = pattern; /* defeat constness! */
5776  PERL_ARGS_ASSERT_RE_COMPILE;
5777  return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5778 #ifdef PERL_IN_XSUB_RE
5779         &my_reg_engine,
5780 #else
5781         &reh_regexp_engine,
5782 #endif
5783         NULL, NULL, rx_flags, 0);
5784 }
5785
5786
5787 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5788  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5789  * point to the realloced string and length.
5790  *
5791  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5792  * stuff added */
5793
5794 static void
5795 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5796      char **pat_p, STRLEN *plen_p, int num_code_blocks)
5797 {
5798  U8 *const src = (U8*)*pat_p;
5799  U8 *dst, *d;
5800  int n=0;
5801  STRLEN s = 0;
5802  bool do_end = 0;
5803  GET_RE_DEBUG_FLAGS_DECL;
5804
5805  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5806   "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5807
5808  Newx(dst, *plen_p * 2 + 1, U8);
5809  d = dst;
5810
5811  while (s < *plen_p) {
5812   append_utf8_from_native_byte(src[s], &d);
5813   if (n < num_code_blocks) {
5814    if (!do_end && pRExC_state->code_blocks[n].start == s) {
5815     pRExC_state->code_blocks[n].start = d - dst - 1;
5816     assert(*(d - 1) == '(');
5817     do_end = 1;
5818    }
5819    else if (do_end && pRExC_state->code_blocks[n].end == s) {
5820     pRExC_state->code_blocks[n].end = d - dst - 1;
5821     assert(*(d - 1) == ')');
5822     do_end = 0;
5823     n++;
5824    }
5825   }
5826   s++;
5827  }
5828  *d = '\0';
5829  *plen_p = d - dst;
5830  *pat_p = (char*) dst;
5831  SAVEFREEPV(*pat_p);
5832  RExC_orig_utf8 = RExC_utf8 = 1;
5833 }
5834
5835
5836
5837 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5838  * while recording any code block indices, and handling overloading,
5839  * nested qr// objects etc.  If pat is null, it will allocate a new
5840  * string, or just return the first arg, if there's only one.
5841  *
5842  * Returns the malloced/updated pat.
5843  * patternp and pat_count is the array of SVs to be concatted;
5844  * oplist is the optional list of ops that generated the SVs;
5845  * recompile_p is a pointer to a boolean that will be set if
5846  *   the regex will need to be recompiled.
5847  * delim, if non-null is an SV that will be inserted between each element
5848  */
5849
5850 static SV*
5851 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5852     SV *pat, SV ** const patternp, int pat_count,
5853     OP *oplist, bool *recompile_p, SV *delim)
5854 {
5855  SV **svp;
5856  int n = 0;
5857  bool use_delim = FALSE;
5858  bool alloced = FALSE;
5859
5860  /* if we know we have at least two args, create an empty string,
5861  * then concatenate args to that. For no args, return an empty string */
5862  if (!pat && pat_count != 1) {
5863   pat = newSVpvs("");
5864   SAVEFREESV(pat);
5865   alloced = TRUE;
5866  }
5867
5868  for (svp = patternp; svp < patternp + pat_count; svp++) {
5869   SV *sv;
5870   SV *rx  = NULL;
5871   STRLEN orig_patlen = 0;
5872   bool code = 0;
5873   SV *msv = use_delim ? delim : *svp;
5874   if (!msv) msv = &PL_sv_undef;
5875
5876   /* if we've got a delimiter, we go round the loop twice for each
5877   * svp slot (except the last), using the delimiter the second
5878   * time round */
5879   if (use_delim) {
5880    svp--;
5881    use_delim = FALSE;
5882   }
5883   else if (delim)
5884    use_delim = TRUE;
5885
5886   if (SvTYPE(msv) == SVt_PVAV) {
5887    /* we've encountered an interpolated array within
5888    * the pattern, e.g. /...@a..../. Expand the list of elements,
5889    * then recursively append elements.
5890    * The code in this block is based on S_pushav() */
5891
5892    AV *const av = (AV*)msv;
5893    const SSize_t maxarg = AvFILL(av) + 1;
5894    SV **array;
5895
5896    if (oplist) {
5897     assert(oplist->op_type == OP_PADAV
5898      || oplist->op_type == OP_RV2AV);
5899     oplist = OpSIBLING(oplist);
5900    }
5901
5902    if (SvRMAGICAL(av)) {
5903     SSize_t i;
5904
5905     Newx(array, maxarg, SV*);
5906     SAVEFREEPV(array);
5907     for (i=0; i < maxarg; i++) {
5908      SV ** const svp = av_fetch(av, i, FALSE);
5909      array[i] = svp ? *svp : &PL_sv_undef;
5910     }
5911    }
5912    else
5913     array = AvARRAY(av);
5914
5915    pat = S_concat_pat(aTHX_ pRExC_state, pat,
5916         array, maxarg, NULL, recompile_p,
5917         /* $" */
5918         GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5919
5920    continue;
5921   }
5922
5923
5924   /* we make the assumption here that each op in the list of
5925   * op_siblings maps to one SV pushed onto the stack,
5926   * except for code blocks, with have both an OP_NULL and
5927   * and OP_CONST.
5928   * This allows us to match up the list of SVs against the
5929   * list of OPs to find the next code block.
5930   *
5931   * Note that       PUSHMARK PADSV PADSV ..
5932   * is optimised to
5933   *                 PADRANGE PADSV  PADSV  ..
5934   * so the alignment still works. */
5935
5936   if (oplist) {
5937    if (oplist->op_type == OP_NULL
5938     && (oplist->op_flags & OPf_SPECIAL))
5939    {
5940     assert(n < pRExC_state->num_code_blocks);
5941     pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5942     pRExC_state->code_blocks[n].block = oplist;
5943     pRExC_state->code_blocks[n].src_regex = NULL;
5944     n++;
5945     code = 1;
5946     oplist = OpSIBLING(oplist); /* skip CONST */
5947     assert(oplist);
5948    }
5949    oplist = OpSIBLING(oplist);;
5950   }
5951
5952   /* apply magic and QR overloading to arg */
5953
5954   SvGETMAGIC(msv);
5955   if (SvROK(msv) && SvAMAGIC(msv)) {
5956    SV *sv = AMG_CALLunary(msv, regexp_amg);
5957    if (sv) {
5958     if (SvROK(sv))
5959      sv = SvRV(sv);
5960     if (SvTYPE(sv) != SVt_REGEXP)
5961      Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5962     msv = sv;
5963    }
5964   }
5965
5966   /* try concatenation overload ... */
5967   if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5968     (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5969   {
5970    sv_setsv(pat, sv);
5971    /* overloading involved: all bets are off over literal
5972    * code. Pretend we haven't seen it */
5973    pRExC_state->num_code_blocks -= n;
5974    n = 0;
5975   }
5976   else  {
5977    /* ... or failing that, try "" overload */
5978    while (SvAMAGIC(msv)
5979      && (sv = AMG_CALLunary(msv, string_amg))
5980      && sv != msv
5981      &&  !(   SvROK(msv)
5982       && SvROK(sv)
5983       && SvRV(msv) == SvRV(sv))
5984    ) {
5985     msv = sv;
5986     SvGETMAGIC(msv);
5987    }
5988    if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5989     msv = SvRV(msv);
5990
5991    if (pat) {
5992     /* this is a partially unrolled
5993     *     sv_catsv_nomg(pat, msv);
5994     * that allows us to adjust code block indices if
5995     * needed */
5996     STRLEN dlen;
5997     char *dst = SvPV_force_nomg(pat, dlen);
5998     orig_patlen = dlen;
5999     if (SvUTF8(msv) && !SvUTF8(pat)) {
6000      S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6001      sv_setpvn(pat, dst, dlen);
6002      SvUTF8_on(pat);
6003     }
6004     sv_catsv_nomg(pat, msv);
6005     rx = msv;
6006    }
6007    else
6008     pat = msv;
6009
6010    if (code)
6011     pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
6012   }
6013
6014   /* extract any code blocks within any embedded qr//'s */
6015   if (rx && SvTYPE(rx) == SVt_REGEXP
6016    && RX_ENGINE((REGEXP*)rx)->op_comp)
6017   {
6018
6019    RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6020    if (ri->num_code_blocks) {
6021     int i;
6022     /* the presence of an embedded qr// with code means
6023     * we should always recompile: the text of the
6024     * qr// may not have changed, but it may be a
6025     * different closure than last time */
6026     *recompile_p = 1;
6027     Renew(pRExC_state->code_blocks,
6028      pRExC_state->num_code_blocks + ri->num_code_blocks,
6029      struct reg_code_block);
6030     pRExC_state->num_code_blocks += ri->num_code_blocks;
6031
6032     for (i=0; i < ri->num_code_blocks; i++) {
6033      struct reg_code_block *src, *dst;
6034      STRLEN offset =  orig_patlen
6035       + ReANY((REGEXP *)rx)->pre_prefix;
6036      assert(n < pRExC_state->num_code_blocks);
6037      src = &ri->code_blocks[i];
6038      dst = &pRExC_state->code_blocks[n];
6039      dst->start     = src->start + offset;
6040      dst->end     = src->end   + offset;
6041      dst->block     = src->block;
6042      dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
6043            src->src_regex
6044             ? src->src_regex
6045             : (REGEXP*)rx);
6046      n++;
6047     }
6048    }
6049   }
6050  }
6051  /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6052  if (alloced)
6053   SvSETMAGIC(pat);
6054
6055  return pat;
6056 }
6057
6058
6059
6060 /* see if there are any run-time code blocks in the pattern.
6061  * False positives are allowed */
6062
6063 static bool
6064 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6065      char *pat, STRLEN plen)
6066 {
6067  int n = 0;
6068  STRLEN s;
6069
6070  PERL_UNUSED_CONTEXT;
6071
6072  for (s = 0; s < plen; s++) {
6073   if (n < pRExC_state->num_code_blocks
6074    && s == pRExC_state->code_blocks[n].start)
6075   {
6076    s = pRExC_state->code_blocks[n].end;
6077    n++;
6078    continue;
6079   }
6080   /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6081   * positives here */
6082   if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6083    (pat[s+2] == '{'
6084     || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6085   )
6086    return 1;
6087  }
6088  return 0;
6089 }
6090
6091 /* Handle run-time code blocks. We will already have compiled any direct
6092  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6093  * copy of it, but with any literal code blocks blanked out and
6094  * appropriate chars escaped; then feed it into
6095  *
6096  *    eval "qr'modified_pattern'"
6097  *
6098  * For example,
6099  *
6100  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6101  *
6102  * becomes
6103  *
6104  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6105  *
6106  * After eval_sv()-ing that, grab any new code blocks from the returned qr
6107  * and merge them with any code blocks of the original regexp.
6108  *
6109  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6110  * instead, just save the qr and return FALSE; this tells our caller that
6111  * the original pattern needs upgrading to utf8.
6112  */
6113
6114 static bool
6115 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6116  char *pat, STRLEN plen)
6117 {
6118  SV *qr;
6119
6120  GET_RE_DEBUG_FLAGS_DECL;
6121
6122  if (pRExC_state->runtime_code_qr) {
6123   /* this is the second time we've been called; this should
6124   * only happen if the main pattern got upgraded to utf8
6125   * during compilation; re-use the qr we compiled first time
6126   * round (which should be utf8 too)
6127   */
6128   qr = pRExC_state->runtime_code_qr;
6129   pRExC_state->runtime_code_qr = NULL;
6130   assert(RExC_utf8 && SvUTF8(qr));
6131  }
6132  else {
6133   int n = 0;
6134   STRLEN s;
6135   char *p, *newpat;
6136   int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6137   SV *sv, *qr_ref;
6138   dSP;
6139
6140   /* determine how many extra chars we need for ' and \ escaping */
6141   for (s = 0; s < plen; s++) {
6142    if (pat[s] == '\'' || pat[s] == '\\')
6143     newlen++;
6144   }
6145
6146   Newx(newpat, newlen, char);
6147   p = newpat;
6148   *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6149
6150   for (s = 0; s < plen; s++) {
6151    if (n < pRExC_state->num_code_blocks
6152     && s == pRExC_state->code_blocks[n].start)
6153    {
6154     /* blank out literal code block */
6155     assert(pat[s] == '(');
6156     while (s <= pRExC_state->code_blocks[n].end) {
6157      *p++ = '_';
6158      s++;
6159     }
6160     s--;
6161     n++;
6162     continue;
6163    }
6164    if (pat[s] == '\'' || pat[s] == '\\')
6165     *p++ = '\\';
6166    *p++ = pat[s];
6167   }
6168   *p++ = '\'';
6169   if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6170    *p++ = 'x';
6171   *p++ = '\0';
6172   DEBUG_COMPILE_r({
6173    PerlIO_printf(Perl_debug_log,
6174     "%sre-parsing pattern for runtime code:%s %s\n",
6175     PL_colors[4],PL_colors[5],newpat);
6176   });
6177
6178   sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6179   Safefree(newpat);
6180
6181   ENTER;
6182   SAVETMPS;
6183   save_re_context();
6184   PUSHSTACKi(PERLSI_REQUIRE);
6185   /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6186   * parsing qr''; normally only q'' does this. It also alters
6187   * hints handling */
6188   eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6189   SvREFCNT_dec_NN(sv);
6190   SPAGAIN;
6191   qr_ref = POPs;
6192   PUTBACK;
6193   {
6194    SV * const errsv = ERRSV;
6195    if (SvTRUE_NN(errsv))
6196    {
6197     Safefree(pRExC_state->code_blocks);
6198     /* use croak_sv ? */
6199     Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6200    }
6201   }
6202   assert(SvROK(qr_ref));
6203   qr = SvRV(qr_ref);
6204   assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6205   /* the leaving below frees the tmp qr_ref.
6206   * Give qr a life of its own */
6207   SvREFCNT_inc(qr);
6208   POPSTACK;
6209   FREETMPS;
6210   LEAVE;
6211
6212  }
6213
6214  if (!RExC_utf8 && SvUTF8(qr)) {
6215   /* first time through; the pattern got upgraded; save the
6216   * qr for the next time through */
6217   assert(!pRExC_state->runtime_code_qr);
6218   pRExC_state->runtime_code_qr = qr;
6219   return 0;
6220  }
6221
6222
6223  /* extract any code blocks within the returned qr//  */
6224
6225
6226  /* merge the main (r1) and run-time (r2) code blocks into one */
6227  {
6228   RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6229   struct reg_code_block *new_block, *dst;
6230   RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6231   int i1 = 0, i2 = 0;
6232
6233   if (!r2->num_code_blocks) /* we guessed wrong */
6234   {
6235    SvREFCNT_dec_NN(qr);
6236    return 1;
6237   }
6238
6239   Newx(new_block,
6240    r1->num_code_blocks + r2->num_code_blocks,
6241    struct reg_code_block);
6242   dst = new_block;
6243
6244   while (    i1 < r1->num_code_blocks
6245     || i2 < r2->num_code_blocks)
6246   {
6247    struct reg_code_block *src;
6248    bool is_qr = 0;
6249
6250    if (i1 == r1->num_code_blocks) {
6251     src = &r2->code_blocks[i2++];
6252     is_qr = 1;
6253    }
6254    else if (i2 == r2->num_code_blocks)
6255     src = &r1->code_blocks[i1++];
6256    else if (  r1->code_blocks[i1].start
6257      < r2->code_blocks[i2].start)
6258    {
6259     src = &r1->code_blocks[i1++];
6260     assert(src->end < r2->code_blocks[i2].start);
6261    }
6262    else {
6263     assert(  r1->code_blocks[i1].start
6264      > r2->code_blocks[i2].start);
6265     src = &r2->code_blocks[i2++];
6266     is_qr = 1;
6267     assert(src->end < r1->code_blocks[i1].start);
6268    }
6269
6270    assert(pat[src->start] == '(');
6271    assert(pat[src->end]   == ')');
6272    dst->start     = src->start;
6273    dst->end     = src->end;
6274    dst->block     = src->block;
6275    dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6276          : src->src_regex;
6277    dst++;
6278   }
6279   r1->num_code_blocks += r2->num_code_blocks;
6280   Safefree(r1->code_blocks);
6281   r1->code_blocks = new_block;
6282  }
6283
6284  SvREFCNT_dec_NN(qr);
6285  return 1;
6286 }
6287
6288
6289 STATIC bool
6290 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6291      SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6292      SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6293      STRLEN longest_length, bool eol, bool meol)
6294 {
6295  /* This is the common code for setting up the floating and fixed length
6296  * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6297  * as to whether succeeded or not */
6298
6299  I32 t;
6300  SSize_t ml;
6301
6302  if (! (longest_length
6303   || (eol /* Can't have SEOL and MULTI */
6304    && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6305   )
6306    /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6307   || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6308  {
6309   return FALSE;
6310  }
6311
6312  /* copy the information about the longest from the reg_scan_data
6313   over to the program. */
6314  if (SvUTF8(sv_longest)) {
6315   *rx_utf8 = sv_longest;
6316   *rx_substr = NULL;
6317  } else {
6318   *rx_substr = sv_longest;
6319   *rx_utf8 = NULL;
6320  }
6321  /* end_shift is how many chars that must be matched that
6322   follow this item. We calculate it ahead of time as once the
6323   lookbehind offset is added in we lose the ability to correctly
6324   calculate it.*/
6325  ml = minlen ? *(minlen) : (SSize_t)longest_length;
6326  *rx_end_shift = ml - offset
6327   - longest_length + (SvTAIL(sv_longest) != 0)
6328   + lookbehind;
6329
6330  t = (eol/* Can't have SEOL and MULTI */
6331   && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6332  fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6333
6334  return TRUE;
6335 }
6336
6337 /*
6338  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6339  * regular expression into internal code.
6340  * The pattern may be passed either as:
6341  *    a list of SVs (patternp plus pat_count)
6342  *    a list of OPs (expr)
6343  * If both are passed, the SV list is used, but the OP list indicates
6344  * which SVs are actually pre-compiled code blocks
6345  *
6346  * The SVs in the list have magic and qr overloading applied to them (and
6347  * the list may be modified in-place with replacement SVs in the latter
6348  * case).
6349  *
6350  * If the pattern hasn't changed from old_re, then old_re will be
6351  * returned.
6352  *
6353  * eng is the current engine. If that engine has an op_comp method, then
6354  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6355  * do the initial concatenation of arguments and pass on to the external
6356  * engine.
6357  *
6358  * If is_bare_re is not null, set it to a boolean indicating whether the
6359  * arg list reduced (after overloading) to a single bare regex which has
6360  * been returned (i.e. /$qr/).
6361  *
6362  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6363  *
6364  * pm_flags contains the PMf_* flags, typically based on those from the
6365  * pm_flags field of the related PMOP. Currently we're only interested in
6366  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6367  *
6368  * We can't allocate space until we know how big the compiled form will be,
6369  * but we can't compile it (and thus know how big it is) until we've got a
6370  * place to put the code.  So we cheat:  we compile it twice, once with code
6371  * generation turned off and size counting turned on, and once "for real".
6372  * This also means that we don't allocate space until we are sure that the
6373  * thing really will compile successfully, and we never have to move the
6374  * code and thus invalidate pointers into it.  (Note that it has to be in
6375  * one piece because free() must be able to free it all.) [NB: not true in perl]
6376  *
6377  * Beware that the optimization-preparation code in here knows about some
6378  * of the structure of the compiled regexp.  [I'll say.]
6379  */
6380
6381 REGEXP *
6382 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6383      OP *expr, const regexp_engine* eng, REGEXP *old_re,
6384      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6385 {
6386  REGEXP *rx;
6387  struct regexp *r;
6388  regexp_internal *ri;
6389  STRLEN plen;
6390  char *exp;
6391  regnode *scan;
6392  I32 flags;
6393  SSize_t minlen = 0;
6394  U32 rx_flags;
6395  SV *pat;
6396  SV *code_blocksv = NULL;
6397  SV** new_patternp = patternp;
6398
6399  /* these are all flags - maybe they should be turned
6400  * into a single int with different bit masks */
6401  I32 sawlookahead = 0;
6402  I32 sawplus = 0;
6403  I32 sawopen = 0;
6404  I32 sawminmod = 0;
6405
6406  regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6407  bool recompile = 0;
6408  bool runtime_code = 0;
6409  scan_data_t data;
6410  RExC_state_t RExC_state;
6411  RExC_state_t * const pRExC_state = &RExC_state;
6412 #ifdef TRIE_STUDY_OPT
6413  int restudied = 0;
6414  RExC_state_t copyRExC_state;
6415 #endif
6416  GET_RE_DEBUG_FLAGS_DECL;
6417
6418  PERL_ARGS_ASSERT_RE_OP_COMPILE;
6419
6420  DEBUG_r(if (!PL_colorset) reginitcolors());
6421
6422  /* Initialize these here instead of as-needed, as is quick and avoids
6423  * having to test them each time otherwise */
6424  if (! PL_AboveLatin1) {
6425   PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6426   PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6427   PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6428   PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6429   PL_HasMultiCharFold =
6430      _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6431
6432   /* This is calculated here, because the Perl program that generates the
6433   * static global ones doesn't currently have access to
6434   * NUM_ANYOF_CODE_POINTS */
6435   PL_InBitmap = _new_invlist(2);
6436   PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6437              NUM_ANYOF_CODE_POINTS - 1);
6438  }
6439
6440  pRExC_state->code_blocks = NULL;
6441  pRExC_state->num_code_blocks = 0;
6442
6443  if (is_bare_re)
6444   *is_bare_re = FALSE;
6445
6446  if (expr && (expr->op_type == OP_LIST ||
6447     (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6448   /* allocate code_blocks if needed */
6449   OP *o;
6450   int ncode = 0;
6451
6452   for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
6453    if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6454     ncode++; /* count of DO blocks */
6455   if (ncode) {
6456    pRExC_state->num_code_blocks = ncode;
6457    Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6458   }
6459  }
6460
6461  if (!pat_count) {
6462   /* compile-time pattern with just OP_CONSTs and DO blocks */
6463
6464   int n;
6465   OP *o;
6466
6467   /* find how many CONSTs there are */
6468   assert(expr);
6469   n = 0;
6470   if (expr->op_type == OP_CONST)
6471    n = 1;
6472   else
6473    for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6474     if (o->op_type == OP_CONST)
6475      n++;
6476    }
6477
6478   /* fake up an SV array */
6479
6480   assert(!new_patternp);
6481   Newx(new_patternp, n, SV*);
6482   SAVEFREEPV(new_patternp);
6483   pat_count = n;
6484
6485   n = 0;
6486   if (expr->op_type == OP_CONST)
6487    new_patternp[n] = cSVOPx_sv(expr);
6488   else
6489    for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6490     if (o->op_type == OP_CONST)
6491      new_patternp[n++] = cSVOPo_sv;
6492    }
6493
6494  }
6495
6496  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6497   "Assembling pattern from %d elements%s\n", pat_count,
6498    orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6499
6500  /* set expr to the first arg op */
6501
6502  if (pRExC_state->num_code_blocks
6503   && expr->op_type != OP_CONST)
6504  {
6505    expr = cLISTOPx(expr)->op_first;
6506    assert(   expr->op_type == OP_PUSHMARK
6507     || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6508     || expr->op_type == OP_PADRANGE);
6509    expr = OpSIBLING(expr);
6510  }
6511
6512  pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6513       expr, &recompile, NULL);
6514
6515  /* handle bare (possibly after overloading) regex: foo =~ $re */
6516  {
6517   SV *re = pat;
6518   if (SvROK(re))
6519    re = SvRV(re);
6520   if (SvTYPE(re) == SVt_REGEXP) {
6521    if (is_bare_re)
6522     *is_bare_re = TRUE;
6523    SvREFCNT_inc(re);
6524    Safefree(pRExC_state->code_blocks);
6525    DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6526     "Precompiled pattern%s\n",
6527      orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6528
6529    return (REGEXP*)re;
6530   }
6531  }
6532
6533  exp = SvPV_nomg(pat, plen);
6534
6535  if (!eng->op_comp) {
6536   if ((SvUTF8(pat) && IN_BYTES)
6537     || SvGMAGICAL(pat) || SvAMAGIC(pat))
6538   {
6539    /* make a temporary copy; either to convert to bytes,
6540    * or to avoid repeating get-magic / overloaded stringify */
6541    pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6542           (IN_BYTES ? 0 : SvUTF8(pat)));
6543   }
6544   Safefree(pRExC_state->code_blocks);
6545   return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6546  }
6547
6548  /* ignore the utf8ness if the pattern is 0 length */
6549  RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6550  RExC_uni_semantics = 0;
6551  RExC_contains_locale = 0;
6552  RExC_contains_i = 0;
6553  RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
6554  pRExC_state->runtime_code_qr = NULL;
6555  RExC_frame_head= NULL;
6556  RExC_frame_last= NULL;
6557  RExC_frame_count= 0;
6558
6559  DEBUG_r({
6560   RExC_mysv1= sv_newmortal();
6561   RExC_mysv2= sv_newmortal();
6562  });
6563  DEBUG_COMPILE_r({
6564    SV *dsv= sv_newmortal();
6565    RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6566    PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6567       PL_colors[4],PL_colors[5],s);
6568   });
6569
6570   redo_first_pass:
6571  /* we jump here if we upgrade the pattern to utf8 and have to
6572  * recompile */
6573
6574  if ((pm_flags & PMf_USE_RE_EVAL)
6575     /* this second condition covers the non-regex literal case,
6576     * i.e.  $foo =~ '(?{})'. */
6577     || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6578  )
6579   runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6580
6581  /* return old regex if pattern hasn't changed */
6582  /* XXX: note in the below we have to check the flags as well as the
6583  * pattern.
6584  *
6585  * Things get a touch tricky as we have to compare the utf8 flag
6586  * independently from the compile flags.  */
6587
6588  if (   old_re
6589   && !recompile
6590   && !!RX_UTF8(old_re) == !!RExC_utf8
6591   && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6592   && RX_PRECOMP(old_re)
6593   && RX_PRELEN(old_re) == plen
6594   && memEQ(RX_PRECOMP(old_re), exp, plen)
6595   && !runtime_code /* with runtime code, always recompile */ )
6596  {
6597   Safefree(pRExC_state->code_blocks);
6598   return old_re;
6599  }
6600
6601  rx_flags = orig_rx_flags;
6602
6603  if (rx_flags & PMf_FOLD) {
6604   RExC_contains_i = 1;
6605  }
6606  if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6607
6608   /* Set to use unicode semantics if the pattern is in utf8 and has the
6609   * 'depends' charset specified, as it means unicode when utf8  */
6610   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6611  }
6612
6613  RExC_precomp = exp;
6614  RExC_flags = rx_flags;
6615  RExC_pm_flags = pm_flags;
6616
6617  if (runtime_code) {
6618   if (TAINTING_get && TAINT_get)
6619    Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6620
6621   if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6622    /* whoops, we have a non-utf8 pattern, whilst run-time code
6623    * got compiled as utf8. Try again with a utf8 pattern */
6624    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6625          pRExC_state->num_code_blocks);
6626    goto redo_first_pass;
6627   }
6628  }
6629  assert(!pRExC_state->runtime_code_qr);
6630
6631  RExC_sawback = 0;
6632
6633  RExC_seen = 0;
6634  RExC_maxlen = 0;
6635  RExC_in_lookbehind = 0;
6636  RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6637  RExC_extralen = 0;
6638  RExC_override_recoding = 0;
6639 #ifdef EBCDIC
6640  RExC_recode_x_to_native = 0;
6641 #endif
6642  RExC_in_multi_char_class = 0;
6643
6644  /* First pass: determine size, legality. */
6645  RExC_parse = exp;
6646  RExC_start = exp;
6647  RExC_end = exp + plen;
6648  RExC_naughty = 0;
6649  RExC_npar = 1;
6650  RExC_nestroot = 0;
6651  RExC_size = 0L;
6652  RExC_emit = (regnode *) &RExC_emit_dummy;
6653  RExC_whilem_seen = 0;
6654  RExC_open_parens = NULL;
6655  RExC_close_parens = NULL;
6656  RExC_opend = NULL;
6657  RExC_paren_names = NULL;
6658 #ifdef DEBUGGING
6659  RExC_paren_name_list = NULL;
6660 #endif
6661  RExC_recurse = NULL;
6662  RExC_study_chunk_recursed = NULL;
6663  RExC_study_chunk_recursed_bytes= 0;
6664  RExC_recurse_count = 0;
6665  pRExC_state->code_index = 0;
6666
6667  DEBUG_PARSE_r(
6668   PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6669   RExC_lastnum=0;
6670   RExC_lastparse=NULL;
6671  );
6672  /* reg may croak on us, not giving us a chance to free
6673  pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6674  need it to survive as long as the regexp (qr/(?{})/).
6675  We must check that code_blocksv is not already set, because we may
6676  have jumped back to restart the sizing pass. */
6677  if (pRExC_state->code_blocks && !code_blocksv) {
6678   code_blocksv = newSV_type(SVt_PV);
6679   SAVEFREESV(code_blocksv);
6680   SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6681   SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6682  }
6683  if (reg(pRExC_state, 0, &flags,1) == NULL) {
6684   /* It's possible to write a regexp in ascii that represents Unicode
6685   codepoints outside of the byte range, such as via \x{100}. If we
6686   detect such a sequence we have to convert the entire pattern to utf8
6687   and then recompile, as our sizing calculation will have been based
6688   on 1 byte == 1 character, but we will need to use utf8 to encode
6689   at least some part of the pattern, and therefore must convert the whole
6690   thing.
6691   -- dmq */
6692   if (flags & RESTART_UTF8) {
6693    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6694          pRExC_state->num_code_blocks);
6695    goto redo_first_pass;
6696   }
6697   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6698  }
6699  if (code_blocksv)
6700   SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6701
6702  DEBUG_PARSE_r({
6703   PerlIO_printf(Perl_debug_log,
6704    "Required size %"IVdf" nodes\n"
6705    "Starting second pass (creation)\n",
6706    (IV)RExC_size);
6707   RExC_lastnum=0;
6708   RExC_lastparse=NULL;
6709  });
6710
6711  /* The first pass could have found things that force Unicode semantics */
6712  if ((RExC_utf8 || RExC_uni_semantics)
6713   && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6714  {
6715   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6716  }
6717
6718  /* Small enough for pointer-storage convention?
6719  If extralen==0, this means that we will not need long jumps. */
6720  if (RExC_size >= 0x10000L && RExC_extralen)
6721   RExC_size += RExC_extralen;
6722  else
6723   RExC_extralen = 0;
6724  if (RExC_whilem_seen > 15)
6725   RExC_whilem_seen = 15;
6726
6727  /* Allocate space and zero-initialize. Note, the two step process
6728  of zeroing when in debug mode, thus anything assigned has to
6729  happen after that */
6730  rx = (REGEXP*) newSV_type(SVt_REGEXP);
6731  r = ReANY(rx);
6732  Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6733   char, regexp_internal);
6734  if ( r == NULL || ri == NULL )
6735   FAIL("Regexp out of space");
6736 #ifdef DEBUGGING
6737  /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6738  Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6739   char);
6740 #else
6741  /* bulk initialize base fields with 0. */
6742  Zero(ri, sizeof(regexp_internal), char);
6743 #endif
6744
6745  /* non-zero initialization begins here */
6746  RXi_SET( r, ri );
6747  r->engine= eng;
6748  r->extflags = rx_flags;
6749  RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6750
6751  if (pm_flags & PMf_IS_QR) {
6752   ri->code_blocks = pRExC_state->code_blocks;
6753   ri->num_code_blocks = pRExC_state->num_code_blocks;
6754  }
6755  else
6756  {
6757   int n;
6758   for (n = 0; n < pRExC_state->num_code_blocks; n++)
6759    if (pRExC_state->code_blocks[n].src_regex)
6760     SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6761   SAVEFREEPV(pRExC_state->code_blocks);
6762  }
6763
6764  {
6765   bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6766   bool has_charset = (get_regex_charset(r->extflags)
6767              != REGEX_DEPENDS_CHARSET);
6768
6769   /* The caret is output if there are any defaults: if not all the STD
6770   * flags are set, or if no character set specifier is needed */
6771   bool has_default =
6772      (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6773      || ! has_charset);
6774   bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6775             == REG_RUN_ON_COMMENT_SEEN);
6776   U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6777        >> RXf_PMf_STD_PMMOD_SHIFT);
6778   const char *fptr = STD_PAT_MODS;        /*"msixn"*/
6779   char *p;
6780   /* Allocate for the worst case, which is all the std flags are turned
6781   * on.  If more precision is desired, we could do a population count of
6782   * the flags set.  This could be done with a small lookup table, or by
6783   * shifting, masking and adding, or even, when available, assembly
6784   * language for a machine-language population count.
6785   * We never output a minus, as all those are defaults, so are
6786   * covered by the caret */
6787   const STRLEN wraplen = plen + has_p + has_runon
6788    + has_default       /* If needs a caret */
6789
6790     /* If needs a character set specifier */
6791    + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6792    + (sizeof(STD_PAT_MODS) - 1)
6793    + (sizeof("(?:)") - 1);
6794
6795   Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6796   r->xpv_len_u.xpvlenu_pv = p;
6797   if (RExC_utf8)
6798    SvFLAGS(rx) |= SVf_UTF8;
6799   *p++='('; *p++='?';
6800
6801   /* If a default, cover it using the caret */
6802   if (has_default) {
6803    *p++= DEFAULT_PAT_MOD;
6804   }
6805   if (has_charset) {
6806    STRLEN len;
6807    const char* const name = get_regex_charset_name(r->extflags, &len);
6808    Copy(name, p, len, char);
6809    p += len;
6810   }
6811   if (has_p)
6812    *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6813   {
6814    char ch;
6815    while((ch = *fptr++)) {
6816     if(reganch & 1)
6817      *p++ = ch;
6818     reganch >>= 1;
6819    }
6820   }
6821
6822   *p++ = ':';
6823   Copy(RExC_precomp, p, plen, char);
6824   assert ((RX_WRAPPED(rx) - p) < 16);
6825   r->pre_prefix = p - RX_WRAPPED(rx);
6826   p += plen;
6827   if (has_runon)
6828    *p++ = '\n';
6829   *p++ = ')';
6830   *p = 0;
6831   SvCUR_set(rx, p - RX_WRAPPED(rx));
6832  }
6833
6834  r->intflags = 0;
6835  r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6836
6837  /* setup various meta data about recursion, this all requires
6838  * RExC_npar to be correctly set, and a bit later on we clear it */
6839  if (RExC_seen & REG_RECURSE_SEEN) {
6840   Newxz(RExC_open_parens, RExC_npar,regnode *);
6841   SAVEFREEPV(RExC_open_parens);
6842   Newxz(RExC_close_parens,RExC_npar,regnode *);
6843   SAVEFREEPV(RExC_close_parens);
6844  }
6845  if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6846   /* Note, RExC_npar is 1 + the number of parens in a pattern.
6847   * So its 1 if there are no parens. */
6848   RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6849           ((RExC_npar & 0x07) != 0);
6850   Newx(RExC_study_chunk_recursed,
6851    RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6852   SAVEFREEPV(RExC_study_chunk_recursed);
6853  }
6854
6855  /* Useful during FAIL. */
6856 #ifdef RE_TRACK_PATTERN_OFFSETS
6857  Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6858  DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6859       "%s %"UVuf" bytes for offset annotations.\n",
6860       ri->u.offsets ? "Got" : "Couldn't get",
6861       (UV)((2*RExC_size+1) * sizeof(U32))));
6862 #endif
6863  SetProgLen(ri,RExC_size);
6864  RExC_rx_sv = rx;
6865  RExC_rx = r;
6866  RExC_rxi = ri;
6867  REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
6868
6869  /* Second pass: emit code. */
6870  RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6871  RExC_pm_flags = pm_flags;
6872  RExC_parse = exp;
6873  RExC_end = exp + plen;
6874  RExC_naughty = 0;
6875  RExC_npar = 1;
6876  RExC_emit_start = ri->program;
6877  RExC_emit = ri->program;
6878  RExC_emit_bound = ri->program + RExC_size + 1;
6879  pRExC_state->code_index = 0;
6880
6881  *((char*) RExC_emit++) = (char) REG_MAGIC;
6882  if (reg(pRExC_state, 0, &flags,1) == NULL) {
6883   ReREFCNT_dec(rx);
6884   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6885  }
6886  /* XXXX To minimize changes to RE engine we always allocate
6887  3-units-long substrs field. */
6888  Newx(r->substrs, 1, struct reg_substr_data);
6889  if (RExC_recurse_count) {
6890   Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6891   SAVEFREEPV(RExC_recurse);
6892  }
6893
6894   reStudy:
6895  r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6896  DEBUG_r(
6897   RExC_study_chunk_recursed_count= 0;
6898  );
6899  Zero(r->substrs, 1, struct reg_substr_data);
6900  if (RExC_study_chunk_recursed) {
6901   Zero(RExC_study_chunk_recursed,
6902    RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6903  }
6904
6905
6906 #ifdef TRIE_STUDY_OPT
6907  if (!restudied) {
6908   StructCopy(&zero_scan_data, &data, scan_data_t);
6909   copyRExC_state = RExC_state;
6910  } else {
6911   U32 seen=RExC_seen;
6912   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6913
6914   RExC_state = copyRExC_state;
6915   if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6916    RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6917   else
6918    RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6919   StructCopy(&zero_scan_data, &data, scan_data_t);
6920  }
6921 #else
6922  StructCopy(&zero_scan_data, &data, scan_data_t);
6923 #endif
6924
6925  /* Dig out information for optimizations. */
6926  r->extflags = RExC_flags; /* was pm_op */
6927  /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6928
6929  if (UTF)
6930   SvUTF8_on(rx); /* Unicode in it? */
6931  ri->regstclass = NULL;
6932  if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
6933   r->intflags |= PREGf_NAUGHTY;
6934  scan = ri->program + 1;  /* First BRANCH. */
6935
6936  /* testing for BRANCH here tells us whether there is "must appear"
6937  data in the pattern. If there is then we can use it for optimisations */
6938  if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
6939             */
6940   SSize_t fake;
6941   STRLEN longest_float_length, longest_fixed_length;
6942   regnode_ssc ch_class; /* pointed to by data */
6943   int stclass_flag;
6944   SSize_t last_close = 0; /* pointed to by data */
6945   regnode *first= scan;
6946   regnode *first_next= regnext(first);
6947   /*
6948   * Skip introductions and multiplicators >= 1
6949   * so that we can extract the 'meat' of the pattern that must
6950   * match in the large if() sequence following.
6951   * NOTE that EXACT is NOT covered here, as it is normally
6952   * picked up by the optimiser separately.
6953   *
6954   * This is unfortunate as the optimiser isnt handling lookahead
6955   * properly currently.
6956   *
6957   */
6958   while ((OP(first) == OPEN && (sawopen = 1)) ||
6959    /* An OR of *one* alternative - should not happen now. */
6960    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6961    /* for now we can't handle lookbehind IFMATCH*/
6962    (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6963    (OP(first) == PLUS) ||
6964    (OP(first) == MINMOD) ||
6965    /* An {n,m} with n>0 */
6966    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6967    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6968   {
6969     /*
6970     * the only op that could be a regnode is PLUS, all the rest
6971     * will be regnode_1 or regnode_2.
6972     *
6973     * (yves doesn't think this is true)
6974     */
6975     if (OP(first) == PLUS)
6976      sawplus = 1;
6977     else {
6978      if (OP(first) == MINMOD)
6979       sawminmod = 1;
6980      first += regarglen[OP(first)];
6981     }
6982     first = NEXTOPER(first);
6983     first_next= regnext(first);
6984   }
6985
6986   /* Starting-point info. */
6987  again:
6988   DEBUG_PEEP("first:",first,0);
6989   /* Ignore EXACT as we deal with it later. */
6990   if (PL_regkind[OP(first)] == EXACT) {
6991    if (OP(first) == EXACT || OP(first) == EXACTL)
6992     NOOP; /* Empty, get anchored substr later. */
6993    else
6994     ri->regstclass = first;
6995   }
6996 #ifdef TRIE_STCLASS
6997   else if (PL_regkind[OP(first)] == TRIE &&
6998     ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6999   {
7000    /* this can happen only on restudy */
7001    ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7002   }
7003 #endif
7004   else if (REGNODE_SIMPLE(OP(first)))
7005    ri->regstclass = first;
7006   else if (PL_regkind[OP(first)] == BOUND ||
7007     PL_regkind[OP(first)] == NBOUND)
7008    ri->regstclass = first;
7009   else if (PL_regkind[OP(first)] == BOL) {
7010    r->intflags |= (OP(first) == MBOL
7011       ? PREGf_ANCH_MBOL
7012       : PREGf_ANCH_SBOL);
7013    first = NEXTOPER(first);
7014    goto again;
7015   }
7016   else if (OP(first) == GPOS) {
7017    r->intflags |= PREGf_ANCH_GPOS;
7018    first = NEXTOPER(first);
7019    goto again;
7020   }
7021   else if ((!sawopen || !RExC_sawback) &&
7022    !sawlookahead &&
7023    (OP(first) == STAR &&
7024    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7025    !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7026   {
7027    /* turn .* into ^.* with an implied $*=1 */
7028    const int type =
7029     (OP(NEXTOPER(first)) == REG_ANY)
7030      ? PREGf_ANCH_MBOL
7031      : PREGf_ANCH_SBOL;
7032    r->intflags |= (type | PREGf_IMPLICIT);
7033    first = NEXTOPER(first);
7034    goto again;
7035   }
7036   if (sawplus && !sawminmod && !sawlookahead
7037    && (!sawopen || !RExC_sawback)
7038    && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7039    /* x+ must match at the 1st pos of run of x's */
7040    r->intflags |= PREGf_SKIP;
7041
7042   /* Scan is after the zeroth branch, first is atomic matcher. */
7043 #ifdef TRIE_STUDY_OPT
7044   DEBUG_PARSE_r(
7045    if (!restudied)
7046     PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7047        (IV)(first - scan + 1))
7048   );
7049 #else
7050   DEBUG_PARSE_r(
7051    PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7052     (IV)(first - scan + 1))
7053   );
7054 #endif
7055
7056
7057   /*
7058   * If there's something expensive in the r.e., find the
7059   * longest literal string that must appear and make it the
7060   * regmust.  Resolve ties in favor of later strings, since
7061   * the regstart check works with the beginning of the r.e.
7062   * and avoiding duplication strengthens checking.  Not a
7063   * strong reason, but sufficient in the absence of others.
7064   * [Now we resolve ties in favor of the earlier string if
7065   * it happens that c_offset_min has been invalidated, since the
7066   * earlier string may buy us something the later one won't.]
7067   */
7068
7069   data.longest_fixed = newSVpvs("");
7070   data.longest_float = newSVpvs("");
7071   data.last_found = newSVpvs("");
7072   data.longest = &(data.longest_fixed);
7073   ENTER_with_name("study_chunk");
7074   SAVEFREESV(data.longest_fixed);
7075   SAVEFREESV(data.longest_float);
7076   SAVEFREESV(data.last_found);
7077   first = scan;
7078   if (!ri->regstclass) {
7079    ssc_init(pRExC_state, &ch_class);
7080    data.start_class = &ch_class;
7081    stclass_flag = SCF_DO_STCLASS_AND;
7082   } else    /* XXXX Check for BOUND? */
7083    stclass_flag = 0;
7084   data.last_closep = &last_close;
7085
7086   DEBUG_RExC_seen();
7087   minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7088        scan + RExC_size, /* Up to end */
7089    &data, -1, 0, NULL,
7090    SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7091       | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7092    0);
7093
7094
7095   CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7096
7097
7098   if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7099    && data.last_start_min == 0 && data.last_end > 0
7100    && !RExC_seen_zerolen
7101    && !(RExC_seen & REG_VERBARG_SEEN)
7102    && !(RExC_seen & REG_GPOS_SEEN)
7103   ){
7104    r->extflags |= RXf_CHECK_ALL;
7105   }
7106   scan_commit(pRExC_state, &data,&minlen,0);
7107
7108   longest_float_length = CHR_SVLEN(data.longest_float);
7109
7110   if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
7111     && data.offset_fixed == data.offset_float_min
7112     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7113    && S_setup_longest (aTHX_ pRExC_state,
7114          data.longest_float,
7115          &(r->float_utf8),
7116          &(r->float_substr),
7117          &(r->float_end_shift),
7118          data.lookbehind_float,
7119          data.offset_float_min,
7120          data.minlen_float,
7121          longest_float_length,
7122          cBOOL(data.flags & SF_FL_BEFORE_EOL),
7123          cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7124   {
7125    r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7126    r->float_max_offset = data.offset_float_max;
7127    if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7128     r->float_max_offset -= data.lookbehind_float;
7129    SvREFCNT_inc_simple_void_NN(data.longest_float);
7130   }
7131   else {
7132    r->float_substr = r->float_utf8 = NULL;
7133    longest_float_length = 0;
7134   }
7135
7136   longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7137
7138   if (S_setup_longest (aTHX_ pRExC_state,
7139         data.longest_fixed,
7140         &(r->anchored_utf8),
7141         &(r->anchored_substr),
7142         &(r->anchored_end_shift),
7143         data.lookbehind_fixed,
7144         data.offset_fixed,
7145         data.minlen_fixed,
7146         longest_fixed_length,
7147         cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7148         cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7149   {
7150    r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7151    SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7152   }
7153   else {
7154    r->anchored_substr = r->anchored_utf8 = NULL;
7155    longest_fixed_length = 0;
7156   }
7157   LEAVE_with_name("study_chunk");
7158
7159   if (ri->regstclass
7160    && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7161    ri->regstclass = NULL;
7162
7163   if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7164    && stclass_flag
7165    && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7166    && is_ssc_worth_it(pRExC_state, data.start_class))
7167   {
7168    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7169
7170    ssc_finalize(pRExC_state, data.start_class);
7171
7172    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7173    StructCopy(data.start_class,
7174      (regnode_ssc*)RExC_rxi->data->data[n],
7175      regnode_ssc);
7176    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7177    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7178    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7179      regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7180      PerlIO_printf(Perl_debug_log,
7181          "synthetic stclass \"%s\".\n",
7182          SvPVX_const(sv));});
7183    data.start_class = NULL;
7184   }
7185
7186   /* A temporary algorithm prefers floated substr to fixed one to dig
7187   * more info. */
7188   if (longest_fixed_length > longest_float_length) {
7189    r->substrs->check_ix = 0;
7190    r->check_end_shift = r->anchored_end_shift;
7191    r->check_substr = r->anchored_substr;
7192    r->check_utf8 = r->anchored_utf8;
7193    r->check_offset_min = r->check_offset_max = r->anchored_offset;
7194    if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7195     r->intflags |= PREGf_NOSCAN;
7196   }
7197   else {
7198    r->substrs->check_ix = 1;
7199    r->check_end_shift = r->float_end_shift;
7200    r->check_substr = r->float_substr;
7201    r->check_utf8 = r->float_utf8;
7202    r->check_offset_min = r->float_min_offset;
7203    r->check_offset_max = r->float_max_offset;
7204   }
7205   if ((r->check_substr || r->check_utf8) ) {
7206    r->extflags |= RXf_USE_INTUIT;
7207    if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7208     r->extflags |= RXf_INTUIT_TAIL;
7209   }
7210   r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7211
7212   /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7213   if ( (STRLEN)minlen < longest_float_length )
7214    minlen= longest_float_length;
7215   if ( (STRLEN)minlen < longest_fixed_length )
7216    minlen= longest_fixed_length;
7217   */
7218  }
7219  else {
7220   /* Several toplevels. Best we can is to set minlen. */
7221   SSize_t fake;
7222   regnode_ssc ch_class;
7223   SSize_t last_close = 0;
7224
7225   DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7226
7227   scan = ri->program + 1;
7228   ssc_init(pRExC_state, &ch_class);
7229   data.start_class = &ch_class;
7230   data.last_closep = &last_close;
7231
7232   DEBUG_RExC_seen();
7233   minlen = study_chunk(pRExC_state,
7234    &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7235    SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7236              ? SCF_TRIE_DOING_RESTUDY
7237              : 0),
7238    0);
7239
7240   CHECK_RESTUDY_GOTO_butfirst(NOOP);
7241
7242   r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7243     = r->float_substr = r->float_utf8 = NULL;
7244
7245   if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7246    && is_ssc_worth_it(pRExC_state, data.start_class))
7247   {
7248    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7249
7250    ssc_finalize(pRExC_state, data.start_class);
7251
7252    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7253    StructCopy(data.start_class,
7254      (regnode_ssc*)RExC_rxi->data->data[n],
7255      regnode_ssc);
7256    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7257    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7258    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7259      regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7260      PerlIO_printf(Perl_debug_log,
7261          "synthetic stclass \"%s\".\n",
7262          SvPVX_const(sv));});
7263    data.start_class = NULL;
7264   }
7265  }
7266
7267  if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7268   r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7269   r->maxlen = REG_INFTY;
7270  }
7271  else {
7272   r->maxlen = RExC_maxlen;
7273  }
7274
7275  /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7276  the "real" pattern. */
7277  DEBUG_OPTIMISE_r({
7278   PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7279      (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7280  });
7281  r->minlenret = minlen;
7282  if (r->minlen < minlen)
7283   r->minlen = minlen;
7284
7285  if (RExC_seen & REG_GPOS_SEEN)
7286   r->intflags |= PREGf_GPOS_SEEN;
7287  if (RExC_seen & REG_LOOKBEHIND_SEEN)
7288   r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7289             lookbehind */
7290  if (pRExC_state->num_code_blocks)
7291   r->extflags |= RXf_EVAL_SEEN;
7292  if (RExC_seen & REG_VERBARG_SEEN)
7293  {
7294   r->intflags |= PREGf_VERBARG_SEEN;
7295   r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7296  }
7297  if (RExC_seen & REG_CUTGROUP_SEEN)
7298   r->intflags |= PREGf_CUTGROUP_SEEN;
7299  if (pm_flags & PMf_USE_RE_EVAL)
7300   r->intflags |= PREGf_USE_RE_EVAL;
7301  if (RExC_paren_names)
7302   RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7303  else
7304   RXp_PAREN_NAMES(r) = NULL;
7305
7306  /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7307  * so it can be used in pp.c */
7308  if (r->intflags & PREGf_ANCH)
7309   r->extflags |= RXf_IS_ANCHORED;
7310
7311
7312  {
7313   /* this is used to identify "special" patterns that might result
7314   * in Perl NOT calling the regex engine and instead doing the match "itself",
7315   * particularly special cases in split//. By having the regex compiler
7316   * do this pattern matching at a regop level (instead of by inspecting the pattern)
7317   * we avoid weird issues with equivalent patterns resulting in different behavior,
7318   * AND we allow non Perl engines to get the same optimizations by the setting the
7319   * flags appropriately - Yves */
7320   regnode *first = ri->program + 1;
7321   U8 fop = OP(first);
7322   regnode *next = regnext(first);
7323   U8 nop = OP(next);
7324
7325   if (PL_regkind[fop] == NOTHING && nop == END)
7326    r->extflags |= RXf_NULL;
7327   else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7328    /* when fop is SBOL first->flags will be true only when it was
7329    * produced by parsing /\A/, and not when parsing /^/. This is
7330    * very important for the split code as there we want to
7331    * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7332    * See rt #122761 for more details. -- Yves */
7333    r->extflags |= RXf_START_ONLY;
7334   else if (fop == PLUS
7335     && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7336     && nop == END)
7337    r->extflags |= RXf_WHITE;
7338   else if ( r->extflags & RXf_SPLIT
7339     && (fop == EXACT || fop == EXACTL)
7340     && STR_LEN(first) == 1
7341     && *(STRING(first)) == ' '
7342     && nop == END )
7343    r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7344
7345  }
7346
7347  if (RExC_contains_locale) {
7348   RXp_EXTFLAGS(r) |= RXf_TAINTED;
7349  }
7350
7351 #ifdef DEBUGGING
7352  if (RExC_paren_names) {
7353   ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7354   ri->data->data[ri->name_list_idx]
7355         = (void*)SvREFCNT_inc(RExC_paren_name_list);
7356  } else
7357 #endif
7358   ri->name_list_idx = 0;
7359
7360  if (RExC_recurse_count) {
7361   for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7362    const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7363    ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7364   }
7365  }
7366  Newxz(r->offs, RExC_npar, regexp_paren_pair);
7367  /* assume we don't need to swap parens around before we match */
7368  DEBUG_TEST_r({
7369   PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
7370    (unsigned long)RExC_study_chunk_recursed_count);
7371  });
7372  DEBUG_DUMP_r({
7373   DEBUG_RExC_seen();
7374   PerlIO_printf(Perl_debug_log,"Final program:\n");
7375   regdump(r);
7376  });
7377 #ifdef RE_TRACK_PATTERN_OFFSETS
7378  DEBUG_OFFSETS_r(if (ri->u.offsets) {
7379   const STRLEN len = ri->u.offsets[0];
7380   STRLEN i;
7381   GET_RE_DEBUG_FLAGS_DECL;
7382   PerlIO_printf(Perl_debug_log,
7383      "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7384   for (i = 1; i <= len; i++) {
7385    if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7386     PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7387     (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7388    }
7389   PerlIO_printf(Perl_debug_log, "\n");
7390  });
7391 #endif
7392
7393 #ifdef USE_ITHREADS
7394  /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7395  * by setting the regexp SV to readonly-only instead. If the
7396  * pattern's been recompiled, the USEDness should remain. */
7397  if (old_re && SvREADONLY(old_re))
7398   SvREADONLY_on(rx);
7399 #endif
7400  return rx;
7401 }
7402
7403
7404 SV*
7405 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7406      const U32 flags)
7407 {
7408  PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7409
7410  PERL_UNUSED_ARG(value);
7411
7412  if (flags & RXapif_FETCH) {
7413   return reg_named_buff_fetch(rx, key, flags);
7414  } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7415   Perl_croak_no_modify();
7416   return NULL;
7417  } else if (flags & RXapif_EXISTS) {
7418   return reg_named_buff_exists(rx, key, flags)
7419    ? &PL_sv_yes
7420    : &PL_sv_no;
7421  } else if (flags & RXapif_REGNAMES) {
7422   return reg_named_buff_all(rx, flags);
7423  } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7424   return reg_named_buff_scalar(rx, flags);
7425  } else {
7426   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7427   return NULL;
7428  }
7429 }
7430
7431 SV*
7432 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7433       const U32 flags)
7434 {
7435  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7436  PERL_UNUSED_ARG(lastkey);
7437
7438  if (flags & RXapif_FIRSTKEY)
7439   return reg_named_buff_firstkey(rx, flags);
7440  else if (flags & RXapif_NEXTKEY)
7441   return reg_named_buff_nextkey(rx, flags);
7442  else {
7443   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7444            (int)flags);
7445   return NULL;
7446  }
7447 }
7448
7449 SV*
7450 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7451       const U32 flags)
7452 {
7453  AV *retarray = NULL;
7454  SV *ret;
7455  struct regexp *const rx = ReANY(r);
7456
7457  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7458
7459  if (flags & RXapif_ALL)
7460   retarray=newAV();
7461
7462  if (rx && RXp_PAREN_NAMES(rx)) {
7463   HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7464   if (he_str) {
7465    IV i;
7466    SV* sv_dat=HeVAL(he_str);
7467    I32 *nums=(I32*)SvPVX(sv_dat);
7468    for ( i=0; i<SvIVX(sv_dat); i++ ) {
7469     if ((I32)(rx->nparens) >= nums[i]
7470      && rx->offs[nums[i]].start != -1
7471      && rx->offs[nums[i]].end != -1)
7472     {
7473      ret = newSVpvs("");
7474      CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7475      if (!retarray)
7476       return ret;
7477     } else {
7478      if (retarray)
7479       ret = newSVsv(&PL_sv_undef);
7480     }
7481     if (retarray)
7482      av_push(retarray, ret);
7483    }
7484    if (retarray)
7485     return newRV_noinc(MUTABLE_SV(retarray));
7486   }
7487  }
7488  return NULL;
7489 }
7490
7491 bool
7492 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7493       const U32 flags)
7494 {
7495  struct regexp *const rx = ReANY(r);
7496
7497  PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7498
7499  if (rx && RXp_PAREN_NAMES(rx)) {
7500   if (flags & RXapif_ALL) {
7501    return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7502   } else {
7503    SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7504    if (sv) {
7505     SvREFCNT_dec_NN(sv);
7506     return TRUE;
7507    } else {
7508     return FALSE;
7509    }
7510   }
7511  } else {
7512   return FALSE;
7513  }
7514 }
7515
7516 SV*
7517 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7518 {
7519  struct regexp *const rx = ReANY(r);
7520
7521  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7522
7523  if ( rx && RXp_PAREN_NAMES(rx) ) {
7524   (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7525
7526   return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7527  } else {
7528   return FALSE;
7529  }
7530 }
7531
7532 SV*
7533 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7534 {
7535  struct regexp *const rx = ReANY(r);
7536  GET_RE_DEBUG_FLAGS_DECL;
7537
7538  PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7539
7540  if (rx && RXp_PAREN_NAMES(rx)) {
7541   HV *hv = RXp_PAREN_NAMES(rx);
7542   HE *temphe;
7543   while ( (temphe = hv_iternext_flags(hv,0)) ) {
7544    IV i;
7545    IV parno = 0;
7546    SV* sv_dat = HeVAL(temphe);
7547    I32 *nums = (I32*)SvPVX(sv_dat);
7548    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7549     if ((I32)(rx->lastparen) >= nums[i] &&
7550      rx->offs[nums[i]].start != -1 &&
7551      rx->offs[nums[i]].end != -1)
7552     {
7553      parno = nums[i];
7554      break;
7555     }
7556    }
7557    if (parno || flags & RXapif_ALL) {
7558     return newSVhek(HeKEY_hek(temphe));
7559    }
7560   }
7561  }
7562  return NULL;
7563 }
7564
7565 SV*
7566 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7567 {
7568  SV *ret;
7569  AV *av;
7570  SSize_t length;
7571  struct regexp *const rx = ReANY(r);
7572
7573  PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7574
7575  if (rx && RXp_PAREN_NAMES(rx)) {
7576   if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7577    return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7578   } else if (flags & RXapif_ONE) {
7579    ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7580    av = MUTABLE_AV(SvRV(ret));
7581    length = av_tindex(av);
7582    SvREFCNT_dec_NN(ret);
7583    return newSViv(length + 1);
7584   } else {
7585    Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7586             (int)flags);
7587    return NULL;
7588   }
7589  }
7590  return &PL_sv_undef;
7591 }
7592
7593 SV*
7594 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7595 {
7596  struct regexp *const rx = ReANY(r);
7597  AV *av = newAV();
7598
7599  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7600
7601  if (rx && RXp_PAREN_NAMES(rx)) {
7602   HV *hv= RXp_PAREN_NAMES(rx);
7603   HE *temphe;
7604   (void)hv_iterinit(hv);
7605   while ( (temphe = hv_iternext_flags(hv,0)) ) {
7606    IV i;
7607    IV parno = 0;
7608    SV* sv_dat = HeVAL(temphe);
7609    I32 *nums = (I32*)SvPVX(sv_dat);
7610    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7611     if ((I32)(rx->lastparen) >= nums[i] &&
7612      rx->offs[nums[i]].start != -1 &&
7613      rx->offs[nums[i]].end != -1)
7614     {
7615      parno = nums[i];
7616      break;
7617     }
7618    }
7619    if (parno || flags & RXapif_ALL) {
7620     av_push(av, newSVhek(HeKEY_hek(temphe)));
7621    }
7622   }
7623  }
7624
7625  return newRV_noinc(MUTABLE_SV(av));
7626 }
7627
7628 void
7629 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7630        SV * const sv)
7631 {
7632  struct regexp *const rx = ReANY(r);
7633  char *s = NULL;
7634  SSize_t i = 0;
7635  SSize_t s1, t1;
7636  I32 n = paren;
7637
7638  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7639
7640  if (      n == RX_BUFF_IDX_CARET_PREMATCH
7641   || n == RX_BUFF_IDX_CARET_FULLMATCH
7642   || n == RX_BUFF_IDX_CARET_POSTMATCH
7643  )
7644  {
7645   bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7646   if (!keepcopy) {
7647    /* on something like
7648    *    $r = qr/.../;
7649    *    /$qr/p;
7650    * the KEEPCOPY is set on the PMOP rather than the regex */
7651    if (PL_curpm && r == PM_GETRE(PL_curpm))
7652     keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7653   }
7654   if (!keepcopy)
7655    goto ret_undef;
7656  }
7657
7658  if (!rx->subbeg)
7659   goto ret_undef;
7660
7661  if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7662   /* no need to distinguish between them any more */
7663   n = RX_BUFF_IDX_FULLMATCH;
7664
7665  if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7666   && rx->offs[0].start != -1)
7667  {
7668   /* $`, ${^PREMATCH} */
7669   i = rx->offs[0].start;
7670   s = rx->subbeg;
7671  }
7672  else
7673  if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7674   && rx->offs[0].end != -1)
7675  {
7676   /* $', ${^POSTMATCH} */
7677   s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7678   i = rx->sublen + rx->suboffset - rx->offs[0].end;
7679  }
7680  else
7681  if ( 0 <= n && n <= (I32)rx->nparens &&
7682   (s1 = rx->offs[n].start) != -1 &&
7683   (t1 = rx->offs[n].end) != -1)
7684  {
7685   /* $&, ${^MATCH},  $1 ... */
7686   i = t1 - s1;
7687   s = rx->subbeg + s1 - rx->suboffset;
7688  } else {
7689   goto ret_undef;
7690  }
7691
7692  assert(s >= rx->subbeg);
7693  assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7694  if (i >= 0) {
7695 #ifdef NO_TAINT_SUPPORT
7696   sv_setpvn(sv, s, i);
7697 #else
7698   const int oldtainted = TAINT_get;
7699   TAINT_NOT;
7700   sv_setpvn(sv, s, i);
7701   TAINT_set(oldtainted);
7702 #endif
7703   if (RXp_MATCH_UTF8(rx))
7704    SvUTF8_on(sv);
7705   else
7706    SvUTF8_off(sv);
7707   if (TAINTING_get) {
7708    if (RXp_MATCH_TAINTED(rx)) {
7709     if (SvTYPE(sv) >= SVt_PVMG) {
7710      MAGIC* const mg = SvMAGIC(sv);
7711      MAGIC* mgt;
7712      TAINT;
7713      SvMAGIC_set(sv, mg->mg_moremagic);
7714      SvTAINT(sv);
7715      if ((mgt = SvMAGIC(sv))) {
7716       mg->mg_moremagic = mgt;
7717       SvMAGIC_set(sv, mg);
7718      }
7719     } else {
7720      TAINT;
7721      SvTAINT(sv);
7722     }
7723    } else
7724     SvTAINTED_off(sv);
7725   }
7726  } else {
7727  ret_undef:
7728   sv_setsv(sv,&PL_sv_undef);
7729   return;
7730  }
7731 }
7732
7733 void
7734 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7735               SV const * const value)
7736 {
7737  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7738
7739  PERL_UNUSED_ARG(rx);
7740  PERL_UNUSED_ARG(paren);
7741  PERL_UNUSED_ARG(value);
7742
7743  if (!PL_localizing)
7744   Perl_croak_no_modify();
7745 }
7746
7747 I32
7748 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7749        const I32 paren)
7750 {
7751  struct regexp *const rx = ReANY(r);
7752  I32 i;
7753  I32 s1, t1;
7754
7755  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7756
7757  if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7758   || paren == RX_BUFF_IDX_CARET_FULLMATCH
7759   || paren == RX_BUFF_IDX_CARET_POSTMATCH
7760  )
7761  {
7762   bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7763   if (!keepcopy) {
7764    /* on something like
7765    *    $r = qr/.../;
7766    *    /$qr/p;
7767    * the KEEPCOPY is set on the PMOP rather than the regex */
7768    if (PL_curpm && r == PM_GETRE(PL_curpm))
7769     keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7770   }
7771   if (!keepcopy)
7772    goto warn_undef;
7773  }
7774
7775  /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7776  switch (paren) {
7777  case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7778  case RX_BUFF_IDX_PREMATCH:       /* $` */
7779   if (rx->offs[0].start != -1) {
7780       i = rx->offs[0].start;
7781       if (i > 0) {
7782         s1 = 0;
7783         t1 = i;
7784         goto getlen;
7785       }
7786    }
7787   return 0;
7788
7789  case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7790  case RX_BUFF_IDX_POSTMATCH:       /* $' */
7791    if (rx->offs[0].end != -1) {
7792       i = rx->sublen - rx->offs[0].end;
7793       if (i > 0) {
7794         s1 = rx->offs[0].end;
7795         t1 = rx->sublen;
7796         goto getlen;
7797       }
7798    }
7799   return 0;
7800
7801  default: /* $& / ${^MATCH}, $1, $2, ... */
7802    if (paren <= (I32)rx->nparens &&
7803    (s1 = rx->offs[paren].start) != -1 &&
7804    (t1 = rx->offs[paren].end) != -1)
7805    {
7806    i = t1 - s1;
7807    goto getlen;
7808   } else {
7809   warn_undef:
7810    if (ckWARN(WARN_UNINITIALIZED))
7811     report_uninit((const SV *)sv);
7812    return 0;
7813   }
7814  }
7815   getlen:
7816  if (i > 0 && RXp_MATCH_UTF8(rx)) {
7817   const char * const s = rx->subbeg - rx->suboffset + s1;
7818   const U8 *ep;
7819   STRLEN el;
7820
7821   i = t1 - s1;
7822   if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7823       i = el;
7824  }
7825  return i;
7826 }
7827
7828 SV*
7829 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7830 {
7831  PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7832   PERL_UNUSED_ARG(rx);
7833   if (0)
7834    return NULL;
7835   else
7836    return newSVpvs("Regexp");
7837 }
7838
7839 /* Scans the name of a named buffer from the pattern.
7840  * If flags is REG_RSN_RETURN_NULL returns null.
7841  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7842  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7843  * to the parsed name as looked up in the RExC_paren_names hash.
7844  * If there is an error throws a vFAIL().. type exception.
7845  */
7846
7847 #define REG_RSN_RETURN_NULL    0
7848 #define REG_RSN_RETURN_NAME    1
7849 #define REG_RSN_RETURN_DATA    2
7850
7851 STATIC SV*
7852 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7853 {
7854  char *name_start = RExC_parse;
7855
7856  PERL_ARGS_ASSERT_REG_SCAN_NAME;
7857
7858  assert (RExC_parse <= RExC_end);
7859  if (RExC_parse == RExC_end) NOOP;
7860  else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7861   /* skip IDFIRST by using do...while */
7862   if (UTF)
7863    do {
7864     RExC_parse += UTF8SKIP(RExC_parse);
7865    } while (isWORDCHAR_utf8((U8*)RExC_parse));
7866   else
7867    do {
7868     RExC_parse++;
7869    } while (isWORDCHAR(*RExC_parse));
7870  } else {
7871   RExC_parse++; /* so the <- from the vFAIL is after the offending
7872       character */
7873   vFAIL("Group name must start with a non-digit word character");
7874  }
7875  if ( flags ) {
7876   SV* sv_name
7877    = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7878        SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7879   if ( flags == REG_RSN_RETURN_NAME)
7880    return sv_name;
7881   else if (flags==REG_RSN_RETURN_DATA) {
7882    HE *he_str = NULL;
7883    SV *sv_dat = NULL;
7884    if ( ! sv_name )      /* should not happen*/
7885     Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7886    if (RExC_paren_names)
7887     he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7888    if ( he_str )
7889     sv_dat = HeVAL(he_str);
7890    if ( ! sv_dat )
7891     vFAIL("Reference to nonexistent named group");
7892    return sv_dat;
7893   }
7894   else {
7895    Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7896      (unsigned long) flags);
7897   }
7898   NOT_REACHED; /* NOTREACHED */
7899  }
7900  return NULL;
7901 }
7902
7903 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7904  int num;                                                    \
7905  if (RExC_lastparse!=RExC_parse) {                           \
7906   PerlIO_printf(Perl_debug_log, "%s",                     \
7907    Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
7908     RExC_end - RExC_parse, 16,                      \
7909     "", "",                                         \
7910     PERL_PV_ESCAPE_UNI_DETECT |                     \
7911     PERL_PV_PRETTY_ELLIPSES   |                     \
7912     PERL_PV_PRETTY_LTGT       |                     \
7913     PERL_PV_ESCAPE_RE         |                     \
7914     PERL_PV_PRETTY_EXACTSIZE                        \
7915    )                                                   \
7916   );                                                      \
7917  } else                                                      \
7918   PerlIO_printf(Perl_debug_log,"%16s","");                \
7919                 \
7920  if (SIZE_ONLY)                                              \
7921  num = RExC_size + 1;                                     \
7922  else                                                        \
7923  num=REG_NODE_NUM(RExC_emit);                             \
7924  if (RExC_lastnum!=num)                                      \
7925  PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7926  else                                                        \
7927  PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7928  PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7929   (int)((depth*2)), "",                                   \
7930   (funcname)                                              \
7931  );                                                          \
7932  RExC_lastnum=num;                                           \
7933  RExC_lastparse=RExC_parse;                                  \
7934 })
7935
7936
7937
7938 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7939  DEBUG_PARSE_MSG((funcname));                            \
7940  PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7941 })
7942 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7943  DEBUG_PARSE_MSG((funcname));                            \
7944  PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7945 })
7946
7947 /* This section of code defines the inversion list object and its methods.  The
7948  * interfaces are highly subject to change, so as much as possible is static to
7949  * this file.  An inversion list is here implemented as a malloc'd C UV array
7950  * as an SVt_INVLIST scalar.
7951  *
7952  * An inversion list for Unicode is an array of code points, sorted by ordinal
7953  * number.  The zeroth element is the first code point in the list.  The 1th
7954  * element is the first element beyond that not in the list.  In other words,
7955  * the first range is
7956  *  invlist[0]..(invlist[1]-1)
7957  * The other ranges follow.  Thus every element whose index is divisible by two
7958  * marks the beginning of a range that is in the list, and every element not
7959  * divisible by two marks the beginning of a range not in the list.  A single
7960  * element inversion list that contains the single code point N generally
7961  * consists of two elements
7962  *  invlist[0] == N
7963  *  invlist[1] == N+1
7964  * (The exception is when N is the highest representable value on the
7965  * machine, in which case the list containing just it would be a single
7966  * element, itself.  By extension, if the last range in the list extends to
7967  * infinity, then the first element of that range will be in the inversion list
7968  * at a position that is divisible by two, and is the final element in the
7969  * list.)
7970  * Taking the complement (inverting) an inversion list is quite simple, if the
7971  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7972  * This implementation reserves an element at the beginning of each inversion
7973  * list to always contain 0; there is an additional flag in the header which
7974  * indicates if the list begins at the 0, or is offset to begin at the next
7975  * element.
7976  *
7977  * More about inversion lists can be found in "Unicode Demystified"
7978  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7979  * More will be coming when functionality is added later.
7980  *
7981  * The inversion list data structure is currently implemented as an SV pointing
7982  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7983  * array of UV whose memory management is automatically handled by the existing
7984  * facilities for SV's.
7985  *
7986  * Some of the methods should always be private to the implementation, and some
7987  * should eventually be made public */
7988
7989 /* The header definitions are in F<inline_invlist.c> */
7990
7991 PERL_STATIC_INLINE UV*
7992 S__invlist_array_init(SV* const invlist, const bool will_have_0)
7993 {
7994  /* Returns a pointer to the first element in the inversion list's array.
7995  * This is called upon initialization of an inversion list.  Where the
7996  * array begins depends on whether the list has the code point U+0000 in it
7997  * or not.  The other parameter tells it whether the code that follows this
7998  * call is about to put a 0 in the inversion list or not.  The first
7999  * element is either the element reserved for 0, if TRUE, or the element
8000  * after it, if FALSE */
8001
8002  bool* offset = get_invlist_offset_addr(invlist);
8003  UV* zero_addr = (UV *) SvPVX(invlist);
8004
8005  PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8006
8007  /* Must be empty */
8008  assert(! _invlist_len(invlist));
8009
8010  *zero_addr = 0;
8011
8012  /* 1^1 = 0; 1^0 = 1 */
8013  *offset = 1 ^ will_have_0;
8014  return zero_addr + *offset;
8015 }
8016
8017 PERL_STATIC_INLINE void
8018 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8019 {
8020  /* Sets the current number of elements stored in the inversion list.
8021  * Updates SvCUR correspondingly */
8022  PERL_UNUSED_CONTEXT;
8023  PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8024
8025  assert(SvTYPE(invlist) == SVt_INVLIST);
8026
8027  SvCUR_set(invlist,
8028    (len == 0)
8029    ? 0
8030    : TO_INTERNAL_SIZE(len + offset));
8031  assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8032 }
8033
8034 #ifndef PERL_IN_XSUB_RE
8035
8036 PERL_STATIC_INLINE IV*
8037 S_get_invlist_previous_index_addr(SV* invlist)
8038 {
8039  /* Return the address of the IV that is reserved to hold the cached index
8040  * */
8041  PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8042
8043  assert(SvTYPE(invlist) == SVt_INVLIST);
8044
8045  return &(((XINVLIST*) SvANY(invlist))->prev_index);
8046 }
8047
8048 PERL_STATIC_INLINE IV
8049 S_invlist_previous_index(SV* const invlist)
8050 {
8051  /* Returns cached index of previous search */
8052
8053  PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8054
8055  return *get_invlist_previous_index_addr(invlist);
8056 }
8057
8058 PERL_STATIC_INLINE void
8059 S_invlist_set_previous_index(SV* const invlist, const IV index)
8060 {
8061  /* Caches <index> for later retrieval */
8062
8063  PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8064
8065  assert(index == 0 || index < (int) _invlist_len(invlist));
8066
8067  *get_invlist_previous_index_addr(invlist) = index;
8068 }
8069
8070 PERL_STATIC_INLINE void
8071 S_invlist_trim(SV* const invlist)
8072 {
8073  PERL_ARGS_ASSERT_INVLIST_TRIM;
8074
8075  assert(SvTYPE(invlist) == SVt_INVLIST);
8076
8077  /* Change the length of the inversion list to how many entries it currently
8078  * has */
8079  SvPV_shrink_to_cur((SV *) invlist);
8080 }
8081
8082 PERL_STATIC_INLINE bool
8083 S_invlist_is_iterating(SV* const invlist)
8084 {
8085  PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8086
8087  return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8088 }
8089
8090 #endif /* ifndef PERL_IN_XSUB_RE */
8091
8092 PERL_STATIC_INLINE UV
8093 S_invlist_max(SV* const invlist)
8094 {
8095  /* Returns the maximum number of elements storable in the inversion list's
8096  * array, without having to realloc() */
8097
8098  PERL_ARGS_ASSERT_INVLIST_MAX;
8099
8100  assert(SvTYPE(invlist) == SVt_INVLIST);
8101
8102  /* Assumes worst case, in which the 0 element is not counted in the
8103  * inversion list, so subtracts 1 for that */
8104  return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
8105   ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8106   : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8107 }
8108
8109 #ifndef PERL_IN_XSUB_RE
8110 SV*
8111 Perl__new_invlist(pTHX_ IV initial_size)
8112 {
8113
8114  /* Return a pointer to a newly constructed inversion list, with enough
8115  * space to store 'initial_size' elements.  If that number is negative, a
8116  * system default is used instead */
8117
8118  SV* new_list;
8119
8120  if (initial_size < 0) {
8121   initial_size = 10;
8122  }
8123
8124  /* Allocate the initial space */
8125  new_list = newSV_type(SVt_INVLIST);
8126
8127  /* First 1 is in case the zero element isn't in the list; second 1 is for
8128  * trailing NUL */
8129  SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8130  invlist_set_len(new_list, 0, 0);
8131
8132  /* Force iterinit() to be used to get iteration to work */
8133  *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8134
8135  *get_invlist_previous_index_addr(new_list) = 0;
8136
8137  return new_list;
8138 }
8139
8140 SV*
8141 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8142 {
8143  /* Return a pointer to a newly constructed inversion list, initialized to
8144  * point to <list>, which has to be in the exact correct inversion list
8145  * form, including internal fields.  Thus this is a dangerous routine that
8146  * should not be used in the wrong hands.  The passed in 'list' contains
8147  * several header fields at the beginning that are not part of the
8148  * inversion list body proper */
8149
8150  const STRLEN length = (STRLEN) list[0];
8151  const UV version_id =          list[1];
8152  const bool offset   =    cBOOL(list[2]);
8153 #define HEADER_LENGTH 3
8154  /* If any of the above changes in any way, you must change HEADER_LENGTH
8155  * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8156  *      perl -E 'say int(rand 2**31-1)'
8157  */
8158 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8159           data structure type, so that one being
8160           passed in can be validated to be an
8161           inversion list of the correct vintage.
8162          */
8163
8164  SV* invlist = newSV_type(SVt_INVLIST);
8165
8166  PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8167
8168  if (version_id != INVLIST_VERSION_ID) {
8169   Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8170  }
8171
8172  /* The generated array passed in includes header elements that aren't part
8173  * of the list proper, so start it just after them */
8174  SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8175
8176  SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
8177        shouldn't touch it */
8178
8179  *(get_invlist_offset_addr(invlist)) = offset;
8180
8181  /* The 'length' passed to us is the physical number of elements in the
8182  * inversion list.  But if there is an offset the logical number is one
8183  * less than that */
8184  invlist_set_len(invlist, length  - offset, offset);
8185
8186  invlist_set_previous_index(invlist, 0);
8187
8188  /* Initialize the iteration pointer. */
8189  invlist_iterfinish(invlist);
8190
8191  SvREADONLY_on(invlist);
8192
8193  return invlist;
8194 }
8195 #endif /* ifndef PERL_IN_XSUB_RE */
8196
8197 STATIC void
8198 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8199 {
8200  /* Grow the maximum size of an inversion list */
8201
8202  PERL_ARGS_ASSERT_INVLIST_EXTEND;
8203
8204  assert(SvTYPE(invlist) == SVt_INVLIST);
8205
8206  /* Add one to account for the zero element at the beginning which may not
8207  * be counted by the calling parameters */
8208  SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8209 }
8210
8211 STATIC void
8212 S__append_range_to_invlist(pTHX_ SV* const invlist,
8213         const UV start, const UV end)
8214 {
8215    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8216  * the end of the inversion list.  The range must be above any existing
8217  * ones. */
8218
8219  UV* array;
8220  UV max = invlist_max(invlist);
8221  UV len = _invlist_len(invlist);
8222  bool offset;
8223
8224  PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8225
8226  if (len == 0) { /* Empty lists must be initialized */
8227   offset = start != 0;
8228   array = _invlist_array_init(invlist, ! offset);
8229  }
8230  else {
8231   /* Here, the existing list is non-empty. The current max entry in the
8232   * list is generally the first value not in the set, except when the
8233   * set extends to the end of permissible values, in which case it is
8234   * the first entry in that final set, and so this call is an attempt to
8235   * append out-of-order */
8236
8237   UV final_element = len - 1;
8238   array = invlist_array(invlist);
8239   if (array[final_element] > start
8240    || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8241   {
8242    Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8243      array[final_element], start,
8244      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8245   }
8246
8247   /* Here, it is a legal append.  If the new range begins with the first
8248   * value not in the set, it is extending the set, so the new first
8249   * value not in the set is one greater than the newly extended range.
8250   * */
8251   offset = *get_invlist_offset_addr(invlist);
8252   if (array[final_element] == start) {
8253    if (end != UV_MAX) {
8254     array[final_element] = end + 1;
8255    }
8256    else {
8257     /* But if the end is the maximum representable on the machine,
8258     * just let the range that this would extend to have no end */
8259     invlist_set_len(invlist, len - 1, offset);
8260    }
8261    return;
8262   }
8263  }
8264
8265  /* Here the new range doesn't extend any existing set.  Add it */
8266
8267  len += 2; /* Includes an element each for the start and end of range */
8268
8269  /* If wll overflow the existing space, extend, which may cause the array to
8270  * be moved */
8271  if (max < len) {
8272   invlist_extend(invlist, len);
8273
8274   /* Have to set len here to avoid assert failure in invlist_array() */
8275   invlist_set_len(invlist, len, offset);
8276
8277   array = invlist_array(invlist);
8278  }
8279  else {
8280   invlist_set_len(invlist, len, offset);
8281  }
8282
8283  /* The next item on the list starts the range, the one after that is
8284  * one past the new range.  */
8285  array[len - 2] = start;
8286  if (end != UV_MAX) {
8287   array[len - 1] = end + 1;
8288  }
8289  else {
8290   /* But if the end is the maximum representable on the machine, just let
8291   * the range have no end */
8292   invlist_set_len(invlist, len - 1, offset);
8293  }
8294 }
8295
8296 #ifndef PERL_IN_XSUB_RE
8297
8298 IV
8299 Perl__invlist_search(SV* const invlist, const UV cp)
8300 {
8301  /* Searches the inversion list for the entry that contains the input code
8302  * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8303  * return value is the index into the list's array of the range that
8304  * contains <cp> */
8305
8306  IV low = 0;
8307  IV mid;
8308  IV high = _invlist_len(invlist);
8309  const IV highest_element = high - 1;
8310  const UV* array;
8311
8312  PERL_ARGS_ASSERT__INVLIST_SEARCH;
8313
8314  /* If list is empty, return failure. */
8315  if (high == 0) {
8316   return -1;
8317  }
8318
8319  /* (We can't get the array unless we know the list is non-empty) */
8320  array = invlist_array(invlist);
8321
8322  mid = invlist_previous_index(invlist);
8323  assert(mid >=0 && mid <= highest_element);
8324
8325  /* <mid> contains the cache of the result of the previous call to this
8326  * function (0 the first time).  See if this call is for the same result,
8327  * or if it is for mid-1.  This is under the theory that calls to this
8328  * function will often be for related code points that are near each other.
8329  * And benchmarks show that caching gives better results.  We also test
8330  * here if the code point is within the bounds of the list.  These tests
8331  * replace others that would have had to be made anyway to make sure that
8332  * the array bounds were not exceeded, and these give us extra information
8333  * at the same time */
8334  if (cp >= array[mid]) {
8335   if (cp >= array[highest_element]) {
8336    return highest_element;
8337   }
8338
8339   /* Here, array[mid] <= cp < array[highest_element].  This means that
8340   * the final element is not the answer, so can exclude it; it also
8341   * means that <mid> is not the final element, so can refer to 'mid + 1'
8342   * safely */
8343   if (cp < array[mid + 1]) {
8344    return mid;
8345   }
8346   high--;
8347   low = mid + 1;
8348  }
8349  else { /* cp < aray[mid] */
8350   if (cp < array[0]) { /* Fail if outside the array */
8351    return -1;
8352   }
8353   high = mid;
8354   if (cp >= array[mid - 1]) {
8355    goto found_entry;
8356   }
8357  }
8358
8359  /* Binary search.  What we are looking for is <i> such that
8360  * array[i] <= cp < array[i+1]
8361  * The loop below converges on the i+1.  Note that there may not be an
8362  * (i+1)th element in the array, and things work nonetheless */
8363  while (low < high) {
8364   mid = (low + high) / 2;
8365   assert(mid <= highest_element);
8366   if (array[mid] <= cp) { /* cp >= array[mid] */
8367    low = mid + 1;
8368
8369    /* We could do this extra test to exit the loop early.
8370    if (cp < array[low]) {
8371     return mid;
8372    }
8373    */
8374   }
8375   else { /* cp < array[mid] */
8376    high = mid;
8377   }
8378  }
8379
8380   found_entry:
8381  high--;
8382  invlist_set_previous_index(invlist, high);
8383  return high;
8384 }
8385
8386 void
8387 Perl__invlist_populate_swatch(SV* const invlist,
8388        const UV start, const UV end, U8* swatch)
8389 {
8390  /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8391  * but is used when the swash has an inversion list.  This makes this much
8392  * faster, as it uses a binary search instead of a linear one.  This is
8393  * intimately tied to that function, and perhaps should be in utf8.c,
8394  * except it is intimately tied to inversion lists as well.  It assumes
8395  * that <swatch> is all 0's on input */
8396
8397  UV current = start;
8398  const IV len = _invlist_len(invlist);
8399  IV i;
8400  const UV * array;
8401
8402  PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8403
8404  if (len == 0) { /* Empty inversion list */
8405   return;
8406  }
8407
8408  array = invlist_array(invlist);
8409
8410  /* Find which element it is */
8411  i = _invlist_search(invlist, start);
8412
8413  /* We populate from <start> to <end> */
8414  while (current < end) {
8415   UV upper;
8416
8417   /* The inversion list gives the results for every possible code point
8418   * after the first one in the list.  Only those ranges whose index is
8419   * even are ones that the inversion list matches.  For the odd ones,
8420   * and if the initial code point is not in the list, we have to skip
8421   * forward to the next element */
8422   if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8423    i++;
8424    if (i >= len) { /* Finished if beyond the end of the array */
8425     return;
8426    }
8427    current = array[i];
8428    if (current >= end) {   /* Finished if beyond the end of what we
8429          are populating */
8430     if (LIKELY(end < UV_MAX)) {
8431      return;
8432     }
8433
8434     /* We get here when the upper bound is the maximum
8435     * representable on the machine, and we are looking for just
8436     * that code point.  Have to special case it */
8437     i = len;
8438     goto join_end_of_list;
8439    }
8440   }
8441   assert(current >= start);
8442
8443   /* The current range ends one below the next one, except don't go past
8444   * <end> */
8445   i++;
8446   upper = (i < len && array[i] < end) ? array[i] : end;
8447
8448   /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8449   * for each code point in it */
8450   for (; current < upper; current++) {
8451    const STRLEN offset = (STRLEN)(current - start);
8452    swatch[offset >> 3] |= 1 << (offset & 7);
8453   }
8454
8455  join_end_of_list:
8456
8457   /* Quit if at the end of the list */
8458   if (i >= len) {
8459
8460    /* But first, have to deal with the highest possible code point on
8461    * the platform.  The previous code assumes that <end> is one
8462    * beyond where we want to populate, but that is impossible at the
8463    * platform's infinity, so have to handle it specially */
8464    if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8465    {
8466     const STRLEN offset = (STRLEN)(end - start);
8467     swatch[offset >> 3] |= 1 << (offset & 7);
8468    }
8469    return;
8470   }
8471
8472   /* Advance to the next range, which will be for code points not in the
8473   * inversion list */
8474   current = array[i];
8475  }
8476
8477  return;
8478 }
8479
8480 void
8481 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8482           const bool complement_b, SV** output)
8483 {
8484  /* Take the union of two inversion lists and point <output> to it.  *output
8485  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8486  * the reference count to that list will be decremented if not already a
8487  * temporary (mortal); otherwise *output will be made correspondingly
8488  * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8489  * second list is returned.  If <complement_b> is TRUE, the union is taken
8490  * of the complement (inversion) of <b> instead of b itself.
8491  *
8492  * The basis for this comes from "Unicode Demystified" Chapter 13 by
8493  * Richard Gillam, published by Addison-Wesley, and explained at some
8494  * length there.  The preface says to incorporate its examples into your
8495  * code at your own risk.
8496  *
8497  * The algorithm is like a merge sort.
8498  *
8499  * XXX A potential performance improvement is to keep track as we go along
8500  * if only one of the inputs contributes to the result, meaning the other
8501  * is a subset of that one.  In that case, we can skip the final copy and
8502  * return the larger of the input lists, but then outside code might need
8503  * to keep track of whether to free the input list or not */
8504
8505  const UV* array_a;    /* a's array */
8506  const UV* array_b;
8507  UV len_a;     /* length of a's array */
8508  UV len_b;
8509
8510  SV* u;   /* the resulting union */
8511  UV* array_u;
8512  UV len_u;
8513
8514  UV i_a = 0;      /* current index into a's array */
8515  UV i_b = 0;
8516  UV i_u = 0;
8517
8518  /* running count, as explained in the algorithm source book; items are
8519  * stopped accumulating and are output when the count changes to/from 0.
8520  * The count is incremented when we start a range that's in the set, and
8521  * decremented when we start a range that's not in the set.  So its range
8522  * is 0 to 2.  Only when the count is zero is something not in the set.
8523  */
8524  UV count = 0;
8525
8526  PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8527  assert(a != b);
8528
8529  /* If either one is empty, the union is the other one */
8530  if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8531   bool make_temp = FALSE; /* Should we mortalize the result? */
8532
8533   if (*output == a) {
8534    if (a != NULL) {
8535     if (! (make_temp = cBOOL(SvTEMP(a)))) {
8536      SvREFCNT_dec_NN(a);
8537     }
8538    }
8539   }
8540   if (*output != b) {
8541    *output = invlist_clone(b);
8542    if (complement_b) {
8543     _invlist_invert(*output);
8544    }
8545   } /* else *output already = b; */
8546
8547   if (make_temp) {
8548    sv_2mortal(*output);
8549   }
8550   return;
8551  }
8552  else if ((len_b = _invlist_len(b)) == 0) {
8553   bool make_temp = FALSE;
8554   if (*output == b) {
8555    if (! (make_temp = cBOOL(SvTEMP(b)))) {
8556     SvREFCNT_dec_NN(b);
8557    }
8558   }
8559
8560   /* The complement of an empty list is a list that has everything in it,
8561   * so the union with <a> includes everything too */
8562   if (complement_b) {
8563    if (a == *output) {
8564     if (! (make_temp = cBOOL(SvTEMP(a)))) {
8565      SvREFCNT_dec_NN(a);
8566     }
8567    }
8568    *output = _new_invlist(1);
8569    _append_range_to_invlist(*output, 0, UV_MAX);
8570   }
8571   else if (*output != a) {
8572    *output = invlist_clone(a);
8573   }
8574   /* else *output already = a; */
8575
8576   if (make_temp) {
8577    sv_2mortal(*output);
8578   }
8579   return;
8580  }
8581
8582  /* Here both lists exist and are non-empty */
8583  array_a = invlist_array(a);
8584  array_b = invlist_array(b);
8585
8586  /* If are to take the union of 'a' with the complement of b, set it
8587  * up so are looking at b's complement. */
8588  if (complement_b) {
8589
8590   /* To complement, we invert: if the first element is 0, remove it.  To
8591   * do this, we just pretend the array starts one later */
8592   if (array_b[0] == 0) {
8593    array_b++;
8594    len_b--;
8595   }
8596   else {
8597
8598    /* But if the first element is not zero, we pretend the list starts
8599    * at the 0 that is always stored immediately before the array. */
8600    array_b--;
8601    len_b++;
8602   }
8603  }
8604
8605  /* Size the union for the worst case: that the sets are completely
8606  * disjoint */
8607  u = _new_invlist(len_a + len_b);
8608
8609  /* Will contain U+0000 if either component does */
8610  array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8611          || (len_b > 0 && array_b[0] == 0));
8612
8613  /* Go through each list item by item, stopping when exhausted one of
8614  * them */
8615  while (i_a < len_a && i_b < len_b) {
8616   UV cp;     /* The element to potentially add to the union's array */
8617   bool cp_in_set;   /* is it in the the input list's set or not */
8618
8619   /* We need to take one or the other of the two inputs for the union.
8620   * Since we are merging two sorted lists, we take the smaller of the
8621   * next items.  In case of a tie, we take the one that is in its set
8622   * first.  If we took one not in the set first, it would decrement the
8623   * count, possibly to 0 which would cause it to be output as ending the
8624   * range, and the next time through we would take the same number, and
8625   * output it again as beginning the next range.  By doing it the
8626   * opposite way, there is no possibility that the count will be
8627   * momentarily decremented to 0, and thus the two adjoining ranges will
8628   * be seamlessly merged.  (In a tie and both are in the set or both not
8629   * in the set, it doesn't matter which we take first.) */
8630   if (array_a[i_a] < array_b[i_b]
8631    || (array_a[i_a] == array_b[i_b]
8632     && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8633   {
8634    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8635    cp= array_a[i_a++];
8636   }
8637   else {
8638    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8639    cp = array_b[i_b++];
8640   }
8641
8642   /* Here, have chosen which of the two inputs to look at.  Only output
8643   * if the running count changes to/from 0, which marks the
8644   * beginning/end of a range in that's in the set */
8645   if (cp_in_set) {
8646    if (count == 0) {
8647     array_u[i_u++] = cp;
8648    }
8649    count++;
8650   }
8651   else {
8652    count--;
8653    if (count == 0) {
8654     array_u[i_u++] = cp;
8655    }
8656   }
8657  }
8658
8659  /* Here, we are finished going through at least one of the lists, which
8660  * means there is something remaining in at most one.  We check if the list
8661  * that hasn't been exhausted is positioned such that we are in the middle
8662  * of a range in its set or not.  (i_a and i_b point to the element beyond
8663  * the one we care about.) If in the set, we decrement 'count'; if 0, there
8664  * is potentially more to output.
8665  * There are four cases:
8666  * 1) Both weren't in their sets, count is 0, and remains 0.  What's left
8667  *    in the union is entirely from the non-exhausted set.
8668  * 2) Both were in their sets, count is 2.  Nothing further should
8669  *    be output, as everything that remains will be in the exhausted
8670  *    list's set, hence in the union; decrementing to 1 but not 0 insures
8671  *    that
8672  * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8673  *    Nothing further should be output because the union includes
8674  *    everything from the exhausted set.  Not decrementing ensures that.
8675  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8676  *    decrementing to 0 insures that we look at the remainder of the
8677  *    non-exhausted set */
8678  if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8679   || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8680  {
8681   count--;
8682  }
8683
8684  /* The final length is what we've output so far, plus what else is about to
8685  * be output.  (If 'count' is non-zero, then the input list we exhausted
8686  * has everything remaining up to the machine's limit in its set, and hence
8687  * in the union, so there will be no further output. */
8688  len_u = i_u;
8689  if (count == 0) {
8690   /* At most one of the subexpressions will be non-zero */
8691   len_u += (len_a - i_a) + (len_b - i_b);
8692  }
8693
8694  /* Set result to final length, which can change the pointer to array_u, so
8695  * re-find it */
8696  if (len_u != _invlist_len(u)) {
8697   invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8698   invlist_trim(u);
8699   array_u = invlist_array(u);
8700  }
8701
8702  /* When 'count' is 0, the list that was exhausted (if one was shorter than
8703  * the other) ended with everything above it not in its set.  That means
8704  * that the remaining part of the union is precisely the same as the
8705  * non-exhausted list, so can just copy it unchanged.  (If both list were
8706  * exhausted at the same time, then the operations below will be both 0.)
8707  */
8708  if (count == 0) {
8709   IV copy_count; /* At most one will have a non-zero copy count */
8710   if ((copy_count = len_a - i_a) > 0) {
8711    Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8712   }
8713   else if ((copy_count = len_b - i_b) > 0) {
8714    Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8715   }
8716  }
8717
8718  /*  We may be removing a reference to one of the inputs.  If so, the output
8719  *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8720  *  count decremented) */
8721  if (a == *output || b == *output) {
8722   assert(! invlist_is_iterating(*output));
8723   if ((SvTEMP(*output))) {
8724    sv_2mortal(u);
8725   }
8726   else {
8727    SvREFCNT_dec_NN(*output);
8728   }
8729  }
8730
8731  *output = u;
8732
8733  return;
8734 }
8735
8736 void
8737 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8738            const bool complement_b, SV** i)
8739 {
8740  /* Take the intersection of two inversion lists and point <i> to it.  *i
8741  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8742  * the reference count to that list will be decremented if not already a
8743  * temporary (mortal); otherwise *i will be made correspondingly mortal.
8744  * The first list, <a>, may be NULL, in which case an empty list is
8745  * returned.  If <complement_b> is TRUE, the result will be the
8746  * intersection of <a> and the complement (or inversion) of <b> instead of
8747  * <b> directly.
8748  *
8749  * The basis for this comes from "Unicode Demystified" Chapter 13 by
8750  * Richard Gillam, published by Addison-Wesley, and explained at some
8751  * length there.  The preface says to incorporate its examples into your
8752  * code at your own risk.  In fact, it had bugs
8753  *
8754  * The algorithm is like a merge sort, and is essentially the same as the
8755  * union above
8756  */
8757
8758  const UV* array_a;  /* a's array */
8759  const UV* array_b;
8760  UV len_a; /* length of a's array */
8761  UV len_b;
8762
8763  SV* r;       /* the resulting intersection */
8764  UV* array_r;
8765  UV len_r;
8766
8767  UV i_a = 0;      /* current index into a's array */
8768  UV i_b = 0;
8769  UV i_r = 0;
8770
8771  /* running count, as explained in the algorithm source book; items are
8772  * stopped accumulating and are output when the count changes to/from 2.
8773  * The count is incremented when we start a range that's in the set, and
8774  * decremented when we start a range that's not in the set.  So its range
8775  * is 0 to 2.  Only when the count is 2 is something in the intersection.
8776  */
8777  UV count = 0;
8778
8779  PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8780  assert(a != b);
8781
8782  /* Special case if either one is empty */
8783  len_a = (a == NULL) ? 0 : _invlist_len(a);
8784  if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8785   bool make_temp = FALSE;
8786
8787   if (len_a != 0 && complement_b) {
8788
8789    /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8790    * be empty.  Here, also we are using 'b's complement, which hence
8791    * must be every possible code point.  Thus the intersection is
8792    * simply 'a'. */
8793    if (*i != a) {
8794     if (*i == b) {
8795      if (! (make_temp = cBOOL(SvTEMP(b)))) {
8796       SvREFCNT_dec_NN(b);
8797      }
8798     }
8799
8800     *i = invlist_clone(a);
8801    }
8802    /* else *i is already 'a' */
8803
8804    if (make_temp) {
8805     sv_2mortal(*i);
8806    }
8807    return;
8808   }
8809
8810   /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8811   * intersection must be empty */
8812   if (*i == a) {
8813    if (! (make_temp = cBOOL(SvTEMP(a)))) {
8814     SvREFCNT_dec_NN(a);
8815    }
8816   }
8817   else if (*i == b) {
8818    if (! (make_temp = cBOOL(SvTEMP(b)))) {
8819     SvREFCNT_dec_NN(b);
8820    }
8821   }
8822   *i = _new_invlist(0);
8823   if (make_temp) {
8824    sv_2mortal(*i);
8825   }
8826
8827   return;
8828  }
8829
8830  /* Here both lists exist and are non-empty */
8831  array_a = invlist_array(a);
8832  array_b = invlist_array(b);
8833
8834  /* If are to take the intersection of 'a' with the complement of b, set it
8835  * up so are looking at b's complement. */
8836  if (complement_b) {
8837
8838   /* To complement, we invert: if the first element is 0, remove it.  To
8839   * do this, we just pretend the array starts one later */
8840   if (array_b[0] == 0) {
8841    array_b++;
8842    len_b--;
8843   }
8844   else {
8845
8846    /* But if the first element is not zero, we pretend the list starts
8847    * at the 0 that is always stored immediately before the array. */
8848    array_b--;
8849    len_b++;
8850   }
8851  }
8852
8853  /* Size the intersection for the worst case: that the intersection ends up
8854  * fragmenting everything to be completely disjoint */
8855  r= _new_invlist(len_a + len_b);
8856
8857  /* Will contain U+0000 iff both components do */
8858  array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8859          && len_b > 0 && array_b[0] == 0);
8860
8861  /* Go through each list item by item, stopping when exhausted one of
8862  * them */
8863  while (i_a < len_a && i_b < len_b) {
8864   UV cp;     /* The element to potentially add to the intersection's
8865      array */
8866   bool cp_in_set; /* Is it in the input list's set or not */
8867
8868   /* We need to take one or the other of the two inputs for the
8869   * intersection.  Since we are merging two sorted lists, we take the
8870   * smaller of the next items.  In case of a tie, we take the one that
8871   * is not in its set first (a difference from the union algorithm).  If
8872   * we took one in the set first, it would increment the count, possibly
8873   * to 2 which would cause it to be output as starting a range in the
8874   * intersection, and the next time through we would take that same
8875   * number, and output it again as ending the set.  By doing it the
8876   * opposite of this, there is no possibility that the count will be
8877   * momentarily incremented to 2.  (In a tie and both are in the set or
8878   * both not in the set, it doesn't matter which we take first.) */
8879   if (array_a[i_a] < array_b[i_b]
8880    || (array_a[i_a] == array_b[i_b]
8881     && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8882   {
8883    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8884    cp= array_a[i_a++];
8885   }
8886   else {
8887    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8888    cp= array_b[i_b++];
8889   }
8890
8891   /* Here, have chosen which of the two inputs to look at.  Only output
8892   * if the running count changes to/from 2, which marks the
8893   * beginning/end of a range that's in the intersection */
8894   if (cp_in_set) {
8895    count++;
8896    if (count == 2) {
8897     array_r[i_r++] = cp;
8898    }
8899   }
8900   else {
8901    if (count == 2) {
8902     array_r[i_r++] = cp;
8903    }
8904    count--;
8905   }
8906  }
8907
8908  /* Here, we are finished going through at least one of the lists, which
8909  * means there is something remaining in at most one.  We check if the list
8910  * that has been exhausted is positioned such that we are in the middle
8911  * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8912  * the ones we care about.)  There are four cases:
8913  * 1) Both weren't in their sets, count is 0, and remains 0.  There's
8914  *    nothing left in the intersection.
8915  * 2) Both were in their sets, count is 2 and perhaps is incremented to
8916  *    above 2.  What should be output is exactly that which is in the
8917  *    non-exhausted set, as everything it has is also in the intersection
8918  *    set, and everything it doesn't have can't be in the intersection
8919  * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8920  *    gets incremented to 2.  Like the previous case, the intersection is
8921  *    everything that remains in the non-exhausted set.
8922  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8923  *    remains 1.  And the intersection has nothing more. */
8924  if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8925   || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8926  {
8927   count++;
8928  }
8929
8930  /* The final length is what we've output so far plus what else is in the
8931  * intersection.  At most one of the subexpressions below will be non-zero
8932  * */
8933  len_r = i_r;
8934  if (count >= 2) {
8935   len_r += (len_a - i_a) + (len_b - i_b);
8936  }
8937
8938  /* Set result to final length, which can change the pointer to array_r, so
8939  * re-find it */
8940  if (len_r != _invlist_len(r)) {
8941   invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8942   invlist_trim(r);
8943   array_r = invlist_array(r);
8944  }
8945
8946  /* Finish outputting any remaining */
8947  if (count >= 2) { /* At most one will have a non-zero copy count */
8948   IV copy_count;
8949   if ((copy_count = len_a - i_a) > 0) {
8950    Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8951   }
8952   else if ((copy_count = len_b - i_b) > 0) {
8953    Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8954   }
8955  }
8956
8957  /*  We may be removing a reference to one of the inputs.  If so, the output
8958  *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8959  *  count decremented) */
8960  if (a == *i || b == *i) {
8961   assert(! invlist_is_iterating(*i));
8962   if (SvTEMP(*i)) {
8963    sv_2mortal(r);
8964   }
8965   else {
8966    SvREFCNT_dec_NN(*i);
8967   }
8968  }
8969
8970  *i = r;
8971
8972  return;
8973 }
8974
8975 SV*
8976 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8977 {
8978  /* Add the range from 'start' to 'end' inclusive to the inversion list's
8979  * set.  A pointer to the inversion list is returned.  This may actually be
8980  * a new list, in which case the passed in one has been destroyed.  The
8981  * passed-in inversion list can be NULL, in which case a new one is created
8982  * with just the one range in it */
8983
8984  SV* range_invlist;
8985  UV len;
8986
8987  if (invlist == NULL) {
8988   invlist = _new_invlist(2);
8989   len = 0;
8990  }
8991  else {
8992   len = _invlist_len(invlist);
8993  }
8994
8995  /* If comes after the final entry actually in the list, can just append it
8996  * to the end, */
8997  if (len == 0
8998   || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8999    && start >= invlist_array(invlist)[len - 1]))
9000  {
9001   _append_range_to_invlist(invlist, start, end);
9002   return invlist;
9003  }
9004
9005  /* Here, can't just append things, create and return a new inversion list
9006  * which is the union of this range and the existing inversion list */
9007  range_invlist = _new_invlist(2);
9008  _append_range_to_invlist(range_invlist, start, end);
9009
9010  _invlist_union(invlist, range_invlist, &invlist);
9011
9012  /* The temporary can be freed */
9013  SvREFCNT_dec_NN(range_invlist);
9014
9015  return invlist;
9016 }
9017
9018 SV*
9019 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9020         UV** other_elements_ptr)
9021 {
9022  /* Create and return an inversion list whose contents are to be populated
9023  * by the caller.  The caller gives the number of elements (in 'size') and
9024  * the very first element ('element0').  This function will set
9025  * '*other_elements_ptr' to an array of UVs, where the remaining elements
9026  * are to be placed.
9027  *
9028  * Obviously there is some trust involved that the caller will properly
9029  * fill in the other elements of the array.
9030  *
9031  * (The first element needs to be passed in, as the underlying code does
9032  * things differently depending on whether it is zero or non-zero) */
9033
9034  SV* invlist = _new_invlist(size);
9035  bool offset;
9036
9037  PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9038
9039  _append_range_to_invlist(invlist, element0, element0);
9040  offset = *get_invlist_offset_addr(invlist);
9041
9042  invlist_set_len(invlist, size, offset);
9043  *other_elements_ptr = invlist_array(invlist) + 1;
9044  return invlist;
9045 }
9046
9047 #endif
9048
9049 PERL_STATIC_INLINE SV*
9050 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9051  return _add_range_to_invlist(invlist, cp, cp);
9052 }
9053
9054 #ifndef PERL_IN_XSUB_RE
9055 void
9056 Perl__invlist_invert(pTHX_ SV* const invlist)
9057 {
9058  /* Complement the input inversion list.  This adds a 0 if the list didn't
9059  * have a zero; removes it otherwise.  As described above, the data
9060  * structure is set up so that this is very efficient */
9061
9062  PERL_ARGS_ASSERT__INVLIST_INVERT;
9063
9064  assert(! invlist_is_iterating(invlist));
9065
9066  /* The inverse of matching nothing is matching everything */
9067  if (_invlist_len(invlist) == 0) {
9068   _append_range_to_invlist(invlist, 0, UV_MAX);
9069   return;
9070  }
9071
9072  *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9073 }
9074
9075 #endif
9076
9077 PERL_STATIC_INLINE SV*
9078 S_invlist_clone(pTHX_ SV* const invlist)
9079 {
9080
9081  /* Return a new inversion list that is a copy of the input one, which is
9082  * unchanged.  The new list will not be mortal even if the old one was. */
9083
9084  /* Need to allocate extra space to accommodate Perl's addition of a
9085  * trailing NUL to SvPV's, since it thinks they are always strings */
9086  SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9087  STRLEN physical_length = SvCUR(invlist);
9088  bool offset = *(get_invlist_offset_addr(invlist));
9089
9090  PERL_ARGS_ASSERT_INVLIST_CLONE;
9091
9092  *(get_invlist_offset_addr(new_invlist)) = offset;
9093  invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9094  Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9095
9096  return new_invlist;
9097 }
9098
9099 PERL_STATIC_INLINE STRLEN*
9100 S_get_invlist_iter_addr(SV* invlist)
9101 {
9102  /* Return the address of the UV that contains the current iteration
9103  * position */
9104
9105  PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9106
9107  assert(SvTYPE(invlist) == SVt_INVLIST);
9108
9109  return &(((XINVLIST*) SvANY(invlist))->iterator);
9110 }
9111
9112 PERL_STATIC_INLINE void
9113 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9114 {
9115  PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9116
9117  *get_invlist_iter_addr(invlist) = 0;
9118 }
9119
9120 PERL_STATIC_INLINE void
9121 S_invlist_iterfinish(SV* invlist)
9122 {
9123  /* Terminate iterator for invlist.  This is to catch development errors.
9124  * Any iteration that is interrupted before completed should call this
9125  * function.  Functions that add code points anywhere else but to the end
9126  * of an inversion list assert that they are not in the middle of an
9127  * iteration.  If they were, the addition would make the iteration
9128  * problematical: if the iteration hadn't reached the place where things
9129  * were being added, it would be ok */
9130
9131  PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9132
9133  *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9134 }
9135
9136 STATIC bool
9137 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9138 {
9139  /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9140  * This call sets in <*start> and <*end>, the next range in <invlist>.
9141  * Returns <TRUE> if successful and the next call will return the next
9142  * range; <FALSE> if was already at the end of the list.  If the latter,
9143  * <*start> and <*end> are unchanged, and the next call to this function
9144  * will start over at the beginning of the list */
9145
9146  STRLEN* pos = get_invlist_iter_addr(invlist);
9147  UV len = _invlist_len(invlist);
9148  UV *array;
9149
9150  PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9151
9152  if (*pos >= len) {
9153   *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9154   return FALSE;
9155  }
9156
9157  array = invlist_array(invlist);
9158
9159  *start = array[(*pos)++];
9160
9161  if (*pos >= len) {
9162   *end = UV_MAX;
9163  }
9164  else {
9165   *end = array[(*pos)++] - 1;
9166  }
9167
9168  return TRUE;
9169 }
9170
9171 PERL_STATIC_INLINE UV
9172 S_invlist_highest(SV* const invlist)
9173 {
9174  /* Returns the highest code point that matches an inversion list.  This API
9175  * has an ambiguity, as it returns 0 under either the highest is actually
9176  * 0, or if the list is empty.  If this distinction matters to you, check
9177  * for emptiness before calling this function */
9178
9179  UV len = _invlist_len(invlist);
9180  UV *array;
9181
9182  PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9183
9184  if (len == 0) {
9185   return 0;
9186  }
9187
9188  array = invlist_array(invlist);
9189
9190  /* The last element in the array in the inversion list always starts a
9191  * range that goes to infinity.  That range may be for code points that are
9192  * matched in the inversion list, or it may be for ones that aren't
9193  * matched.  In the latter case, the highest code point in the set is one
9194  * less than the beginning of this range; otherwise it is the final element
9195  * of this range: infinity */
9196  return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9197   ? UV_MAX
9198   : array[len - 1] - 1;
9199 }
9200
9201 #ifndef PERL_IN_XSUB_RE
9202 SV *
9203 Perl__invlist_contents(pTHX_ SV* const invlist)
9204 {
9205  /* Get the contents of an inversion list into a string SV so that they can
9206  * be printed out.  It uses the format traditionally done for debug tracing
9207  */
9208
9209  UV start, end;
9210  SV* output = newSVpvs("\n");
9211
9212  PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9213
9214  assert(! invlist_is_iterating(invlist));
9215
9216  invlist_iterinit(invlist);
9217  while (invlist_iternext(invlist, &start, &end)) {
9218   if (end == UV_MAX) {
9219    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9220   }
9221   else if (end != start) {
9222    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9223      start,       end);
9224   }
9225   else {
9226    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9227   }
9228  }
9229
9230  return output;
9231 }
9232 #endif
9233
9234 #ifndef PERL_IN_XSUB_RE
9235 void
9236 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9237       const char * const indent, SV* const invlist)
9238 {
9239  /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9240  * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9241  * the string 'indent'.  The output looks like this:
9242   [0] 0x000A .. 0x000D
9243   [2] 0x0085
9244   [4] 0x2028 .. 0x2029
9245   [6] 0x3104 .. INFINITY
9246  * This means that the first range of code points matched by the list are
9247  * 0xA through 0xD; the second range contains only the single code point
9248  * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9249  * are used to define each range (except if the final range extends to
9250  * infinity, only a single element is needed).  The array index of the
9251  * first element for the corresponding range is given in brackets. */
9252
9253  UV start, end;
9254  STRLEN count = 0;
9255
9256  PERL_ARGS_ASSERT__INVLIST_DUMP;
9257
9258  if (invlist_is_iterating(invlist)) {
9259   Perl_dump_indent(aTHX_ level, file,
9260    "%sCan't dump inversion list because is in middle of iterating\n",
9261    indent);
9262   return;
9263  }
9264
9265  invlist_iterinit(invlist);
9266  while (invlist_iternext(invlist, &start, &end)) {
9267   if (end == UV_MAX) {
9268    Perl_dump_indent(aTHX_ level, file,
9269          "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9270         indent, (UV)count, start);
9271   }
9272   else if (end != start) {
9273    Perl_dump_indent(aTHX_ level, file,
9274          "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9275         indent, (UV)count, start,         end);
9276   }
9277   else {
9278    Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9279            indent, (UV)count, start);
9280   }
9281   count += 2;
9282  }
9283 }
9284
9285 void
9286 Perl__load_PL_utf8_foldclosures (pTHX)
9287 {
9288  assert(! PL_utf8_foldclosures);
9289
9290  /* If the folds haven't been read in, call a fold function
9291  * to force that */
9292  if (! PL_utf8_tofold) {
9293   U8 dummy[UTF8_MAXBYTES_CASE+1];
9294
9295   /* This string is just a short named one above \xff */
9296   to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9297   assert(PL_utf8_tofold); /* Verify that worked */
9298  }
9299  PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9300 }
9301 #endif
9302
9303 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9304 bool
9305 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9306 {
9307  /* Return a boolean as to if the two passed in inversion lists are
9308  * identical.  The final argument, if TRUE, says to take the complement of
9309  * the second inversion list before doing the comparison */
9310
9311  const UV* array_a = invlist_array(a);
9312  const UV* array_b = invlist_array(b);
9313  UV len_a = _invlist_len(a);
9314  UV len_b = _invlist_len(b);
9315
9316  UV i = 0;      /* current index into the arrays */
9317  bool retval = TRUE;     /* Assume are identical until proven otherwise */
9318
9319  PERL_ARGS_ASSERT__INVLISTEQ;
9320
9321  /* If are to compare 'a' with the complement of b, set it
9322  * up so are looking at b's complement. */
9323  if (complement_b) {
9324
9325   /* The complement of nothing is everything, so <a> would have to have
9326   * just one element, starting at zero (ending at infinity) */
9327   if (len_b == 0) {
9328    return (len_a == 1 && array_a[0] == 0);
9329   }
9330   else if (array_b[0] == 0) {
9331
9332    /* Otherwise, to complement, we invert.  Here, the first element is
9333    * 0, just remove it.  To do this, we just pretend the array starts
9334    * one later */
9335
9336    array_b++;
9337    len_b--;
9338   }
9339   else {
9340
9341    /* But if the first element is not zero, we pretend the list starts
9342    * at the 0 that is always stored immediately before the array. */
9343    array_b--;
9344    len_b++;
9345   }
9346  }
9347
9348  /* Make sure that the lengths are the same, as well as the final element
9349  * before looping through the remainder.  (Thus we test the length, final,
9350  * and first elements right off the bat) */
9351  if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9352   retval = FALSE;
9353  }
9354  else for (i = 0; i < len_a - 1; i++) {
9355   if (array_a[i] != array_b[i]) {
9356    retval = FALSE;
9357    break;
9358   }
9359  }
9360
9361  return retval;
9362 }
9363 #endif
9364
9365 /*
9366  * As best we can, determine the characters that can match the start of
9367  * the given EXACTF-ish node.
9368  *
9369  * Returns the invlist as a new SV*; it is the caller's responsibility to
9370  * call SvREFCNT_dec() when done with it.
9371  */
9372 STATIC SV*
9373 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
9374 {
9375  const U8 * s = (U8*)STRING(node);
9376  SSize_t bytelen = STR_LEN(node);
9377  UV uc;
9378  /* Start out big enough for 2 separate code points */
9379  SV* invlist = _new_invlist(4);
9380
9381  PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
9382
9383  if (! UTF) {
9384   uc = *s;
9385
9386   /* We punt and assume can match anything if the node begins
9387   * with a multi-character fold.  Things are complicated.  For
9388   * example, /ffi/i could match any of:
9389   *  "\N{LATIN SMALL LIGATURE FFI}"
9390   *  "\N{LATIN SMALL LIGATURE FF}I"
9391   *  "F\N{LATIN SMALL LIGATURE FI}"
9392   *  plus several other things; and making sure we have all the
9393   *  possibilities is hard. */
9394   if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
9395    invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9396   }
9397   else {
9398    /* Any Latin1 range character can potentially match any
9399    * other depending on the locale */
9400    if (OP(node) == EXACTFL) {
9401     _invlist_union(invlist, PL_Latin1, &invlist);
9402    }
9403    else {
9404     /* But otherwise, it matches at least itself.  We can
9405     * quickly tell if it has a distinct fold, and if so,
9406     * it matches that as well */
9407     invlist = add_cp_to_invlist(invlist, uc);
9408     if (IS_IN_SOME_FOLD_L1(uc))
9409      invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
9410    }
9411
9412    /* Some characters match above-Latin1 ones under /i.  This
9413    * is true of EXACTFL ones when the locale is UTF-8 */
9414    if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
9415     && (! isASCII(uc) || (OP(node) != EXACTFA
9416          && OP(node) != EXACTFA_NO_TRIE)))
9417    {
9418     add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
9419    }
9420   }
9421  }
9422  else {  /* Pattern is UTF-8 */
9423   U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
9424   STRLEN foldlen = UTF8SKIP(s);
9425   const U8* e = s + bytelen;
9426   SV** listp;
9427
9428   uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
9429
9430   /* The only code points that aren't folded in a UTF EXACTFish
9431   * node are are the problematic ones in EXACTFL nodes */
9432   if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
9433    /* We need to check for the possibility that this EXACTFL
9434    * node begins with a multi-char fold.  Therefore we fold
9435    * the first few characters of it so that we can make that
9436    * check */
9437    U8 *d = folded;
9438    int i;
9439
9440    for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
9441     if (isASCII(*s)) {
9442      *(d++) = (U8) toFOLD(*s);
9443      s++;
9444     }
9445     else {
9446      STRLEN len;
9447      to_utf8_fold(s, d, &len);
9448      d += len;
9449      s += UTF8SKIP(s);
9450     }
9451    }
9452
9453    /* And set up so the code below that looks in this folded
9454    * buffer instead of the node's string */
9455    e = d;
9456    foldlen = UTF8SKIP(folded);
9457    s = folded;
9458   }
9459
9460   /* When we reach here 's' points to the fold of the first
9461   * character(s) of the node; and 'e' points to far enough along
9462   * the folded string to be just past any possible multi-char
9463   * fold. 'foldlen' is the length in bytes of the first
9464   * character in 's'
9465   *
9466   * Unlike the non-UTF-8 case, the macro for determining if a
9467   * string is a multi-char fold requires all the characters to
9468   * already be folded.  This is because of all the complications
9469   * if not.  Note that they are folded anyway, except in EXACTFL
9470   * nodes.  Like the non-UTF case above, we punt if the node
9471   * begins with a multi-char fold  */
9472
9473   if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
9474    invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9475   }
9476   else {  /* Single char fold */
9477
9478    /* It matches all the things that fold to it, which are
9479    * found in PL_utf8_foldclosures (including itself) */
9480    invlist = add_cp_to_invlist(invlist, uc);
9481    if (! PL_utf8_foldclosures)
9482     _load_PL_utf8_foldclosures();
9483    if ((listp = hv_fetch(PL_utf8_foldclosures,
9484         (char *) s, foldlen, FALSE)))
9485    {
9486     AV* list = (AV*) *listp;
9487     IV k;
9488     for (k = 0; k <= av_tindex(list); k++) {
9489      SV** c_p = av_fetch(list, k, FALSE);
9490      UV c;
9491      assert(c_p);
9492
9493      c = SvUV(*c_p);
9494
9495      /* /aa doesn't allow folds between ASCII and non- */
9496      if ((OP(node) == EXACTFA || OP(node) == EXACTFA_NO_TRIE)
9497       && isASCII(c) != isASCII(uc))
9498      {
9499       continue;
9500      }
9501
9502      invlist = add_cp_to_invlist(invlist, c);
9503     }
9504    }
9505   }
9506  }
9507
9508  return invlist;
9509 }
9510
9511 #undef HEADER_LENGTH
9512 #undef TO_INTERNAL_SIZE
9513 #undef FROM_INTERNAL_SIZE
9514 #undef INVLIST_VERSION_ID
9515
9516 /* End of inversion list object */
9517
9518 STATIC void
9519 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9520 {
9521  /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9522  * constructs, and updates RExC_flags with them.  On input, RExC_parse
9523  * should point to the first flag; it is updated on output to point to the
9524  * final ')' or ':'.  There needs to be at least one flag, or this will
9525  * abort */
9526
9527  /* for (?g), (?gc), and (?o) warnings; warning
9528  about (?c) will warn about (?g) -- japhy    */
9529
9530 #define WASTED_O  0x01
9531 #define WASTED_G  0x02
9532 #define WASTED_C  0x04
9533 #define WASTED_GC (WASTED_G|WASTED_C)
9534  I32 wastedflags = 0x00;
9535  U32 posflags = 0, negflags = 0;
9536  U32 *flagsp = &posflags;
9537  char has_charset_modifier = '\0';
9538  regex_charset cs;
9539  bool has_use_defaults = FALSE;
9540  const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9541  int x_mod_count = 0;
9542
9543  PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9544
9545  /* '^' as an initial flag sets certain defaults */
9546  if (UCHARAT(RExC_parse) == '^') {
9547   RExC_parse++;
9548   has_use_defaults = TRUE;
9549   STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9550   set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9551           ? REGEX_UNICODE_CHARSET
9552           : REGEX_DEPENDS_CHARSET);
9553  }
9554
9555  cs = get_regex_charset(RExC_flags);
9556  if (cs == REGEX_DEPENDS_CHARSET
9557   && (RExC_utf8 || RExC_uni_semantics))
9558  {
9559   cs = REGEX_UNICODE_CHARSET;
9560  }
9561
9562  while (*RExC_parse) {
9563   /* && strchr("iogcmsx", *RExC_parse) */
9564   /* (?g), (?gc) and (?o) are useless here
9565   and must be globally applied -- japhy */
9566   switch (*RExC_parse) {
9567
9568    /* Code for the imsxn flags */
9569    CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
9570
9571    case LOCALE_PAT_MOD:
9572     if (has_charset_modifier) {
9573      goto excess_modifier;
9574     }
9575     else if (flagsp == &negflags) {
9576      goto neg_modifier;
9577     }
9578     cs = REGEX_LOCALE_CHARSET;
9579     has_charset_modifier = LOCALE_PAT_MOD;
9580     break;
9581    case UNICODE_PAT_MOD:
9582     if (has_charset_modifier) {
9583      goto excess_modifier;
9584     }
9585     else if (flagsp == &negflags) {
9586      goto neg_modifier;
9587     }
9588     cs = REGEX_UNICODE_CHARSET;
9589     has_charset_modifier = UNICODE_PAT_MOD;
9590     break;
9591    case ASCII_RESTRICT_PAT_MOD:
9592     if (flagsp == &negflags) {
9593      goto neg_modifier;
9594     }
9595     if (has_charset_modifier) {
9596      if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9597       goto excess_modifier;
9598      }
9599      /* Doubled modifier implies more restricted */
9600      cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9601     }
9602     else {
9603      cs = REGEX_ASCII_RESTRICTED_CHARSET;
9604     }
9605     has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9606     break;
9607    case DEPENDS_PAT_MOD:
9608     if (has_use_defaults) {
9609      goto fail_modifiers;
9610     }
9611     else if (flagsp == &negflags) {
9612      goto neg_modifier;
9613     }
9614     else if (has_charset_modifier) {
9615      goto excess_modifier;
9616     }
9617
9618     /* The dual charset means unicode semantics if the
9619     * pattern (or target, not known until runtime) are
9620     * utf8, or something in the pattern indicates unicode
9621     * semantics */
9622     cs = (RExC_utf8 || RExC_uni_semantics)
9623      ? REGEX_UNICODE_CHARSET
9624      : REGEX_DEPENDS_CHARSET;
9625     has_charset_modifier = DEPENDS_PAT_MOD;
9626     break;
9627    excess_modifier:
9628     RExC_parse++;
9629     if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9630      vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9631     }
9632     else if (has_charset_modifier == *(RExC_parse - 1)) {
9633      vFAIL2("Regexp modifier \"%c\" may not appear twice",
9634           *(RExC_parse - 1));
9635     }
9636     else {
9637      vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9638     }
9639     NOT_REACHED; /*NOTREACHED*/
9640    neg_modifier:
9641     RExC_parse++;
9642     vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9643          *(RExC_parse - 1));
9644     NOT_REACHED; /*NOTREACHED*/
9645    case ONCE_PAT_MOD: /* 'o' */
9646    case GLOBAL_PAT_MOD: /* 'g' */
9647     if (PASS2 && ckWARN(WARN_REGEXP)) {
9648      const I32 wflagbit = *RExC_parse == 'o'
9649           ? WASTED_O
9650           : WASTED_G;
9651      if (! (wastedflags & wflagbit) ) {
9652       wastedflags |= wflagbit;
9653       /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9654       vWARN5(
9655        RExC_parse + 1,
9656        "Useless (%s%c) - %suse /%c modifier",
9657        flagsp == &negflags ? "?-" : "?",
9658        *RExC_parse,
9659        flagsp == &negflags ? "don't " : "",
9660        *RExC_parse
9661       );
9662      }
9663     }
9664     break;
9665
9666    case CONTINUE_PAT_MOD: /* 'c' */
9667     if (PASS2 && ckWARN(WARN_REGEXP)) {
9668      if (! (wastedflags & WASTED_C) ) {
9669       wastedflags |= WASTED_GC;
9670       /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9671       vWARN3(
9672        RExC_parse + 1,
9673        "Useless (%sc) - %suse /gc modifier",
9674        flagsp == &negflags ? "?-" : "?",
9675        flagsp == &negflags ? "don't " : ""
9676       );
9677      }
9678     }
9679     break;
9680    case KEEPCOPY_PAT_MOD: /* 'p' */
9681     if (flagsp == &negflags) {
9682      if (PASS2)
9683       ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9684     } else {
9685      *flagsp |= RXf_PMf_KEEPCOPY;
9686     }
9687     break;
9688    case '-':
9689     /* A flag is a default iff it is following a minus, so
9690     * if there is a minus, it means will be trying to
9691     * re-specify a default which is an error */
9692     if (has_use_defaults || flagsp == &negflags) {
9693      goto fail_modifiers;
9694     }
9695     flagsp = &negflags;
9696     wastedflags = 0;  /* reset so (?g-c) warns twice */
9697     break;
9698    case ':':
9699    case ')':
9700     RExC_flags |= posflags;
9701     RExC_flags &= ~negflags;
9702     set_regex_charset(&RExC_flags, cs);
9703     if (RExC_flags & RXf_PMf_FOLD) {
9704      RExC_contains_i = 1;
9705     }
9706     if (PASS2) {
9707      STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9708     }
9709     return;
9710     /*NOTREACHED*/
9711    default:
9712    fail_modifiers:
9713     RExC_parse += SKIP_IF_CHAR(RExC_parse);
9714     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9715     vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9716      UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9717     NOT_REACHED; /*NOTREACHED*/
9718   }
9719
9720   ++RExC_parse;
9721  }
9722
9723  if (PASS2) {
9724   STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9725  }
9726 }
9727
9728 /*
9729  - reg - regular expression, i.e. main body or parenthesized thing
9730  *
9731  * Caller must absorb opening parenthesis.
9732  *
9733  * Combining parenthesis handling with the base level of regular expression
9734  * is a trifle forced, but the need to tie the tails of the branches to what
9735  * follows makes it hard to avoid.
9736  */
9737 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9738 #ifdef DEBUGGING
9739 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9740 #else
9741 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9742 #endif
9743
9744 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9745    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9746    needs to be restarted.
9747    Otherwise would only return NULL if regbranch() returns NULL, which
9748    cannot happen.  */
9749 STATIC regnode *
9750 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9751  /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9752  * 2 is like 1, but indicates that nextchar() has been called to advance
9753  * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9754  * this flag alerts us to the need to check for that */
9755 {
9756  regnode *ret;  /* Will be the head of the group. */
9757  regnode *br;
9758  regnode *lastbr;
9759  regnode *ender = NULL;
9760  I32 parno = 0;
9761  I32 flags;
9762  U32 oregflags = RExC_flags;
9763  bool have_branch = 0;
9764  bool is_open = 0;
9765  I32 freeze_paren = 0;
9766  I32 after_freeze = 0;
9767  I32 num; /* numeric backreferences */
9768
9769  char * parse_start = RExC_parse; /* MJD */
9770  char * const oregcomp_parse = RExC_parse;
9771
9772  GET_RE_DEBUG_FLAGS_DECL;
9773
9774  PERL_ARGS_ASSERT_REG;
9775  DEBUG_PARSE("reg ");
9776
9777  *flagp = 0;    /* Tentatively. */
9778
9779
9780  /* Make an OPEN node, if parenthesized. */
9781  if (paren) {
9782
9783   /* Under /x, space and comments can be gobbled up between the '(' and
9784   * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9785   * intervening space, as the sequence is a token, and a token should be
9786   * indivisible */
9787   bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9788
9789   if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9790    char *start_verb = RExC_parse;
9791    STRLEN verb_len = 0;
9792    char *start_arg = NULL;
9793    unsigned char op = 0;
9794    int argok = 1;
9795    int internal_argval = 0; /* internal_argval is only useful if
9796           !argok */
9797
9798    if (has_intervening_patws) {
9799     RExC_parse++;
9800     vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9801    }
9802    while ( *RExC_parse && *RExC_parse != ')' ) {
9803     if ( *RExC_parse == ':' ) {
9804      start_arg = RExC_parse + 1;
9805      break;
9806     }
9807     RExC_parse++;
9808    }
9809    ++start_verb;
9810    verb_len = RExC_parse - start_verb;
9811    if ( start_arg ) {
9812     RExC_parse++;
9813     while ( *RExC_parse && *RExC_parse != ')' )
9814      RExC_parse++;
9815     if ( *RExC_parse != ')' )
9816      vFAIL("Unterminated verb pattern argument");
9817     if ( RExC_parse == start_arg )
9818      start_arg = NULL;
9819    } else {
9820     if ( *RExC_parse != ')' )
9821      vFAIL("Unterminated verb pattern");
9822    }
9823
9824    switch ( *start_verb ) {
9825    case 'A':  /* (*ACCEPT) */
9826     if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9827      op = ACCEPT;
9828      internal_argval = RExC_nestroot;
9829     }
9830     break;
9831    case 'C':  /* (*COMMIT) */
9832     if ( memEQs(start_verb,verb_len,"COMMIT") )
9833      op = COMMIT;
9834     break;
9835    case 'F':  /* (*FAIL) */
9836     if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9837      op = OPFAIL;
9838      argok = 0;
9839     }
9840     break;
9841    case ':':  /* (*:NAME) */
9842    case 'M':  /* (*MARK:NAME) */
9843     if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9844      op = MARKPOINT;
9845      argok = -1;
9846     }
9847     break;
9848    case 'P':  /* (*PRUNE) */
9849     if ( memEQs(start_verb,verb_len,"PRUNE") )
9850      op = PRUNE;
9851     break;
9852    case 'S':   /* (*SKIP) */
9853     if ( memEQs(start_verb,verb_len,"SKIP") )
9854      op = SKIP;
9855     break;
9856    case 'T':  /* (*THEN) */
9857     /* [19:06] <TimToady> :: is then */
9858     if ( memEQs(start_verb,verb_len,"THEN") ) {
9859      op = CUTGROUP;
9860      RExC_seen |= REG_CUTGROUP_SEEN;
9861     }
9862     break;
9863    }
9864    if ( ! op ) {
9865     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9866     vFAIL2utf8f(
9867      "Unknown verb pattern '%"UTF8f"'",
9868      UTF8fARG(UTF, verb_len, start_verb));
9869    }
9870    if ( argok ) {
9871     if ( start_arg && internal_argval ) {
9872      vFAIL3("Verb pattern '%.*s' may not have an argument",
9873       verb_len, start_verb);
9874     } else if ( argok < 0 && !start_arg ) {
9875      vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9876       verb_len, start_verb);
9877     } else {
9878      ret = reganode(pRExC_state, op, internal_argval);
9879      if ( ! internal_argval && ! SIZE_ONLY ) {
9880       if (start_arg) {
9881        SV *sv = newSVpvn( start_arg,
9882            RExC_parse - start_arg);
9883        ARG(ret) = add_data( pRExC_state,
9884             STR_WITH_LEN("S"));
9885        RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9886        ret->flags = 0;
9887       } else {
9888        ret->flags = 1;
9889       }
9890      }
9891     }
9892     if (!internal_argval)
9893      RExC_seen |= REG_VERBARG_SEEN;
9894    } else if ( start_arg ) {
9895     vFAIL3("Verb pattern '%.*s' may not have an argument",
9896       verb_len, start_verb);
9897    } else {
9898     ret = reg_node(pRExC_state, op);
9899    }
9900    nextchar(pRExC_state);
9901    return ret;
9902   }
9903   else if (*RExC_parse == '?') { /* (?...) */
9904    bool is_logical = 0;
9905    const char * const seqstart = RExC_parse;
9906    const char * endptr;
9907    if (has_intervening_patws) {
9908     RExC_parse++;
9909     vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9910    }
9911
9912    RExC_parse++;
9913    paren = *RExC_parse++;
9914    ret = NULL;   /* For look-ahead/behind. */
9915    switch (paren) {
9916
9917    case 'P': /* (?P...) variants for those used to PCRE/Python */
9918     paren = *RExC_parse++;
9919     if ( paren == '<')         /* (?P<...>) named capture */
9920      goto named_capture;
9921     else if (paren == '>') {   /* (?P>name) named recursion */
9922      goto named_recursion;
9923     }
9924     else if (paren == '=') {   /* (?P=...)  named backref */
9925      /* this pretty much dupes the code for \k<NAME> in
9926      * regatom(), if you change this make sure you change that
9927      * */
9928      char* name_start = RExC_parse;
9929      U32 num = 0;
9930      SV *sv_dat = reg_scan_name(pRExC_state,
9931       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9932      if (RExC_parse == name_start || *RExC_parse != ')')
9933       /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9934       vFAIL2("Sequence %.3s... not terminated",parse_start);
9935
9936      if (!SIZE_ONLY) {
9937       num = add_data( pRExC_state, STR_WITH_LEN("S"));
9938       RExC_rxi->data->data[num]=(void*)sv_dat;
9939       SvREFCNT_inc_simple_void(sv_dat);
9940      }
9941      RExC_sawback = 1;
9942      ret = reganode(pRExC_state,
9943         ((! FOLD)
9944          ? NREF
9945          : (ASCII_FOLD_RESTRICTED)
9946          ? NREFFA
9947          : (AT_LEAST_UNI_SEMANTICS)
9948           ? NREFFU
9949           : (LOC)
9950           ? NREFFL
9951           : NREFF),
9952          num);
9953      *flagp |= HASWIDTH;
9954
9955      Set_Node_Offset(ret, parse_start+1);
9956      Set_Node_Cur_Length(ret, parse_start);
9957
9958      nextchar(pRExC_state);
9959      return ret;
9960     }
9961     --RExC_parse;
9962     RExC_parse += SKIP_IF_CHAR(RExC_parse);
9963     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9964     vFAIL3("Sequence (%.*s...) not recognized",
9965         RExC_parse-seqstart, seqstart);
9966     NOT_REACHED; /*NOTREACHED*/
9967    case '<':           /* (?<...) */
9968     if (*RExC_parse == '!')
9969      paren = ',';
9970     else if (*RExC_parse != '=')
9971    named_capture:
9972     {               /* (?<...>) */
9973      char *name_start;
9974      SV *svname;
9975      paren= '>';
9976    case '\'':          /* (?'...') */
9977       name_start= RExC_parse;
9978       svname = reg_scan_name(pRExC_state,
9979       SIZE_ONLY    /* reverse test from the others */
9980       ? REG_RSN_RETURN_NAME
9981       : REG_RSN_RETURN_NULL);
9982      if (RExC_parse == name_start || *RExC_parse != paren)
9983       vFAIL2("Sequence (?%c... not terminated",
9984        paren=='>' ? '<' : paren);
9985      if (SIZE_ONLY) {
9986       HE *he_str;
9987       SV *sv_dat = NULL;
9988       if (!svname) /* shouldn't happen */
9989        Perl_croak(aTHX_
9990         "panic: reg_scan_name returned NULL");
9991       if (!RExC_paren_names) {
9992        RExC_paren_names= newHV();
9993        sv_2mortal(MUTABLE_SV(RExC_paren_names));
9994 #ifdef DEBUGGING
9995        RExC_paren_name_list= newAV();
9996        sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9997 #endif
9998       }
9999       he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
10000       if ( he_str )
10001        sv_dat = HeVAL(he_str);
10002       if ( ! sv_dat ) {
10003        /* croak baby croak */
10004        Perl_croak(aTHX_
10005         "panic: paren_name hash element allocation failed");
10006       } else if ( SvPOK(sv_dat) ) {
10007        /* (?|...) can mean we have dupes so scan to check
10008        its already been stored. Maybe a flag indicating
10009        we are inside such a construct would be useful,
10010        but the arrays are likely to be quite small, so
10011        for now we punt -- dmq */
10012        IV count = SvIV(sv_dat);
10013        I32 *pv = (I32*)SvPVX(sv_dat);
10014        IV i;
10015        for ( i = 0 ; i < count ; i++ ) {
10016         if ( pv[i] == RExC_npar ) {
10017          count = 0;
10018          break;
10019         }
10020        }
10021        if ( count ) {
10022         pv = (I32*)SvGROW(sv_dat,
10023             SvCUR(sv_dat) + sizeof(I32)+1);
10024         SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10025         pv[count] = RExC_npar;
10026         SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10027        }
10028       } else {
10029        (void)SvUPGRADE(sv_dat,SVt_PVNV);
10030        sv_setpvn(sv_dat, (char *)&(RExC_npar),
10031                 sizeof(I32));
10032        SvIOK_on(sv_dat);
10033        SvIV_set(sv_dat, 1);
10034       }
10035 #ifdef DEBUGGING
10036       /* Yes this does cause a memory leak in debugging Perls
10037       * */
10038       if (!av_store(RExC_paren_name_list,
10039          RExC_npar, SvREFCNT_inc(svname)))
10040        SvREFCNT_dec_NN(svname);
10041 #endif
10042
10043       /*sv_dump(sv_dat);*/
10044      }
10045      nextchar(pRExC_state);
10046      paren = 1;
10047      goto capturing_parens;
10048     }
10049     RExC_seen |= REG_LOOKBEHIND_SEEN;
10050     RExC_in_lookbehind++;
10051     RExC_parse++;
10052     /* FALLTHROUGH */
10053    case '=':           /* (?=...) */
10054     RExC_seen_zerolen++;
10055     break;
10056    case '!':           /* (?!...) */
10057     RExC_seen_zerolen++;
10058     /* check if we're really just a "FAIL" assertion */
10059     --RExC_parse;
10060     nextchar(pRExC_state);
10061     if (*RExC_parse == ')') {
10062      ret=reg_node(pRExC_state, OPFAIL);
10063      nextchar(pRExC_state);
10064      return ret;
10065     }
10066     break;
10067    case '|':           /* (?|...) */
10068     /* branch reset, behave like a (?:...) except that
10069     buffers in alternations share the same numbers */
10070     paren = ':';
10071     after_freeze = freeze_paren = RExC_npar;
10072     break;
10073    case ':':           /* (?:...) */
10074    case '>':           /* (?>...) */
10075     break;
10076    case '$':           /* (?$...) */
10077    case '@':           /* (?@...) */
10078     vFAIL2("Sequence (?%c...) not implemented", (int)paren);
10079     break;
10080    case '0' :           /* (?0) */
10081    case 'R' :           /* (?R) */
10082     if (*RExC_parse != ')')
10083      FAIL("Sequence (?R) not terminated");
10084     ret = reg_node(pRExC_state, GOSTART);
10085      RExC_seen |= REG_GOSTART_SEEN;
10086     *flagp |= POSTPONED;
10087     nextchar(pRExC_state);
10088     return ret;
10089     /*notreached*/
10090    /* named and numeric backreferences */
10091    case '&':            /* (?&NAME) */
10092     parse_start = RExC_parse - 1;
10093    named_recursion:
10094     {
10095       SV *sv_dat = reg_scan_name(pRExC_state,
10096        SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10097       num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10098     }
10099     if (RExC_parse == RExC_end || *RExC_parse != ')')
10100      vFAIL("Sequence (?&... not terminated");
10101     goto gen_recurse_regop;
10102     /* NOTREACHED */
10103    case '+':
10104     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10105      RExC_parse++;
10106      vFAIL("Illegal pattern");
10107     }
10108     goto parse_recursion;
10109     /* NOTREACHED*/
10110    case '-': /* (?-1) */
10111     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10112      RExC_parse--; /* rewind to let it be handled later */
10113      goto parse_flags;
10114     }
10115     /* FALLTHROUGH */
10116    case '1': case '2': case '3': case '4': /* (?1) */
10117    case '5': case '6': case '7': case '8': case '9':
10118     RExC_parse--;
10119    parse_recursion:
10120     {
10121      bool is_neg = FALSE;
10122      UV unum;
10123      parse_start = RExC_parse - 1; /* MJD */
10124      if (*RExC_parse == '-') {
10125       RExC_parse++;
10126       is_neg = TRUE;
10127      }
10128      if (grok_atoUV(RExC_parse, &unum, &endptr)
10129       && unum <= I32_MAX
10130      ) {
10131       num = (I32)unum;
10132       RExC_parse = (char*)endptr;
10133      } else
10134       num = I32_MAX;
10135      if (is_neg) {
10136       /* Some limit for num? */
10137       num = -num;
10138      }
10139     }
10140     if (*RExC_parse!=')')
10141      vFAIL("Expecting close bracket");
10142
10143    gen_recurse_regop:
10144     if ( paren == '-' ) {
10145      /*
10146      Diagram of capture buffer numbering.
10147      Top line is the normal capture buffer numbers
10148      Bottom line is the negative indexing as from
10149      the X (the (?-2))
10150
10151      +   1 2    3 4 5 X          6 7
10152      /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10153      -   5 4    3 2 1 X          x x
10154
10155      */
10156      num = RExC_npar + num;
10157      if (num < 1)  {
10158       RExC_parse++;
10159       vFAIL("Reference to nonexistent group");
10160      }
10161     } else if ( paren == '+' ) {
10162      num = RExC_npar + num - 1;
10163     }
10164
10165     ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10166     if (!SIZE_ONLY) {
10167      if (num > (I32)RExC_rx->nparens) {
10168       RExC_parse++;
10169       vFAIL("Reference to nonexistent group");
10170      }
10171      RExC_recurse_count++;
10172      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10173       "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
10174        22, "|    |", (int)(depth * 2 + 1), "",
10175        (UV)ARG(ret), (IV)ARG2L(ret)));
10176     }
10177     RExC_seen |= REG_RECURSE_SEEN;
10178     Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
10179     Set_Node_Offset(ret, parse_start); /* MJD */
10180
10181     *flagp |= POSTPONED;
10182     nextchar(pRExC_state);
10183     return ret;
10184
10185    /* NOTREACHED */
10186
10187    case '?':           /* (??...) */
10188     is_logical = 1;
10189     if (*RExC_parse != '{') {
10190      RExC_parse += SKIP_IF_CHAR(RExC_parse);
10191      /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10192      vFAIL2utf8f(
10193       "Sequence (%"UTF8f"...) not recognized",
10194       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10195      NOT_REACHED; /*NOTREACHED*/
10196     }
10197     *flagp |= POSTPONED;
10198     paren = *RExC_parse++;
10199     /* FALLTHROUGH */
10200    case '{':           /* (?{...}) */
10201    {
10202     U32 n = 0;
10203     struct reg_code_block *cb;
10204
10205     RExC_seen_zerolen++;
10206
10207     if (   !pRExC_state->num_code_blocks
10208      || pRExC_state->code_index >= pRExC_state->num_code_blocks
10209      || pRExC_state->code_blocks[pRExC_state->code_index].start
10210       != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
10211        - RExC_start)
10212     ) {
10213      if (RExC_pm_flags & PMf_USE_RE_EVAL)
10214       FAIL("panic: Sequence (?{...}): no code block found\n");
10215      FAIL("Eval-group not allowed at runtime, use re 'eval'");
10216     }
10217     /* this is a pre-compiled code block (?{...}) */
10218     cb = &pRExC_state->code_blocks[pRExC_state->code_index];
10219     RExC_parse = RExC_start + cb->end;
10220     if (!SIZE_ONLY) {
10221      OP *o = cb->block;
10222      if (cb->src_regex) {
10223       n = add_data(pRExC_state, STR_WITH_LEN("rl"));
10224       RExC_rxi->data->data[n] =
10225        (void*)SvREFCNT_inc((SV*)cb->src_regex);
10226       RExC_rxi->data->data[n+1] = (void*)o;
10227      }
10228      else {
10229       n = add_data(pRExC_state,
10230        (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
10231       RExC_rxi->data->data[n] = (void*)o;
10232      }
10233     }
10234     pRExC_state->code_index++;
10235     nextchar(pRExC_state);
10236
10237     if (is_logical) {
10238      regnode *eval;
10239      ret = reg_node(pRExC_state, LOGICAL);
10240
10241      eval = reg2Lanode(pRExC_state, EVAL,
10242          n,
10243
10244          /* for later propagation into (??{})
10245           * return value */
10246          RExC_flags & RXf_PMf_COMPILETIME
10247          );
10248      if (!SIZE_ONLY) {
10249       ret->flags = 2;
10250      }
10251      REGTAIL(pRExC_state, ret, eval);
10252      /* deal with the length of this later - MJD */
10253      return ret;
10254     }
10255     ret = reg2Lanode(pRExC_state, EVAL, n, 0);
10256     Set_Node_Length(ret, RExC_parse - parse_start + 1);
10257     Set_Node_Offset(ret, parse_start);
10258     return ret;
10259    }
10260    case '(':           /* (?(?{...})...) and (?(?=...)...) */
10261    {
10262     int is_define= 0;
10263     const int DEFINE_len = sizeof("DEFINE") - 1;
10264     if (RExC_parse[0] == '?') {        /* (?(?...)) */
10265      if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
10266       || RExC_parse[1] == '<'
10267       || RExC_parse[1] == '{') { /* Lookahead or eval. */
10268       I32 flag;
10269       regnode *tail;
10270
10271       ret = reg_node(pRExC_state, LOGICAL);
10272       if (!SIZE_ONLY)
10273        ret->flags = 1;
10274
10275       tail = reg(pRExC_state, 1, &flag, depth+1);
10276       if (flag & RESTART_UTF8) {
10277        *flagp = RESTART_UTF8;
10278        return NULL;
10279       }
10280       REGTAIL(pRExC_state, ret, tail);
10281       goto insert_if;
10282      }
10283      /* Fall through to ‘Unknown switch condition’ at the
10284      end of the if/else chain. */
10285     }
10286     else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
10287       || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10288     {
10289      char ch = RExC_parse[0] == '<' ? '>' : '\'';
10290      char *name_start= RExC_parse++;
10291      U32 num = 0;
10292      SV *sv_dat=reg_scan_name(pRExC_state,
10293       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10294      if (RExC_parse == name_start || *RExC_parse != ch)
10295       vFAIL2("Sequence (?(%c... not terminated",
10296        (ch == '>' ? '<' : ch));
10297      RExC_parse++;
10298      if (!SIZE_ONLY) {
10299       num = add_data( pRExC_state, STR_WITH_LEN("S"));
10300       RExC_rxi->data->data[num]=(void*)sv_dat;
10301       SvREFCNT_inc_simple_void(sv_dat);
10302      }
10303      ret = reganode(pRExC_state,NGROUPP,num);
10304      goto insert_if_check_paren;
10305     }
10306     else if (RExC_end - RExC_parse >= DEFINE_len
10307       && strnEQ(RExC_parse, "DEFINE", DEFINE_len))
10308     {
10309      ret = reganode(pRExC_state,DEFINEP,0);
10310      RExC_parse += DEFINE_len;
10311      is_define = 1;
10312      goto insert_if_check_paren;
10313     }
10314     else if (RExC_parse[0] == 'R') {
10315      RExC_parse++;
10316      parno = 0;
10317      if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10318       UV uv;
10319       if (grok_atoUV(RExC_parse, &uv, &endptr)
10320        && uv <= I32_MAX
10321       ) {
10322        parno = (I32)uv;
10323        RExC_parse = (char*)endptr;
10324       }
10325       /* else "Switch condition not recognized" below */
10326      } else if (RExC_parse[0] == '&') {
10327       SV *sv_dat;
10328       RExC_parse++;
10329       sv_dat = reg_scan_name(pRExC_state,
10330        SIZE_ONLY
10331        ? REG_RSN_RETURN_NULL
10332        : REG_RSN_RETURN_DATA);
10333        parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10334      }
10335      ret = reganode(pRExC_state,INSUBP,parno);
10336      goto insert_if_check_paren;
10337     }
10338     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10339      /* (?(1)...) */
10340      char c;
10341      char *tmp;
10342      UV uv;
10343      if (grok_atoUV(RExC_parse, &uv, &endptr)
10344       && uv <= I32_MAX
10345      ) {
10346       parno = (I32)uv;
10347       RExC_parse = (char*)endptr;
10348      }
10349      /* XXX else what? */
10350      ret = reganode(pRExC_state, GROUPP, parno);
10351
10352     insert_if_check_paren:
10353      if (*(tmp = nextchar(pRExC_state)) != ')') {
10354       /* nextchar also skips comments, so undo its work
10355       * and skip over the the next character.
10356       */
10357       RExC_parse = tmp;
10358       RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10359       vFAIL("Switch condition not recognized");
10360      }
10361     insert_if:
10362      REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10363      br = regbranch(pRExC_state, &flags, 1,depth+1);
10364      if (br == NULL) {
10365       if (flags & RESTART_UTF8) {
10366        *flagp = RESTART_UTF8;
10367        return NULL;
10368       }
10369       FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10370        (UV) flags);
10371      } else
10372       REGTAIL(pRExC_state, br, reganode(pRExC_state,
10373               LONGJMP, 0));
10374      c = *nextchar(pRExC_state);
10375      if (flags&HASWIDTH)
10376       *flagp |= HASWIDTH;
10377      if (c == '|') {
10378       if (is_define)
10379        vFAIL("(?(DEFINE)....) does not allow branches");
10380
10381       /* Fake one for optimizer.  */
10382       lastbr = reganode(pRExC_state, IFTHEN, 0);
10383
10384       if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10385        if (flags & RESTART_UTF8) {
10386         *flagp = RESTART_UTF8;
10387         return NULL;
10388        }
10389        FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10390         (UV) flags);
10391       }
10392       REGTAIL(pRExC_state, ret, lastbr);
10393       if (flags&HASWIDTH)
10394        *flagp |= HASWIDTH;
10395       c = *nextchar(pRExC_state);
10396      }
10397      else
10398       lastbr = NULL;
10399      if (c != ')') {
10400       if (RExC_parse>RExC_end)
10401        vFAIL("Switch (?(condition)... not terminated");
10402       else
10403        vFAIL("Switch (?(condition)... contains too many branches");
10404      }
10405      ender = reg_node(pRExC_state, TAIL);
10406      REGTAIL(pRExC_state, br, ender);
10407      if (lastbr) {
10408       REGTAIL(pRExC_state, lastbr, ender);
10409       REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10410      }
10411      else
10412       REGTAIL(pRExC_state, ret, ender);
10413      RExC_size++; /* XXX WHY do we need this?!!
10414          For large programs it seems to be required
10415          but I can't figure out why. -- dmq*/
10416      return ret;
10417     }
10418     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10419     vFAIL("Unknown switch condition (?(...))");
10420    }
10421    case '[':           /* (?[ ... ]) */
10422     return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10423           oregcomp_parse);
10424    case 0:
10425     RExC_parse--; /* for vFAIL to print correctly */
10426     vFAIL("Sequence (? incomplete");
10427     break;
10428    default: /* e.g., (?i) */
10429     --RExC_parse;
10430    parse_flags:
10431     parse_lparen_question_flags(pRExC_state);
10432     if (UCHARAT(RExC_parse) != ':') {
10433      if (*RExC_parse)
10434       nextchar(pRExC_state);
10435      *flagp = TRYAGAIN;
10436      return NULL;
10437     }
10438     paren = ':';
10439     nextchar(pRExC_state);
10440     ret = NULL;
10441     goto parse_rest;
10442    } /* end switch */
10443   }
10444   else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
10445   capturing_parens:
10446    parno = RExC_npar;
10447    RExC_npar++;
10448
10449    ret = reganode(pRExC_state, OPEN, parno);
10450    if (!SIZE_ONLY ){
10451     if (!RExC_nestroot)
10452      RExC_nestroot = parno;
10453     if (RExC_seen & REG_RECURSE_SEEN
10454      && !RExC_open_parens[parno-1])
10455     {
10456      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10457       "%*s%*s Setting open paren #%"IVdf" to %d\n",
10458       22, "|    |", (int)(depth * 2 + 1), "",
10459       (IV)parno, REG_NODE_NUM(ret)));
10460      RExC_open_parens[parno-1]= ret;
10461     }
10462    }
10463    Set_Node_Length(ret, 1); /* MJD */
10464    Set_Node_Offset(ret, RExC_parse); /* MJD */
10465    is_open = 1;
10466   } else {
10467    /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
10468    paren = ':';
10469    ret = NULL;
10470   }
10471  }
10472  else                        /* ! paren */
10473   ret = NULL;
10474
10475    parse_rest:
10476  /* Pick up the branches, linking them together. */
10477  parse_start = RExC_parse;   /* MJD */
10478  br = regbranch(pRExC_state, &flags, 1,depth+1);
10479
10480  /*     branch_len = (paren != 0); */
10481
10482  if (br == NULL) {
10483   if (flags & RESTART_UTF8) {
10484    *flagp = RESTART_UTF8;
10485    return NULL;
10486   }
10487   FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10488  }
10489  if (*RExC_parse == '|') {
10490   if (!SIZE_ONLY && RExC_extralen) {
10491    reginsert(pRExC_state, BRANCHJ, br, depth+1);
10492   }
10493   else {                  /* MJD */
10494    reginsert(pRExC_state, BRANCH, br, depth+1);
10495    Set_Node_Length(br, paren != 0);
10496    Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10497   }
10498   have_branch = 1;
10499   if (SIZE_ONLY)
10500    RExC_extralen += 1;  /* For BRANCHJ-BRANCH. */
10501  }
10502  else if (paren == ':') {
10503   *flagp |= flags&SIMPLE;
10504  }
10505  if (is_open) {    /* Starts with OPEN. */
10506   REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10507  }
10508  else if (paren != '?')  /* Not Conditional */
10509   ret = br;
10510  *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10511  lastbr = br;
10512  while (*RExC_parse == '|') {
10513   if (!SIZE_ONLY && RExC_extralen) {
10514    ender = reganode(pRExC_state, LONGJMP,0);
10515
10516    /* Append to the previous. */
10517    REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10518   }
10519   if (SIZE_ONLY)
10520    RExC_extralen += 2;  /* Account for LONGJMP. */
10521   nextchar(pRExC_state);
10522   if (freeze_paren) {
10523    if (RExC_npar > after_freeze)
10524     after_freeze = RExC_npar;
10525    RExC_npar = freeze_paren;
10526   }
10527   br = regbranch(pRExC_state, &flags, 0, depth+1);
10528
10529   if (br == NULL) {
10530    if (flags & RESTART_UTF8) {
10531     *flagp = RESTART_UTF8;
10532     return NULL;
10533    }
10534    FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10535   }
10536   REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10537   lastbr = br;
10538   *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10539  }
10540
10541  if (have_branch || paren != ':') {
10542   /* Make a closing node, and hook it on the end. */
10543   switch (paren) {
10544   case ':':
10545    ender = reg_node(pRExC_state, TAIL);
10546    break;
10547   case 1: case 2:
10548    ender = reganode(pRExC_state, CLOSE, parno);
10549    if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10550     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10551       "%*s%*s Setting close paren #%"IVdf" to %d\n",
10552       22, "|    |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
10553     RExC_close_parens[parno-1]= ender;
10554     if (RExC_nestroot == parno)
10555      RExC_nestroot = 0;
10556    }
10557    Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10558    Set_Node_Length(ender,1); /* MJD */
10559    break;
10560   case '<':
10561   case ',':
10562   case '=':
10563   case '!':
10564    *flagp &= ~HASWIDTH;
10565    /* FALLTHROUGH */
10566   case '>':
10567    ender = reg_node(pRExC_state, SUCCEED);
10568    break;
10569   case 0:
10570    ender = reg_node(pRExC_state, END);
10571    if (!SIZE_ONLY) {
10572     assert(!RExC_opend); /* there can only be one! */
10573     RExC_opend = ender;
10574    }
10575    break;
10576   }
10577   DEBUG_PARSE_r(if (!SIZE_ONLY) {
10578    DEBUG_PARSE_MSG("lsbr");
10579    regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
10580    regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10581    PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10582       SvPV_nolen_const(RExC_mysv1),
10583       (IV)REG_NODE_NUM(lastbr),
10584       SvPV_nolen_const(RExC_mysv2),
10585       (IV)REG_NODE_NUM(ender),
10586       (IV)(ender - lastbr)
10587    );
10588   });
10589   REGTAIL(pRExC_state, lastbr, ender);
10590
10591   if (have_branch && !SIZE_ONLY) {
10592    char is_nothing= 1;
10593    if (depth==1)
10594     RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10595
10596    /* Hook the tails of the branches to the closing node. */
10597    for (br = ret; br; br = regnext(br)) {
10598     const U8 op = PL_regkind[OP(br)];
10599     if (op == BRANCH) {
10600      REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10601      if ( OP(NEXTOPER(br)) != NOTHING
10602       || regnext(NEXTOPER(br)) != ender)
10603       is_nothing= 0;
10604     }
10605     else if (op == BRANCHJ) {
10606      REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10607      /* for now we always disable this optimisation * /
10608      if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10609       || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10610      */
10611       is_nothing= 0;
10612     }
10613    }
10614    if (is_nothing) {
10615     br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10616     DEBUG_PARSE_r(if (!SIZE_ONLY) {
10617      DEBUG_PARSE_MSG("NADA");
10618      regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
10619      regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10620      PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10621         SvPV_nolen_const(RExC_mysv1),
10622         (IV)REG_NODE_NUM(ret),
10623         SvPV_nolen_const(RExC_mysv2),
10624         (IV)REG_NODE_NUM(ender),
10625         (IV)(ender - ret)
10626      );
10627     });
10628     OP(br)= NOTHING;
10629     if (OP(ender) == TAIL) {
10630      NEXT_OFF(br)= 0;
10631      RExC_emit= br + 1;
10632     } else {
10633      regnode *opt;
10634      for ( opt= br + 1; opt < ender ; opt++ )
10635       OP(opt)= OPTIMIZED;
10636      NEXT_OFF(br)= ender - br;
10637     }
10638    }
10639   }
10640  }
10641
10642  {
10643   const char *p;
10644   static const char parens[] = "=!<,>";
10645
10646   if (paren && (p = strchr(parens, paren))) {
10647    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10648    int flag = (p - parens) > 1;
10649
10650    if (paren == '>')
10651     node = SUSPEND, flag = 0;
10652    reginsert(pRExC_state, node,ret, depth+1);
10653    Set_Node_Cur_Length(ret, parse_start);
10654    Set_Node_Offset(ret, parse_start + 1);
10655    ret->flags = flag;
10656    REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10657   }
10658  }
10659
10660  /* Check for proper termination. */
10661  if (paren) {
10662   /* restore original flags, but keep (?p) */
10663   RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10664   if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10665    RExC_parse = oregcomp_parse;
10666    vFAIL("Unmatched (");
10667   }
10668  }
10669  else if (!paren && RExC_parse < RExC_end) {
10670   if (*RExC_parse == ')') {
10671    RExC_parse++;
10672    vFAIL("Unmatched )");
10673   }
10674   else
10675    FAIL("Junk on end of regexp"); /* "Can't happen". */
10676   NOT_REACHED; /* NOTREACHED */
10677  }
10678
10679  if (RExC_in_lookbehind) {
10680   RExC_in_lookbehind--;
10681  }
10682  if (after_freeze > RExC_npar)
10683   RExC_npar = after_freeze;
10684  return(ret);
10685 }
10686
10687 /*
10688  - regbranch - one alternative of an | operator
10689  *
10690  * Implements the concatenation operator.
10691  *
10692  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10693  * restarted.
10694  */
10695 STATIC regnode *
10696 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10697 {
10698  regnode *ret;
10699  regnode *chain = NULL;
10700  regnode *latest;
10701  I32 flags = 0, c = 0;
10702  GET_RE_DEBUG_FLAGS_DECL;
10703
10704  PERL_ARGS_ASSERT_REGBRANCH;
10705
10706  DEBUG_PARSE("brnc");
10707
10708  if (first)
10709   ret = NULL;
10710  else {
10711   if (!SIZE_ONLY && RExC_extralen)
10712    ret = reganode(pRExC_state, BRANCHJ,0);
10713   else {
10714    ret = reg_node(pRExC_state, BRANCH);
10715    Set_Node_Length(ret, 1);
10716   }
10717  }
10718
10719  if (!first && SIZE_ONLY)
10720   RExC_extralen += 1;   /* BRANCHJ */
10721
10722  *flagp = WORST;   /* Tentatively. */
10723
10724  RExC_parse--;
10725  nextchar(pRExC_state);
10726  while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10727   flags &= ~TRYAGAIN;
10728   latest = regpiece(pRExC_state, &flags,depth+1);
10729   if (latest == NULL) {
10730    if (flags & TRYAGAIN)
10731     continue;
10732    if (flags & RESTART_UTF8) {
10733     *flagp = RESTART_UTF8;
10734     return NULL;
10735    }
10736    FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10737   }
10738   else if (ret == NULL)
10739    ret = latest;
10740   *flagp |= flags&(HASWIDTH|POSTPONED);
10741   if (chain == NULL)  /* First piece. */
10742    *flagp |= flags&SPSTART;
10743   else {
10744    /* FIXME adding one for every branch after the first is probably
10745    * excessive now we have TRIE support. (hv) */
10746    MARK_NAUGHTY(1);
10747    REGTAIL(pRExC_state, chain, latest);
10748   }
10749   chain = latest;
10750   c++;
10751  }
10752  if (chain == NULL) { /* Loop ran zero times. */
10753   chain = reg_node(pRExC_state, NOTHING);
10754   if (ret == NULL)
10755    ret = chain;
10756  }
10757  if (c == 1) {
10758   *flagp |= flags&SIMPLE;
10759  }
10760
10761  return ret;
10762 }
10763
10764 /*
10765  - regpiece - something followed by possible [*+?]
10766  *
10767  * Note that the branching code sequences used for ? and the general cases
10768  * of * and + are somewhat optimized:  they use the same NOTHING node as
10769  * both the endmarker for their branch list and the body of the last branch.
10770  * It might seem that this node could be dispensed with entirely, but the
10771  * endmarker role is not redundant.
10772  *
10773  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10774  * TRYAGAIN.
10775  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10776  * restarted.
10777  */
10778 STATIC regnode *
10779 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10780 {
10781  regnode *ret;
10782  char op;
10783  char *next;
10784  I32 flags;
10785  const char * const origparse = RExC_parse;
10786  I32 min;
10787  I32 max = REG_INFTY;
10788 #ifdef RE_TRACK_PATTERN_OFFSETS
10789  char *parse_start;
10790 #endif
10791  const char *maxpos = NULL;
10792  UV uv;
10793
10794  /* Save the original in case we change the emitted regop to a FAIL. */
10795  regnode * const orig_emit = RExC_emit;
10796
10797  GET_RE_DEBUG_FLAGS_DECL;
10798
10799  PERL_ARGS_ASSERT_REGPIECE;
10800
10801  DEBUG_PARSE("piec");
10802
10803  ret = regatom(pRExC_state, &flags,depth+1);
10804  if (ret == NULL) {
10805   if (flags & (TRYAGAIN|RESTART_UTF8))
10806    *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10807   else
10808    FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10809   return(NULL);
10810  }
10811
10812  op = *RExC_parse;
10813
10814  if (op == '{' && regcurly(RExC_parse)) {
10815   maxpos = NULL;
10816 #ifdef RE_TRACK_PATTERN_OFFSETS
10817   parse_start = RExC_parse; /* MJD */
10818 #endif
10819   next = RExC_parse + 1;
10820   while (isDIGIT(*next) || *next == ',') {
10821    if (*next == ',') {
10822     if (maxpos)
10823      break;
10824     else
10825      maxpos = next;
10826    }
10827    next++;
10828   }
10829   if (*next == '}') {  /* got one */
10830    const char* endptr;
10831    if (!maxpos)
10832     maxpos = next;
10833    RExC_parse++;
10834    if (isDIGIT(*RExC_parse)) {
10835     if (!grok_atoUV(RExC_parse, &uv, &endptr))
10836      vFAIL("Invalid quantifier in {,}");
10837     if (uv >= REG_INFTY)
10838      vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10839     min = (I32)uv;
10840    } else {
10841     min = 0;
10842    }
10843    if (*maxpos == ',')
10844     maxpos++;
10845    else
10846     maxpos = RExC_parse;
10847    if (isDIGIT(*maxpos)) {
10848     if (!grok_atoUV(maxpos, &uv, &endptr))
10849      vFAIL("Invalid quantifier in {,}");
10850     if (uv >= REG_INFTY)
10851      vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10852     max = (I32)uv;
10853    } else {
10854     max = REG_INFTY;  /* meaning "infinity" */
10855    }
10856    RExC_parse = next;
10857    nextchar(pRExC_state);
10858    if (max < min) {    /* If can't match, warn and optimize to fail
10859         unconditionally */
10860     if (SIZE_ONLY) {
10861
10862      /* We can't back off the size because we have to reserve
10863      * enough space for all the things we are about to throw
10864      * away, but we can shrink it by the ammount we are about
10865      * to re-use here */
10866      RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10867     }
10868     else {
10869      ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10870      RExC_emit = orig_emit;
10871     }
10872     ret = reg_node(pRExC_state, OPFAIL);
10873     return ret;
10874    }
10875    else if (min == max
10876      && RExC_parse < RExC_end
10877      && (*RExC_parse == '?' || *RExC_parse == '+'))
10878    {
10879     if (PASS2) {
10880      ckWARN2reg(RExC_parse + 1,
10881        "Useless use of greediness modifier '%c'",
10882        *RExC_parse);
10883     }
10884     /* Absorb the modifier, so later code doesn't see nor use
10885      * it */
10886     nextchar(pRExC_state);
10887    }
10888
10889   do_curly:
10890    if ((flags&SIMPLE)) {
10891     MARK_NAUGHTY_EXP(2, 2);
10892     reginsert(pRExC_state, CURLY, ret, depth+1);
10893     Set_Node_Offset(ret, parse_start+1); /* MJD */
10894     Set_Node_Cur_Length(ret, parse_start);
10895    }
10896    else {
10897     regnode * const w = reg_node(pRExC_state, WHILEM);
10898
10899     w->flags = 0;
10900     REGTAIL(pRExC_state, ret, w);
10901     if (!SIZE_ONLY && RExC_extralen) {
10902      reginsert(pRExC_state, LONGJMP,ret, depth+1);
10903      reginsert(pRExC_state, NOTHING,ret, depth+1);
10904      NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10905     }
10906     reginsert(pRExC_state, CURLYX,ret, depth+1);
10907         /* MJD hk */
10908     Set_Node_Offset(ret, parse_start+1);
10909     Set_Node_Length(ret,
10910         op == '{' ? (RExC_parse - parse_start) : 1);
10911
10912     if (!SIZE_ONLY && RExC_extralen)
10913      NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10914     REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10915     if (SIZE_ONLY)
10916      RExC_whilem_seen++, RExC_extralen += 3;
10917     MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
10918    }
10919    ret->flags = 0;
10920
10921    if (min > 0)
10922     *flagp = WORST;
10923    if (max > 0)
10924     *flagp |= HASWIDTH;
10925    if (!SIZE_ONLY) {
10926     ARG1_SET(ret, (U16)min);
10927     ARG2_SET(ret, (U16)max);
10928    }
10929    if (max == REG_INFTY)
10930     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10931
10932    goto nest_check;
10933   }
10934  }
10935
10936  if (!ISMULT1(op)) {
10937   *flagp = flags;
10938   return(ret);
10939  }
10940
10941 #if 0    /* Now runtime fix should be reliable. */
10942
10943  /* if this is reinstated, don't forget to put this back into perldiag:
10944
10945    =item Regexp *+ operand could be empty at {#} in regex m/%s/
10946
10947   (F) The part of the regexp subject to either the * or + quantifier
10948   could match an empty string. The {#} shows in the regular
10949   expression about where the problem was discovered.
10950
10951  */
10952
10953  if (!(flags&HASWIDTH) && op != '?')
10954  vFAIL("Regexp *+ operand could be empty");
10955 #endif
10956
10957 #ifdef RE_TRACK_PATTERN_OFFSETS
10958  parse_start = RExC_parse;
10959 #endif
10960  nextchar(pRExC_state);
10961
10962  *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10963
10964  if (op == '*' && (flags&SIMPLE)) {
10965   reginsert(pRExC_state, STAR, ret, depth+1);
10966   ret->flags = 0;
10967   MARK_NAUGHTY(4);
10968   RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10969  }
10970  else if (op == '*') {
10971   min = 0;
10972   goto do_curly;
10973  }
10974  else if (op == '+' && (flags&SIMPLE)) {
10975   reginsert(pRExC_state, PLUS, ret, depth+1);
10976   ret->flags = 0;
10977   MARK_NAUGHTY(3);
10978   RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10979  }
10980  else if (op == '+') {
10981   min = 1;
10982   goto do_curly;
10983  }
10984  else if (op == '?') {
10985   min = 0; max = 1;
10986   goto do_curly;
10987  }
10988   nest_check:
10989  if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10990   SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10991   ckWARN2reg(RExC_parse,
10992     "%"UTF8f" matches null string many times",
10993     UTF8fARG(UTF, (RExC_parse >= origparse
10994         ? RExC_parse - origparse
10995         : 0),
10996     origparse));
10997   (void)ReREFCNT_inc(RExC_rx_sv);
10998  }
10999
11000  if (RExC_parse < RExC_end && *RExC_parse == '?') {
11001   nextchar(pRExC_state);
11002   reginsert(pRExC_state, MINMOD, ret, depth+1);
11003   REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
11004  }
11005  else
11006  if (RExC_parse < RExC_end && *RExC_parse == '+') {
11007   regnode *ender;
11008   nextchar(pRExC_state);
11009   ender = reg_node(pRExC_state, SUCCEED);
11010   REGTAIL(pRExC_state, ret, ender);
11011   reginsert(pRExC_state, SUSPEND, ret, depth+1);
11012   ret->flags = 0;
11013   ender = reg_node(pRExC_state, TAIL);
11014   REGTAIL(pRExC_state, ret, ender);
11015  }
11016
11017  if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
11018   RExC_parse++;
11019   vFAIL("Nested quantifiers");
11020  }
11021
11022  return(ret);
11023 }
11024
11025 STATIC bool
11026 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
11027     regnode ** node_p,
11028     UV * code_point_p,
11029     int * cp_count,
11030     I32 * flagp,
11031     const U32 depth
11032  )
11033 {
11034  /* This routine teases apart the various meanings of \N and returns
11035   * accordingly.  The input parameters constrain which meaning(s) is/are valid
11036   * in the current context.
11037   *
11038   * Exactly one of <node_p> and <code_point_p> must be non-NULL.
11039   *
11040   * If <code_point_p> is not NULL, the context is expecting the result to be a
11041   * single code point.  If this \N instance turns out to a single code point,
11042   * the function returns TRUE and sets *code_point_p to that code point.
11043   *
11044   * If <node_p> is not NULL, the context is expecting the result to be one of
11045   * the things representable by a regnode.  If this \N instance turns out to be
11046   * one such, the function generates the regnode, returns TRUE and sets *node_p
11047   * to point to that regnode.
11048   *
11049   * If this instance of \N isn't legal in any context, this function will
11050   * generate a fatal error and not return.
11051   *
11052   * On input, RExC_parse should point to the first char following the \N at the
11053   * time of the call.  On successful return, RExC_parse will have been updated
11054   * to point to just after the sequence identified by this routine.  Also
11055   * *flagp has been updated as needed.
11056   *
11057   * When there is some problem with the current context and this \N instance,
11058   * the function returns FALSE, without advancing RExC_parse, nor setting
11059   * *node_p, nor *code_point_p, nor *flagp.
11060   *
11061   * If <cp_count> is not NULL, the caller wants to know the length (in code
11062   * points) that this \N sequence matches.  This is set even if the function
11063   * returns FALSE, as detailed below.
11064   *
11065   * There are 5 possibilities here, as detailed in the next 5 paragraphs.
11066   *
11067   * Probably the most common case is for the \N to specify a single code point.
11068   * *cp_count will be set to 1, and *code_point_p will be set to that code
11069   * point.
11070   *
11071   * Another possibility is for the input to be an empty \N{}, which for
11072   * backwards compatibility we accept.  *cp_count will be set to 0. *node_p
11073   * will be set to a generated NOTHING node.
11074   *
11075   * Still another possibility is for the \N to mean [^\n]. *cp_count will be
11076   * set to 0. *node_p will be set to a generated REG_ANY node.
11077   *
11078   * The fourth possibility is that \N resolves to a sequence of more than one
11079   * code points.  *cp_count will be set to the number of code points in the
11080   * sequence. *node_p * will be set to a generated node returned by this
11081   * function calling S_reg().
11082   *
11083   * The final possibility, which happens only when the fourth one would
11084   * otherwise be in effect, is that one of those code points requires the
11085   * pattern to be recompiled as UTF-8.  The function returns FALSE, and sets
11086   * the RESTART_UTF8 flag in *flagp.  When this happens, the caller needs to
11087   * desist from continuing parsing, and return this information to its caller.
11088   * This is not set for when there is only one code point, as this can be
11089   * called as part of an ANYOF node, and they can store above-Latin1 code
11090   * points without the pattern having to be in UTF-8.
11091   *
11092   * For non-single-quoted regexes, the tokenizer has resolved character and
11093   * sequence names inside \N{...} into their Unicode values, normalizing the
11094   * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
11095   * hex-represented code points in the sequence.  This is done there because
11096   * the names can vary based on what charnames pragma is in scope at the time,
11097   * so we need a way to take a snapshot of what they resolve to at the time of
11098   * the original parse. [perl #56444].
11099   *
11100   * That parsing is skipped for single-quoted regexes, so we may here get
11101   * '\N{NAME}'.  This is a fatal error.  These names have to be resolved by the
11102   * parser.  But if the single-quoted regex is something like '\N{U+41}', that
11103   * is legal and handled here.  The code point is Unicode, and has to be
11104   * translated into the native character set for non-ASCII platforms.
11105   * the tokenizer passes the \N sequence through unchanged; this code will not
11106   * attempt to determine this nor expand those, instead raising a syntax error.
11107   */
11108
11109  char * endbrace;    /* points to '}' following the name */
11110  char *endchar; /* Points to '.' or '}' ending cur char in the input
11111       stream */
11112  char* p;            /* Temporary */
11113
11114  GET_RE_DEBUG_FLAGS_DECL;
11115
11116  PERL_ARGS_ASSERT_GROK_BSLASH_N;
11117
11118  GET_RE_DEBUG_FLAGS;
11119
11120  assert(cBOOL(node_p) ^ cBOOL(code_point_p));  /* Exactly one should be set */
11121  assert(! (node_p && cp_count));               /* At most 1 should be set */
11122
11123  if (cp_count) {     /* Initialize return for the most common case */
11124   *cp_count = 1;
11125  }
11126
11127  /* The [^\n] meaning of \N ignores spaces and comments under the /x
11128  * modifier.  The other meanings do not, so use a temporary until we find
11129  * out which we are being called with */
11130  p = (RExC_flags & RXf_PMf_EXTENDED)
11131   ? regpatws(pRExC_state, RExC_parse,
11132         TRUE) /* means recognize comments */
11133   : RExC_parse;
11134
11135  /* Disambiguate between \N meaning a named character versus \N meaning
11136  * [^\n].  The latter is assumed when the {...} following the \N is a legal
11137  * quantifier, or there is no a '{' at all */
11138  if (*p != '{' || regcurly(p)) {
11139   RExC_parse = p;
11140   if (cp_count) {
11141    *cp_count = -1;
11142   }
11143
11144   if (! node_p) {
11145    return FALSE;
11146   }
11147   RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
11148       current char */
11149   nextchar(pRExC_state);
11150   *node_p = reg_node(pRExC_state, REG_ANY);
11151   *flagp |= HASWIDTH|SIMPLE;
11152   MARK_NAUGHTY(1);
11153   Set_Node_Length(*node_p, 1); /* MJD */
11154   return TRUE;
11155  }
11156
11157  /* Here, we have decided it should be a named character or sequence */
11158
11159  /* The test above made sure that the next real character is a '{', but
11160  * under the /x modifier, it could be separated by space (or a comment and
11161  * \n) and this is not allowed (for consistency with \x{...} and the
11162  * tokenizer handling of \N{NAME}). */
11163  if (*RExC_parse != '{') {
11164   vFAIL("Missing braces on \\N{}");
11165  }
11166
11167  RExC_parse++; /* Skip past the '{' */
11168
11169  if (! (endbrace = strchr(RExC_parse, '}'))  /* no trailing brace */
11170   || ! (endbrace == RExC_parse  /* nothing between the {} */
11171    || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked... */
11172     && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better
11173              error msg) */
11174  {
11175   if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
11176   vFAIL("\\N{NAME} must be resolved by the lexer");
11177  }
11178
11179  RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
11180
11181  if (endbrace == RExC_parse) {   /* empty: \N{} */
11182   if (cp_count) {
11183    *cp_count = 0;
11184   }
11185   nextchar(pRExC_state);
11186   if (! node_p) {
11187    return FALSE;
11188   }
11189
11190   *node_p = reg_node(pRExC_state,NOTHING);
11191   return TRUE;
11192  }
11193
11194  RExC_parse += 2; /* Skip past the 'U+' */
11195
11196  endchar = RExC_parse + strcspn(RExC_parse, ".}");
11197
11198  /* Code points are separated by dots.  If none, there is only one code
11199  * point, and is terminated by the brace */
11200
11201  if (endchar >= endbrace) {
11202   STRLEN length_of_hex;
11203   I32 grok_hex_flags;
11204
11205   /* Here, exactly one code point.  If that isn't what is wanted, fail */
11206   if (! code_point_p) {
11207    RExC_parse = p;
11208    return FALSE;
11209   }
11210
11211   /* Convert code point from hex */
11212   length_of_hex = (STRLEN)(endchar - RExC_parse);
11213   grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
11214       | PERL_SCAN_DISALLOW_PREFIX
11215
11216        /* No errors in the first pass (See [perl
11217        * #122671].)  We let the code below find the
11218        * errors when there are multiple chars. */
11219       | ((SIZE_ONLY)
11220        ? PERL_SCAN_SILENT_ILLDIGIT
11221        : 0);
11222
11223   /* This routine is the one place where both single- and double-quotish
11224   * \N{U+xxxx} are evaluated.  The value is a Unicode code point which
11225   * must be converted to native. */
11226   *code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse,
11227           &length_of_hex,
11228           &grok_hex_flags,
11229           NULL));
11230
11231   /* The tokenizer should have guaranteed validity, but it's possible to
11232   * bypass it by using single quoting, so check.  Don't do the check
11233   * here when there are multiple chars; we do it below anyway. */
11234   if (length_of_hex == 0
11235    || length_of_hex != (STRLEN)(endchar - RExC_parse) )
11236   {
11237    RExC_parse += length_of_hex; /* Includes all the valid */
11238    RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
11239        ? UTF8SKIP(RExC_parse)
11240        : 1;
11241    /* Guard against malformed utf8 */
11242    if (RExC_parse >= endchar) {
11243     RExC_parse = endchar;
11244    }
11245    vFAIL("Invalid hexadecimal number in \\N{U+...}");
11246   }
11247
11248   RExC_parse = endbrace + 1;
11249   return TRUE;
11250  }
11251  else {  /* Is a multiple character sequence */
11252   SV * substitute_parse;
11253   STRLEN len;
11254   char *orig_end = RExC_end;
11255   I32 flags;
11256
11257   /* Count the code points, if desired, in the sequence */
11258   if (cp_count) {
11259    *cp_count = 0;
11260    while (RExC_parse < endbrace) {
11261     /* Point to the beginning of the next character in the sequence. */
11262     RExC_parse = endchar + 1;
11263     endchar = RExC_parse + strcspn(RExC_parse, ".}");
11264     (*cp_count)++;
11265    }
11266   }
11267
11268   /* Fail if caller doesn't want to handle a multi-code-point sequence.
11269   * But don't backup up the pointer if the caller want to know how many
11270   * code points there are (they can then handle things) */
11271   if (! node_p) {
11272    if (! cp_count) {
11273     RExC_parse = p;
11274    }
11275    return FALSE;
11276   }
11277
11278   /* What is done here is to convert this to a sub-pattern of the form
11279   * \x{char1}\x{char2}...  and then call reg recursively to parse it
11280   * (enclosing in "(?: ... )" ).  That way, it retains its atomicness,
11281   * while not having to worry about special handling that some code
11282   * points may have. */
11283
11284   substitute_parse = newSVpvs("?:");
11285
11286   while (RExC_parse < endbrace) {
11287
11288    /* Convert to notation the rest of the code understands */
11289    sv_catpv(substitute_parse, "\\x{");
11290    sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
11291    sv_catpv(substitute_parse, "}");
11292
11293    /* Point to the beginning of the next character in the sequence. */
11294    RExC_parse = endchar + 1;
11295    endchar = RExC_parse + strcspn(RExC_parse, ".}");
11296
11297   }
11298   sv_catpv(substitute_parse, ")");
11299
11300   RExC_parse = SvPV(substitute_parse, len);
11301
11302   /* Don't allow empty number */
11303   if (len < (STRLEN) 8) {
11304    RExC_parse = endbrace;
11305    vFAIL("Invalid hexadecimal number in \\N{U+...}");
11306   }
11307   RExC_end = RExC_parse + len;
11308
11309   /* The values are Unicode, and therefore not subject to recoding, but
11310   * have to be converted to native on a non-Unicode (meaning non-ASCII)
11311   * platform. */
11312   RExC_override_recoding = 1;
11313 #ifdef EBCDIC
11314   RExC_recode_x_to_native = 1;
11315 #endif
11316
11317   if (node_p) {
11318    if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
11319     if (flags & RESTART_UTF8) {
11320      *flagp = RESTART_UTF8;
11321      return FALSE;
11322     }
11323     FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
11324      (UV) flags);
11325    }
11326    *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11327   }
11328
11329   /* Restore the saved values */
11330   RExC_parse = endbrace;
11331   RExC_end = orig_end;
11332   RExC_override_recoding = 0;
11333 #ifdef EBCDIC
11334   RExC_recode_x_to_native = 0;
11335 #endif
11336
11337   SvREFCNT_dec_NN(substitute_parse);
11338   nextchar(pRExC_state);
11339
11340   return TRUE;
11341  }
11342 }
11343
11344
11345 /*
11346  * reg_recode
11347  *
11348  * It returns the code point in utf8 for the value in *encp.
11349  *    value: a code value in the source encoding
11350  *    encp:  a pointer to an Encode object
11351  *
11352  * If the result from Encode is not a single character,
11353  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
11354  */
11355 STATIC UV
11356 S_reg_recode(pTHX_ const char value, SV **encp)
11357 {
11358  STRLEN numlen = 1;
11359  SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
11360  const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
11361  const STRLEN newlen = SvCUR(sv);
11362  UV uv = UNICODE_REPLACEMENT;
11363
11364  PERL_ARGS_ASSERT_REG_RECODE;
11365
11366  if (newlen)
11367   uv = SvUTF8(sv)
11368    ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
11369    : *(U8*)s;
11370
11371  if (!newlen || numlen != newlen) {
11372   uv = UNICODE_REPLACEMENT;
11373   *encp = NULL;
11374  }
11375  return uv;
11376 }
11377
11378 PERL_STATIC_INLINE U8
11379 S_compute_EXACTish(RExC_state_t *pRExC_state)
11380 {
11381  U8 op;
11382
11383  PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
11384
11385  if (! FOLD) {
11386   return (LOC)
11387     ? EXACTL
11388     : EXACT;
11389  }
11390
11391  op = get_regex_charset(RExC_flags);
11392  if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11393   op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11394     been, so there is no hole */
11395  }
11396
11397  return op + EXACTF;
11398 }
11399
11400 PERL_STATIC_INLINE void
11401 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11402       regnode *node, I32* flagp, STRLEN len, UV code_point,
11403       bool downgradable)
11404 {
11405  /* This knows the details about sizing an EXACTish node, setting flags for
11406  * it (by setting <*flagp>, and potentially populating it with a single
11407  * character.
11408  *
11409  * If <len> (the length in bytes) is non-zero, this function assumes that
11410  * the node has already been populated, and just does the sizing.  In this
11411  * case <code_point> should be the final code point that has already been
11412  * placed into the node.  This value will be ignored except that under some
11413  * circumstances <*flagp> is set based on it.
11414  *
11415  * If <len> is zero, the function assumes that the node is to contain only
11416  * the single character given by <code_point> and calculates what <len>
11417  * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
11418  * additionally will populate the node's STRING with <code_point> or its
11419  * fold if folding.
11420  *
11421  * In both cases <*flagp> is appropriately set
11422  *
11423  * It knows that under FOLD, the Latin Sharp S and UTF characters above
11424  * 255, must be folded (the former only when the rules indicate it can
11425  * match 'ss')
11426  *
11427  * When it does the populating, it looks at the flag 'downgradable'.  If
11428  * true with a node that folds, it checks if the single code point
11429  * participates in a fold, and if not downgrades the node to an EXACT.
11430  * This helps the optimizer */
11431
11432  bool len_passed_in = cBOOL(len != 0);
11433  U8 character[UTF8_MAXBYTES_CASE+1];
11434
11435  PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11436
11437  /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11438  * sizing difference, and is extra work that is thrown away */
11439  if (downgradable && ! PASS2) {
11440   downgradable = FALSE;
11441  }
11442
11443  if (! len_passed_in) {
11444   if (UTF) {
11445    if (UVCHR_IS_INVARIANT(code_point)) {
11446     if (LOC || ! FOLD) {    /* /l defers folding until runtime */
11447      *character = (U8) code_point;
11448     }
11449     else { /* Here is /i and not /l. (toFOLD() is defined on just
11450       ASCII, which isn't the same thing as INVARIANT on
11451       EBCDIC, but it works there, as the extra invariants
11452       fold to themselves) */
11453      *character = toFOLD((U8) code_point);
11454
11455      /* We can downgrade to an EXACT node if this character
11456      * isn't a folding one.  Note that this assumes that
11457      * nothing above Latin1 folds to some other invariant than
11458      * one of these alphabetics; otherwise we would also have
11459      * to check:
11460      *  && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11461      *      || ASCII_FOLD_RESTRICTED))
11462      */
11463      if (downgradable && PL_fold[code_point] == code_point) {
11464       OP(node) = EXACT;
11465      }
11466     }
11467     len = 1;
11468    }
11469    else if (FOLD && (! LOC
11470        || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11471    {   /* Folding, and ok to do so now */
11472     UV folded = _to_uni_fold_flags(
11473         code_point,
11474         character,
11475         &len,
11476         FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11477              ? FOLD_FLAGS_NOMIX_ASCII
11478              : 0));
11479     if (downgradable
11480      && folded == code_point /* This quickly rules out many
11481            cases, avoiding the
11482            _invlist_contains_cp() overhead
11483            for those.  */
11484      && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11485     {
11486      OP(node) = (LOC)
11487        ? EXACTL
11488        : EXACT;
11489     }
11490    }
11491    else if (code_point <= MAX_UTF8_TWO_BYTE) {
11492
11493     /* Not folding this cp, and can output it directly */
11494     *character = UTF8_TWO_BYTE_HI(code_point);
11495     *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11496     len = 2;
11497    }
11498    else {
11499     uvchr_to_utf8( character, code_point);
11500     len = UTF8SKIP(character);
11501    }
11502   } /* Else pattern isn't UTF8.  */
11503   else if (! FOLD) {
11504    *character = (U8) code_point;
11505    len = 1;
11506   } /* Else is folded non-UTF8 */
11507   else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11508
11509    /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11510    * comments at join_exact()); */
11511    *character = (U8) code_point;
11512    len = 1;
11513
11514    /* Can turn into an EXACT node if we know the fold at compile time,
11515    * and it folds to itself and doesn't particpate in other folds */
11516    if (downgradable
11517     && ! LOC
11518     && PL_fold_latin1[code_point] == code_point
11519     && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11520      || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11521    {
11522     OP(node) = EXACT;
11523    }
11524   } /* else is Sharp s.  May need to fold it */
11525   else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11526    *character = 's';
11527    *(character + 1) = 's';
11528    len = 2;
11529   }
11530   else {
11531    *character = LATIN_SMALL_LETTER_SHARP_S;
11532    len = 1;
11533   }
11534  }
11535
11536  if (SIZE_ONLY) {
11537   RExC_size += STR_SZ(len);
11538  }
11539  else {
11540   RExC_emit += STR_SZ(len);
11541   STR_LEN(node) = len;
11542   if (! len_passed_in) {
11543    Copy((char *) character, STRING(node), len, char);
11544   }
11545  }
11546
11547  *flagp |= HASWIDTH;
11548
11549  /* A single character node is SIMPLE, except for the special-cased SHARP S
11550  * under /di. */
11551  if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11552   && (code_point != LATIN_SMALL_LETTER_SHARP_S
11553    || ! FOLD || ! DEPENDS_SEMANTICS))
11554  {
11555   *flagp |= SIMPLE;
11556  }
11557
11558  /* The OP may not be well defined in PASS1 */
11559  if (PASS2 && OP(node) == EXACTFL) {
11560   RExC_contains_locale = 1;
11561  }
11562 }
11563
11564
11565 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11566  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11567
11568 static I32
11569 S_backref_value(char *p)
11570 {
11571  const char* endptr;
11572  UV val;
11573  if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
11574   return (I32)val;
11575  return I32_MAX;
11576 }
11577
11578
11579 /*
11580  - regatom - the lowest level
11581
11582    Try to identify anything special at the start of the pattern. If there
11583    is, then handle it as required. This may involve generating a single regop,
11584    such as for an assertion; or it may involve recursing, such as to
11585    handle a () structure.
11586
11587    If the string doesn't start with something special then we gobble up
11588    as much literal text as we can.
11589
11590    Once we have been able to handle whatever type of thing started the
11591    sequence, we return.
11592
11593    Note: we have to be careful with escapes, as they can be both literal
11594    and special, and in the case of \10 and friends, context determines which.
11595
11596    A summary of the code structure is:
11597
11598    switch (first_byte) {
11599   cases for each special:
11600    handle this special;
11601    break;
11602   case '\\':
11603    switch (2nd byte) {
11604     cases for each unambiguous special:
11605      handle this special;
11606      break;
11607     cases for each ambigous special/literal:
11608      disambiguate;
11609      if (special)  handle here
11610      else goto defchar;
11611     default: // unambiguously literal:
11612      goto defchar;
11613    }
11614   default:  // is a literal char
11615    // FALL THROUGH
11616   defchar:
11617    create EXACTish node for literal;
11618    while (more input and node isn't full) {
11619     switch (input_byte) {
11620     cases for each special;
11621      make sure parse pointer is set so that the next call to
11622       regatom will see this special first
11623      goto loopdone; // EXACTish node terminated by prev. char
11624     default:
11625      append char to EXACTISH node;
11626     }
11627     get next input byte;
11628    }
11629   loopdone:
11630    }
11631    return the generated node;
11632
11633    Specifically there are two separate switches for handling
11634    escape sequences, with the one for handling literal escapes requiring
11635    a dummy entry for all of the special escapes that are actually handled
11636    by the other.
11637
11638    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11639    TRYAGAIN.
11640    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11641    restarted.
11642    Otherwise does not return NULL.
11643 */
11644
11645 STATIC regnode *
11646 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11647 {
11648  regnode *ret = NULL;
11649  I32 flags = 0;
11650  char *parse_start = RExC_parse;
11651  U8 op;
11652  int invert = 0;
11653  U8 arg;
11654
11655  GET_RE_DEBUG_FLAGS_DECL;
11656
11657  *flagp = WORST;  /* Tentatively. */
11658
11659  DEBUG_PARSE("atom");
11660
11661  PERL_ARGS_ASSERT_REGATOM;
11662
11663   tryagain:
11664  switch ((U8)*RExC_parse) {
11665  case '^':
11666   RExC_seen_zerolen++;
11667   nextchar(pRExC_state);
11668   if (RExC_flags & RXf_PMf_MULTILINE)
11669    ret = reg_node(pRExC_state, MBOL);
11670   else
11671    ret = reg_node(pRExC_state, SBOL);
11672   Set_Node_Length(ret, 1); /* MJD */
11673   break;
11674  case '$':
11675   nextchar(pRExC_state);
11676   if (*RExC_parse)
11677    RExC_seen_zerolen++;
11678   if (RExC_flags & RXf_PMf_MULTILINE)
11679    ret = reg_node(pRExC_state, MEOL);
11680   else
11681    ret = reg_node(pRExC_state, SEOL);
11682   Set_Node_Length(ret, 1); /* MJD */
11683   break;
11684  case '.':
11685   nextchar(pRExC_state);
11686   if (RExC_flags & RXf_PMf_SINGLELINE)
11687    ret = reg_node(pRExC_state, SANY);
11688   else
11689    ret = reg_node(pRExC_state, REG_ANY);
11690   *flagp |= HASWIDTH|SIMPLE;
11691   MARK_NAUGHTY(1);
11692   Set_Node_Length(ret, 1); /* MJD */
11693   break;
11694  case '[':
11695  {
11696   char * const oregcomp_parse = ++RExC_parse;
11697   ret = regclass(pRExC_state, flagp,depth+1,
11698      FALSE, /* means parse the whole char class */
11699      TRUE, /* allow multi-char folds */
11700      FALSE, /* don't silence non-portable warnings. */
11701      (bool) RExC_strict,
11702      NULL);
11703   if (*RExC_parse != ']') {
11704    RExC_parse = oregcomp_parse;
11705    vFAIL("Unmatched [");
11706   }
11707   if (ret == NULL) {
11708    if (*flagp & RESTART_UTF8)
11709     return NULL;
11710    FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11711     (UV) *flagp);
11712   }
11713   nextchar(pRExC_state);
11714   Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11715   break;
11716  }
11717  case '(':
11718   nextchar(pRExC_state);
11719   ret = reg(pRExC_state, 2, &flags,depth+1);
11720   if (ret == NULL) {
11721     if (flags & TRYAGAIN) {
11722      if (RExC_parse == RExC_end) {
11723       /* Make parent create an empty node if needed. */
11724       *flagp |= TRYAGAIN;
11725       return(NULL);
11726      }
11727      goto tryagain;
11728     }
11729     if (flags & RESTART_UTF8) {
11730      *flagp = RESTART_UTF8;
11731      return NULL;
11732     }
11733     FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11734                 (UV) flags);
11735   }
11736   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11737   break;
11738  case '|':
11739  case ')':
11740   if (flags & TRYAGAIN) {
11741    *flagp |= TRYAGAIN;
11742    return NULL;
11743   }
11744   vFAIL("Internal urp");
11745         /* Supposed to be caught earlier. */
11746   break;
11747  case '?':
11748  case '+':
11749  case '*':
11750   RExC_parse++;
11751   vFAIL("Quantifier follows nothing");
11752   break;
11753  case '\\':
11754   /* Special Escapes
11755
11756   This switch handles escape sequences that resolve to some kind
11757   of special regop and not to literal text. Escape sequnces that
11758   resolve to literal text are handled below in the switch marked
11759   "Literal Escapes".
11760
11761   Every entry in this switch *must* have a corresponding entry
11762   in the literal escape switch. However, the opposite is not
11763   required, as the default for this switch is to jump to the
11764   literal text handling code.
11765   */
11766   switch ((U8)*++RExC_parse) {
11767   /* Special Escapes */
11768   case 'A':
11769    RExC_seen_zerolen++;
11770    ret = reg_node(pRExC_state, SBOL);
11771    /* SBOL is shared with /^/ so we set the flags so we can tell
11772    * /\A/ from /^/ in split. We check ret because first pass we
11773    * have no regop struct to set the flags on. */
11774    if (PASS2)
11775     ret->flags = 1;
11776    *flagp |= SIMPLE;
11777    goto finish_meta_pat;
11778   case 'G':
11779    ret = reg_node(pRExC_state, GPOS);
11780    RExC_seen |= REG_GPOS_SEEN;
11781    *flagp |= SIMPLE;
11782    goto finish_meta_pat;
11783   case 'K':
11784    RExC_seen_zerolen++;
11785    ret = reg_node(pRExC_state, KEEPS);
11786    *flagp |= SIMPLE;
11787    /* XXX:dmq : disabling in-place substitution seems to
11788    * be necessary here to avoid cases of memory corruption, as
11789    * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11790    */
11791    RExC_seen |= REG_LOOKBEHIND_SEEN;
11792    goto finish_meta_pat;
11793   case 'Z':
11794    ret = reg_node(pRExC_state, SEOL);
11795    *flagp |= SIMPLE;
11796    RExC_seen_zerolen++;  /* Do not optimize RE away */
11797    goto finish_meta_pat;
11798   case 'z':
11799    ret = reg_node(pRExC_state, EOS);
11800    *flagp |= SIMPLE;
11801    RExC_seen_zerolen++;  /* Do not optimize RE away */
11802    goto finish_meta_pat;
11803   case 'C':
11804    vFAIL("\\C no longer supported");
11805   case 'X':
11806    ret = reg_node(pRExC_state, CLUMP);
11807    *flagp |= HASWIDTH;
11808    goto finish_meta_pat;
11809
11810   case 'W':
11811    invert = 1;
11812    /* FALLTHROUGH */
11813   case 'w':
11814    arg = ANYOF_WORDCHAR;
11815    goto join_posix;
11816
11817   case 'B':
11818    invert = 1;
11819    /* FALLTHROUGH */
11820   case 'b':
11821   {
11822    regex_charset charset = get_regex_charset(RExC_flags);
11823
11824    RExC_seen_zerolen++;
11825    RExC_seen |= REG_LOOKBEHIND_SEEN;
11826    op = BOUND + charset;
11827
11828    if (op == BOUNDL) {
11829     RExC_contains_locale = 1;
11830    }
11831
11832    ret = reg_node(pRExC_state, op);
11833    *flagp |= SIMPLE;
11834    if (*(RExC_parse + 1) != '{') {
11835     FLAGS(ret) = TRADITIONAL_BOUND;
11836     if (PASS2 && op > BOUNDA) {  /* /aa is same as /a */
11837      OP(ret) = BOUNDA;
11838     }
11839    }
11840    else {
11841     STRLEN length;
11842     char name = *RExC_parse;
11843     char * endbrace;
11844     RExC_parse += 2;
11845     endbrace = strchr(RExC_parse, '}');
11846
11847     if (! endbrace) {
11848      vFAIL2("Missing right brace on \\%c{}", name);
11849     }
11850     /* XXX Need to decide whether to take spaces or not.  Should be
11851     * consistent with \p{}, but that currently is SPACE, which
11852     * means vertical too, which seems wrong
11853     * while (isBLANK(*RExC_parse)) {
11854      RExC_parse++;
11855     }*/
11856     if (endbrace == RExC_parse) {
11857      RExC_parse++;  /* After the '}' */
11858      vFAIL2("Empty \\%c{}", name);
11859     }
11860     length = endbrace - RExC_parse;
11861     /*while (isBLANK(*(RExC_parse + length - 1))) {
11862      length--;
11863     }*/
11864     switch (*RExC_parse) {
11865      case 'g':
11866       if (length != 1
11867        && (length != 3 || strnNE(RExC_parse + 1, "cb", 2)))
11868       {
11869        goto bad_bound_type;
11870       }
11871       FLAGS(ret) = GCB_BOUND;
11872       break;
11873      case 's':
11874       if (length != 2 || *(RExC_parse + 1) != 'b') {
11875        goto bad_bound_type;
11876       }
11877       FLAGS(ret) = SB_BOUND;
11878       break;
11879      case 'w':
11880       if (length != 2 || *(RExC_parse + 1) != 'b') {
11881        goto bad_bound_type;
11882       }
11883       FLAGS(ret) = WB_BOUND;
11884       break;
11885      default:
11886      bad_bound_type:
11887       RExC_parse = endbrace;
11888       vFAIL2utf8f(
11889        "'%"UTF8f"' is an unknown bound type",
11890        UTF8fARG(UTF, length, endbrace - length));
11891       NOT_REACHED; /*NOTREACHED*/
11892     }
11893     RExC_parse = endbrace;
11894     RExC_uni_semantics = 1;
11895
11896     if (PASS2 && op >= BOUNDA) {  /* /aa is same as /a */
11897      OP(ret) = BOUNDU;
11898      length += 4;
11899
11900      /* Don't have to worry about UTF-8, in this message because
11901      * to get here the contents of the \b must be ASCII */
11902      ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
11903        "Using /u for '%.*s' instead of /%s",
11904        (unsigned) length,
11905        endbrace - length + 1,
11906        (charset == REGEX_ASCII_RESTRICTED_CHARSET)
11907        ? ASCII_RESTRICT_PAT_MODS
11908        : ASCII_MORE_RESTRICT_PAT_MODS);
11909     }
11910    }
11911
11912    if (PASS2 && invert) {
11913     OP(ret) += NBOUND - BOUND;
11914    }
11915    goto finish_meta_pat;
11916   }
11917
11918   case 'D':
11919    invert = 1;
11920    /* FALLTHROUGH */
11921   case 'd':
11922    arg = ANYOF_DIGIT;
11923    if (! DEPENDS_SEMANTICS) {
11924     goto join_posix;
11925    }
11926
11927    /* \d doesn't have any matches in the upper Latin1 range, hence /d
11928    * is equivalent to /u.  Changing to /u saves some branches at
11929    * runtime */
11930    op = POSIXU;
11931    goto join_posix_op_known;
11932
11933   case 'R':
11934    ret = reg_node(pRExC_state, LNBREAK);
11935    *flagp |= HASWIDTH|SIMPLE;
11936    goto finish_meta_pat;
11937
11938   case 'H':
11939    invert = 1;
11940    /* FALLTHROUGH */
11941   case 'h':
11942    arg = ANYOF_BLANK;
11943    op = POSIXU;
11944    goto join_posix_op_known;
11945
11946   case 'V':
11947    invert = 1;
11948    /* FALLTHROUGH */
11949   case 'v':
11950    arg = ANYOF_VERTWS;
11951    op = POSIXU;
11952    goto join_posix_op_known;
11953
11954   case 'S':
11955    invert = 1;
11956    /* FALLTHROUGH */
11957   case 's':
11958    arg = ANYOF_SPACE;
11959
11960   join_posix:
11961
11962    op = POSIXD + get_regex_charset(RExC_flags);
11963    if (op > POSIXA) {  /* /aa is same as /a */
11964     op = POSIXA;
11965    }
11966    else if (op == POSIXL) {
11967     RExC_contains_locale = 1;
11968    }
11969
11970   join_posix_op_known:
11971
11972    if (invert) {
11973     op += NPOSIXD - POSIXD;
11974    }
11975
11976    ret = reg_node(pRExC_state, op);
11977    if (! SIZE_ONLY) {
11978     FLAGS(ret) = namedclass_to_classnum(arg);
11979    }
11980
11981    *flagp |= HASWIDTH|SIMPLE;
11982    /* FALLTHROUGH */
11983
11984   finish_meta_pat:
11985    nextchar(pRExC_state);
11986    Set_Node_Length(ret, 2); /* MJD */
11987    break;
11988   case 'p':
11989   case 'P':
11990    {
11991 #ifdef DEBUGGING
11992     char* parse_start = RExC_parse - 2;
11993 #endif
11994
11995     RExC_parse--;
11996
11997     ret = regclass(pRExC_state, flagp,depth+1,
11998        TRUE, /* means just parse this element */
11999        FALSE, /* don't allow multi-char folds */
12000        FALSE, /* don't silence non-portable warnings.
12001           It would be a bug if these returned
12002           non-portables */
12003        (bool) RExC_strict,
12004        NULL);
12005     /* regclass() can only return RESTART_UTF8 if multi-char folds
12006     are allowed.  */
12007     if (!ret)
12008      FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
12009       (UV) *flagp);
12010
12011     RExC_parse--;
12012
12013     Set_Node_Offset(ret, parse_start + 2);
12014     Set_Node_Cur_Length(ret, parse_start);
12015     nextchar(pRExC_state);
12016    }
12017    break;
12018   case 'N':
12019    /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
12020    * \N{...} evaluates to a sequence of more than one code points).
12021    * The function call below returns a regnode, which is our result.
12022    * The parameters cause it to fail if the \N{} evaluates to a
12023    * single code point; we handle those like any other literal.  The
12024    * reason that the multicharacter case is handled here and not as
12025    * part of the EXACtish code is because of quantifiers.  In
12026    * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
12027    * this way makes that Just Happen. dmq.
12028    * join_exact() will join this up with adjacent EXACTish nodes
12029    * later on, if appropriate. */
12030    ++RExC_parse;
12031    if (grok_bslash_N(pRExC_state,
12032        &ret,     /* Want a regnode returned */
12033        NULL,     /* Fail if evaluates to a single code
12034           point */
12035        NULL,     /* Don't need a count of how many code
12036           points */
12037        flagp,
12038        depth)
12039    ) {
12040     break;
12041    }
12042
12043    if (*flagp & RESTART_UTF8)
12044     return NULL;
12045    RExC_parse--;
12046    goto defchar;
12047
12048   case 'k':    /* Handle \k<NAME> and \k'NAME' */
12049  parse_named_seq:
12050   {
12051    char ch= RExC_parse[1];
12052    if (ch != '<' && ch != '\'' && ch != '{') {
12053     RExC_parse++;
12054     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
12055     vFAIL2("Sequence %.2s... not terminated",parse_start);
12056    } else {
12057     /* this pretty much dupes the code for (?P=...) in reg(), if
12058     you change this make sure you change that */
12059     char* name_start = (RExC_parse += 2);
12060     U32 num = 0;
12061     SV *sv_dat = reg_scan_name(pRExC_state,
12062      SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
12063     ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
12064     if (RExC_parse == name_start || *RExC_parse != ch)
12065      /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
12066      vFAIL2("Sequence %.3s... not terminated",parse_start);
12067
12068     if (!SIZE_ONLY) {
12069      num = add_data( pRExC_state, STR_WITH_LEN("S"));
12070      RExC_rxi->data->data[num]=(void*)sv_dat;
12071      SvREFCNT_inc_simple_void(sv_dat);
12072     }
12073
12074     RExC_sawback = 1;
12075     ret = reganode(pRExC_state,
12076        ((! FOLD)
12077         ? NREF
12078         : (ASCII_FOLD_RESTRICTED)
12079         ? NREFFA
12080         : (AT_LEAST_UNI_SEMANTICS)
12081          ? NREFFU
12082          : (LOC)
12083          ? NREFFL
12084          : NREFF),
12085         num);
12086     *flagp |= HASWIDTH;
12087
12088     /* override incorrect value set in reganode MJD */
12089     Set_Node_Offset(ret, parse_start+1);
12090     Set_Node_Cur_Length(ret, parse_start);
12091     nextchar(pRExC_state);
12092
12093    }
12094    break;
12095   }
12096   case 'g':
12097   case '1': case '2': case '3': case '4':
12098   case '5': case '6': case '7': case '8': case '9':
12099    {
12100     I32 num;
12101     bool hasbrace = 0;
12102
12103     if (*RExC_parse == 'g') {
12104      bool isrel = 0;
12105
12106      RExC_parse++;
12107      if (*RExC_parse == '{') {
12108       RExC_parse++;
12109       hasbrace = 1;
12110      }
12111      if (*RExC_parse == '-') {
12112       RExC_parse++;
12113       isrel = 1;
12114      }
12115      if (hasbrace && !isDIGIT(*RExC_parse)) {
12116       if (isrel) RExC_parse--;
12117       RExC_parse -= 2;
12118       goto parse_named_seq;
12119      }
12120
12121      num = S_backref_value(RExC_parse);
12122      if (num == 0)
12123       vFAIL("Reference to invalid group 0");
12124      else if (num == I32_MAX) {
12125       if (isDIGIT(*RExC_parse))
12126        vFAIL("Reference to nonexistent group");
12127       else
12128        vFAIL("Unterminated \\g... pattern");
12129      }
12130
12131      if (isrel) {
12132       num = RExC_npar - num;
12133       if (num < 1)
12134        vFAIL("Reference to nonexistent or unclosed group");
12135      }
12136     }
12137     else {
12138      num = S_backref_value(RExC_parse);
12139      /* bare \NNN might be backref or octal - if it is larger
12140      * than or equal RExC_npar then it is assumed to be an
12141      * octal escape. Note RExC_npar is +1 from the actual
12142      * number of parens. */
12143      /* Note we do NOT check if num == I32_MAX here, as that is
12144      * handled by the RExC_npar check */
12145
12146      if (
12147       /* any numeric escape < 10 is always a backref */
12148       num > 9
12149       /* any numeric escape < RExC_npar is a backref */
12150       && num >= RExC_npar
12151       /* cannot be an octal escape if it starts with 8 */
12152       && *RExC_parse != '8'
12153       /* cannot be an octal escape it it starts with 9 */
12154       && *RExC_parse != '9'
12155      )
12156      {
12157       /* Probably not a backref, instead likely to be an
12158       * octal character escape, e.g. \35 or \777.
12159       * The above logic should make it obvious why using
12160       * octal escapes in patterns is problematic. - Yves */
12161       goto defchar;
12162      }
12163     }
12164
12165     /* At this point RExC_parse points at a numeric escape like
12166     * \12 or \88 or something similar, which we should NOT treat
12167     * as an octal escape. It may or may not be a valid backref
12168     * escape. For instance \88888888 is unlikely to be a valid
12169     * backref. */
12170     {
12171 #ifdef RE_TRACK_PATTERN_OFFSETS
12172      char * const parse_start = RExC_parse - 1; /* MJD */
12173 #endif
12174      while (isDIGIT(*RExC_parse))
12175       RExC_parse++;
12176      if (hasbrace) {
12177       if (*RExC_parse != '}')
12178        vFAIL("Unterminated \\g{...} pattern");
12179       RExC_parse++;
12180      }
12181      if (!SIZE_ONLY) {
12182       if (num > (I32)RExC_rx->nparens)
12183        vFAIL("Reference to nonexistent group");
12184      }
12185      RExC_sawback = 1;
12186      ret = reganode(pRExC_state,
12187         ((! FOLD)
12188          ? REF
12189          : (ASCII_FOLD_RESTRICTED)
12190          ? REFFA
12191          : (AT_LEAST_UNI_SEMANTICS)
12192           ? REFFU
12193           : (LOC)
12194           ? REFFL
12195           : REFF),
12196          num);
12197      *flagp |= HASWIDTH;
12198
12199      /* override incorrect value set in reganode MJD */
12200      Set_Node_Offset(ret, parse_start+1);
12201      Set_Node_Cur_Length(ret, parse_start);
12202      RExC_parse--;
12203      nextchar(pRExC_state);
12204     }
12205    }
12206    break;
12207   case '\0':
12208    if (RExC_parse >= RExC_end)
12209     FAIL("Trailing \\");
12210    /* FALLTHROUGH */
12211   default:
12212    /* Do not generate "unrecognized" warnings here, we fall
12213    back into the quick-grab loop below */
12214    parse_start--;
12215    goto defchar;
12216   }
12217   break;
12218
12219  case '#':
12220   if (RExC_flags & RXf_PMf_EXTENDED) {
12221    RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
12222    if (RExC_parse < RExC_end)
12223     goto tryagain;
12224   }
12225   /* FALLTHROUGH */
12226
12227  default:
12228
12229    parse_start = RExC_parse - 1;
12230
12231    RExC_parse++;
12232
12233   defchar: {
12234    STRLEN len = 0;
12235    UV ender = 0;
12236    char *p;
12237    char *s;
12238 #define MAX_NODE_STRING_SIZE 127
12239    char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
12240    char *s0;
12241    U8 upper_parse = MAX_NODE_STRING_SIZE;
12242    U8 node_type = compute_EXACTish(pRExC_state);
12243    bool next_is_quantifier;
12244    char * oldp = NULL;
12245
12246    /* We can convert EXACTF nodes to EXACTFU if they contain only
12247    * characters that match identically regardless of the target
12248    * string's UTF8ness.  The reason to do this is that EXACTF is not
12249    * trie-able, EXACTFU is.
12250    *
12251    * Similarly, we can convert EXACTFL nodes to EXACTFU if they
12252    * contain only above-Latin1 characters (hence must be in UTF8),
12253    * which don't participate in folds with Latin1-range characters,
12254    * as the latter's folds aren't known until runtime.  (We don't
12255    * need to figure this out until pass 2) */
12256    bool maybe_exactfu = PASS2
12257        && (node_type == EXACTF || node_type == EXACTFL);
12258
12259    /* If a folding node contains only code points that don't
12260    * participate in folds, it can be changed into an EXACT node,
12261    * which allows the optimizer more things to look for */
12262    bool maybe_exact;
12263
12264    ret = reg_node(pRExC_state, node_type);
12265
12266    /* In pass1, folded, we use a temporary buffer instead of the
12267    * actual node, as the node doesn't exist yet */
12268    s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
12269
12270    s0 = s;
12271
12272   reparse:
12273
12274    /* We do the EXACTFish to EXACT node only if folding.  (And we
12275    * don't need to figure this out until pass 2) */
12276    maybe_exact = FOLD && PASS2;
12277
12278    /* XXX The node can hold up to 255 bytes, yet this only goes to
12279    * 127.  I (khw) do not know why.  Keeping it somewhat less than
12280    * 255 allows us to not have to worry about overflow due to
12281    * converting to utf8 and fold expansion, but that value is
12282    * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
12283    * split up by this limit into a single one using the real max of
12284    * 255.  Even at 127, this breaks under rare circumstances.  If
12285    * folding, we do not want to split a node at a character that is a
12286    * non-final in a multi-char fold, as an input string could just
12287    * happen to want to match across the node boundary.  The join
12288    * would solve that problem if the join actually happens.  But a
12289    * series of more than two nodes in a row each of 127 would cause
12290    * the first join to succeed to get to 254, but then there wouldn't
12291    * be room for the next one, which could at be one of those split
12292    * multi-char folds.  I don't know of any fool-proof solution.  One
12293    * could back off to end with only a code point that isn't such a
12294    * non-final, but it is possible for there not to be any in the
12295    * entire node. */
12296    for (p = RExC_parse - 1;
12297     len < upper_parse && p < RExC_end;
12298     len++)
12299    {
12300     oldp = p;
12301
12302     if (RExC_flags & RXf_PMf_EXTENDED)
12303      p = regpatws(pRExC_state, p,
12304           TRUE); /* means recognize comments */
12305     switch ((U8)*p) {
12306     case '^':
12307     case '$':
12308     case '.':
12309     case '[':
12310     case '(':
12311     case ')':
12312     case '|':
12313      goto loopdone;
12314     case '\\':
12315      /* Literal Escapes Switch
12316
12317      This switch is meant to handle escape sequences that
12318      resolve to a literal character.
12319
12320      Every escape sequence that represents something
12321      else, like an assertion or a char class, is handled
12322      in the switch marked 'Special Escapes' above in this
12323      routine, but also has an entry here as anything that
12324      isn't explicitly mentioned here will be treated as
12325      an unescaped equivalent literal.
12326      */
12327
12328      switch ((U8)*++p) {
12329      /* These are all the special escapes. */
12330      case 'A':             /* Start assertion */
12331      case 'b': case 'B':   /* Word-boundary assertion*/
12332      case 'C':             /* Single char !DANGEROUS! */
12333      case 'd': case 'D':   /* digit class */
12334      case 'g': case 'G':   /* generic-backref, pos assertion */
12335      case 'h': case 'H':   /* HORIZWS */
12336      case 'k': case 'K':   /* named backref, keep marker */
12337      case 'p': case 'P':   /* Unicode property */
12338        case 'R':   /* LNBREAK */
12339      case 's': case 'S':   /* space class */
12340      case 'v': case 'V':   /* VERTWS */
12341      case 'w': case 'W':   /* word class */
12342      case 'X':             /* eXtended Unicode "combining
12343            character sequence" */
12344      case 'z': case 'Z':   /* End of line/string assertion */
12345       --p;
12346       goto loopdone;
12347
12348      /* Anything after here is an escape that resolves to a
12349      literal. (Except digits, which may or may not)
12350      */
12351      case 'n':
12352       ender = '\n';
12353       p++;
12354       break;
12355      case 'N': /* Handle a single-code point named character. */
12356       RExC_parse = p + 1;
12357       if (! grok_bslash_N(pRExC_state,
12358            NULL,   /* Fail if evaluates to
12359              anything other than a
12360              single code point */
12361            &ender, /* The returned single code
12362              point */
12363            NULL,   /* Don't need a count of
12364              how many code points */
12365            flagp,
12366            depth)
12367       ) {
12368        if (*flagp & RESTART_UTF8)
12369         FAIL("panic: grok_bslash_N set RESTART_UTF8");
12370
12371        /* Here, it wasn't a single code point.  Go close
12372        * up this EXACTish node.  The switch() prior to
12373        * this switch handles the other cases */
12374        RExC_parse = p = oldp;
12375        goto loopdone;
12376       }
12377       p = RExC_parse;
12378       if (ender > 0xff) {
12379        REQUIRE_UTF8;
12380       }
12381       break;
12382      case 'r':
12383       ender = '\r';
12384       p++;
12385       break;
12386      case 't':
12387       ender = '\t';
12388       p++;
12389       break;
12390      case 'f':
12391       ender = '\f';
12392       p++;
12393       break;
12394      case 'e':
12395       ender = ESC_NATIVE;
12396       p++;
12397       break;
12398      case 'a':
12399       ender = '\a';
12400       p++;
12401       break;
12402      case 'o':
12403       {
12404        UV result;
12405        const char* error_msg;
12406
12407        bool valid = grok_bslash_o(&p,
12408              &result,
12409              &error_msg,
12410              PASS2, /* out warnings */
12411              (bool) RExC_strict,
12412              TRUE, /* Output warnings
12413                 for non-
12414                 portables */
12415              UTF);
12416        if (! valid) {
12417         RExC_parse = p; /* going to die anyway; point
12418             to exact spot of failure */
12419         vFAIL(error_msg);
12420        }
12421        ender = result;
12422        if (IN_ENCODING && ender < 0x100) {
12423         goto recode_encoding;
12424        }
12425        if (ender > 0xff) {
12426         REQUIRE_UTF8;
12427        }
12428        break;
12429       }
12430      case 'x':
12431       {
12432        UV result = UV_MAX; /* initialize to erroneous
12433             value */
12434        const char* error_msg;
12435
12436        bool valid = grok_bslash_x(&p,
12437              &result,
12438              &error_msg,
12439              PASS2, /* out warnings */
12440              (bool) RExC_strict,
12441              TRUE, /* Silence warnings
12442                 for non-
12443                 portables */
12444              UTF);
12445        if (! valid) {
12446         RExC_parse = p; /* going to die anyway; point
12447             to exact spot of failure */
12448         vFAIL(error_msg);
12449        }
12450        ender = result;
12451
12452        if (ender < 0x100) {
12453 #ifdef EBCDIC
12454         if (RExC_recode_x_to_native) {
12455          ender = LATIN1_TO_NATIVE(ender);
12456         }
12457         else
12458 #endif
12459         if (IN_ENCODING) {
12460          goto recode_encoding;
12461         }
12462        }
12463        else {
12464         REQUIRE_UTF8;
12465        }
12466        break;
12467       }
12468      case 'c':
12469       p++;
12470       ender = grok_bslash_c(*p++, PASS2);
12471       break;
12472      case '8': case '9': /* must be a backreference */
12473       --p;
12474       /* we have an escape like \8 which cannot be an octal escape
12475       * so we exit the loop, and let the outer loop handle this
12476       * escape which may or may not be a legitimate backref. */
12477       goto loopdone;
12478      case '1': case '2': case '3':case '4':
12479      case '5': case '6': case '7':
12480       /* When we parse backslash escapes there is ambiguity
12481       * between backreferences and octal escapes. Any escape
12482       * from \1 - \9 is a backreference, any multi-digit
12483       * escape which does not start with 0 and which when
12484       * evaluated as decimal could refer to an already
12485       * parsed capture buffer is a back reference. Anything
12486       * else is octal.
12487       *
12488       * Note this implies that \118 could be interpreted as
12489       * 118 OR as "\11" . "8" depending on whether there
12490       * were 118 capture buffers defined already in the
12491       * pattern.  */
12492
12493       /* NOTE, RExC_npar is 1 more than the actual number of
12494       * parens we have seen so far, hence the < RExC_npar below. */
12495
12496       if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
12497       {  /* Not to be treated as an octal constant, go
12498         find backref */
12499        --p;
12500        goto loopdone;
12501       }
12502       /* FALLTHROUGH */
12503      case '0':
12504       {
12505        I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12506        STRLEN numlen = 3;
12507        ender = grok_oct(p, &numlen, &flags, NULL);
12508        if (ender > 0xff) {
12509         REQUIRE_UTF8;
12510        }
12511        p += numlen;
12512        if (PASS2   /* like \08, \178 */
12513         && numlen < 3
12514         && p < RExC_end
12515         && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12516        {
12517         reg_warn_non_literal_string(
12518           p + 1,
12519           form_short_octal_warning(p, numlen));
12520        }
12521       }
12522       if (IN_ENCODING && ender < 0x100)
12523        goto recode_encoding;
12524       break;
12525      recode_encoding:
12526       if (! RExC_override_recoding) {
12527        SV* enc = _get_encoding();
12528        ender = reg_recode((const char)(U8)ender, &enc);
12529        if (!enc && PASS2)
12530         ckWARNreg(p, "Invalid escape in the specified encoding");
12531        REQUIRE_UTF8;
12532       }
12533       break;
12534      case '\0':
12535       if (p >= RExC_end)
12536        FAIL("Trailing \\");
12537       /* FALLTHROUGH */
12538      default:
12539       if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12540        /* Include any { following the alpha to emphasize
12541        * that it could be part of an escape at some point
12542        * in the future */
12543        int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12544        ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12545       }
12546       goto normal_default;
12547      } /* End of switch on '\' */
12548      break;
12549     case '{':
12550      /* Currently we don't warn when the lbrace is at the start
12551      * of a construct.  This catches it in the middle of a
12552      * literal string, or when its the first thing after
12553      * something like "\b" */
12554      if (! SIZE_ONLY
12555       && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12556      {
12557       ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12558      }
12559      /*FALLTHROUGH*/
12560     default:    /* A literal character */
12561     normal_default:
12562      if (UTF8_IS_START(*p) && UTF) {
12563       STRLEN numlen;
12564       ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12565            &numlen, UTF8_ALLOW_DEFAULT);
12566       p += numlen;
12567      }
12568      else
12569       ender = (U8) *p++;
12570      break;
12571     } /* End of switch on the literal */
12572
12573     /* Here, have looked at the literal character and <ender>
12574     * contains its ordinal, <p> points to the character after it
12575     */
12576
12577     if ( RExC_flags & RXf_PMf_EXTENDED)
12578      p = regpatws(pRExC_state, p,
12579           TRUE); /* means recognize comments */
12580
12581     /* If the next thing is a quantifier, it applies to this
12582     * character only, which means that this character has to be in
12583     * its own node and can't just be appended to the string in an
12584     * existing node, so if there are already other characters in
12585     * the node, close the node with just them, and set up to do
12586     * this character again next time through, when it will be the
12587     * only thing in its new node */
12588     if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12589     {
12590      p = oldp;
12591      goto loopdone;
12592     }
12593
12594     if (! FOLD) {  /* The simple case, just append the literal */
12595
12596      /* In the sizing pass, we need only the size of the
12597      * character we are appending, hence we can delay getting
12598      * its representation until PASS2. */
12599      if (SIZE_ONLY) {
12600       if (UTF) {
12601        const STRLEN unilen = UNISKIP(ender);
12602        s += unilen;
12603
12604        /* We have to subtract 1 just below (and again in
12605        * the corresponding PASS2 code) because the loop
12606        * increments <len> each time, as all but this path
12607        * (and one other) through it add a single byte to
12608        * the EXACTish node.  But these paths would change
12609        * len to be the correct final value, so cancel out
12610        * the increment that follows */
12611        len += unilen - 1;
12612       }
12613       else {
12614        s++;
12615       }
12616      } else { /* PASS2 */
12617      not_fold_common:
12618       if (UTF) {
12619        U8 * new_s = uvchr_to_utf8((U8*)s, ender);
12620        len += (char *) new_s - s - 1;
12621        s = (char *) new_s;
12622       }
12623       else {
12624        *(s++) = (char) ender;
12625       }
12626      }
12627     }
12628     else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
12629
12630      /* Here are folding under /l, and the code point is
12631      * problematic.  First, we know we can't simplify things */
12632      maybe_exact = FALSE;
12633      maybe_exactfu = FALSE;
12634
12635      /* A problematic code point in this context means that its
12636      * fold isn't known until runtime, so we can't fold it now.
12637      * (The non-problematic code points are the above-Latin1
12638      * ones that fold to also all above-Latin1.  Their folds
12639      * don't vary no matter what the locale is.) But here we
12640      * have characters whose fold depends on the locale.
12641      * Unlike the non-folding case above, we have to keep track
12642      * of these in the sizing pass, so that we can make sure we
12643      * don't split too-long nodes in the middle of a potential
12644      * multi-char fold.  And unlike the regular fold case
12645      * handled in the else clauses below, we don't actually
12646      * fold and don't have special cases to consider.  What we
12647      * do for both passes is the PASS2 code for non-folding */
12648      goto not_fold_common;
12649     }
12650     else /* A regular FOLD code point */
12651      if (! ( UTF
12652       /* See comments for join_exact() as to why we fold this
12653       * non-UTF at compile time */
12654       || (node_type == EXACTFU
12655        && ender == LATIN_SMALL_LETTER_SHARP_S)))
12656     {
12657      /* Here, are folding and are not UTF-8 encoded; therefore
12658      * the character must be in the range 0-255, and is not /l
12659      * (Not /l because we already handled these under /l in
12660      * is_PROBLEMATIC_LOCALE_FOLD_cp) */
12661      if (IS_IN_SOME_FOLD_L1(ender)) {
12662       maybe_exact = FALSE;
12663
12664       /* See if the character's fold differs between /d and
12665       * /u.  This includes the multi-char fold SHARP S to
12666       * 'ss' */
12667       if (maybe_exactfu
12668        && (PL_fold[ender] != PL_fold_latin1[ender]
12669         || ender == LATIN_SMALL_LETTER_SHARP_S
12670         || (len > 0
12671         && isALPHA_FOLD_EQ(ender, 's')
12672         && isALPHA_FOLD_EQ(*(s-1), 's'))))
12673       {
12674        maybe_exactfu = FALSE;
12675       }
12676      }
12677
12678      /* Even when folding, we store just the input character, as
12679      * we have an array that finds its fold quickly */
12680      *(s++) = (char) ender;
12681     }
12682     else {  /* FOLD and UTF */
12683      /* Unlike the non-fold case, we do actually have to
12684      * calculate the results here in pass 1.  This is for two
12685      * reasons, the folded length may be longer than the
12686      * unfolded, and we have to calculate how many EXACTish
12687      * nodes it will take; and we may run out of room in a node
12688      * in the middle of a potential multi-char fold, and have
12689      * to back off accordingly.  */
12690
12691      UV folded;
12692      if (isASCII_uni(ender)) {
12693       folded = toFOLD(ender);
12694       *(s)++ = (U8) folded;
12695      }
12696      else {
12697       STRLEN foldlen;
12698
12699       folded = _to_uni_fold_flags(
12700          ender,
12701          (U8 *) s,
12702          &foldlen,
12703          FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12704               ? FOLD_FLAGS_NOMIX_ASCII
12705               : 0));
12706       s += foldlen;
12707
12708       /* The loop increments <len> each time, as all but this
12709       * path (and one other) through it add a single byte to
12710       * the EXACTish node.  But this one has changed len to
12711       * be the correct final value, so subtract one to
12712       * cancel out the increment that follows */
12713       len += foldlen - 1;
12714      }
12715      /* If this node only contains non-folding code points so
12716      * far, see if this new one is also non-folding */
12717      if (maybe_exact) {
12718       if (folded != ender) {
12719        maybe_exact = FALSE;
12720       }
12721       else {
12722        /* Here the fold is the original; we have to check
12723        * further to see if anything folds to it */
12724        if (_invlist_contains_cp(PL_utf8_foldable,
12725               ender))
12726        {
12727         maybe_exact = FALSE;
12728        }
12729       }
12730      }
12731      ender = folded;
12732     }
12733
12734     if (next_is_quantifier) {
12735
12736      /* Here, the next input is a quantifier, and to get here,
12737      * the current character is the only one in the node.
12738      * Also, here <len> doesn't include the final byte for this
12739      * character */
12740      len++;
12741      goto loopdone;
12742     }
12743
12744    } /* End of loop through literal characters */
12745
12746    /* Here we have either exhausted the input or ran out of room in
12747    * the node.  (If we encountered a character that can't be in the
12748    * node, transfer is made directly to <loopdone>, and so we
12749    * wouldn't have fallen off the end of the loop.)  In the latter
12750    * case, we artificially have to split the node into two, because
12751    * we just don't have enough space to hold everything.  This
12752    * creates a problem if the final character participates in a
12753    * multi-character fold in the non-final position, as a match that
12754    * should have occurred won't, due to the way nodes are matched,
12755    * and our artificial boundary.  So back off until we find a non-
12756    * problematic character -- one that isn't at the beginning or
12757    * middle of such a fold.  (Either it doesn't participate in any
12758    * folds, or appears only in the final position of all the folds it
12759    * does participate in.)  A better solution with far fewer false
12760    * positives, and that would fill the nodes more completely, would
12761    * be to actually have available all the multi-character folds to
12762    * test against, and to back-off only far enough to be sure that
12763    * this node isn't ending with a partial one.  <upper_parse> is set
12764    * further below (if we need to reparse the node) to include just
12765    * up through that final non-problematic character that this code
12766    * identifies, so when it is set to less than the full node, we can
12767    * skip the rest of this */
12768    if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12769
12770     const STRLEN full_len = len;
12771
12772     assert(len >= MAX_NODE_STRING_SIZE);
12773
12774     /* Here, <s> points to the final byte of the final character.
12775     * Look backwards through the string until find a non-
12776     * problematic character */
12777
12778     if (! UTF) {
12779
12780      /* This has no multi-char folds to non-UTF characters */
12781      if (ASCII_FOLD_RESTRICTED) {
12782       goto loopdone;
12783      }
12784
12785      while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12786      len = s - s0 + 1;
12787     }
12788     else {
12789      if (!  PL_NonL1NonFinalFold) {
12790       PL_NonL1NonFinalFold = _new_invlist_C_array(
12791           NonL1_Perl_Non_Final_Folds_invlist);
12792      }
12793
12794      /* Point to the first byte of the final character */
12795      s = (char *) utf8_hop((U8 *) s, -1);
12796
12797      while (s >= s0) {   /* Search backwards until find
12798           non-problematic char */
12799       if (UTF8_IS_INVARIANT(*s)) {
12800
12801        /* There are no ascii characters that participate
12802        * in multi-char folds under /aa.  In EBCDIC, the
12803        * non-ascii invariants are all control characters,
12804        * so don't ever participate in any folds. */
12805        if (ASCII_FOLD_RESTRICTED
12806         || ! IS_NON_FINAL_FOLD(*s))
12807        {
12808         break;
12809        }
12810       }
12811       else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12812        if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12813                 *s, *(s+1))))
12814        {
12815         break;
12816        }
12817       }
12818       else if (! _invlist_contains_cp(
12819           PL_NonL1NonFinalFold,
12820           valid_utf8_to_uvchr((U8 *) s, NULL)))
12821       {
12822        break;
12823       }
12824
12825       /* Here, the current character is problematic in that
12826       * it does occur in the non-final position of some
12827       * fold, so try the character before it, but have to
12828       * special case the very first byte in the string, so
12829       * we don't read outside the string */
12830       s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12831      } /* End of loop backwards through the string */
12832
12833      /* If there were only problematic characters in the string,
12834      * <s> will point to before s0, in which case the length
12835      * should be 0, otherwise include the length of the
12836      * non-problematic character just found */
12837      len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12838     }
12839
12840     /* Here, have found the final character, if any, that is
12841     * non-problematic as far as ending the node without splitting
12842     * it across a potential multi-char fold.  <len> contains the
12843     * number of bytes in the node up-to and including that
12844     * character, or is 0 if there is no such character, meaning
12845     * the whole node contains only problematic characters.  In
12846     * this case, give up and just take the node as-is.  We can't
12847     * do any better */
12848     if (len == 0) {
12849      len = full_len;
12850
12851      /* If the node ends in an 's' we make sure it stays EXACTF,
12852      * as if it turns into an EXACTFU, it could later get
12853      * joined with another 's' that would then wrongly match
12854      * the sharp s */
12855      if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
12856      {
12857       maybe_exactfu = FALSE;
12858      }
12859     } else {
12860
12861      /* Here, the node does contain some characters that aren't
12862      * problematic.  If one such is the final character in the
12863      * node, we are done */
12864      if (len == full_len) {
12865       goto loopdone;
12866      }
12867      else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12868
12869       /* If the final character is problematic, but the
12870       * penultimate is not, back-off that last character to
12871       * later start a new node with it */
12872       p = oldp;
12873       goto loopdone;
12874      }
12875
12876      /* Here, the final non-problematic character is earlier
12877      * in the input than the penultimate character.  What we do
12878      * is reparse from the beginning, going up only as far as
12879      * this final ok one, thus guaranteeing that the node ends
12880      * in an acceptable character.  The reason we reparse is
12881      * that we know how far in the character is, but we don't
12882      * know how to correlate its position with the input parse.
12883      * An alternate implementation would be to build that
12884      * correlation as we go along during the original parse,
12885      * but that would entail extra work for every node, whereas
12886      * this code gets executed only when the string is too
12887      * large for the node, and the final two characters are
12888      * problematic, an infrequent occurrence.  Yet another
12889      * possible strategy would be to save the tail of the
12890      * string, and the next time regatom is called, initialize
12891      * with that.  The problem with this is that unless you
12892      * back off one more character, you won't be guaranteed
12893      * regatom will get called again, unless regbranch,
12894      * regpiece ... are also changed.  If you do back off that
12895      * extra character, so that there is input guaranteed to
12896      * force calling regatom, you can't handle the case where
12897      * just the first character in the node is acceptable.  I
12898      * (khw) decided to try this method which doesn't have that
12899      * pitfall; if performance issues are found, we can do a
12900      * combination of the current approach plus that one */
12901      upper_parse = len;
12902      len = 0;
12903      s = s0;
12904      goto reparse;
12905     }
12906    }   /* End of verifying node ends with an appropriate char */
12907
12908   loopdone:   /* Jumped to when encounters something that shouldn't be
12909       in the node */
12910
12911    /* I (khw) don't know if you can get here with zero length, but the
12912    * old code handled this situation by creating a zero-length EXACT
12913    * node.  Might as well be NOTHING instead */
12914    if (len == 0) {
12915     OP(ret) = NOTHING;
12916    }
12917    else {
12918     if (FOLD) {
12919      /* If 'maybe_exact' is still set here, means there are no
12920      * code points in the node that participate in folds;
12921      * similarly for 'maybe_exactfu' and code points that match
12922      * differently depending on UTF8ness of the target string
12923      * (for /u), or depending on locale for /l */
12924      if (maybe_exact) {
12925       OP(ret) = (LOC)
12926         ? EXACTL
12927         : EXACT;
12928      }
12929      else if (maybe_exactfu) {
12930       OP(ret) = (LOC)
12931         ? EXACTFLU8
12932         : EXACTFU;
12933      }
12934     }
12935     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12936           FALSE /* Don't look to see if could
12937              be turned into an EXACT
12938              node, as we have already
12939              computed that */
12940           );
12941    }
12942
12943    RExC_parse = p - 1;
12944    Set_Node_Cur_Length(ret, parse_start);
12945    nextchar(pRExC_state);
12946    {
12947     /* len is STRLEN which is unsigned, need to copy to signed */
12948     IV iv = len;
12949     if (iv < 0)
12950      vFAIL("Internal disaster");
12951    }
12952
12953   } /* End of label 'defchar:' */
12954   break;
12955  } /* End of giant switch on input character */
12956
12957  return(ret);
12958 }
12959
12960 STATIC char *
12961 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12962 {
12963  /* Returns the next non-pattern-white space, non-comment character (the
12964  * latter only if 'recognize_comment is true) in the string p, which is
12965  * ended by RExC_end.  See also reg_skipcomment */
12966  const char *e = RExC_end;
12967
12968  PERL_ARGS_ASSERT_REGPATWS;
12969
12970  while (p < e) {
12971   STRLEN len;
12972   if ((len = is_PATWS_safe(p, e, UTF))) {
12973    p += len;
12974   }
12975   else if (recognize_comment && *p == '#') {
12976    p = reg_skipcomment(pRExC_state, p);
12977   }
12978   else
12979    break;
12980  }
12981  return p;
12982 }
12983
12984 STATIC void
12985 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12986 {
12987  /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12988  * sets up the bitmap and any flags, removing those code points from the
12989  * inversion list, setting it to NULL should it become completely empty */
12990
12991  PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12992  assert(PL_regkind[OP(node)] == ANYOF);
12993
12994  ANYOF_BITMAP_ZERO(node);
12995  if (*invlist_ptr) {
12996
12997   /* This gets set if we actually need to modify things */
12998   bool change_invlist = FALSE;
12999
13000   UV start, end;
13001
13002   /* Start looking through *invlist_ptr */
13003   invlist_iterinit(*invlist_ptr);
13004   while (invlist_iternext(*invlist_ptr, &start, &end)) {
13005    UV high;
13006    int i;
13007
13008    if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
13009     ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
13010    }
13011    else if (end >= NUM_ANYOF_CODE_POINTS) {
13012     ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
13013    }
13014
13015    /* Quit if are above what we should change */
13016    if (start >= NUM_ANYOF_CODE_POINTS) {
13017     break;
13018    }
13019
13020    change_invlist = TRUE;
13021
13022    /* Set all the bits in the range, up to the max that we are doing */
13023    high = (end < NUM_ANYOF_CODE_POINTS - 1)
13024     ? end
13025     : NUM_ANYOF_CODE_POINTS - 1;
13026    for (i = start; i <= (int) high; i++) {
13027     if (! ANYOF_BITMAP_TEST(node, i)) {
13028      ANYOF_BITMAP_SET(node, i);
13029     }
13030    }
13031   }
13032   invlist_iterfinish(*invlist_ptr);
13033
13034   /* Done with loop; remove any code points that are in the bitmap from
13035   * *invlist_ptr; similarly for code points above the bitmap if we have
13036   * a flag to match all of them anyways */
13037   if (change_invlist) {
13038    _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
13039   }
13040   if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
13041    _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
13042   }
13043
13044   /* If have completely emptied it, remove it completely */
13045   if (_invlist_len(*invlist_ptr) == 0) {
13046    SvREFCNT_dec_NN(*invlist_ptr);
13047    *invlist_ptr = NULL;
13048   }
13049  }
13050 }
13051
13052 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
13053    Character classes ([:foo:]) can also be negated ([:^foo:]).
13054    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
13055    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
13056    but trigger failures because they are currently unimplemented. */
13057
13058 #define POSIXCC_DONE(c)   ((c) == ':')
13059 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
13060 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
13061
13062 PERL_STATIC_INLINE I32
13063 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
13064 {
13065  I32 namedclass = OOB_NAMEDCLASS;
13066
13067  PERL_ARGS_ASSERT_REGPPOSIXCC;
13068
13069  if (value == '[' && RExC_parse + 1 < RExC_end &&
13070   /* I smell either [: or [= or [. -- POSIX has been here, right? */
13071   POSIXCC(UCHARAT(RExC_parse)))
13072  {
13073   const char c = UCHARAT(RExC_parse);
13074   char* const s = RExC_parse++;
13075
13076   while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
13077    RExC_parse++;
13078   if (RExC_parse == RExC_end) {
13079    if (strict) {
13080
13081     /* Try to give a better location for the error (than the end of
13082     * the string) by looking for the matching ']' */
13083     RExC_parse = s;
13084     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
13085      RExC_parse++;
13086     }
13087     vFAIL2("Unmatched '%c' in POSIX class", c);
13088    }
13089    /* Grandfather lone [:, [=, [. */
13090    RExC_parse = s;
13091   }
13092   else {
13093    const char* const t = RExC_parse++; /* skip over the c */
13094    assert(*t == c);
13095
13096    if (UCHARAT(RExC_parse) == ']') {
13097     const char *posixcc = s + 1;
13098     RExC_parse++; /* skip over the ending ] */
13099
13100     if (*s == ':') {
13101      const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
13102      const I32 skip = t - posixcc;
13103
13104      /* Initially switch on the length of the name.  */
13105      switch (skip) {
13106      case 4:
13107       if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
13108               this is the Perl \w
13109               */
13110        namedclass = ANYOF_WORDCHAR;
13111       break;
13112      case 5:
13113       /* Names all of length 5.  */
13114       /* alnum alpha ascii blank cntrl digit graph lower
13115       print punct space upper  */
13116       /* Offset 4 gives the best switch position.  */
13117       switch (posixcc[4]) {
13118       case 'a':
13119        if (memEQ(posixcc, "alph", 4)) /* alpha */
13120         namedclass = ANYOF_ALPHA;
13121        break;
13122       case 'e':
13123        if (memEQ(posixcc, "spac", 4)) /* space */
13124         namedclass = ANYOF_SPACE;
13125        break;
13126       case 'h':
13127        if (memEQ(posixcc, "grap", 4)) /* graph */
13128         namedclass = ANYOF_GRAPH;
13129        break;
13130       case 'i':
13131        if (memEQ(posixcc, "asci", 4)) /* ascii */
13132         namedclass = ANYOF_ASCII;
13133        break;
13134       case 'k':
13135        if (memEQ(posixcc, "blan", 4)) /* blank */
13136         namedclass = ANYOF_BLANK;
13137        break;
13138       case 'l':
13139        if (memEQ(posixcc, "cntr", 4)) /* cntrl */
13140         namedclass = ANYOF_CNTRL;
13141        break;
13142       case 'm':
13143        if (memEQ(posixcc, "alnu", 4)) /* alnum */
13144         namedclass = ANYOF_ALPHANUMERIC;
13145        break;
13146       case 'r':
13147        if (memEQ(posixcc, "lowe", 4)) /* lower */
13148         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
13149        else if (memEQ(posixcc, "uppe", 4)) /* upper */
13150         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
13151        break;
13152       case 't':
13153        if (memEQ(posixcc, "digi", 4)) /* digit */
13154         namedclass = ANYOF_DIGIT;
13155        else if (memEQ(posixcc, "prin", 4)) /* print */
13156         namedclass = ANYOF_PRINT;
13157        else if (memEQ(posixcc, "punc", 4)) /* punct */
13158         namedclass = ANYOF_PUNCT;
13159        break;
13160       }
13161       break;
13162      case 6:
13163       if (memEQ(posixcc, "xdigit", 6))
13164        namedclass = ANYOF_XDIGIT;
13165       break;
13166      }
13167
13168      if (namedclass == OOB_NAMEDCLASS)
13169       vFAIL2utf8f(
13170        "POSIX class [:%"UTF8f":] unknown",
13171        UTF8fARG(UTF, t - s - 1, s + 1));
13172
13173      /* The #defines are structured so each complement is +1 to
13174      * the normal one */
13175      if (complement) {
13176       namedclass++;
13177      }
13178      assert (posixcc[skip] == ':');
13179      assert (posixcc[skip+1] == ']');
13180     } else if (!SIZE_ONLY) {
13181      /* [[=foo=]] and [[.foo.]] are still future. */
13182
13183      /* adjust RExC_parse so the warning shows after
13184      the class closes */
13185      while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
13186       RExC_parse++;
13187      vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
13188     }
13189    } else {
13190     /* Maternal grandfather:
13191     * "[:" ending in ":" but not in ":]" */
13192     if (strict) {
13193      vFAIL("Unmatched '[' in POSIX class");
13194     }
13195
13196     /* Grandfather lone [:, [=, [. */
13197     RExC_parse = s;
13198    }
13199   }
13200  }
13201
13202  return namedclass;
13203 }
13204
13205 STATIC bool
13206 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
13207 {
13208  /* This applies some heuristics at the current parse position (which should
13209  * be at a '[') to see if what follows might be intended to be a [:posix:]
13210  * class.  It returns true if it really is a posix class, of course, but it
13211  * also can return true if it thinks that what was intended was a posix
13212  * class that didn't quite make it.
13213  *
13214  * It will return true for
13215  *      [:alphanumerics:
13216  *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
13217  *                         ')' indicating the end of the (?[
13218  *      [:any garbage including %^&$ punctuation:]
13219  *
13220  * This is designed to be called only from S_handle_regex_sets; it could be
13221  * easily adapted to be called from the spot at the beginning of regclass()
13222  * that checks to see in a normal bracketed class if the surrounding []
13223  * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
13224  * change long-standing behavior, so I (khw) didn't do that */
13225  char* p = RExC_parse + 1;
13226  char first_char = *p;
13227
13228  PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
13229
13230  assert(*(p - 1) == '[');
13231
13232  if (! POSIXCC(first_char)) {
13233   return FALSE;
13234  }
13235
13236  p++;
13237  while (p < RExC_end && isWORDCHAR(*p)) p++;
13238
13239  if (p >= RExC_end) {
13240   return FALSE;
13241  }
13242
13243  if (p - RExC_parse > 2    /* Got at least 1 word character */
13244   && (*p == first_char
13245    || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
13246  {
13247   return TRUE;
13248  }
13249
13250  p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
13251
13252  return (p
13253    && p - RExC_parse > 2 /* [:] evaluates to colon;
13254          [::] is a bad posix class. */
13255    && first_char == *(p - 1));
13256 }
13257
13258 STATIC unsigned  int
13259 S_regex_set_precedence(const U8 my_operator) {
13260
13261  /* Returns the precedence in the (?[...]) construct of the input operator,
13262  * specified by its character representation.  The precedence follows
13263  * general Perl rules, but it extends this so that ')' and ']' have (low)
13264  * precedence even though they aren't really operators */
13265
13266  switch (my_operator) {
13267   case '!':
13268    return 5;
13269   case '&':
13270    return 4;
13271   case '^':
13272   case '|':
13273   case '+':
13274   case '-':
13275    return 3;
13276   case ')':
13277    return 2;
13278   case ']':
13279    return 1;
13280  }
13281
13282  NOT_REACHED; /* NOTREACHED */
13283  return 0;   /* Silence compiler warning */
13284 }
13285
13286 STATIC regnode *
13287 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
13288      I32 *flagp, U32 depth,
13289      char * const oregcomp_parse)
13290 {
13291  /* Handle the (?[...]) construct to do set operations */
13292
13293  U8 curchar;                     /* Current character being parsed */
13294  UV start, end;             /* End points of code point ranges */
13295  SV* final = NULL;               /* The end result inversion list */
13296  SV* result_string;              /* 'final' stringified */
13297  AV* stack;                      /* stack of operators and operands not yet
13298          resolved */
13299  AV* fence_stack = NULL;         /* A stack containing the positions in
13300          'stack' of where the undealt-with left
13301          parens would be if they were actually
13302          put there */
13303  IV fence = 0;                   /* Position of where most recent undealt-
13304          with left paren in stack is; -1 if none.
13305          */
13306  STRLEN len;                     /* Temporary */
13307  regnode* node;                  /* Temporary, and final regnode returned by
13308          this function */
13309  const bool save_fold = FOLD;    /* Temporary */
13310  char *save_end, *save_parse;    /* Temporaries */
13311
13312  GET_RE_DEBUG_FLAGS_DECL;
13313
13314  PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
13315
13316  if (LOC) {  /* XXX could make valid in UTF-8 locales */
13317   vFAIL("(?[...]) not valid in locale");
13318  }
13319  RExC_uni_semantics = 1;     /* The use of this operator implies /u.  This
13320         is required so that the compile time values
13321         are valid in all runtime cases */
13322
13323  /* This will return only an ANYOF regnode, or (unlikely) something smaller
13324  * (such as EXACT).  Thus we can skip most everything if just sizing.  We
13325  * call regclass to handle '[]' so as to not have to reinvent its parsing
13326  * rules here (throwing away the size it computes each time).  And, we exit
13327  * upon an unescaped ']' that isn't one ending a regclass.  To do both
13328  * these things, we need to realize that something preceded by a backslash
13329  * is escaped, so we have to keep track of backslashes */
13330  if (SIZE_ONLY) {
13331   UV depth = 0; /* how many nested (?[...]) constructs */
13332
13333   while (RExC_parse < RExC_end) {
13334    SV* current = NULL;
13335    RExC_parse = regpatws(pRExC_state, RExC_parse,
13336           TRUE); /* means recognize comments */
13337    switch (*RExC_parse) {
13338     case '?':
13339      if (RExC_parse[1] == '[') depth++, RExC_parse++;
13340      /* FALLTHROUGH */
13341     default:
13342      break;
13343     case '\\':
13344      /* Skip the next byte (which could cause us to end up in
13345      * the middle of a UTF-8 character, but since none of those
13346      * are confusable with anything we currently handle in this
13347      * switch (invariants all), it's safe.  We'll just hit the
13348      * default: case next time and keep on incrementing until
13349      * we find one of the invariants we do handle. */
13350      RExC_parse++;
13351      break;
13352     case '[':
13353     {
13354      /* If this looks like it is a [:posix:] class, leave the
13355      * parse pointer at the '[' to fool regclass() into
13356      * thinking it is part of a '[[:posix:]]'.  That function
13357      * will use strict checking to force a syntax error if it
13358      * doesn't work out to a legitimate class */
13359      bool is_posix_class
13360          = could_it_be_a_POSIX_class(pRExC_state);
13361      if (! is_posix_class) {
13362       RExC_parse++;
13363      }
13364
13365      /* regclass() can only return RESTART_UTF8 if multi-char
13366      folds are allowed.  */
13367      if (!regclass(pRExC_state, flagp,depth+1,
13368         is_posix_class, /* parse the whole char
13369              class only if not a
13370              posix class */
13371         FALSE, /* don't allow multi-char folds */
13372         TRUE, /* silence non-portable warnings. */
13373         TRUE, /* strict */
13374         &current
13375         ))
13376       FAIL2("panic: regclass returned NULL to handle_sets, "
13377        "flags=%#"UVxf"", (UV) *flagp);
13378
13379      /* function call leaves parse pointing to the ']', except
13380      * if we faked it */
13381      if (is_posix_class) {
13382       RExC_parse--;
13383      }
13384
13385      SvREFCNT_dec(current);   /* In case it returned something */
13386      break;
13387     }
13388
13389     case ']':
13390      if (depth--) break;
13391      RExC_parse++;
13392      if (RExC_parse < RExC_end
13393       && *RExC_parse == ')')
13394      {
13395       node = reganode(pRExC_state, ANYOF, 0);
13396       RExC_size += ANYOF_SKIP;
13397       nextchar(pRExC_state);
13398       Set_Node_Length(node,
13399         RExC_parse - oregcomp_parse + 1); /* MJD */
13400       return node;
13401      }
13402      goto no_close;
13403    }
13404    RExC_parse++;
13405   }
13406
13407  no_close:
13408   FAIL("Syntax error in (?[...])");
13409  }
13410
13411  /* Pass 2 only after this. */
13412  Perl_ck_warner_d(aTHX_
13413   packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
13414   "The regex_sets feature is experimental" REPORT_LOCATION,
13415    UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
13416    UTF8fARG(UTF,
13417      RExC_end - RExC_start - (RExC_parse - RExC_precomp),
13418      RExC_precomp + (RExC_parse - RExC_precomp)));
13419
13420  /* Everything in this construct is a metacharacter.  Operands begin with
13421  * either a '\' (for an escape sequence), or a '[' for a bracketed
13422  * character class.  Any other character should be an operator, or
13423  * parenthesis for grouping.  Both types of operands are handled by calling
13424  * regclass() to parse them.  It is called with a parameter to indicate to
13425  * return the computed inversion list.  The parsing here is implemented via
13426  * a stack.  Each entry on the stack is a single character representing one
13427  * of the operators; or else a pointer to an operand inversion list. */
13428
13429 #define IS_OPERAND(a)  (! SvIOK(a))
13430
13431  /* The stack is kept in Łukasiewicz order.  (That's pronounced similar
13432  * to luke-a-shave-itch (or -itz), but people who didn't want to bother
13433  * with prounouncing it called it Reverse Polish instead, but now that YOU
13434  * know how to prounounce it you can use the correct term, thus giving due
13435  * credit to the person who invented it, and impressing your geek friends.
13436  * Wikipedia says that the pronounciation of "Ł" has been changing so that
13437  * it is now more like an English initial W (as in wonk) than an L.)
13438  *
13439  * This means that, for example, 'a | b & c' is stored on the stack as
13440  *
13441  * c  [4]
13442  * b  [3]
13443  * &  [2]
13444  * a  [1]
13445  * |  [0]
13446  *
13447  * where the numbers in brackets give the stack [array] element number.
13448  * In this implementation, parentheses are not stored on the stack.
13449  * Instead a '(' creates a "fence" so that the part of the stack below the
13450  * fence is invisible except to the corresponding ')' (this allows us to
13451  * replace testing for parens, by using instead subtraction of the fence
13452  * position).  As new operands are processed they are pushed onto the stack
13453  * (except as noted in the next paragraph).  New operators of higher
13454  * precedence than the current final one are inserted on the stack before
13455  * the lhs operand (so that when the rhs is pushed next, everything will be
13456  * in the correct positions shown above.  When an operator of equal or
13457  * lower precedence is encountered in parsing, all the stacked operations
13458  * of equal or higher precedence are evaluated, leaving the result as the
13459  * top entry on the stack.  This makes higher precedence operations
13460  * evaluate before lower precedence ones, and causes operations of equal
13461  * precedence to left associate.
13462  *
13463  * The only unary operator '!' is immediately pushed onto the stack when
13464  * encountered.  When an operand is encountered, if the top of the stack is
13465  * a '!", the complement is immediately performed, and the '!' popped.  The
13466  * resulting value is treated as a new operand, and the logic in the
13467  * previous paragraph is executed.  Thus in the expression
13468  *      [a] + ! [b]
13469  * the stack looks like
13470  *
13471  * !
13472  * a
13473  * +
13474  *
13475  * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
13476  * becomes
13477  *
13478  * !b
13479  * a
13480  * +
13481  *
13482  * A ')' is treated as an operator with lower precedence than all the
13483  * aforementioned ones, which causes all operations on the stack above the
13484  * corresponding '(' to be evaluated down to a single resultant operand.
13485  * Then the fence for the '(' is removed, and the operand goes through the
13486  * algorithm above, without the fence.
13487  *
13488  * A separate stack is kept of the fence positions, so that the position of
13489  * the latest so-far unbalanced '(' is at the top of it.
13490  *
13491  * The ']' ending the construct is treated as the lowest operator of all,
13492  * so that everything gets evaluated down to a single operand, which is the
13493  * result */
13494
13495  sv_2mortal((SV *)(stack = newAV()));
13496  sv_2mortal((SV *)(fence_stack = newAV()));
13497
13498  while (RExC_parse < RExC_end) {
13499   I32 top_index;              /* Index of top-most element in 'stack' */
13500   SV** top_ptr;               /* Pointer to top 'stack' element */
13501   SV* current = NULL;         /* To contain the current inversion list
13502          operand */
13503   SV* only_to_avoid_leaks;
13504
13505   /* Skip white space */
13506   RExC_parse = regpatws(pRExC_state, RExC_parse,
13507     TRUE /* means recognize comments */ );
13508   if (RExC_parse >= RExC_end) {
13509    Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
13510   }
13511
13512   curchar = UCHARAT(RExC_parse);
13513
13514 redo_curchar:
13515
13516   top_index = av_tindex(stack);
13517
13518   switch (curchar) {
13519    SV** stacked_ptr;       /* Ptr to something already on 'stack' */
13520    char stacked_operator;  /* The topmost operator on the 'stack'. */
13521    SV* lhs;                /* Operand to the left of the operator */
13522    SV* rhs;                /* Operand to the right of the operator */
13523    SV* fence_ptr;          /* Pointer to top element of the fence
13524          stack */
13525
13526    case '(':
13527
13528     if (RExC_parse < RExC_end && (UCHARAT(RExC_parse + 1) == '?'))
13529     {
13530      /* If is a '(?', could be an embedded '(?flags:(?[...])'.
13531      * This happens when we have some thing like
13532      *
13533      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
13534      *   ...
13535      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
13536      *
13537      * Here we would be handling the interpolated
13538      * '$thai_or_lao'.  We handle this by a recursive call to
13539      * ourselves which returns the inversion list the
13540      * interpolated expression evaluates to.  We use the flags
13541      * from the interpolated pattern. */
13542      U32 save_flags = RExC_flags;
13543      const char * save_parse;
13544
13545      RExC_parse += 2;        /* Skip past the '(?' */
13546      save_parse = RExC_parse;
13547
13548      /* Parse any flags for the '(?' */
13549      parse_lparen_question_flags(pRExC_state);
13550
13551      if (RExC_parse == save_parse  /* Makes sure there was at
13552              least one flag (or else
13553              this embedding wasn't
13554              compiled) */
13555       || RExC_parse >= RExC_end - 4
13556       || UCHARAT(RExC_parse) != ':'
13557       || UCHARAT(++RExC_parse) != '('
13558       || UCHARAT(++RExC_parse) != '?'
13559       || UCHARAT(++RExC_parse) != '[')
13560      {
13561
13562       /* In combination with the above, this moves the
13563       * pointer to the point just after the first erroneous
13564       * character (or if there are no flags, to where they
13565       * should have been) */
13566       if (RExC_parse >= RExC_end - 4) {
13567        RExC_parse = RExC_end;
13568       }
13569       else if (RExC_parse != save_parse) {
13570        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13571       }
13572       vFAIL("Expecting '(?flags:(?[...'");
13573      }
13574
13575      /* Recurse, with the meat of the embedded expression */
13576      RExC_parse++;
13577      (void) handle_regex_sets(pRExC_state, &current, flagp,
13578              depth+1, oregcomp_parse);
13579
13580      /* Here, 'current' contains the embedded expression's
13581      * inversion list, and RExC_parse points to the trailing
13582      * ']'; the next character should be the ')' */
13583      RExC_parse++;
13584      assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
13585
13586      /* Then the ')' matching the original '(' handled by this
13587      * case: statement */
13588      RExC_parse++;
13589      assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
13590
13591      RExC_parse++;
13592      RExC_flags = save_flags;
13593      goto handle_operand;
13594     }
13595
13596     /* A regular '('.  Look behind for illegal syntax */
13597     if (top_index - fence >= 0) {
13598      /* If the top entry on the stack is an operator, it had
13599      * better be a '!', otherwise the entry below the top
13600      * operand should be an operator */
13601      if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
13602       || (! IS_OPERAND(*top_ptr) && SvUV(*top_ptr) != '!')
13603       || top_index - fence < 1
13604       || ! (stacked_ptr = av_fetch(stack,
13605              top_index - 1,
13606              FALSE))
13607       || IS_OPERAND(*stacked_ptr))
13608      {
13609       RExC_parse++;
13610       vFAIL("Unexpected '(' with no preceding operator");
13611      }
13612     }
13613
13614     /* Stack the position of this undealt-with left paren */
13615     fence = top_index + 1;
13616     av_push(fence_stack, newSViv(fence));
13617     break;
13618
13619    case '\\':
13620     /* regclass() can only return RESTART_UTF8 if multi-char
13621     folds are allowed.  */
13622     if (!regclass(pRExC_state, flagp,depth+1,
13623        TRUE, /* means parse just the next thing */
13624        FALSE, /* don't allow multi-char folds */
13625        FALSE, /* don't silence non-portable warnings.  */
13626        TRUE,  /* strict */
13627        &current))
13628     {
13629      FAIL2("panic: regclass returned NULL to handle_sets, "
13630       "flags=%#"UVxf"", (UV) *flagp);
13631     }
13632
13633     /* regclass() will return with parsing just the \ sequence,
13634     * leaving the parse pointer at the next thing to parse */
13635     RExC_parse--;
13636     goto handle_operand;
13637
13638    case '[':   /* Is a bracketed character class */
13639    {
13640     bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
13641
13642     if (! is_posix_class) {
13643      RExC_parse++;
13644     }
13645
13646     /* regclass() can only return RESTART_UTF8 if multi-char
13647     folds are allowed.  */
13648     if(!regclass(pRExC_state, flagp,depth+1,
13649        is_posix_class, /* parse the whole char class
13650             only if not a posix class */
13651        FALSE, /* don't allow multi-char folds */
13652        FALSE, /* don't silence non-portable warnings.  */
13653        TRUE,   /* strict */
13654        &current
13655        ))
13656     {
13657      FAIL2("panic: regclass returned NULL to handle_sets, "
13658       "flags=%#"UVxf"", (UV) *flagp);
13659     }
13660
13661     /* function call leaves parse pointing to the ']', except if we
13662     * faked it */
13663     if (is_posix_class) {
13664      RExC_parse--;
13665     }
13666
13667     goto handle_operand;
13668    }
13669
13670    case ']':
13671     if (top_index >= 1) {
13672      goto join_operators;
13673     }
13674
13675     /* Only a single operand on the stack: are done */
13676     goto done;
13677
13678    case ')':
13679     if (av_tindex(fence_stack) < 0) {
13680      RExC_parse++;
13681      vFAIL("Unexpected ')'");
13682     }
13683
13684     /* If at least two thing on the stack, treat this as an
13685     * operator */
13686     if (top_index - fence >= 1) {
13687      goto join_operators;
13688     }
13689
13690     /* Here only a single thing on the fenced stack, and there is a
13691     * fence.  Get rid of it */
13692     fence_ptr = av_pop(fence_stack);
13693     assert(fence_ptr);
13694     fence = SvIV(fence_ptr) - 1;
13695     SvREFCNT_dec_NN(fence_ptr);
13696     fence_ptr = NULL;
13697
13698     if (fence < 0) {
13699      fence = 0;
13700     }
13701
13702     /* Having gotten rid of the fence, we pop the operand at the
13703     * stack top and process it as a newly encountered operand */
13704     current = av_pop(stack);
13705     assert(IS_OPERAND(current));
13706     goto handle_operand;
13707
13708    case '&':
13709    case '|':
13710    case '+':
13711    case '-':
13712    case '^':
13713
13714     /* These binary operators should have a left operand already
13715     * parsed */
13716     if (   top_index - fence < 0
13717      || top_index - fence == 1
13718      || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13719      || ! IS_OPERAND(*top_ptr))
13720     {
13721      goto unexpected_binary;
13722     }
13723
13724     /* If only the one operand is on the part of the stack visible
13725     * to us, we just place this operator in the proper position */
13726     if (top_index - fence < 2) {
13727
13728      /* Place the operator before the operand */
13729
13730      SV* lhs = av_pop(stack);
13731      av_push(stack, newSVuv(curchar));
13732      av_push(stack, lhs);
13733      break;
13734     }
13735
13736     /* But if there is something else on the stack, we need to
13737     * process it before this new operator if and only if the
13738     * stacked operation has equal or higher precedence than the
13739     * new one */
13740
13741    join_operators:
13742
13743     /* The operator on the stack is supposed to be below both its
13744     * operands */
13745     if (   ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
13746      || IS_OPERAND(*stacked_ptr))
13747     {
13748      /* But if not, it's legal and indicates we are completely
13749      * done if and only if we're currently processing a ']',
13750      * which should be the final thing in the expression */
13751      if (curchar == ']') {
13752       goto done;
13753      }
13754
13755     unexpected_binary:
13756      RExC_parse++;
13757      vFAIL2("Unexpected binary operator '%c' with no "
13758       "preceding operand", curchar);
13759     }
13760     stacked_operator = (char) SvUV(*stacked_ptr);
13761
13762     if (regex_set_precedence(curchar)
13763      > regex_set_precedence(stacked_operator))
13764     {
13765      /* Here, the new operator has higher precedence than the
13766      * stacked one.  This means we need to add the new one to
13767      * the stack to await its rhs operand (and maybe more
13768      * stuff).  We put it before the lhs operand, leaving
13769      * untouched the stacked operator and everything below it
13770      * */
13771      lhs = av_pop(stack);
13772      assert(IS_OPERAND(lhs));
13773
13774      av_push(stack, newSVuv(curchar));
13775      av_push(stack, lhs);
13776      break;
13777     }
13778
13779     /* Here, the new operator has equal or lower precedence than
13780     * what's already there.  This means the operation already
13781     * there should be performed now, before the new one. */
13782     rhs = av_pop(stack);
13783     lhs = av_pop(stack);
13784
13785     assert(IS_OPERAND(rhs));
13786     assert(IS_OPERAND(lhs));
13787
13788     switch (stacked_operator) {
13789      case '&':
13790       _invlist_intersection(lhs, rhs, &rhs);
13791       break;
13792
13793      case '|':
13794      case '+':
13795       _invlist_union(lhs, rhs, &rhs);
13796       break;
13797
13798      case '-':
13799       _invlist_subtract(lhs, rhs, &rhs);
13800       break;
13801
13802      case '^':   /* The union minus the intersection */
13803      {
13804       SV* i = NULL;
13805       SV* u = NULL;
13806       SV* element;
13807
13808       _invlist_union(lhs, rhs, &u);
13809       _invlist_intersection(lhs, rhs, &i);
13810       /* _invlist_subtract will overwrite rhs
13811        without freeing what it already contains */
13812       element = rhs;
13813       _invlist_subtract(u, i, &rhs);
13814       SvREFCNT_dec_NN(i);
13815       SvREFCNT_dec_NN(u);
13816       SvREFCNT_dec_NN(element);
13817       break;
13818      }
13819     }
13820     SvREFCNT_dec(lhs);
13821
13822     /* Here, the higher precedence operation has been done, and the
13823     * result is in 'rhs'.  We overwrite the stacked operator with
13824     * the result.  Then we redo this code to either push the new
13825     * operator onto the stack or perform any higher precedence
13826     * stacked operation */
13827     only_to_avoid_leaks = av_pop(stack);
13828     SvREFCNT_dec(only_to_avoid_leaks);
13829     av_push(stack, rhs);
13830     goto redo_curchar;
13831
13832    case '!':   /* Highest priority, right associative, so just push
13833       onto stack */
13834     av_push(stack, newSVuv(curchar));
13835     break;
13836
13837    default:
13838     RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13839     vFAIL("Unexpected character");
13840
13841   handle_operand:
13842
13843    /* Here 'current' is the operand.  If something is already on the
13844    * stack, we have to check if it is a !. */
13845    top_index = av_tindex(stack);   /* Code above may have altered the
13846            * stack in the time since we
13847            * earlier set 'top_index'. */
13848    if (top_index - fence >= 0) {
13849     /* If the top entry on the stack is an operator, it had better
13850     * be a '!', otherwise the entry below the top operand should
13851     * be an operator */
13852     top_ptr = av_fetch(stack, top_index, FALSE);
13853     assert(top_ptr);
13854     if (! IS_OPERAND(*top_ptr)) {
13855
13856      /* The only permissible operator at the top of the stack is
13857      * '!', which is applied immediately to this operand. */
13858      curchar = (char) SvUV(*top_ptr);
13859      if (curchar != '!') {
13860       SvREFCNT_dec(current);
13861       vFAIL2("Unexpected binary operator '%c' with no "
13862         "preceding operand", curchar);
13863      }
13864
13865      _invlist_invert(current);
13866
13867      only_to_avoid_leaks = av_pop(stack);
13868      SvREFCNT_dec(only_to_avoid_leaks);
13869      top_index = av_tindex(stack);
13870
13871      /* And we redo with the inverted operand.  This allows
13872      * handling multiple ! in a row */
13873      goto handle_operand;
13874     }
13875       /* Single operand is ok only for the non-binary ')'
13876       * operator */
13877     else if ((top_index - fence == 0 && curchar != ')')
13878       || (top_index - fence > 0
13879        && (! (stacked_ptr = av_fetch(stack,
13880               top_index - 1,
13881               FALSE))
13882         || IS_OPERAND(*stacked_ptr))))
13883     {
13884      SvREFCNT_dec(current);
13885      vFAIL("Operand with no preceding operator");
13886     }
13887    }
13888
13889    /* Here there was nothing on the stack or the top element was
13890    * another operand.  Just add this new one */
13891    av_push(stack, current);
13892
13893   } /* End of switch on next parse token */
13894
13895   RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13896  } /* End of loop parsing through the construct */
13897
13898   done:
13899  if (av_tindex(fence_stack) >= 0) {
13900   vFAIL("Unmatched (");
13901  }
13902
13903  if (av_tindex(stack) < 0   /* Was empty */
13904   || ((final = av_pop(stack)) == NULL)
13905   || ! IS_OPERAND(final)
13906   || av_tindex(stack) >= 0)  /* More left on stack */
13907  {
13908   SvREFCNT_dec(final);
13909   vFAIL("Incomplete expression within '(?[ ])'");
13910  }
13911
13912  /* Here, 'final' is the resultant inversion list from evaluating the
13913  * expression.  Return it if so requested */
13914  if (return_invlist) {
13915   *return_invlist = final;
13916   return END;
13917  }
13918
13919  /* Otherwise generate a resultant node, based on 'final'.  regclass() is
13920  * expecting a string of ranges and individual code points */
13921  invlist_iterinit(final);
13922  result_string = newSVpvs("");
13923  while (invlist_iternext(final, &start, &end)) {
13924   if (start == end) {
13925    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13926   }
13927   else {
13928    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13929              start,          end);
13930   }
13931  }
13932
13933  /* About to generate an ANYOF (or similar) node from the inversion list we
13934  * have calculated */
13935  save_parse = RExC_parse;
13936  RExC_parse = SvPV(result_string, len);
13937  save_end = RExC_end;
13938  RExC_end = RExC_parse + len;
13939
13940  /* We turn off folding around the call, as the class we have constructed
13941  * already has all folding taken into consideration, and we don't want
13942  * regclass() to add to that */
13943  RExC_flags &= ~RXf_PMf_FOLD;
13944  /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13945  */
13946  node = regclass(pRExC_state, flagp,depth+1,
13947      FALSE, /* means parse the whole char class */
13948      FALSE, /* don't allow multi-char folds */
13949      TRUE, /* silence non-portable warnings.  The above may very
13950        well have generated non-portable code points, but
13951        they're valid on this machine */
13952      FALSE, /* similarly, no need for strict */
13953      NULL
13954     );
13955  if (!node)
13956   FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13957      PTR2UV(flagp));
13958  if (save_fold) {
13959   RExC_flags |= RXf_PMf_FOLD;
13960  }
13961  RExC_parse = save_parse + 1;
13962  RExC_end = save_end;
13963  SvREFCNT_dec_NN(final);
13964  SvREFCNT_dec_NN(result_string);
13965
13966  nextchar(pRExC_state);
13967  Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13968  return node;
13969 }
13970 #undef IS_OPERAND
13971
13972 STATIC void
13973 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13974 {
13975  /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13976  * innocent-looking character class, like /[ks]/i won't have to go out to
13977  * disk to find the possible matches.
13978  *
13979  * This should be called only for a Latin1-range code points, cp, which is
13980  * known to be involved in a simple fold with other code points above
13981  * Latin1.  It would give false results if /aa has been specified.
13982  * Multi-char folds are outside the scope of this, and must be handled
13983  * specially.
13984  *
13985  * XXX It would be better to generate these via regen, in case a new
13986  * version of the Unicode standard adds new mappings, though that is not
13987  * really likely, and may be caught by the default: case of the switch
13988  * below. */
13989
13990  PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13991
13992  assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
13993
13994  switch (cp) {
13995   case 'k':
13996   case 'K':
13997   *invlist =
13998    add_cp_to_invlist(*invlist, KELVIN_SIGN);
13999    break;
14000   case 's':
14001   case 'S':
14002   *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
14003    break;
14004   case MICRO_SIGN:
14005   *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
14006   *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
14007    break;
14008   case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14009   case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14010   *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
14011    break;
14012   case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14013   *invlist = add_cp_to_invlist(*invlist,
14014           LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14015    break;
14016   case LATIN_SMALL_LETTER_SHARP_S:
14017   *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
14018    break;
14019   default:
14020    /* Use deprecated warning to increase the chances of this being
14021    * output */
14022    if (PASS2) {
14023     ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
14024    }
14025    break;
14026  }
14027 }
14028
14029 STATIC AV *
14030 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
14031 {
14032  /* This adds the string scalar <multi_string> to the array
14033  * <multi_char_matches>.  <multi_string> is known to have exactly
14034  * <cp_count> code points in it.  This is used when constructing a
14035  * bracketed character class and we find something that needs to match more
14036  * than a single character.
14037  *
14038  * <multi_char_matches> is actually an array of arrays.  Each top-level
14039  * element is an array that contains all the strings known so far that are
14040  * the same length.  And that length (in number of code points) is the same
14041  * as the index of the top-level array.  Hence, the [2] element is an
14042  * array, each element thereof is a string containing TWO code points;
14043  * while element [3] is for strings of THREE characters, and so on.  Since
14044  * this is for multi-char strings there can never be a [0] nor [1] element.
14045  *
14046  * When we rewrite the character class below, we will do so such that the
14047  * longest strings are written first, so that it prefers the longest
14048  * matching strings first.  This is done even if it turns out that any
14049  * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
14050  * Christiansen has agreed that this is ok.  This makes the test for the
14051  * ligature 'ffi' come before the test for 'ff', for example */
14052
14053  AV* this_array;
14054  AV** this_array_ptr;
14055
14056  PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
14057
14058  if (! multi_char_matches) {
14059   multi_char_matches = newAV();
14060  }
14061
14062  if (av_exists(multi_char_matches, cp_count)) {
14063   this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
14064   this_array = *this_array_ptr;
14065  }
14066  else {
14067   this_array = newAV();
14068   av_store(multi_char_matches, cp_count,
14069     (SV*) this_array);
14070  }
14071  av_push(this_array, multi_string);
14072
14073  return multi_char_matches;
14074 }
14075
14076 /* The names of properties whose definitions are not known at compile time are
14077  * stored in this SV, after a constant heading.  So if the length has been
14078  * changed since initialization, then there is a run-time definition. */
14079 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
14080           (SvCUR(listsv) != initial_listsv_len)
14081
14082 STATIC regnode *
14083 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
14084     const bool stop_at_1,  /* Just parse the next thing, don't
14085           look for a full character class */
14086     bool allow_multi_folds,
14087     const bool silence_non_portable,   /* Don't output warnings
14088              about too large
14089              characters */
14090     const bool strict,
14091     SV** ret_invlist  /* Return an inversion list, not a node */
14092   )
14093 {
14094  /* parse a bracketed class specification.  Most of these will produce an
14095  * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
14096  * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
14097  * under /i with multi-character folds: it will be rewritten following the
14098  * paradigm of this example, where the <multi-fold>s are characters which
14099  * fold to multiple character sequences:
14100  *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
14101  * gets effectively rewritten as:
14102  *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
14103  * reg() gets called (recursively) on the rewritten version, and this
14104  * function will return what it constructs.  (Actually the <multi-fold>s
14105  * aren't physically removed from the [abcdefghi], it's just that they are
14106  * ignored in the recursion by means of a flag:
14107  * <RExC_in_multi_char_class>.)
14108  *
14109  * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
14110  * characters, with the corresponding bit set if that character is in the
14111  * list.  For characters above this, a range list or swash is used.  There
14112  * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
14113  * determinable at compile time
14114  *
14115  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
14116  * to be restarted.  This can only happen if ret_invlist is non-NULL.
14117  */
14118
14119  UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
14120  IV range = 0;
14121  UV value = OOB_UNICODE, save_value = OOB_UNICODE;
14122  regnode *ret;
14123  STRLEN numlen;
14124  IV namedclass = OOB_NAMEDCLASS;
14125  char *rangebegin = NULL;
14126  bool need_class = 0;
14127  SV *listsv = NULL;
14128  STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
14129          than just initialized.  */
14130  SV* properties = NULL;    /* Code points that match \p{} \P{} */
14131  SV* posixes = NULL;     /* Code points that match classes like [:word:],
14132        extended beyond the Latin1 range.  These have to
14133        be kept separate from other code points for much
14134        of this function because their handling  is
14135        different under /i, and for most classes under
14136        /d as well */
14137  SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
14138        separate for a while from the non-complemented
14139        versions because of complications with /d
14140        matching */
14141  SV* simple_posixes = NULL; /* But under some conditions, the classes can be
14142         treated more simply than the general case,
14143         leading to less compilation and execution
14144         work */
14145  UV element_count = 0;   /* Number of distinct elements in the class.
14146        Optimizations may be possible if this is tiny */
14147  AV * multi_char_matches = NULL; /* Code points that fold to more than one
14148          character; used under /i */
14149  UV n;
14150  char * stop_ptr = RExC_end;    /* where to stop parsing */
14151  const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
14152             space? */
14153
14154  /* Unicode properties are stored in a swash; this holds the current one
14155  * being parsed.  If this swash is the only above-latin1 component of the
14156  * character class, an optimization is to pass it directly on to the
14157  * execution engine.  Otherwise, it is set to NULL to indicate that there
14158  * are other things in the class that have to be dealt with at execution
14159  * time */
14160  SV* swash = NULL;  /* Code points that match \p{} \P{} */
14161
14162  /* Set if a component of this character class is user-defined; just passed
14163  * on to the engine */
14164  bool has_user_defined_property = FALSE;
14165
14166  /* inversion list of code points this node matches only when the target
14167  * string is in UTF-8.  (Because is under /d) */
14168  SV* depends_list = NULL;
14169
14170  /* Inversion list of code points this node matches regardless of things
14171  * like locale, folding, utf8ness of the target string */
14172  SV* cp_list = NULL;
14173
14174  /* Like cp_list, but code points on this list need to be checked for things
14175  * that fold to/from them under /i */
14176  SV* cp_foldable_list = NULL;
14177
14178  /* Like cp_list, but code points on this list are valid only when the
14179  * runtime locale is UTF-8 */
14180  SV* only_utf8_locale_list = NULL;
14181
14182  /* In a range, if one of the endpoints is non-character-set portable,
14183  * meaning that it hard-codes a code point that may mean a different
14184  * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
14185  * mnemonic '\t' which each mean the same character no matter which
14186  * character set the platform is on. */
14187  unsigned int non_portable_endpoint = 0;
14188
14189  /* Is the range unicode? which means on a platform that isn't 1-1 native
14190  * to Unicode (i.e. non-ASCII), each code point in it should be considered
14191  * to be a Unicode value.  */
14192  bool unicode_range = FALSE;
14193  bool invert = FALSE;    /* Is this class to be complemented */
14194
14195  bool warn_super = ALWAYS_WARN_SUPER;
14196
14197  regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
14198   case we need to change the emitted regop to an EXACT. */
14199  const char * orig_parse = RExC_parse;
14200  const SSize_t orig_size = RExC_size;
14201  bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
14202  GET_RE_DEBUG_FLAGS_DECL;
14203
14204  PERL_ARGS_ASSERT_REGCLASS;
14205 #ifndef DEBUGGING
14206  PERL_UNUSED_ARG(depth);
14207 #endif
14208
14209  DEBUG_PARSE("clas");
14210
14211  /* Assume we are going to generate an ANYOF node. */
14212  ret = reganode(pRExC_state,
14213     (LOC)
14214      ? ANYOFL
14215      : ANYOF,
14216     0);
14217
14218  if (SIZE_ONLY) {
14219   RExC_size += ANYOF_SKIP;
14220   listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
14221  }
14222  else {
14223   ANYOF_FLAGS(ret) = 0;
14224
14225   RExC_emit += ANYOF_SKIP;
14226   listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
14227   initial_listsv_len = SvCUR(listsv);
14228   SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
14229  }
14230
14231  if (skip_white) {
14232   RExC_parse = regpatws(pRExC_state, RExC_parse,
14233        FALSE /* means don't recognize comments */ );
14234  }
14235
14236  if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
14237   RExC_parse++;
14238   invert = TRUE;
14239   allow_multi_folds = FALSE;
14240   MARK_NAUGHTY(1);
14241   if (skip_white) {
14242    RExC_parse = regpatws(pRExC_state, RExC_parse,
14243         FALSE /* means don't recognize comments */ );
14244   }
14245  }
14246
14247  /* Check that they didn't say [:posix:] instead of [[:posix:]] */
14248  if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
14249   const char *s = RExC_parse;
14250   const char  c = *s++;
14251
14252   if (*s == '^') {
14253    s++;
14254   }
14255   while (isWORDCHAR(*s))
14256    s++;
14257   if (*s && c == *s && s[1] == ']') {
14258    SAVEFREESV(RExC_rx_sv);
14259    ckWARN3reg(s+2,
14260      "POSIX syntax [%c %c] belongs inside character classes",
14261      c, c);
14262    (void)ReREFCNT_inc(RExC_rx_sv);
14263   }
14264  }
14265
14266  /* If the caller wants us to just parse a single element, accomplish this
14267  * by faking the loop ending condition */
14268  if (stop_at_1 && RExC_end > RExC_parse) {
14269   stop_ptr = RExC_parse + 1;
14270  }
14271
14272  /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
14273  if (UCHARAT(RExC_parse) == ']')
14274   goto charclassloop;
14275
14276  while (1) {
14277   if  (RExC_parse >= stop_ptr) {
14278    break;
14279   }
14280
14281   if (skip_white) {
14282    RExC_parse = regpatws(pRExC_state, RExC_parse,
14283         FALSE /* means don't recognize comments */ );
14284   }
14285
14286   if  (UCHARAT(RExC_parse) == ']') {
14287    break;
14288   }
14289
14290  charclassloop:
14291
14292   namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
14293   save_value = value;
14294   save_prevvalue = prevvalue;
14295
14296   if (!range) {
14297    rangebegin = RExC_parse;
14298    element_count++;
14299    non_portable_endpoint = 0;
14300   }
14301   if (UTF) {
14302    value = utf8n_to_uvchr((U8*)RExC_parse,
14303         RExC_end - RExC_parse,
14304         &numlen, UTF8_ALLOW_DEFAULT);
14305    RExC_parse += numlen;
14306   }
14307   else
14308    value = UCHARAT(RExC_parse++);
14309
14310   if (value == '['
14311    && RExC_parse < RExC_end
14312    && POSIXCC(UCHARAT(RExC_parse)))
14313   {
14314    namedclass = regpposixcc(pRExC_state, value, strict);
14315   }
14316   else if (value == '\\') {
14317    /* Is a backslash; get the code point of the char after it */
14318    if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
14319     value = utf8n_to_uvchr((U8*)RExC_parse,
14320         RExC_end - RExC_parse,
14321         &numlen, UTF8_ALLOW_DEFAULT);
14322     RExC_parse += numlen;
14323    }
14324    else
14325     value = UCHARAT(RExC_parse++);
14326
14327    /* Some compilers cannot handle switching on 64-bit integer
14328    * values, therefore value cannot be an UV.  Yes, this will
14329    * be a problem later if we want switch on Unicode.
14330    * A similar issue a little bit later when switching on
14331    * namedclass. --jhi */
14332
14333    /* If the \ is escaping white space when white space is being
14334    * skipped, it means that that white space is wanted literally, and
14335    * is already in 'value'.  Otherwise, need to translate the escape
14336    * into what it signifies. */
14337    if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
14338
14339    case 'w': namedclass = ANYOF_WORDCHAR; break;
14340    case 'W': namedclass = ANYOF_NWORDCHAR; break;
14341    case 's': namedclass = ANYOF_SPACE; break;
14342    case 'S': namedclass = ANYOF_NSPACE; break;
14343    case 'd': namedclass = ANYOF_DIGIT; break;
14344    case 'D': namedclass = ANYOF_NDIGIT; break;
14345    case 'v': namedclass = ANYOF_VERTWS; break;
14346    case 'V': namedclass = ANYOF_NVERTWS; break;
14347    case 'h': namedclass = ANYOF_HORIZWS; break;
14348    case 'H': namedclass = ANYOF_NHORIZWS; break;
14349    case 'N':  /* Handle \N{NAME} in class */
14350     {
14351      const char * const backslash_N_beg = RExC_parse - 2;
14352      int cp_count;
14353
14354      if (! grok_bslash_N(pRExC_state,
14355           NULL,      /* No regnode */
14356           &value,    /* Yes single value */
14357           &cp_count, /* Multiple code pt count */
14358           flagp,
14359           depth)
14360      ) {
14361
14362       if (*flagp & RESTART_UTF8)
14363        FAIL("panic: grok_bslash_N set RESTART_UTF8");
14364
14365       if (cp_count < 0) {
14366        vFAIL("\\N in a character class must be a named character: \\N{...}");
14367       }
14368       else if (cp_count == 0) {
14369        if (strict) {
14370         RExC_parse++;   /* Position after the "}" */
14371         vFAIL("Zero length \\N{}");
14372        }
14373        else if (PASS2) {
14374         ckWARNreg(RExC_parse,
14375           "Ignoring zero length \\N{} in character class");
14376        }
14377       }
14378       else { /* cp_count > 1 */
14379        if (! RExC_in_multi_char_class) {
14380         if (invert || range || *RExC_parse == '-') {
14381          if (strict) {
14382           RExC_parse--;
14383           vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
14384          }
14385          else if (PASS2) {
14386           ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
14387          }
14388          break; /* <value> contains the first code
14389            point. Drop out of the switch to
14390            process it */
14391         }
14392         else {
14393          SV * multi_char_N = newSVpvn(backslash_N_beg,
14394             RExC_parse - backslash_N_beg);
14395          multi_char_matches
14396           = add_multi_match(multi_char_matches,
14397               multi_char_N,
14398               cp_count);
14399         }
14400        }
14401       } /* End of cp_count != 1 */
14402
14403       /* This element should not be processed further in this
14404       * class */
14405       element_count--;
14406       value = save_value;
14407       prevvalue = save_prevvalue;
14408       continue;   /* Back to top of loop to get next char */
14409      }
14410
14411      /* Here, is a single code point, and <value> contains it */
14412      unicode_range = TRUE;   /* \N{} are Unicode */
14413     }
14414     break;
14415    case 'p':
14416    case 'P':
14417     {
14418     char *e;
14419
14420     /* We will handle any undefined properties ourselves */
14421     U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
14422          /* And we actually would prefer to get
14423           * the straight inversion list of the
14424           * swash, since we will be accessing it
14425           * anyway, to save a little time */
14426          |_CORE_SWASH_INIT_ACCEPT_INVLIST;
14427
14428     if (RExC_parse >= RExC_end)
14429      vFAIL2("Empty \\%c{}", (U8)value);
14430     if (*RExC_parse == '{') {
14431      const U8 c = (U8)value;
14432      e = strchr(RExC_parse++, '}');
14433      if (!e)
14434       vFAIL2("Missing right brace on \\%c{}", c);
14435      while (isSPACE(*RExC_parse))
14436       RExC_parse++;
14437      if (e == RExC_parse)
14438       vFAIL2("Empty \\%c{}", c);
14439      n = e - RExC_parse;
14440      while (isSPACE(*(RExC_parse + n - 1)))
14441       n--;
14442     }
14443     else {
14444      e = RExC_parse;
14445      n = 1;
14446     }
14447     if (!SIZE_ONLY) {
14448      SV* invlist;
14449      char* name;
14450
14451      if (UCHARAT(RExC_parse) == '^') {
14452       RExC_parse++;
14453       n--;
14454       /* toggle.  (The rhs xor gets the single bit that
14455       * differs between P and p; the other xor inverts just
14456       * that bit) */
14457       value ^= 'P' ^ 'p';
14458
14459       while (isSPACE(*RExC_parse)) {
14460        RExC_parse++;
14461        n--;
14462       }
14463      }
14464      /* Try to get the definition of the property into
14465      * <invlist>.  If /i is in effect, the effective property
14466      * will have its name be <__NAME_i>.  The design is
14467      * discussed in commit
14468      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
14469      name = savepv(Perl_form(aTHX_
14470           "%s%.*s%s\n",
14471           (FOLD) ? "__" : "",
14472           (int)n,
14473           RExC_parse,
14474           (FOLD) ? "_i" : ""
14475         ));
14476
14477      /* Look up the property name, and get its swash and
14478      * inversion list, if the property is found  */
14479      if (swash) {
14480       SvREFCNT_dec_NN(swash);
14481      }
14482      swash = _core_swash_init("utf8", name, &PL_sv_undef,
14483            1, /* binary */
14484            0, /* not tr/// */
14485            NULL, /* No inversion list */
14486            &swash_init_flags
14487            );
14488      if (! swash || ! (invlist = _get_swash_invlist(swash))) {
14489       HV* curpkg = (IN_PERL_COMPILETIME)
14490          ? PL_curstash
14491          : CopSTASH(PL_curcop);
14492       if (swash) {
14493        SvREFCNT_dec_NN(swash);
14494        swash = NULL;
14495       }
14496
14497       /* Here didn't find it.  It could be a user-defined
14498       * property that will be available at run-time.  If we
14499       * accept only compile-time properties, is an error;
14500       * otherwise add it to the list for run-time look up */
14501       if (ret_invlist) {
14502        RExC_parse = e + 1;
14503        vFAIL2utf8f(
14504         "Property '%"UTF8f"' is unknown",
14505         UTF8fARG(UTF, n, name));
14506       }
14507
14508       /* If the property name doesn't already have a package
14509       * name, add the current one to it so that it can be
14510       * referred to outside it. [perl #121777] */
14511       if (curpkg && ! instr(name, "::")) {
14512        char* pkgname = HvNAME(curpkg);
14513        if (strNE(pkgname, "main")) {
14514         char* full_name = Perl_form(aTHX_
14515                "%s::%s",
14516                pkgname,
14517                name);
14518         n = strlen(full_name);
14519         Safefree(name);
14520         name = savepvn(full_name, n);
14521        }
14522       }
14523       Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
14524           (value == 'p' ? '+' : '!'),
14525           UTF8fARG(UTF, n, name));
14526       has_user_defined_property = TRUE;
14527
14528       /* We don't know yet, so have to assume that the
14529       * property could match something in the Latin1 range,
14530       * hence something that isn't utf8.  Note that this
14531       * would cause things in <depends_list> to match
14532       * inappropriately, except that any \p{}, including
14533       * this one forces Unicode semantics, which means there
14534       * is no <depends_list> */
14535       ANYOF_FLAGS(ret)
14536          |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
14537      }
14538      else {
14539
14540       /* Here, did get the swash and its inversion list.  If
14541       * the swash is from a user-defined property, then this
14542       * whole character class should be regarded as such */
14543       if (swash_init_flags
14544        & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
14545       {
14546        has_user_defined_property = TRUE;
14547       }
14548       else if
14549        /* We warn on matching an above-Unicode code point
14550        * if the match would return true, except don't
14551        * warn for \p{All}, which has exactly one element
14552        * = 0 */
14553        (_invlist_contains_cp(invlist, 0x110000)
14554         && (! (_invlist_len(invlist) == 1
14555          && *invlist_array(invlist) == 0)))
14556       {
14557        warn_super = TRUE;
14558       }
14559
14560
14561       /* Invert if asking for the complement */
14562       if (value == 'P') {
14563        _invlist_union_complement_2nd(properties,
14564               invlist,
14565               &properties);
14566
14567        /* The swash can't be used as-is, because we've
14568        * inverted things; delay removing it to here after
14569        * have copied its invlist above */
14570        SvREFCNT_dec_NN(swash);
14571        swash = NULL;
14572       }
14573       else {
14574        _invlist_union(properties, invlist, &properties);
14575       }
14576      }
14577      Safefree(name);
14578     }
14579     RExC_parse = e + 1;
14580     namedclass = ANYOF_UNIPROP;  /* no official name, but it's
14581             named */
14582
14583     /* \p means they want Unicode semantics */
14584     RExC_uni_semantics = 1;
14585     }
14586     break;
14587    case 'n': value = '\n';   break;
14588    case 'r': value = '\r';   break;
14589    case 't': value = '\t';   break;
14590    case 'f': value = '\f';   break;
14591    case 'b': value = '\b';   break;
14592    case 'e': value = ESC_NATIVE;             break;
14593    case 'a': value = '\a';                   break;
14594    case 'o':
14595     RExC_parse--; /* function expects to be pointed at the 'o' */
14596     {
14597      const char* error_msg;
14598      bool valid = grok_bslash_o(&RExC_parse,
14599            &value,
14600            &error_msg,
14601            PASS2,   /* warnings only in
14602               pass 2 */
14603            strict,
14604            silence_non_portable,
14605            UTF);
14606      if (! valid) {
14607       vFAIL(error_msg);
14608      }
14609     }
14610     non_portable_endpoint++;
14611     if (IN_ENCODING && value < 0x100) {
14612      goto recode_encoding;
14613     }
14614     break;
14615    case 'x':
14616     RExC_parse--; /* function expects to be pointed at the 'x' */
14617     {
14618      const char* error_msg;
14619      bool valid = grok_bslash_x(&RExC_parse,
14620            &value,
14621            &error_msg,
14622            PASS2, /* Output warnings */
14623            strict,
14624            silence_non_portable,
14625            UTF);
14626      if (! valid) {
14627       vFAIL(error_msg);
14628      }
14629     }
14630     non_portable_endpoint++;
14631     if (IN_ENCODING && value < 0x100)
14632      goto recode_encoding;
14633     break;
14634    case 'c':
14635     value = grok_bslash_c(*RExC_parse++, PASS2);
14636     non_portable_endpoint++;
14637     break;
14638    case '0': case '1': case '2': case '3': case '4':
14639    case '5': case '6': case '7':
14640     {
14641      /* Take 1-3 octal digits */
14642      I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14643      numlen = (strict) ? 4 : 3;
14644      value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
14645      RExC_parse += numlen;
14646      if (numlen != 3) {
14647       if (strict) {
14648        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14649        vFAIL("Need exactly 3 octal digits");
14650       }
14651       else if (! SIZE_ONLY /* like \08, \178 */
14652         && numlen < 3
14653         && RExC_parse < RExC_end
14654         && isDIGIT(*RExC_parse)
14655         && ckWARN(WARN_REGEXP))
14656       {
14657        SAVEFREESV(RExC_rx_sv);
14658        reg_warn_non_literal_string(
14659         RExC_parse + 1,
14660         form_short_octal_warning(RExC_parse, numlen));
14661        (void)ReREFCNT_inc(RExC_rx_sv);
14662       }
14663      }
14664      non_portable_endpoint++;
14665      if (IN_ENCODING && value < 0x100)
14666       goto recode_encoding;
14667      break;
14668     }
14669    recode_encoding:
14670     if (! RExC_override_recoding) {
14671      SV* enc = _get_encoding();
14672      value = reg_recode((const char)(U8)value, &enc);
14673      if (!enc) {
14674       if (strict) {
14675        vFAIL("Invalid escape in the specified encoding");
14676       }
14677       else if (PASS2) {
14678        ckWARNreg(RExC_parse,
14679         "Invalid escape in the specified encoding");
14680       }
14681      }
14682      break;
14683     }
14684    default:
14685     /* Allow \_ to not give an error */
14686     if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
14687      if (strict) {
14688       vFAIL2("Unrecognized escape \\%c in character class",
14689        (int)value);
14690      }
14691      else {
14692       SAVEFREESV(RExC_rx_sv);
14693       ckWARN2reg(RExC_parse,
14694        "Unrecognized escape \\%c in character class passed through",
14695        (int)value);
14696       (void)ReREFCNT_inc(RExC_rx_sv);
14697      }
14698     }
14699     break;
14700    }   /* End of switch on char following backslash */
14701   } /* end of handling backslash escape sequences */
14702
14703   /* Here, we have the current token in 'value' */
14704
14705   if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
14706    U8 classnum;
14707
14708    /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
14709    * literal, as is the character that began the false range, i.e.
14710    * the 'a' in the examples */
14711    if (range) {
14712     if (!SIZE_ONLY) {
14713      const int w = (RExC_parse >= rangebegin)
14714         ? RExC_parse - rangebegin
14715         : 0;
14716      if (strict) {
14717       vFAIL2utf8f(
14718        "False [] range \"%"UTF8f"\"",
14719        UTF8fARG(UTF, w, rangebegin));
14720      }
14721      else {
14722       SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
14723       ckWARN2reg(RExC_parse,
14724        "False [] range \"%"UTF8f"\"",
14725        UTF8fARG(UTF, w, rangebegin));
14726       (void)ReREFCNT_inc(RExC_rx_sv);
14727       cp_list = add_cp_to_invlist(cp_list, '-');
14728       cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
14729                prevvalue);
14730      }
14731     }
14732
14733     range = 0; /* this was not a true range */
14734     element_count += 2; /* So counts for three values */
14735    }
14736
14737    classnum = namedclass_to_classnum(namedclass);
14738
14739    if (LOC && namedclass < ANYOF_POSIXL_MAX
14740 #ifndef HAS_ISASCII
14741     && classnum != _CC_ASCII
14742 #endif
14743    ) {
14744     /* What the Posix classes (like \w, [:space:]) match in locale
14745     * isn't knowable under locale until actual match time.  Room
14746     * must be reserved (one time per outer bracketed class) to
14747     * store such classes.  The space will contain a bit for each
14748     * named class that is to be matched against.  This isn't
14749     * needed for \p{} and pseudo-classes, as they are not affected
14750     * by locale, and hence are dealt with separately */
14751     if (! need_class) {
14752      need_class = 1;
14753      if (SIZE_ONLY) {
14754       RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14755      }
14756      else {
14757       RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14758      }
14759      ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
14760      ANYOF_POSIXL_ZERO(ret);
14761     }
14762
14763     /* Coverity thinks it is possible for this to be negative; both
14764     * jhi and khw think it's not, but be safer */
14765     assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14766      || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
14767
14768     /* See if it already matches the complement of this POSIX
14769     * class */
14770     if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14771      && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
14772                ? -1
14773                : 1)))
14774     {
14775      posixl_matches_all = TRUE;
14776      break;  /* No need to continue.  Since it matches both
14777        e.g., \w and \W, it matches everything, and the
14778        bracketed class can be optimized into qr/./s */
14779     }
14780
14781     /* Add this class to those that should be checked at runtime */
14782     ANYOF_POSIXL_SET(ret, namedclass);
14783
14784     /* The above-Latin1 characters are not subject to locale rules.
14785     * Just add them, in the second pass, to the
14786     * unconditionally-matched list */
14787     if (! SIZE_ONLY) {
14788      SV* scratch_list = NULL;
14789
14790      /* Get the list of the above-Latin1 code points this
14791      * matches */
14792      _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
14793           PL_XPosix_ptrs[classnum],
14794
14795           /* Odd numbers are complements, like
14796           * NDIGIT, NASCII, ... */
14797           namedclass % 2 != 0,
14798           &scratch_list);
14799      /* Checking if 'cp_list' is NULL first saves an extra
14800      * clone.  Its reference count will be decremented at the
14801      * next union, etc, or if this is the only instance, at the
14802      * end of the routine */
14803      if (! cp_list) {
14804       cp_list = scratch_list;
14805      }
14806      else {
14807       _invlist_union(cp_list, scratch_list, &cp_list);
14808       SvREFCNT_dec_NN(scratch_list);
14809      }
14810      continue;   /* Go get next character */
14811     }
14812    }
14813    else if (! SIZE_ONLY) {
14814
14815     /* Here, not in pass1 (in that pass we skip calculating the
14816     * contents of this class), and is /l, or is a POSIX class for
14817     * which /l doesn't matter (or is a Unicode property, which is
14818     * skipped here). */
14819     if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
14820      if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
14821
14822       /* Here, should be \h, \H, \v, or \V.  None of /d, /i
14823       * nor /l make a difference in what these match,
14824       * therefore we just add what they match to cp_list. */
14825       if (classnum != _CC_VERTSPACE) {
14826        assert(   namedclass == ANYOF_HORIZWS
14827         || namedclass == ANYOF_NHORIZWS);
14828
14829        /* It turns out that \h is just a synonym for
14830        * XPosixBlank */
14831        classnum = _CC_BLANK;
14832       }
14833
14834       _invlist_union_maybe_complement_2nd(
14835         cp_list,
14836         PL_XPosix_ptrs[classnum],
14837         namedclass % 2 != 0,    /* Complement if odd
14838               (NHORIZWS, NVERTWS)
14839               */
14840         &cp_list);
14841      }
14842     }
14843     else if (UNI_SEMANTICS
14844       || classnum == _CC_ASCII
14845       || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT
14846             || classnum == _CC_XDIGIT)))
14847     {
14848      /* We usually have to worry about /d and /a affecting what
14849      * POSIX classes match, with special code needed for /d
14850      * because we won't know until runtime what all matches.
14851      * But there is no extra work needed under /u, and
14852      * [:ascii:] is unaffected by /a and /d; and :digit: and
14853      * :xdigit: don't have runtime differences under /d.  So we
14854      * can special case these, and avoid some extra work below,
14855      * and at runtime. */
14856      _invlist_union_maybe_complement_2nd(
14857              simple_posixes,
14858              PL_XPosix_ptrs[classnum],
14859              namedclass % 2 != 0,
14860              &simple_posixes);
14861     }
14862     else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
14863       complement and use nposixes */
14864      SV** posixes_ptr = namedclass % 2 == 0
14865          ? &posixes
14866          : &nposixes;
14867      _invlist_union_maybe_complement_2nd(
14868              *posixes_ptr,
14869              PL_XPosix_ptrs[classnum],
14870              namedclass % 2 != 0,
14871              posixes_ptr);
14872     }
14873    }
14874   } /* end of namedclass \blah */
14875
14876   if (skip_white) {
14877    RExC_parse = regpatws(pRExC_state, RExC_parse,
14878         FALSE /* means don't recognize comments */ );
14879   }
14880
14881   /* If 'range' is set, 'value' is the ending of a range--check its
14882   * validity.  (If value isn't a single code point in the case of a
14883   * range, we should have figured that out above in the code that
14884   * catches false ranges).  Later, we will handle each individual code
14885   * point in the range.  If 'range' isn't set, this could be the
14886   * beginning of a range, so check for that by looking ahead to see if
14887   * the next real character to be processed is the range indicator--the
14888   * minus sign */
14889
14890   if (range) {
14891 #ifdef EBCDIC
14892    /* For unicode ranges, we have to test that the Unicode as opposed
14893    * to the native values are not decreasing.  (Above 255, there is
14894    * no difference between native and Unicode) */
14895    if (unicode_range && prevvalue < 255 && value < 255) {
14896     if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
14897      goto backwards_range;
14898     }
14899    }
14900    else
14901 #endif
14902    if (prevvalue > value) /* b-a */ {
14903     int w;
14904 #ifdef EBCDIC
14905    backwards_range:
14906 #endif
14907     w = RExC_parse - rangebegin;
14908     vFAIL2utf8f(
14909      "Invalid [] range \"%"UTF8f"\"",
14910      UTF8fARG(UTF, w, rangebegin));
14911     NOT_REACHED; /* NOTREACHED */
14912    }
14913   }
14914   else {
14915    prevvalue = value; /* save the beginning of the potential range */
14916    if (! stop_at_1     /* Can't be a range if parsing just one thing */
14917     && *RExC_parse == '-')
14918    {
14919     char* next_char_ptr = RExC_parse + 1;
14920     if (skip_white) {   /* Get the next real char after the '-' */
14921      next_char_ptr = regpatws(pRExC_state,
14922            RExC_parse + 1,
14923            FALSE); /* means don't recognize
14924               comments */
14925     }
14926
14927     /* If the '-' is at the end of the class (just before the ']',
14928     * it is a literal minus; otherwise it is a range */
14929     if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14930      RExC_parse = next_char_ptr;
14931
14932      /* a bad range like \w-, [:word:]- ? */
14933      if (namedclass > OOB_NAMEDCLASS) {
14934       if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
14935        const int w = RExC_parse >= rangebegin
14936           ?  RExC_parse - rangebegin
14937           : 0;
14938        if (strict) {
14939         vFAIL4("False [] range \"%*.*s\"",
14940          w, w, rangebegin);
14941        }
14942        else if (PASS2) {
14943         vWARN4(RExC_parse,
14944          "False [] range \"%*.*s\"",
14945          w, w, rangebegin);
14946        }
14947       }
14948       if (!SIZE_ONLY) {
14949        cp_list = add_cp_to_invlist(cp_list, '-');
14950       }
14951       element_count++;
14952      } else
14953       range = 1; /* yeah, it's a range! */
14954      continue; /* but do it the next time */
14955     }
14956    }
14957   }
14958
14959   if (namedclass > OOB_NAMEDCLASS) {
14960    continue;
14961   }
14962
14963   /* Here, we have a single value this time through the loop, and
14964   * <prevvalue> is the beginning of the range, if any; or <value> if
14965   * not. */
14966
14967   /* non-Latin1 code point implies unicode semantics.  Must be set in
14968   * pass1 so is there for the whole of pass 2 */
14969   if (value > 255) {
14970    RExC_uni_semantics = 1;
14971   }
14972
14973   /* Ready to process either the single value, or the completed range.
14974   * For single-valued non-inverted ranges, we consider the possibility
14975   * of multi-char folds.  (We made a conscious decision to not do this
14976   * for the other cases because it can often lead to non-intuitive
14977   * results.  For example, you have the peculiar case that:
14978   *  "s s" =~ /^[^\xDF]+$/i => Y
14979   *  "ss"  =~ /^[^\xDF]+$/i => N
14980   *
14981   * See [perl #89750] */
14982   if (FOLD && allow_multi_folds && value == prevvalue) {
14983    if (value == LATIN_SMALL_LETTER_SHARP_S
14984     || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14985               value)))
14986    {
14987     /* Here <value> is indeed a multi-char fold.  Get what it is */
14988
14989     U8 foldbuf[UTF8_MAXBYTES_CASE];
14990     STRLEN foldlen;
14991
14992     UV folded = _to_uni_fold_flags(
14993         value,
14994         foldbuf,
14995         &foldlen,
14996         FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14997             ? FOLD_FLAGS_NOMIX_ASCII
14998             : 0)
14999         );
15000
15001     /* Here, <folded> should be the first character of the
15002     * multi-char fold of <value>, with <foldbuf> containing the
15003     * whole thing.  But, if this fold is not allowed (because of
15004     * the flags), <fold> will be the same as <value>, and should
15005     * be processed like any other character, so skip the special
15006     * handling */
15007     if (folded != value) {
15008
15009      /* Skip if we are recursed, currently parsing the class
15010      * again.  Otherwise add this character to the list of
15011      * multi-char folds. */
15012      if (! RExC_in_multi_char_class) {
15013       STRLEN cp_count = utf8_length(foldbuf,
15014              foldbuf + foldlen);
15015       SV* multi_fold = sv_2mortal(newSVpvs(""));
15016
15017       Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
15018
15019       multi_char_matches
15020           = add_multi_match(multi_char_matches,
15021               multi_fold,
15022               cp_count);
15023
15024      }
15025
15026      /* This element should not be processed further in this
15027      * class */
15028      element_count--;
15029      value = save_value;
15030      prevvalue = save_prevvalue;
15031      continue;
15032     }
15033    }
15034   }
15035
15036   if (strict && PASS2 && ckWARN(WARN_REGEXP)) {
15037    if (range) {
15038
15039     /* If the range starts above 255, everything is portable and
15040     * likely to be so for any forseeable character set, so don't
15041     * warn. */
15042     if (unicode_range && non_portable_endpoint && prevvalue < 256) {
15043      vWARN(RExC_parse, "Both or neither range ends should be Unicode");
15044     }
15045     else if (prevvalue != value) {
15046
15047      /* Under strict, ranges that stop and/or end in an ASCII
15048      * printable should have each end point be a portable value
15049      * for it (preferably like 'A', but we don't warn if it is
15050      * a (portable) Unicode name or code point), and the range
15051      * must be be all digits or all letters of the same case.
15052      * Otherwise, the range is non-portable and unclear as to
15053      * what it contains */
15054      if ((isPRINT_A(prevvalue) || isPRINT_A(value))
15055       && (non_portable_endpoint
15056        || ! ((isDIGIT_A(prevvalue) && isDIGIT_A(value))
15057         || (isLOWER_A(prevvalue) && isLOWER_A(value))
15058         || (isUPPER_A(prevvalue) && isUPPER_A(value)))))
15059      {
15060       vWARN(RExC_parse, "Ranges of ASCII printables should be some subset of \"0-9\", \"A-Z\", or \"a-z\"");
15061      }
15062      else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */
15063
15064       /* But the nature of Unicode and languages mean we
15065       * can't do the same checks for above-ASCII ranges,
15066       * except in the case of digit ones.  These should
15067       * contain only digits from the same group of 10.  The
15068       * ASCII case is handled just above.  0x660 is the
15069       * first digit character beyond ASCII.  Hence here, the
15070       * range could be a range of digits.  Find out.  */
15071       IV index_start = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
15072               prevvalue);
15073       IV index_final = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
15074               value);
15075
15076       /* If the range start and final points are in the same
15077       * inversion list element, it means that either both
15078       * are not digits, or both are digits in a consecutive
15079       * sequence of digits.  (So far, Unicode has kept all
15080       * such sequences as distinct groups of 10, but assert
15081       * to make sure).  If the end points are not in the
15082       * same element, neither should be a digit. */
15083       if (index_start == index_final) {
15084        assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start)
15085        || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
15086        - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15087        == 10)
15088        /* But actually Unicode did have one group of 11
15089         * 'digits' in 5.2, so in case we are operating
15090         * on that version, let that pass */
15091        || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
15092        - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15093         == 11
15094        && invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15095         == 0x19D0)
15096        );
15097       }
15098       else if ((index_start >= 0
15099         && ELEMENT_RANGE_MATCHES_INVLIST(index_start))
15100         || (index_final >= 0
15101          && ELEMENT_RANGE_MATCHES_INVLIST(index_final)))
15102       {
15103        vWARN(RExC_parse, "Ranges of digits should be from the same group of 10");
15104       }
15105      }
15106     }
15107    }
15108    if ((! range || prevvalue == value) && non_portable_endpoint) {
15109     if (isPRINT_A(value)) {
15110      char literal[3];
15111      unsigned d = 0;
15112      if (isBACKSLASHED_PUNCT(value)) {
15113       literal[d++] = '\\';
15114      }
15115      literal[d++] = (char) value;
15116      literal[d++] = '\0';
15117
15118      vWARN4(RExC_parse,
15119       "\"%.*s\" is more clearly written simply as \"%s\"",
15120       (int) (RExC_parse - rangebegin),
15121       rangebegin,
15122       literal
15123       );
15124     }
15125     else if isMNEMONIC_CNTRL(value) {
15126      vWARN4(RExC_parse,
15127       "\"%.*s\" is more clearly written simply as \"%s\"",
15128       (int) (RExC_parse - rangebegin),
15129       rangebegin,
15130       cntrl_to_mnemonic((char) value)
15131       );
15132     }
15133    }
15134   }
15135
15136   /* Deal with this element of the class */
15137   if (! SIZE_ONLY) {
15138
15139 #ifndef EBCDIC
15140    cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15141              prevvalue, value);
15142 #else
15143    /* On non-ASCII platforms, for ranges that span all of 0..255, and
15144    * ones that don't require special handling, we can just add the
15145    * range like we do for ASCII platforms */
15146    if ((UNLIKELY(prevvalue == 0) && value >= 255)
15147     || ! (prevvalue < 256
15148      && (unicode_range
15149       || (! non_portable_endpoint
15150        && ((isLOWER_A(prevvalue) && isLOWER_A(value))
15151         || (isUPPER_A(prevvalue)
15152          && isUPPER_A(value)))))))
15153    {
15154     cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15155               prevvalue, value);
15156    }
15157    else {
15158     /* Here, requires special handling.  This can be because it is
15159     * a range whose code points are considered to be Unicode, and
15160     * so must be individually translated into native, or because
15161     * its a subrange of 'A-Z' or 'a-z' which each aren't
15162     * contiguous in EBCDIC, but we have defined them to include
15163     * only the "expected" upper or lower case ASCII alphabetics.
15164     * Subranges above 255 are the same in native and Unicode, so
15165     * can be added as a range */
15166     U8 start = NATIVE_TO_LATIN1(prevvalue);
15167     unsigned j;
15168     U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
15169     for (j = start; j <= end; j++) {
15170      cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
15171     }
15172     if (value > 255) {
15173      cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15174                256, value);
15175     }
15176    }
15177 #endif
15178   }
15179
15180   range = 0; /* this range (if it was one) is done now */
15181  } /* End of loop through all the text within the brackets */
15182
15183  /* If anything in the class expands to more than one character, we have to
15184  * deal with them by building up a substitute parse string, and recursively
15185  * calling reg() on it, instead of proceeding */
15186  if (multi_char_matches) {
15187   SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
15188   I32 cp_count;
15189   STRLEN len;
15190   char *save_end = RExC_end;
15191   char *save_parse = RExC_parse;
15192   bool first_time = TRUE;     /* First multi-char occurrence doesn't get
15193          a "|" */
15194   I32 reg_flags;
15195
15196   assert(! invert);
15197 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
15198   because too confusing */
15199   if (invert) {
15200    sv_catpv(substitute_parse, "(?:");
15201   }
15202 #endif
15203
15204   /* Look at the longest folds first */
15205   for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
15206
15207    if (av_exists(multi_char_matches, cp_count)) {
15208     AV** this_array_ptr;
15209     SV* this_sequence;
15210
15211     this_array_ptr = (AV**) av_fetch(multi_char_matches,
15212             cp_count, FALSE);
15213     while ((this_sequence = av_pop(*this_array_ptr)) !=
15214                 &PL_sv_undef)
15215     {
15216      if (! first_time) {
15217       sv_catpv(substitute_parse, "|");
15218      }
15219      first_time = FALSE;
15220
15221      sv_catpv(substitute_parse, SvPVX(this_sequence));
15222     }
15223    }
15224   }
15225
15226   /* If the character class contains anything else besides these
15227   * multi-character folds, have to include it in recursive parsing */
15228   if (element_count) {
15229    sv_catpv(substitute_parse, "|[");
15230    sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
15231    sv_catpv(substitute_parse, "]");
15232   }
15233
15234   sv_catpv(substitute_parse, ")");
15235 #if 0
15236   if (invert) {
15237    /* This is a way to get the parse to skip forward a whole named
15238    * sequence instead of matching the 2nd character when it fails the
15239    * first */
15240    sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
15241   }
15242 #endif
15243
15244   RExC_parse = SvPV(substitute_parse, len);
15245   RExC_end = RExC_parse + len;
15246   RExC_in_multi_char_class = 1;
15247   RExC_override_recoding = 1;
15248   RExC_emit = (regnode *)orig_emit;
15249
15250   ret = reg(pRExC_state, 1, &reg_flags, depth+1);
15251
15252   *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
15253
15254   RExC_parse = save_parse;
15255   RExC_end = save_end;
15256   RExC_in_multi_char_class = 0;
15257   RExC_override_recoding = 0;
15258   SvREFCNT_dec_NN(multi_char_matches);
15259   return ret;
15260  }
15261
15262  /* Here, we've gone through the entire class and dealt with multi-char
15263  * folds.  We are now in a position that we can do some checks to see if we
15264  * can optimize this ANYOF node into a simpler one, even in Pass 1.
15265  * Currently we only do two checks:
15266  * 1) is in the unlikely event that the user has specified both, eg. \w and
15267  *    \W under /l, then the class matches everything.  (This optimization
15268  *    is done only to make the optimizer code run later work.)
15269  * 2) if the character class contains only a single element (including a
15270  *    single range), we see if there is an equivalent node for it.
15271  * Other checks are possible */
15272  if (! ret_invlist   /* Can't optimize if returning the constructed
15273       inversion list */
15274   && (UNLIKELY(posixl_matches_all) || element_count == 1))
15275  {
15276   U8 op = END;
15277   U8 arg = 0;
15278
15279   if (UNLIKELY(posixl_matches_all)) {
15280    op = SANY;
15281   }
15282   else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
15283             \w or [:digit:] or \p{foo}
15284             */
15285
15286    /* All named classes are mapped into POSIXish nodes, with its FLAG
15287    * argument giving which class it is */
15288    switch ((I32)namedclass) {
15289     case ANYOF_UNIPROP:
15290      break;
15291
15292     /* These don't depend on the charset modifiers.  They always
15293     * match under /u rules */
15294     case ANYOF_NHORIZWS:
15295     case ANYOF_HORIZWS:
15296      namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
15297      /* FALLTHROUGH */
15298
15299     case ANYOF_NVERTWS:
15300     case ANYOF_VERTWS:
15301      op = POSIXU;
15302      goto join_posix;
15303
15304     /* The actual POSIXish node for all the rest depends on the
15305     * charset modifier.  The ones in the first set depend only on
15306     * ASCII or, if available on this platform, also locale */
15307     case ANYOF_ASCII:
15308     case ANYOF_NASCII:
15309 #ifdef HAS_ISASCII
15310      op = (LOC) ? POSIXL : POSIXA;
15311 #else
15312      op = POSIXA;
15313 #endif
15314      goto join_posix;
15315
15316     /* The following don't have any matches in the upper Latin1
15317     * range, hence /d is equivalent to /u for them.  Making it /u
15318     * saves some branches at runtime */
15319     case ANYOF_DIGIT:
15320     case ANYOF_NDIGIT:
15321     case ANYOF_XDIGIT:
15322     case ANYOF_NXDIGIT:
15323      if (! DEPENDS_SEMANTICS) {
15324       goto treat_as_default;
15325      }
15326
15327      op = POSIXU;
15328      goto join_posix;
15329
15330     /* The following change to CASED under /i */
15331     case ANYOF_LOWER:
15332     case ANYOF_NLOWER:
15333     case ANYOF_UPPER:
15334     case ANYOF_NUPPER:
15335      if (FOLD) {
15336       namedclass = ANYOF_CASED + (namedclass % 2);
15337      }
15338      /* FALLTHROUGH */
15339
15340     /* The rest have more possibilities depending on the charset.
15341     * We take advantage of the enum ordering of the charset
15342     * modifiers to get the exact node type, */
15343     default:
15344     treat_as_default:
15345      op = POSIXD + get_regex_charset(RExC_flags);
15346      if (op > POSIXA) { /* /aa is same as /a */
15347       op = POSIXA;
15348      }
15349
15350     join_posix:
15351      /* The odd numbered ones are the complements of the
15352      * next-lower even number one */
15353      if (namedclass % 2 == 1) {
15354       invert = ! invert;
15355       namedclass--;
15356      }
15357      arg = namedclass_to_classnum(namedclass);
15358      break;
15359    }
15360   }
15361   else if (value == prevvalue) {
15362
15363    /* Here, the class consists of just a single code point */
15364
15365    if (invert) {
15366     if (! LOC && value == '\n') {
15367      op = REG_ANY; /* Optimize [^\n] */
15368      *flagp |= HASWIDTH|SIMPLE;
15369      MARK_NAUGHTY(1);
15370     }
15371    }
15372    else if (value < 256 || UTF) {
15373
15374     /* Optimize a single value into an EXACTish node, but not if it
15375     * would require converting the pattern to UTF-8. */
15376     op = compute_EXACTish(pRExC_state);
15377    }
15378   } /* Otherwise is a range */
15379   else if (! LOC) {   /* locale could vary these */
15380    if (prevvalue == '0') {
15381     if (value == '9') {
15382      arg = _CC_DIGIT;
15383      op = POSIXA;
15384     }
15385    }
15386    else if (! FOLD || ASCII_FOLD_RESTRICTED) {
15387     /* We can optimize A-Z or a-z, but not if they could match
15388     * something like the KELVIN SIGN under /i. */
15389     if (prevvalue == 'A') {
15390      if (value == 'Z'
15391 #ifdef EBCDIC
15392       && ! non_portable_endpoint
15393 #endif
15394      ) {
15395       arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
15396       op = POSIXA;
15397      }
15398     }
15399     else if (prevvalue == 'a') {
15400      if (value == 'z'
15401 #ifdef EBCDIC
15402       && ! non_portable_endpoint
15403 #endif
15404      ) {
15405       arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
15406       op = POSIXA;
15407      }
15408     }
15409    }
15410   }
15411
15412   /* Here, we have changed <op> away from its initial value iff we found
15413   * an optimization */
15414   if (op != END) {
15415
15416    /* Throw away this ANYOF regnode, and emit the calculated one,
15417    * which should correspond to the beginning, not current, state of
15418    * the parse */
15419    const char * cur_parse = RExC_parse;
15420    RExC_parse = (char *)orig_parse;
15421    if ( SIZE_ONLY) {
15422     if (! LOC) {
15423
15424      /* To get locale nodes to not use the full ANYOF size would
15425      * require moving the code above that writes the portions
15426      * of it that aren't in other nodes to after this point.
15427      * e.g.  ANYOF_POSIXL_SET */
15428      RExC_size = orig_size;
15429     }
15430    }
15431    else {
15432     RExC_emit = (regnode *)orig_emit;
15433     if (PL_regkind[op] == POSIXD) {
15434      if (op == POSIXL) {
15435       RExC_contains_locale = 1;
15436      }
15437      if (invert) {
15438       op += NPOSIXD - POSIXD;
15439      }
15440     }
15441    }
15442
15443    ret = reg_node(pRExC_state, op);
15444
15445    if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
15446     if (! SIZE_ONLY) {
15447      FLAGS(ret) = arg;
15448     }
15449     *flagp |= HASWIDTH|SIMPLE;
15450    }
15451    else if (PL_regkind[op] == EXACT) {
15452     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15453           TRUE /* downgradable to EXACT */
15454           );
15455    }
15456
15457    RExC_parse = (char *) cur_parse;
15458
15459    SvREFCNT_dec(posixes);
15460    SvREFCNT_dec(nposixes);
15461    SvREFCNT_dec(simple_posixes);
15462    SvREFCNT_dec(cp_list);
15463    SvREFCNT_dec(cp_foldable_list);
15464    return ret;
15465   }
15466  }
15467
15468  if (SIZE_ONLY)
15469   return ret;
15470  /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
15471
15472  /* If folding, we calculate all characters that could fold to or from the
15473  * ones already on the list */
15474  if (cp_foldable_list) {
15475   if (FOLD) {
15476    UV start, end; /* End points of code point ranges */
15477
15478    SV* fold_intersection = NULL;
15479    SV** use_list;
15480
15481    /* Our calculated list will be for Unicode rules.  For locale
15482    * matching, we have to keep a separate list that is consulted at
15483    * runtime only when the locale indicates Unicode rules.  For
15484    * non-locale, we just use to the general list */
15485    if (LOC) {
15486     use_list = &only_utf8_locale_list;
15487    }
15488    else {
15489     use_list = &cp_list;
15490    }
15491
15492    /* Only the characters in this class that participate in folds need
15493    * be checked.  Get the intersection of this class and all the
15494    * possible characters that are foldable.  This can quickly narrow
15495    * down a large class */
15496    _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
15497         &fold_intersection);
15498
15499    /* The folds for all the Latin1 characters are hard-coded into this
15500    * program, but we have to go out to disk to get the others. */
15501    if (invlist_highest(cp_foldable_list) >= 256) {
15502
15503     /* This is a hash that for a particular fold gives all
15504     * characters that are involved in it */
15505     if (! PL_utf8_foldclosures) {
15506      _load_PL_utf8_foldclosures();
15507     }
15508    }
15509
15510    /* Now look at the foldable characters in this class individually */
15511    invlist_iterinit(fold_intersection);
15512    while (invlist_iternext(fold_intersection, &start, &end)) {
15513     UV j;
15514
15515     /* Look at every character in the range */
15516     for (j = start; j <= end; j++) {
15517      U8 foldbuf[UTF8_MAXBYTES_CASE+1];
15518      STRLEN foldlen;
15519      SV** listp;
15520
15521      if (j < 256) {
15522
15523       if (IS_IN_SOME_FOLD_L1(j)) {
15524
15525        /* ASCII is always matched; non-ASCII is matched
15526        * only under Unicode rules (which could happen
15527        * under /l if the locale is a UTF-8 one */
15528        if (isASCII(j) || ! DEPENDS_SEMANTICS) {
15529         *use_list = add_cp_to_invlist(*use_list,
15530                PL_fold_latin1[j]);
15531        }
15532        else {
15533         depends_list =
15534         add_cp_to_invlist(depends_list,
15535             PL_fold_latin1[j]);
15536        }
15537       }
15538
15539       if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
15540        && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
15541       {
15542        add_above_Latin1_folds(pRExC_state,
15543             (U8) j,
15544             use_list);
15545       }
15546       continue;
15547      }
15548
15549      /* Here is an above Latin1 character.  We don't have the
15550      * rules hard-coded for it.  First, get its fold.  This is
15551      * the simple fold, as the multi-character folds have been
15552      * handled earlier and separated out */
15553      _to_uni_fold_flags(j, foldbuf, &foldlen,
15554               (ASCII_FOLD_RESTRICTED)
15555               ? FOLD_FLAGS_NOMIX_ASCII
15556               : 0);
15557
15558      /* Single character fold of above Latin1.  Add everything in
15559      * its fold closure to the list that this node should match.
15560      * The fold closures data structure is a hash with the keys
15561      * being the UTF-8 of every character that is folded to, like
15562      * 'k', and the values each an array of all code points that
15563      * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
15564      * Multi-character folds are not included */
15565      if ((listp = hv_fetch(PL_utf8_foldclosures,
15566           (char *) foldbuf, foldlen, FALSE)))
15567      {
15568       AV* list = (AV*) *listp;
15569       IV k;
15570       for (k = 0; k <= av_tindex(list); k++) {
15571        SV** c_p = av_fetch(list, k, FALSE);
15572        UV c;
15573        assert(c_p);
15574
15575        c = SvUV(*c_p);
15576
15577        /* /aa doesn't allow folds between ASCII and non- */
15578        if ((ASCII_FOLD_RESTRICTED
15579         && (isASCII(c) != isASCII(j))))
15580        {
15581         continue;
15582        }
15583
15584        /* Folds under /l which cross the 255/256 boundary
15585        * are added to a separate list.  (These are valid
15586        * only when the locale is UTF-8.) */
15587        if (c < 256 && LOC) {
15588         *use_list = add_cp_to_invlist(*use_list, c);
15589         continue;
15590        }
15591
15592        if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
15593        {
15594         cp_list = add_cp_to_invlist(cp_list, c);
15595        }
15596        else {
15597         /* Similarly folds involving non-ascii Latin1
15598         * characters under /d are added to their list */
15599         depends_list = add_cp_to_invlist(depends_list,
15600                 c);
15601        }
15602       }
15603      }
15604     }
15605    }
15606    SvREFCNT_dec_NN(fold_intersection);
15607   }
15608
15609   /* Now that we have finished adding all the folds, there is no reason
15610   * to keep the foldable list separate */
15611   _invlist_union(cp_list, cp_foldable_list, &cp_list);
15612   SvREFCNT_dec_NN(cp_foldable_list);
15613  }
15614
15615  /* And combine the result (if any) with any inversion list from posix
15616  * classes.  The lists are kept separate up to now because we don't want to
15617  * fold the classes (folding of those is automatically handled by the swash
15618  * fetching code) */
15619  if (simple_posixes) {
15620   _invlist_union(cp_list, simple_posixes, &cp_list);
15621   SvREFCNT_dec_NN(simple_posixes);
15622  }
15623  if (posixes || nposixes) {
15624   if (posixes && AT_LEAST_ASCII_RESTRICTED) {
15625    /* Under /a and /aa, nothing above ASCII matches these */
15626    _invlist_intersection(posixes,
15627         PL_XPosix_ptrs[_CC_ASCII],
15628         &posixes);
15629   }
15630   if (nposixes) {
15631    if (DEPENDS_SEMANTICS) {
15632     /* Under /d, everything in the upper half of the Latin1 range
15633     * matches these complements */
15634     ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII;
15635    }
15636    else if (AT_LEAST_ASCII_RESTRICTED) {
15637     /* Under /a and /aa, everything above ASCII matches these
15638     * complements */
15639     _invlist_union_complement_2nd(nposixes,
15640            PL_XPosix_ptrs[_CC_ASCII],
15641            &nposixes);
15642    }
15643    if (posixes) {
15644     _invlist_union(posixes, nposixes, &posixes);
15645     SvREFCNT_dec_NN(nposixes);
15646    }
15647    else {
15648     posixes = nposixes;
15649    }
15650   }
15651   if (! DEPENDS_SEMANTICS) {
15652    if (cp_list) {
15653     _invlist_union(cp_list, posixes, &cp_list);
15654     SvREFCNT_dec_NN(posixes);
15655    }
15656    else {
15657     cp_list = posixes;
15658    }
15659   }
15660   else {
15661    /* Under /d, we put into a separate list the Latin1 things that
15662    * match only when the target string is utf8 */
15663    SV* nonascii_but_latin1_properties = NULL;
15664    _invlist_intersection(posixes, PL_UpperLatin1,
15665         &nonascii_but_latin1_properties);
15666    _invlist_subtract(posixes, nonascii_but_latin1_properties,
15667        &posixes);
15668    if (cp_list) {
15669     _invlist_union(cp_list, posixes, &cp_list);
15670     SvREFCNT_dec_NN(posixes);
15671    }
15672    else {
15673     cp_list = posixes;
15674    }
15675
15676    if (depends_list) {
15677     _invlist_union(depends_list, nonascii_but_latin1_properties,
15678        &depends_list);
15679     SvREFCNT_dec_NN(nonascii_but_latin1_properties);
15680    }
15681    else {
15682     depends_list = nonascii_but_latin1_properties;
15683    }
15684   }
15685  }
15686
15687  /* And combine the result (if any) with any inversion list from properties.
15688  * The lists are kept separate up to now so that we can distinguish the two
15689  * in regards to matching above-Unicode.  A run-time warning is generated
15690  * if a Unicode property is matched against a non-Unicode code point. But,
15691  * we allow user-defined properties to match anything, without any warning,
15692  * and we also suppress the warning if there is a portion of the character
15693  * class that isn't a Unicode property, and which matches above Unicode, \W
15694  * or [\x{110000}] for example.
15695  * (Note that in this case, unlike the Posix one above, there is no
15696  * <depends_list>, because having a Unicode property forces Unicode
15697  * semantics */
15698  if (properties) {
15699   if (cp_list) {
15700
15701    /* If it matters to the final outcome, see if a non-property
15702    * component of the class matches above Unicode.  If so, the
15703    * warning gets suppressed.  This is true even if just a single
15704    * such code point is specified, as though not strictly correct if
15705    * another such code point is matched against, the fact that they
15706    * are using above-Unicode code points indicates they should know
15707    * the issues involved */
15708    if (warn_super) {
15709     warn_super = ! (invert
15710        ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
15711    }
15712
15713    _invlist_union(properties, cp_list, &cp_list);
15714    SvREFCNT_dec_NN(properties);
15715   }
15716   else {
15717    cp_list = properties;
15718   }
15719
15720   if (warn_super) {
15721    ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
15722   }
15723  }
15724
15725  /* Here, we have calculated what code points should be in the character
15726  * class.
15727  *
15728  * Now we can see about various optimizations.  Fold calculation (which we
15729  * did above) needs to take place before inversion.  Otherwise /[^k]/i
15730  * would invert to include K, which under /i would match k, which it
15731  * shouldn't.  Therefore we can't invert folded locale now, as it won't be
15732  * folded until runtime */
15733
15734  /* If we didn't do folding, it's because some information isn't available
15735  * until runtime; set the run-time fold flag for these.  (We don't have to
15736  * worry about properties folding, as that is taken care of by the swash
15737  * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
15738  * locales, or the class matches at least one 0-255 range code point */
15739  if (LOC && FOLD) {
15740   if (only_utf8_locale_list) {
15741    ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15742   }
15743   else if (cp_list) { /* Look to see if there a 0-255 code point is in
15744        the list */
15745    UV start, end;
15746    invlist_iterinit(cp_list);
15747    if (invlist_iternext(cp_list, &start, &end) && start < 256) {
15748     ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15749    }
15750    invlist_iterfinish(cp_list);
15751   }
15752  }
15753
15754  /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
15755  * at compile time.  Besides not inverting folded locale now, we can't
15756  * invert if there are things such as \w, which aren't known until runtime
15757  * */
15758  if (cp_list
15759   && invert
15760   && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15761   && ! depends_list
15762   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15763  {
15764   _invlist_invert(cp_list);
15765
15766   /* Any swash can't be used as-is, because we've inverted things */
15767   if (swash) {
15768    SvREFCNT_dec_NN(swash);
15769    swash = NULL;
15770   }
15771
15772   /* Clear the invert flag since have just done it here */
15773   invert = FALSE;
15774  }
15775
15776  if (ret_invlist) {
15777   assert(cp_list);
15778
15779   *ret_invlist = cp_list;
15780   SvREFCNT_dec(swash);
15781
15782   /* Discard the generated node */
15783   if (SIZE_ONLY) {
15784    RExC_size = orig_size;
15785   }
15786   else {
15787    RExC_emit = orig_emit;
15788   }
15789   return orig_emit;
15790  }
15791
15792  /* Some character classes are equivalent to other nodes.  Such nodes take
15793  * up less room and generally fewer operations to execute than ANYOF nodes.
15794  * Above, we checked for and optimized into some such equivalents for
15795  * certain common classes that are easy to test.  Getting to this point in
15796  * the code means that the class didn't get optimized there.  Since this
15797  * code is only executed in Pass 2, it is too late to save space--it has
15798  * been allocated in Pass 1, and currently isn't given back.  But turning
15799  * things into an EXACTish node can allow the optimizer to join it to any
15800  * adjacent such nodes.  And if the class is equivalent to things like /./,
15801  * expensive run-time swashes can be avoided.  Now that we have more
15802  * complete information, we can find things necessarily missed by the
15803  * earlier code.  I (khw) am not sure how much to look for here.  It would
15804  * be easy, but perhaps too slow, to check any candidates against all the
15805  * node types they could possibly match using _invlistEQ(). */
15806
15807  if (cp_list
15808   && ! invert
15809   && ! depends_list
15810   && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15811   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15812
15813   /* We don't optimize if we are supposed to make sure all non-Unicode
15814    * code points raise a warning, as only ANYOF nodes have this check.
15815    * */
15816   && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
15817  {
15818   UV start, end;
15819   U8 op = END;  /* The optimzation node-type */
15820   const char * cur_parse= RExC_parse;
15821
15822   invlist_iterinit(cp_list);
15823   if (! invlist_iternext(cp_list, &start, &end)) {
15824
15825    /* Here, the list is empty.  This happens, for example, when a
15826    * Unicode property is the only thing in the character class, and
15827    * it doesn't match anything.  (perluniprops.pod notes such
15828    * properties) */
15829    op = OPFAIL;
15830    *flagp |= HASWIDTH|SIMPLE;
15831   }
15832   else if (start == end) {    /* The range is a single code point */
15833    if (! invlist_iternext(cp_list, &start, &end)
15834
15835      /* Don't do this optimization if it would require changing
15836      * the pattern to UTF-8 */
15837     && (start < 256 || UTF))
15838    {
15839     /* Here, the list contains a single code point.  Can optimize
15840     * into an EXACTish node */
15841
15842     value = start;
15843
15844     if (! FOLD) {
15845      op = (LOC)
15846       ? EXACTL
15847       : EXACT;
15848     }
15849     else if (LOC) {
15850
15851      /* A locale node under folding with one code point can be
15852      * an EXACTFL, as its fold won't be calculated until
15853      * runtime */
15854      op = EXACTFL;
15855     }
15856     else {
15857
15858      /* Here, we are generally folding, but there is only one
15859      * code point to match.  If we have to, we use an EXACT
15860      * node, but it would be better for joining with adjacent
15861      * nodes in the optimization pass if we used the same
15862      * EXACTFish node that any such are likely to be.  We can
15863      * do this iff the code point doesn't participate in any
15864      * folds.  For example, an EXACTF of a colon is the same as
15865      * an EXACT one, since nothing folds to or from a colon. */
15866      if (value < 256) {
15867       if (IS_IN_SOME_FOLD_L1(value)) {
15868        op = EXACT;
15869       }
15870      }
15871      else {
15872       if (_invlist_contains_cp(PL_utf8_foldable, value)) {
15873        op = EXACT;
15874       }
15875      }
15876
15877      /* If we haven't found the node type, above, it means we
15878      * can use the prevailing one */
15879      if (op == END) {
15880       op = compute_EXACTish(pRExC_state);
15881      }
15882     }
15883    }
15884   }
15885   else if (start == 0) {
15886    if (end == UV_MAX) {
15887     op = SANY;
15888     *flagp |= HASWIDTH|SIMPLE;
15889     MARK_NAUGHTY(1);
15890    }
15891    else if (end == '\n' - 1
15892      && invlist_iternext(cp_list, &start, &end)
15893      && start == '\n' + 1 && end == UV_MAX)
15894    {
15895     op = REG_ANY;
15896     *flagp |= HASWIDTH|SIMPLE;
15897     MARK_NAUGHTY(1);
15898    }
15899   }
15900   invlist_iterfinish(cp_list);
15901
15902   if (op != END) {
15903    RExC_parse = (char *)orig_parse;
15904    RExC_emit = (regnode *)orig_emit;
15905
15906    ret = reg_node(pRExC_state, op);
15907
15908    RExC_parse = (char *)cur_parse;
15909
15910    if (PL_regkind[op] == EXACT) {
15911     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15912           TRUE /* downgradable to EXACT */
15913           );
15914    }
15915
15916    SvREFCNT_dec_NN(cp_list);
15917    return ret;
15918   }
15919  }
15920
15921  /* Here, <cp_list> contains all the code points we can determine at
15922  * compile time that match under all conditions.  Go through it, and
15923  * for things that belong in the bitmap, put them there, and delete from
15924  * <cp_list>.  While we are at it, see if everything above 255 is in the
15925  * list, and if so, set a flag to speed up execution */
15926
15927  populate_ANYOF_from_invlist(ret, &cp_list);
15928
15929  if (invert) {
15930   ANYOF_FLAGS(ret) |= ANYOF_INVERT;
15931  }
15932
15933  /* Here, the bitmap has been populated with all the Latin1 code points that
15934  * always match.  Can now add to the overall list those that match only
15935  * when the target string is UTF-8 (<depends_list>). */
15936  if (depends_list) {
15937   if (cp_list) {
15938    _invlist_union(cp_list, depends_list, &cp_list);
15939    SvREFCNT_dec_NN(depends_list);
15940   }
15941   else {
15942    cp_list = depends_list;
15943   }
15944   ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
15945  }
15946
15947  /* If there is a swash and more than one element, we can't use the swash in
15948  * the optimization below. */
15949  if (swash && element_count > 1) {
15950   SvREFCNT_dec_NN(swash);
15951   swash = NULL;
15952  }
15953
15954  /* Note that the optimization of using 'swash' if it is the only thing in
15955  * the class doesn't have us change swash at all, so it can include things
15956  * that are also in the bitmap; otherwise we have purposely deleted that
15957  * duplicate information */
15958  set_ANYOF_arg(pRExC_state, ret, cp_list,
15959     (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15960     ? listsv : NULL,
15961     only_utf8_locale_list,
15962     swash, has_user_defined_property);
15963
15964  *flagp |= HASWIDTH|SIMPLE;
15965
15966  if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
15967   RExC_contains_locale = 1;
15968  }
15969
15970  return ret;
15971 }
15972
15973 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15974
15975 STATIC void
15976 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
15977     regnode* const node,
15978     SV* const cp_list,
15979     SV* const runtime_defns,
15980     SV* const only_utf8_locale_list,
15981     SV* const swash,
15982     const bool has_user_defined_property)
15983 {
15984  /* Sets the arg field of an ANYOF-type node 'node', using information about
15985  * the node passed-in.  If there is nothing outside the node's bitmap, the
15986  * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
15987  * the count returned by add_data(), having allocated and stored an array,
15988  * av, that that count references, as follows:
15989  *  av[0] stores the character class description in its textual form.
15990  *        This is used later (regexec.c:Perl_regclass_swash()) to
15991  *        initialize the appropriate swash, and is also useful for dumping
15992  *        the regnode.  This is set to &PL_sv_undef if the textual
15993  *        description is not needed at run-time (as happens if the other
15994  *        elements completely define the class)
15995  *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
15996  *        computed from av[0].  But if no further computation need be done,
15997  *        the swash is stored here now (and av[0] is &PL_sv_undef).
15998  *  av[2] stores the inversion list of code points that match only if the
15999  *        current locale is UTF-8
16000  *  av[3] stores the cp_list inversion list for use in addition or instead
16001  *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
16002  *        (Otherwise everything needed is already in av[0] and av[1])
16003  *  av[4] is set if any component of the class is from a user-defined
16004  *        property; used only if av[3] exists */
16005
16006  UV n;
16007
16008  PERL_ARGS_ASSERT_SET_ANYOF_ARG;
16009
16010  if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
16011   assert(! (ANYOF_FLAGS(node)
16012     & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16013      |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
16014   ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
16015  }
16016  else {
16017   AV * const av = newAV();
16018   SV *rv;
16019
16020   assert(ANYOF_FLAGS(node)
16021    & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16022     |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
16023
16024   av_store(av, 0, (runtime_defns)
16025       ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
16026   if (swash) {
16027    assert(cp_list);
16028    av_store(av, 1, swash);
16029    SvREFCNT_dec_NN(cp_list);
16030   }
16031   else {
16032    av_store(av, 1, &PL_sv_undef);
16033    if (cp_list) {
16034     av_store(av, 3, cp_list);
16035     av_store(av, 4, newSVuv(has_user_defined_property));
16036    }
16037   }
16038
16039   if (only_utf8_locale_list) {
16040    av_store(av, 2, only_utf8_locale_list);
16041   }
16042   else {
16043    av_store(av, 2, &PL_sv_undef);
16044   }
16045
16046   rv = newRV_noinc(MUTABLE_SV(av));
16047   n = add_data(pRExC_state, STR_WITH_LEN("s"));
16048   RExC_rxi->data->data[n] = (void*)rv;
16049   ARG_SET(node, n);
16050  }
16051 }
16052
16053 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
16054 SV *
16055 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
16056           const regnode* node,
16057           bool doinit,
16058           SV** listsvp,
16059           SV** only_utf8_locale_ptr,
16060           SV*  exclude_list)
16061
16062 {
16063  /* For internal core use only.
16064  * Returns the swash for the input 'node' in the regex 'prog'.
16065  * If <doinit> is 'true', will attempt to create the swash if not already
16066  *   done.
16067  * If <listsvp> is non-null, will return the printable contents of the
16068  *    swash.  This can be used to get debugging information even before the
16069  *    swash exists, by calling this function with 'doinit' set to false, in
16070  *    which case the components that will be used to eventually create the
16071  *    swash are returned  (in a printable form).
16072  * If <exclude_list> is not NULL, it is an inversion list of things to
16073  *    exclude from what's returned in <listsvp>.
16074  * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
16075  * that, in spite of this function's name, the swash it returns may include
16076  * the bitmap data as well */
16077
16078  SV *sw  = NULL;
16079  SV *si  = NULL;         /* Input swash initialization string */
16080  SV*  invlist = NULL;
16081
16082  RXi_GET_DECL(prog,progi);
16083  const struct reg_data * const data = prog ? progi->data : NULL;
16084
16085  PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
16086
16087  assert(ANYOF_FLAGS(node)
16088   & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16089   |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
16090
16091  if (data && data->count) {
16092   const U32 n = ARG(node);
16093
16094   if (data->what[n] == 's') {
16095    SV * const rv = MUTABLE_SV(data->data[n]);
16096    AV * const av = MUTABLE_AV(SvRV(rv));
16097    SV **const ary = AvARRAY(av);
16098    U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
16099
16100    si = *ary; /* ary[0] = the string to initialize the swash with */
16101
16102    /* Elements 3 and 4 are either both present or both absent. [3] is
16103    * any inversion list generated at compile time; [4] indicates if
16104    * that inversion list has any user-defined properties in it. */
16105    if (av_tindex(av) >= 2) {
16106     if (only_utf8_locale_ptr
16107      && ary[2]
16108      && ary[2] != &PL_sv_undef)
16109     {
16110      *only_utf8_locale_ptr = ary[2];
16111     }
16112     else {
16113      assert(only_utf8_locale_ptr);
16114      *only_utf8_locale_ptr = NULL;
16115     }
16116
16117     if (av_tindex(av) >= 3) {
16118      invlist = ary[3];
16119      if (SvUV(ary[4])) {
16120       swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
16121      }
16122     }
16123     else {
16124      invlist = NULL;
16125     }
16126    }
16127
16128    /* Element [1] is reserved for the set-up swash.  If already there,
16129    * return it; if not, create it and store it there */
16130    if (ary[1] && SvROK(ary[1])) {
16131     sw = ary[1];
16132    }
16133    else if (doinit && ((si && si != &PL_sv_undef)
16134         || (invlist && invlist != &PL_sv_undef))) {
16135     assert(si);
16136     sw = _core_swash_init("utf8", /* the utf8 package */
16137          "", /* nameless */
16138          si,
16139          1, /* binary */
16140          0, /* not from tr/// */
16141          invlist,
16142          &swash_init_flags);
16143     (void)av_store(av, 1, sw);
16144    }
16145   }
16146  }
16147
16148  /* If requested, return a printable version of what this swash matches */
16149  if (listsvp) {
16150   SV* matches_string = newSVpvs("");
16151
16152   /* The swash should be used, if possible, to get the data, as it
16153   * contains the resolved data.  But this function can be called at
16154   * compile-time, before everything gets resolved, in which case we
16155   * return the currently best available information, which is the string
16156   * that will eventually be used to do that resolving, 'si' */
16157   if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
16158    && (si && si != &PL_sv_undef))
16159   {
16160    sv_catsv(matches_string, si);
16161   }
16162
16163   /* Add the inversion list to whatever we have.  This may have come from
16164   * the swash, or from an input parameter */
16165   if (invlist) {
16166    if (exclude_list) {
16167     SV* clone = invlist_clone(invlist);
16168     _invlist_subtract(clone, exclude_list, &clone);
16169     sv_catsv(matches_string, _invlist_contents(clone));
16170     SvREFCNT_dec_NN(clone);
16171    }
16172    else {
16173     sv_catsv(matches_string, _invlist_contents(invlist));
16174    }
16175   }
16176   *listsvp = matches_string;
16177  }
16178
16179  return sw;
16180 }
16181 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
16182
16183 /* reg_skipcomment()
16184
16185    Absorbs an /x style # comment from the input stream,
16186    returning a pointer to the first character beyond the comment, or if the
16187    comment terminates the pattern without anything following it, this returns
16188    one past the final character of the pattern (in other words, RExC_end) and
16189    sets the REG_RUN_ON_COMMENT_SEEN flag.
16190
16191    Note it's the callers responsibility to ensure that we are
16192    actually in /x mode
16193
16194 */
16195
16196 PERL_STATIC_INLINE char*
16197 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
16198 {
16199  PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
16200
16201  assert(*p == '#');
16202
16203  while (p < RExC_end) {
16204   if (*(++p) == '\n') {
16205    return p+1;
16206   }
16207  }
16208
16209  /* we ran off the end of the pattern without ending the comment, so we have
16210  * to add an \n when wrapping */
16211  RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
16212  return p;
16213 }
16214
16215 /* nextchar()
16216
16217    Advances the parse position, and optionally absorbs
16218    "whitespace" from the inputstream.
16219
16220    Without /x "whitespace" means (?#...) style comments only,
16221    with /x this means (?#...) and # comments and whitespace proper.
16222
16223    Returns the RExC_parse point from BEFORE the scan occurs.
16224
16225    This is the /x friendly way of saying RExC_parse++.
16226 */
16227
16228 STATIC char*
16229 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
16230 {
16231  char* const retval = RExC_parse++;
16232
16233  PERL_ARGS_ASSERT_NEXTCHAR;
16234
16235  for (;;) {
16236   if (RExC_end - RExC_parse >= 3
16237    && *RExC_parse == '('
16238    && RExC_parse[1] == '?'
16239    && RExC_parse[2] == '#')
16240   {
16241    while (*RExC_parse != ')') {
16242     if (RExC_parse == RExC_end)
16243      FAIL("Sequence (?#... not terminated");
16244     RExC_parse++;
16245    }
16246    RExC_parse++;
16247    continue;
16248   }
16249   if (RExC_flags & RXf_PMf_EXTENDED) {
16250    char * p = regpatws(pRExC_state, RExC_parse,
16251           TRUE); /* means recognize comments */
16252    if (p != RExC_parse) {
16253     RExC_parse = p;
16254     continue;
16255    }
16256   }
16257   return retval;
16258  }
16259 }
16260
16261 STATIC regnode *
16262 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
16263 {
16264  /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
16265  * space.  In pass1, it aligns and increments RExC_size; in pass2,
16266  * RExC_emit */
16267
16268  regnode * const ret = RExC_emit;
16269  GET_RE_DEBUG_FLAGS_DECL;
16270
16271  PERL_ARGS_ASSERT_REGNODE_GUTS;
16272
16273  assert(extra_size >= regarglen[op]);
16274
16275  if (SIZE_ONLY) {
16276   SIZE_ALIGN(RExC_size);
16277   RExC_size += 1 + extra_size;
16278   return(ret);
16279  }
16280  if (RExC_emit >= RExC_emit_bound)
16281   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
16282     op, (void*)RExC_emit, (void*)RExC_emit_bound);
16283
16284  NODE_ALIGN_FILL(ret);
16285 #ifndef RE_TRACK_PATTERN_OFFSETS
16286  PERL_UNUSED_ARG(name);
16287 #else
16288  if (RExC_offsets) {         /* MJD */
16289   MJD_OFFSET_DEBUG(
16290    ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
16291    name, __LINE__,
16292    PL_reg_name[op],
16293    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
16294     ? "Overwriting end of array!\n" : "OK",
16295    (UV)(RExC_emit - RExC_emit_start),
16296    (UV)(RExC_parse - RExC_start),
16297    (UV)RExC_offsets[0]));
16298   Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
16299  }
16300 #endif
16301  return(ret);
16302 }
16303
16304 /*
16305 - reg_node - emit a node
16306 */
16307 STATIC regnode *   /* Location. */
16308 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
16309 {
16310  regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
16311
16312  PERL_ARGS_ASSERT_REG_NODE;
16313
16314  assert(regarglen[op] == 0);
16315
16316  if (PASS2) {
16317   regnode *ptr = ret;
16318   FILL_ADVANCE_NODE(ptr, op);
16319  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
16320   RExC_emit = ptr;
16321  }
16322  return(ret);
16323 }
16324
16325 /*
16326 - reganode - emit a node with an argument
16327 */
16328 STATIC regnode *   /* Location. */
16329 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
16330 {
16331  regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
16332
16333  PERL_ARGS_ASSERT_REGANODE;
16334
16335  assert(regarglen[op] == 1);
16336
16337  if (PASS2) {
16338   regnode *ptr = ret;
16339   FILL_ADVANCE_NODE_ARG(ptr, op, arg);
16340  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
16341   RExC_emit = ptr;
16342  }
16343  return(ret);
16344 }
16345
16346 STATIC regnode *
16347 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
16348 {
16349  /* emit a node with U32 and I32 arguments */
16350
16351  regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
16352
16353  PERL_ARGS_ASSERT_REG2LANODE;
16354
16355  assert(regarglen[op] == 2);
16356
16357  if (PASS2) {
16358   regnode *ptr = ret;
16359   FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
16360   RExC_emit = ptr;
16361  }
16362  return(ret);
16363 }
16364
16365 /*
16366 - reginsert - insert an operator in front of already-emitted operand
16367 *
16368 * Means relocating the operand.
16369 */
16370 STATIC void
16371 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
16372 {
16373  regnode *src;
16374  regnode *dst;
16375  regnode *place;
16376  const int offset = regarglen[(U8)op];
16377  const int size = NODE_STEP_REGNODE + offset;
16378  GET_RE_DEBUG_FLAGS_DECL;
16379
16380  PERL_ARGS_ASSERT_REGINSERT;
16381  PERL_UNUSED_CONTEXT;
16382  PERL_UNUSED_ARG(depth);
16383 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
16384  DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
16385  if (SIZE_ONLY) {
16386   RExC_size += size;
16387   return;
16388  }
16389
16390  src = RExC_emit;
16391  RExC_emit += size;
16392  dst = RExC_emit;
16393  if (RExC_open_parens) {
16394   int paren;
16395   /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
16396   for ( paren=0 ; paren < RExC_npar ; paren++ ) {
16397    if ( RExC_open_parens[paren] >= opnd ) {
16398     /*DEBUG_PARSE_FMT("open"," - %d",size);*/
16399     RExC_open_parens[paren] += size;
16400    } else {
16401     /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
16402    }
16403    if ( RExC_close_parens[paren] >= opnd ) {
16404     /*DEBUG_PARSE_FMT("close"," - %d",size);*/
16405     RExC_close_parens[paren] += size;
16406    } else {
16407     /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
16408    }
16409   }
16410  }
16411
16412  while (src > opnd) {
16413   StructCopy(--src, --dst, regnode);
16414 #ifdef RE_TRACK_PATTERN_OFFSETS
16415   if (RExC_offsets) {     /* MJD 20010112 */
16416    MJD_OFFSET_DEBUG(
16417     ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
16418     "reg_insert",
16419     __LINE__,
16420     PL_reg_name[op],
16421     (UV)(dst - RExC_emit_start) > RExC_offsets[0]
16422      ? "Overwriting end of array!\n" : "OK",
16423     (UV)(src - RExC_emit_start),
16424     (UV)(dst - RExC_emit_start),
16425     (UV)RExC_offsets[0]));
16426    Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
16427    Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
16428   }
16429 #endif
16430  }
16431
16432
16433  place = opnd;  /* Op node, where operand used to be. */
16434 #ifdef RE_TRACK_PATTERN_OFFSETS
16435  if (RExC_offsets) {         /* MJD */
16436   MJD_OFFSET_DEBUG(
16437    ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
16438    "reginsert",
16439    __LINE__,
16440    PL_reg_name[op],
16441    (UV)(place - RExC_emit_start) > RExC_offsets[0]
16442    ? "Overwriting end of array!\n" : "OK",
16443    (UV)(place - RExC_emit_start),
16444    (UV)(RExC_parse - RExC_start),
16445    (UV)RExC_offsets[0]));
16446   Set_Node_Offset(place, RExC_parse);
16447   Set_Node_Length(place, 1);
16448  }
16449 #endif
16450  src = NEXTOPER(place);
16451  FILL_ADVANCE_NODE(place, op);
16452  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
16453  Zero(src, offset, regnode);
16454 }
16455
16456 /*
16457 - regtail - set the next-pointer at the end of a node chain of p to val.
16458 - SEE ALSO: regtail_study
16459 */
16460 /* TODO: All three parms should be const */
16461 STATIC void
16462 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
16463     const regnode *val,U32 depth)
16464 {
16465  regnode *scan;
16466  GET_RE_DEBUG_FLAGS_DECL;
16467
16468  PERL_ARGS_ASSERT_REGTAIL;
16469 #ifndef DEBUGGING
16470  PERL_UNUSED_ARG(depth);
16471 #endif
16472
16473  if (SIZE_ONLY)
16474   return;
16475
16476  /* Find last node. */
16477  scan = p;
16478  for (;;) {
16479   regnode * const temp = regnext(scan);
16480   DEBUG_PARSE_r({
16481    DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
16482    regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
16483    PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
16484     SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
16485      (temp == NULL ? "->" : ""),
16486      (temp == NULL ? PL_reg_name[OP(val)] : "")
16487    );
16488   });
16489   if (temp == NULL)
16490    break;
16491   scan = temp;
16492  }
16493
16494  if (reg_off_by_arg[OP(scan)]) {
16495   ARG_SET(scan, val - scan);
16496  }
16497  else {
16498   NEXT_OFF(scan) = val - scan;
16499  }
16500 }
16501
16502 #ifdef DEBUGGING
16503 /*
16504 - regtail_study - set the next-pointer at the end of a node chain of p to val.
16505 - Look for optimizable sequences at the same time.
16506 - currently only looks for EXACT chains.
16507
16508 This is experimental code. The idea is to use this routine to perform
16509 in place optimizations on branches and groups as they are constructed,
16510 with the long term intention of removing optimization from study_chunk so
16511 that it is purely analytical.
16512
16513 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
16514 to control which is which.
16515
16516 */
16517 /* TODO: All four parms should be const */
16518
16519 STATIC U8
16520 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
16521      const regnode *val,U32 depth)
16522 {
16523  regnode *scan;
16524  U8 exact = PSEUDO;
16525 #ifdef EXPERIMENTAL_INPLACESCAN
16526  I32 min = 0;
16527 #endif
16528  GET_RE_DEBUG_FLAGS_DECL;
16529
16530  PERL_ARGS_ASSERT_REGTAIL_STUDY;
16531
16532
16533  if (SIZE_ONLY)
16534   return exact;
16535
16536  /* Find last node. */
16537
16538  scan = p;
16539  for (;;) {
16540   regnode * const temp = regnext(scan);
16541 #ifdef EXPERIMENTAL_INPLACESCAN
16542   if (PL_regkind[OP(scan)] == EXACT) {
16543    bool unfolded_multi_char; /* Unexamined in this routine */
16544    if (join_exact(pRExC_state, scan, &min,
16545       &unfolded_multi_char, 1, val, depth+1))
16546     return EXACT;
16547   }
16548 #endif
16549   if ( exact ) {
16550    switch (OP(scan)) {
16551     case EXACT:
16552     case EXACTL:
16553     case EXACTF:
16554     case EXACTFA_NO_TRIE:
16555     case EXACTFA:
16556     case EXACTFU:
16557     case EXACTFLU8:
16558     case EXACTFU_SS:
16559     case EXACTFL:
16560       if( exact == PSEUDO )
16561        exact= OP(scan);
16562       else if ( exact != OP(scan) )
16563        exact= 0;
16564     case NOTHING:
16565      break;
16566     default:
16567      exact= 0;
16568    }
16569   }
16570   DEBUG_PARSE_r({
16571    DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
16572    regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
16573    PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
16574     SvPV_nolen_const(RExC_mysv),
16575     REG_NODE_NUM(scan),
16576     PL_reg_name[exact]);
16577   });
16578   if (temp == NULL)
16579    break;
16580   scan = temp;
16581  }
16582  DEBUG_PARSE_r({
16583   DEBUG_PARSE_MSG("");
16584   regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
16585   PerlIO_printf(Perl_debug_log,
16586      "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
16587      SvPV_nolen_const(RExC_mysv),
16588      (IV)REG_NODE_NUM(val),
16589      (IV)(val - scan)
16590   );
16591  });
16592  if (reg_off_by_arg[OP(scan)]) {
16593   ARG_SET(scan, val - scan);
16594  }
16595  else {
16596   NEXT_OFF(scan) = val - scan;
16597  }
16598
16599  return exact;
16600 }
16601 #endif
16602
16603 /*
16604  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
16605  */
16606 #ifdef DEBUGGING
16607
16608 static void
16609 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
16610 {
16611  int bit;
16612  int set=0;
16613
16614  ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16615
16616  for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
16617   if (flags & (1<<bit)) {
16618    if (!set++ && lead)
16619     PerlIO_printf(Perl_debug_log, "%s",lead);
16620    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
16621   }
16622  }
16623  if (lead)  {
16624   if (set)
16625    PerlIO_printf(Perl_debug_log, "\n");
16626   else
16627    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16628  }
16629 }
16630
16631 static void
16632 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
16633 {
16634  int bit;
16635  int set=0;
16636  regex_charset cs;
16637
16638  ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16639
16640  for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
16641   if (flags & (1<<bit)) {
16642    if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
16643     continue;
16644    }
16645    if (!set++ && lead)
16646     PerlIO_printf(Perl_debug_log, "%s",lead);
16647    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
16648   }
16649  }
16650  if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
16651    if (!set++ && lead) {
16652     PerlIO_printf(Perl_debug_log, "%s",lead);
16653    }
16654    switch (cs) {
16655     case REGEX_UNICODE_CHARSET:
16656      PerlIO_printf(Perl_debug_log, "UNICODE");
16657      break;
16658     case REGEX_LOCALE_CHARSET:
16659      PerlIO_printf(Perl_debug_log, "LOCALE");
16660      break;
16661     case REGEX_ASCII_RESTRICTED_CHARSET:
16662      PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
16663      break;
16664     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
16665      PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
16666      break;
16667     default:
16668      PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
16669      break;
16670    }
16671  }
16672  if (lead)  {
16673   if (set)
16674    PerlIO_printf(Perl_debug_log, "\n");
16675   else
16676    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16677  }
16678 }
16679 #endif
16680
16681 void
16682 Perl_regdump(pTHX_ const regexp *r)
16683 {
16684 #ifdef DEBUGGING
16685  SV * const sv = sv_newmortal();
16686  SV *dsv= sv_newmortal();
16687  RXi_GET_DECL(r,ri);
16688  GET_RE_DEBUG_FLAGS_DECL;
16689
16690  PERL_ARGS_ASSERT_REGDUMP;
16691
16692  (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
16693
16694  /* Header fields of interest. */
16695  if (r->anchored_substr) {
16696   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
16697    RE_SV_DUMPLEN(r->anchored_substr), 30);
16698   PerlIO_printf(Perl_debug_log,
16699      "anchored %s%s at %"IVdf" ",
16700      s, RE_SV_TAIL(r->anchored_substr),
16701      (IV)r->anchored_offset);
16702  } else if (r->anchored_utf8) {
16703   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
16704    RE_SV_DUMPLEN(r->anchored_utf8), 30);
16705   PerlIO_printf(Perl_debug_log,
16706      "anchored utf8 %s%s at %"IVdf" ",
16707      s, RE_SV_TAIL(r->anchored_utf8),
16708      (IV)r->anchored_offset);
16709  }
16710  if (r->float_substr) {
16711   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
16712    RE_SV_DUMPLEN(r->float_substr), 30);
16713   PerlIO_printf(Perl_debug_log,
16714      "floating %s%s at %"IVdf"..%"UVuf" ",
16715      s, RE_SV_TAIL(r->float_substr),
16716      (IV)r->float_min_offset, (UV)r->float_max_offset);
16717  } else if (r->float_utf8) {
16718   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
16719    RE_SV_DUMPLEN(r->float_utf8), 30);
16720   PerlIO_printf(Perl_debug_log,
16721      "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
16722      s, RE_SV_TAIL(r->float_utf8),
16723      (IV)r->float_min_offset, (UV)r->float_max_offset);
16724  }
16725  if (r->check_substr || r->check_utf8)
16726   PerlIO_printf(Perl_debug_log,
16727      (const char *)
16728      (r->check_substr == r->float_substr
16729      && r->check_utf8 == r->float_utf8
16730      ? "(checking floating" : "(checking anchored"));
16731  if (r->intflags & PREGf_NOSCAN)
16732   PerlIO_printf(Perl_debug_log, " noscan");
16733  if (r->extflags & RXf_CHECK_ALL)
16734   PerlIO_printf(Perl_debug_log, " isall");
16735  if (r->check_substr || r->check_utf8)
16736   PerlIO_printf(Perl_debug_log, ") ");
16737
16738  if (ri->regstclass) {
16739   regprop(r, sv, ri->regstclass, NULL, NULL);
16740   PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
16741  }
16742  if (r->intflags & PREGf_ANCH) {
16743   PerlIO_printf(Perl_debug_log, "anchored");
16744   if (r->intflags & PREGf_ANCH_MBOL)
16745    PerlIO_printf(Perl_debug_log, "(MBOL)");
16746   if (r->intflags & PREGf_ANCH_SBOL)
16747    PerlIO_printf(Perl_debug_log, "(SBOL)");
16748   if (r->intflags & PREGf_ANCH_GPOS)
16749    PerlIO_printf(Perl_debug_log, "(GPOS)");
16750   (void)PerlIO_putc(Perl_debug_log, ' ');
16751  }
16752  if (r->intflags & PREGf_GPOS_SEEN)
16753   PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
16754  if (r->intflags & PREGf_SKIP)
16755   PerlIO_printf(Perl_debug_log, "plus ");
16756  if (r->intflags & PREGf_IMPLICIT)
16757   PerlIO_printf(Perl_debug_log, "implicit ");
16758  PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
16759  if (r->extflags & RXf_EVAL_SEEN)
16760   PerlIO_printf(Perl_debug_log, "with eval ");
16761  PerlIO_printf(Perl_debug_log, "\n");
16762  DEBUG_FLAGS_r({
16763   regdump_extflags("r->extflags: ",r->extflags);
16764   regdump_intflags("r->intflags: ",r->intflags);
16765  });
16766 #else
16767  PERL_ARGS_ASSERT_REGDUMP;
16768  PERL_UNUSED_CONTEXT;
16769  PERL_UNUSED_ARG(r);
16770 #endif /* DEBUGGING */
16771 }
16772
16773 /*
16774 - regprop - printable representation of opcode, with run time support
16775 */
16776
16777 void
16778 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
16779 {
16780 #ifdef DEBUGGING
16781  int k;
16782
16783  /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
16784  static const char * const anyofs[] = {
16785 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
16786  || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
16787  || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
16788  || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
16789  || _CC_CNTRL != 13 || _CC_ASCII != 14 || _CC_VERTSPACE != 15
16790   #error Need to adjust order of anyofs[]
16791 #endif
16792   "\\w",
16793   "\\W",
16794   "\\d",
16795   "\\D",
16796   "[:alpha:]",
16797   "[:^alpha:]",
16798   "[:lower:]",
16799   "[:^lower:]",
16800   "[:upper:]",
16801   "[:^upper:]",
16802   "[:punct:]",
16803   "[:^punct:]",
16804   "[:print:]",
16805   "[:^print:]",
16806   "[:alnum:]",
16807   "[:^alnum:]",
16808   "[:graph:]",
16809   "[:^graph:]",
16810   "[:cased:]",
16811   "[:^cased:]",
16812   "\\s",
16813   "\\S",
16814   "[:blank:]",
16815   "[:^blank:]",
16816   "[:xdigit:]",
16817   "[:^xdigit:]",
16818   "[:cntrl:]",
16819   "[:^cntrl:]",
16820   "[:ascii:]",
16821   "[:^ascii:]",
16822   "\\v",
16823   "\\V"
16824  };
16825  RXi_GET_DECL(prog,progi);
16826  GET_RE_DEBUG_FLAGS_DECL;
16827
16828  PERL_ARGS_ASSERT_REGPROP;
16829
16830  sv_setpvn(sv, "", 0);
16831
16832  if (OP(o) > REGNODE_MAX)  /* regnode.type is unsigned */
16833   /* It would be nice to FAIL() here, but this may be called from
16834   regexec.c, and it would be hard to supply pRExC_state. */
16835   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16836            (int)OP(o), (int)REGNODE_MAX);
16837  sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
16838
16839  k = PL_regkind[OP(o)];
16840
16841  if (k == EXACT) {
16842   sv_catpvs(sv, " ");
16843   /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
16844   * is a crude hack but it may be the best for now since
16845   * we have no flag "this EXACTish node was UTF-8"
16846   * --jhi */
16847   pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
16848     PERL_PV_ESCAPE_UNI_DETECT |
16849     PERL_PV_ESCAPE_NONASCII   |
16850     PERL_PV_PRETTY_ELLIPSES   |
16851     PERL_PV_PRETTY_LTGT       |
16852     PERL_PV_PRETTY_NOCLEAR
16853     );
16854  } else if (k == TRIE) {
16855   /* print the details of the trie in dumpuntil instead, as
16856   * progi->data isn't available here */
16857   const char op = OP(o);
16858   const U32 n = ARG(o);
16859   const reg_ac_data * const ac = IS_TRIE_AC(op) ?
16860    (reg_ac_data *)progi->data->data[n] :
16861    NULL;
16862   const reg_trie_data * const trie
16863    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
16864
16865   Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
16866   DEBUG_TRIE_COMPILE_r(
16867   Perl_sv_catpvf(aTHX_ sv,
16868    "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
16869    (UV)trie->startstate,
16870    (IV)trie->statecount-1, /* -1 because of the unused 0 element */
16871    (UV)trie->wordcount,
16872    (UV)trie->minlen,
16873    (UV)trie->maxlen,
16874    (UV)TRIE_CHARCOUNT(trie),
16875    (UV)trie->uniquecharcount
16876   );
16877   );
16878   if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
16879    sv_catpvs(sv, "[");
16880    (void) put_charclass_bitmap_innards(sv,
16881             (IS_ANYOF_TRIE(op))
16882             ? ANYOF_BITMAP(o)
16883             : TRIE_BITMAP(trie),
16884             NULL);
16885    sv_catpvs(sv, "]");
16886   }
16887
16888  } else if (k == CURLY) {
16889   if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
16890    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
16891   Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
16892  }
16893  else if (k == WHILEM && o->flags)   /* Ordinal/of */
16894   Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
16895  else if (k == REF || k == OPEN || k == CLOSE
16896    || k == GROUPP || OP(o)==ACCEPT)
16897  {
16898   AV *name_list= NULL;
16899   Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
16900   if ( RXp_PAREN_NAMES(prog) ) {
16901    name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16902   } else if ( pRExC_state ) {
16903    name_list= RExC_paren_name_list;
16904   }
16905   if (name_list) {
16906    if ( k != REF || (OP(o) < NREF)) {
16907     SV **name= av_fetch(name_list, ARG(o), 0 );
16908     if (name)
16909      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16910    }
16911    else {
16912     SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
16913     I32 *nums=(I32*)SvPVX(sv_dat);
16914     SV **name= av_fetch(name_list, nums[0], 0 );
16915     I32 n;
16916     if (name) {
16917      for ( n=0; n<SvIVX(sv_dat); n++ ) {
16918       Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
16919          (n ? "," : ""), (IV)nums[n]);
16920      }
16921      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16922     }
16923    }
16924   }
16925   if ( k == REF && reginfo) {
16926    U32 n = ARG(o);  /* which paren pair */
16927    I32 ln = prog->offs[n].start;
16928    if (prog->lastparen < n || ln == -1)
16929     Perl_sv_catpvf(aTHX_ sv, ": FAIL");
16930    else if (ln == prog->offs[n].end)
16931     Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
16932    else {
16933     const char *s = reginfo->strbeg + ln;
16934     Perl_sv_catpvf(aTHX_ sv, ": ");
16935     Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
16936      PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
16937    }
16938   }
16939  } else if (k == GOSUB) {
16940   AV *name_list= NULL;
16941   if ( RXp_PAREN_NAMES(prog) ) {
16942    name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16943   } else if ( pRExC_state ) {
16944    name_list= RExC_paren_name_list;
16945   }
16946
16947   /* Paren and offset */
16948   Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
16949   if (name_list) {
16950    SV **name= av_fetch(name_list, ARG(o), 0 );
16951    if (name)
16952     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16953   }
16954  }
16955  else if (k == VERB) {
16956   if (!o->flags)
16957    Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
16958       SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
16959  } else if (k == LOGICAL)
16960   /* 2: embedded, otherwise 1 */
16961   Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
16962  else if (k == ANYOF) {
16963   const U8 flags = ANYOF_FLAGS(o);
16964   int do_sep = 0;
16965   SV* bitmap_invlist;  /* Will hold what the bit map contains */
16966
16967
16968   if (OP(o) == ANYOFL)
16969    sv_catpvs(sv, "{loc}");
16970   if (flags & ANYOF_LOC_FOLD)
16971    sv_catpvs(sv, "{i}");
16972   Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
16973   if (flags & ANYOF_INVERT)
16974    sv_catpvs(sv, "^");
16975
16976   /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
16977   * */
16978   do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
16979                &bitmap_invlist);
16980
16981   /* output any special charclass tests (used entirely under use
16982   * locale) * */
16983   if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
16984    int i;
16985    for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
16986     if (ANYOF_POSIXL_TEST(o,i)) {
16987      sv_catpv(sv, anyofs[i]);
16988      do_sep = 1;
16989     }
16990    }
16991   }
16992
16993   if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
16994      |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16995      |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
16996      |ANYOF_LOC_FOLD)))
16997   {
16998    if (do_sep) {
16999     Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
17000     if (flags & ANYOF_INVERT)
17001      /*make sure the invert info is in each */
17002      sv_catpvs(sv, "^");
17003    }
17004
17005    if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
17006     sv_catpvs(sv, "{non-utf8-latin1-all}");
17007    }
17008
17009    if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
17010     sv_catpvs(sv, "{above_bitmap_all}");
17011
17012    if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
17013     SV *lv; /* Set if there is something outside the bit map. */
17014     bool byte_output = FALSE;   /* If something has been output */
17015     SV *only_utf8_locale;
17016
17017     /* Get the stuff that wasn't in the bitmap.  'bitmap_invlist'
17018     * is used to guarantee that nothing in the bitmap gets
17019     * returned */
17020     (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
17021              &lv, &only_utf8_locale,
17022              bitmap_invlist);
17023     if (lv && lv != &PL_sv_undef) {
17024      char *s = savesvpv(lv);
17025      char * const origs = s;
17026
17027      while (*s && *s != '\n')
17028       s++;
17029
17030      if (*s == '\n') {
17031       const char * const t = ++s;
17032
17033       if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
17034        sv_catpvs(sv, "{outside bitmap}");
17035       }
17036       else {
17037        sv_catpvs(sv, "{utf8}");
17038       }
17039
17040       if (byte_output) {
17041        sv_catpvs(sv, " ");
17042       }
17043
17044       while (*s) {
17045        if (*s == '\n') {
17046
17047         /* Truncate very long output */
17048         if (s - origs > 256) {
17049          Perl_sv_catpvf(aTHX_ sv,
17050             "%.*s...",
17051             (int) (s - origs - 1),
17052             t);
17053          goto out_dump;
17054         }
17055         *s = ' ';
17056        }
17057        else if (*s == '\t') {
17058         *s = '-';
17059        }
17060        s++;
17061       }
17062       if (s[-1] == ' ')
17063        s[-1] = 0;
17064
17065       sv_catpv(sv, t);
17066      }
17067
17068     out_dump:
17069
17070      Safefree(origs);
17071      SvREFCNT_dec_NN(lv);
17072     }
17073
17074     if ((flags & ANYOF_LOC_FOLD)
17075      && only_utf8_locale
17076      && only_utf8_locale != &PL_sv_undef)
17077     {
17078      UV start, end;
17079      int max_entries = 256;
17080
17081      sv_catpvs(sv, "{utf8 locale}");
17082      invlist_iterinit(only_utf8_locale);
17083      while (invlist_iternext(only_utf8_locale,
17084            &start, &end)) {
17085       put_range(sv, start, end, FALSE);
17086       max_entries --;
17087       if (max_entries < 0) {
17088        sv_catpvs(sv, "...");
17089        break;
17090       }
17091      }
17092      invlist_iterfinish(only_utf8_locale);
17093     }
17094    }
17095   }
17096   SvREFCNT_dec(bitmap_invlist);
17097
17098
17099   Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
17100  }
17101  else if (k == POSIXD || k == NPOSIXD) {
17102   U8 index = FLAGS(o) * 2;
17103   if (index < C_ARRAY_LENGTH(anyofs)) {
17104    if (*anyofs[index] != '[')  {
17105     sv_catpv(sv, "[");
17106    }
17107    sv_catpv(sv, anyofs[index]);
17108    if (*anyofs[index] != '[')  {
17109     sv_catpv(sv, "]");
17110    }
17111   }
17112   else {
17113    Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
17114   }
17115  }
17116  else if (k == BOUND || k == NBOUND) {
17117   /* Must be synced with order of 'bound_type' in regcomp.h */
17118   const char * const bounds[] = {
17119    "",      /* Traditional */
17120    "{gcb}",
17121    "{sb}",
17122    "{wb}"
17123   };
17124   sv_catpv(sv, bounds[FLAGS(o)]);
17125  }
17126  else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
17127   Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
17128  else if (OP(o) == SBOL)
17129   Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
17130 #else
17131  PERL_UNUSED_CONTEXT;
17132  PERL_UNUSED_ARG(sv);
17133  PERL_UNUSED_ARG(o);
17134  PERL_UNUSED_ARG(prog);
17135  PERL_UNUSED_ARG(reginfo);
17136  PERL_UNUSED_ARG(pRExC_state);
17137 #endif /* DEBUGGING */
17138 }
17139
17140
17141
17142 SV *
17143 Perl_re_intuit_string(pTHX_ REGEXP * const r)
17144 {    /* Assume that RE_INTUIT is set */
17145  struct regexp *const prog = ReANY(r);
17146  GET_RE_DEBUG_FLAGS_DECL;
17147
17148  PERL_ARGS_ASSERT_RE_INTUIT_STRING;
17149  PERL_UNUSED_CONTEXT;
17150
17151  DEBUG_COMPILE_r(
17152   {
17153    const char * const s = SvPV_nolen_const(RX_UTF8(r)
17154      ? prog->check_utf8 : prog->check_substr);
17155
17156    if (!PL_colorset) reginitcolors();
17157    PerlIO_printf(Perl_debug_log,
17158      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
17159      PL_colors[4],
17160      RX_UTF8(r) ? "utf8 " : "",
17161      PL_colors[5],PL_colors[0],
17162      s,
17163      PL_colors[1],
17164      (strlen(s) > 60 ? "..." : ""));
17165   } );
17166
17167  /* use UTF8 check substring if regexp pattern itself is in UTF8 */
17168  return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
17169 }
17170
17171 /*
17172    pregfree()
17173
17174    handles refcounting and freeing the perl core regexp structure. When
17175    it is necessary to actually free the structure the first thing it
17176    does is call the 'free' method of the regexp_engine associated to
17177    the regexp, allowing the handling of the void *pprivate; member
17178    first. (This routine is not overridable by extensions, which is why
17179    the extensions free is called first.)
17180
17181    See regdupe and regdupe_internal if you change anything here.
17182 */
17183 #ifndef PERL_IN_XSUB_RE
17184 void
17185 Perl_pregfree(pTHX_ REGEXP *r)
17186 {
17187  SvREFCNT_dec(r);
17188 }
17189
17190 void
17191 Perl_pregfree2(pTHX_ REGEXP *rx)
17192 {
17193  struct regexp *const r = ReANY(rx);
17194  GET_RE_DEBUG_FLAGS_DECL;
17195
17196  PERL_ARGS_ASSERT_PREGFREE2;
17197
17198  if (r->mother_re) {
17199   ReREFCNT_dec(r->mother_re);
17200  } else {
17201   CALLREGFREE_PVT(rx); /* free the private data */
17202   SvREFCNT_dec(RXp_PAREN_NAMES(r));
17203   Safefree(r->xpv_len_u.xpvlenu_pv);
17204  }
17205  if (r->substrs) {
17206   SvREFCNT_dec(r->anchored_substr);
17207   SvREFCNT_dec(r->anchored_utf8);
17208   SvREFCNT_dec(r->float_substr);
17209   SvREFCNT_dec(r->float_utf8);
17210   Safefree(r->substrs);
17211  }
17212  RX_MATCH_COPY_FREE(rx);
17213 #ifdef PERL_ANY_COW
17214  SvREFCNT_dec(r->saved_copy);
17215 #endif
17216  Safefree(r->offs);
17217  SvREFCNT_dec(r->qr_anoncv);
17218  rx->sv_u.svu_rx = 0;
17219 }
17220
17221 /*  reg_temp_copy()
17222
17223  This is a hacky workaround to the structural issue of match results
17224  being stored in the regexp structure which is in turn stored in
17225  PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
17226  could be PL_curpm in multiple contexts, and could require multiple
17227  result sets being associated with the pattern simultaneously, such
17228  as when doing a recursive match with (??{$qr})
17229
17230  The solution is to make a lightweight copy of the regexp structure
17231  when a qr// is returned from the code executed by (??{$qr}) this
17232  lightweight copy doesn't actually own any of its data except for
17233  the starp/end and the actual regexp structure itself.
17234
17235 */
17236
17237
17238 REGEXP *
17239 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
17240 {
17241  struct regexp *ret;
17242  struct regexp *const r = ReANY(rx);
17243  const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
17244
17245  PERL_ARGS_ASSERT_REG_TEMP_COPY;
17246
17247  if (!ret_x)
17248   ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
17249  else {
17250   SvOK_off((SV *)ret_x);
17251   if (islv) {
17252    /* For PVLVs, SvANY points to the xpvlv body while sv_u points
17253    to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
17254    made both spots point to the same regexp body.) */
17255    REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
17256    assert(!SvPVX(ret_x));
17257    ret_x->sv_u.svu_rx = temp->sv_any;
17258    temp->sv_any = NULL;
17259    SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
17260    SvREFCNT_dec_NN(temp);
17261    /* SvCUR still resides in the xpvlv struct, so the regexp copy-
17262    ing below will not set it. */
17263    SvCUR_set(ret_x, SvCUR(rx));
17264   }
17265  }
17266  /* This ensures that SvTHINKFIRST(sv) is true, and hence that
17267  sv_force_normal(sv) is called.  */
17268  SvFAKE_on(ret_x);
17269  ret = ReANY(ret_x);
17270
17271  SvFLAGS(ret_x) |= SvUTF8(rx);
17272  /* We share the same string buffer as the original regexp, on which we
17273  hold a reference count, incremented when mother_re is set below.
17274  The string pointer is copied here, being part of the regexp struct.
17275  */
17276  memcpy(&(ret->xpv_cur), &(r->xpv_cur),
17277   sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
17278  if (r->offs) {
17279   const I32 npar = r->nparens+1;
17280   Newx(ret->offs, npar, regexp_paren_pair);
17281   Copy(r->offs, ret->offs, npar, regexp_paren_pair);
17282  }
17283  if (r->substrs) {
17284   Newx(ret->substrs, 1, struct reg_substr_data);
17285   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
17286
17287   SvREFCNT_inc_void(ret->anchored_substr);
17288   SvREFCNT_inc_void(ret->anchored_utf8);
17289   SvREFCNT_inc_void(ret->float_substr);
17290   SvREFCNT_inc_void(ret->float_utf8);
17291
17292   /* check_substr and check_utf8, if non-NULL, point to either their
17293   anchored or float namesakes, and don't hold a second reference.  */
17294  }
17295  RX_MATCH_COPIED_off(ret_x);
17296 #ifdef PERL_ANY_COW
17297  ret->saved_copy = NULL;
17298 #endif
17299  ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
17300  SvREFCNT_inc_void(ret->qr_anoncv);
17301
17302  return ret_x;
17303 }
17304 #endif
17305
17306 /* regfree_internal()
17307
17308    Free the private data in a regexp. This is overloadable by
17309    extensions. Perl takes care of the regexp structure in pregfree(),
17310    this covers the *pprivate pointer which technically perl doesn't
17311    know about, however of course we have to handle the
17312    regexp_internal structure when no extension is in use.
17313
17314    Note this is called before freeing anything in the regexp
17315    structure.
17316  */
17317
17318 void
17319 Perl_regfree_internal(pTHX_ REGEXP * const rx)
17320 {
17321  struct regexp *const r = ReANY(rx);
17322  RXi_GET_DECL(r,ri);
17323  GET_RE_DEBUG_FLAGS_DECL;
17324
17325  PERL_ARGS_ASSERT_REGFREE_INTERNAL;
17326
17327  DEBUG_COMPILE_r({
17328   if (!PL_colorset)
17329    reginitcolors();
17330   {
17331    SV *dsv= sv_newmortal();
17332    RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
17333     dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
17334    PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
17335     PL_colors[4],PL_colors[5],s);
17336   }
17337  });
17338 #ifdef RE_TRACK_PATTERN_OFFSETS
17339  if (ri->u.offsets)
17340   Safefree(ri->u.offsets);             /* 20010421 MJD */
17341 #endif
17342  if (ri->code_blocks) {
17343   int n;
17344   for (n = 0; n < ri->num_code_blocks; n++)
17345    SvREFCNT_dec(ri->code_blocks[n].src_regex);
17346   Safefree(ri->code_blocks);
17347  }
17348
17349  if (ri->data) {
17350   int n = ri->data->count;
17351
17352   while (--n >= 0) {
17353   /* If you add a ->what type here, update the comment in regcomp.h */
17354    switch (ri->data->what[n]) {
17355    case 'a':
17356    case 'r':
17357    case 's':
17358    case 'S':
17359    case 'u':
17360     SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
17361     break;
17362    case 'f':
17363     Safefree(ri->data->data[n]);
17364     break;
17365    case 'l':
17366    case 'L':
17367     break;
17368    case 'T':
17369     { /* Aho Corasick add-on structure for a trie node.
17370      Used in stclass optimization only */
17371      U32 refcount;
17372      reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
17373 #ifdef USE_ITHREADS
17374      dVAR;
17375 #endif
17376      OP_REFCNT_LOCK;
17377      refcount = --aho->refcount;
17378      OP_REFCNT_UNLOCK;
17379      if ( !refcount ) {
17380       PerlMemShared_free(aho->states);
17381       PerlMemShared_free(aho->fail);
17382       /* do this last!!!! */
17383       PerlMemShared_free(ri->data->data[n]);
17384       /* we should only ever get called once, so
17385       * assert as much, and also guard the free
17386       * which /might/ happen twice. At the least
17387       * it will make code anlyzers happy and it
17388       * doesn't cost much. - Yves */
17389       assert(ri->regstclass);
17390       if (ri->regstclass) {
17391        PerlMemShared_free(ri->regstclass);
17392        ri->regstclass = 0;
17393       }
17394      }
17395     }
17396     break;
17397    case 't':
17398     {
17399      /* trie structure. */
17400      U32 refcount;
17401      reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
17402 #ifdef USE_ITHREADS
17403      dVAR;
17404 #endif
17405      OP_REFCNT_LOCK;
17406      refcount = --trie->refcount;
17407      OP_REFCNT_UNLOCK;
17408      if ( !refcount ) {
17409       PerlMemShared_free(trie->charmap);
17410       PerlMemShared_free(trie->states);
17411       PerlMemShared_free(trie->trans);
17412       if (trie->bitmap)
17413        PerlMemShared_free(trie->bitmap);
17414       if (trie->jump)
17415        PerlMemShared_free(trie->jump);
17416       PerlMemShared_free(trie->wordinfo);
17417       /* do this last!!!! */
17418       PerlMemShared_free(ri->data->data[n]);
17419      }
17420     }
17421     break;
17422    default:
17423     Perl_croak(aTHX_ "panic: regfree data code '%c'",
17424              ri->data->what[n]);
17425    }
17426   }
17427   Safefree(ri->data->what);
17428   Safefree(ri->data);
17429  }
17430
17431  Safefree(ri);
17432 }
17433
17434 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
17435 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
17436 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
17437
17438 /*
17439    re_dup - duplicate a regexp.
17440
17441    This routine is expected to clone a given regexp structure. It is only
17442    compiled under USE_ITHREADS.
17443
17444    After all of the core data stored in struct regexp is duplicated
17445    the regexp_engine.dupe method is used to copy any private data
17446    stored in the *pprivate pointer. This allows extensions to handle
17447    any duplication it needs to do.
17448
17449    See pregfree() and regfree_internal() if you change anything here.
17450 */
17451 #if defined(USE_ITHREADS)
17452 #ifndef PERL_IN_XSUB_RE
17453 void
17454 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
17455 {
17456  dVAR;
17457  I32 npar;
17458  const struct regexp *r = ReANY(sstr);
17459  struct regexp *ret = ReANY(dstr);
17460
17461  PERL_ARGS_ASSERT_RE_DUP_GUTS;
17462
17463  npar = r->nparens+1;
17464  Newx(ret->offs, npar, regexp_paren_pair);
17465  Copy(r->offs, ret->offs, npar, regexp_paren_pair);
17466
17467  if (ret->substrs) {
17468   /* Do it this way to avoid reading from *r after the StructCopy().
17469   That way, if any of the sv_dup_inc()s dislodge *r from the L1
17470   cache, it doesn't matter.  */
17471   const bool anchored = r->check_substr
17472    ? r->check_substr == r->anchored_substr
17473    : r->check_utf8 == r->anchored_utf8;
17474   Newx(ret->substrs, 1, struct reg_substr_data);
17475   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
17476
17477   ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
17478   ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
17479   ret->float_substr = sv_dup_inc(ret->float_substr, param);
17480   ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
17481
17482   /* check_substr and check_utf8, if non-NULL, point to either their
17483   anchored or float namesakes, and don't hold a second reference.  */
17484
17485   if (ret->check_substr) {
17486    if (anchored) {
17487     assert(r->check_utf8 == r->anchored_utf8);
17488     ret->check_substr = ret->anchored_substr;
17489     ret->check_utf8 = ret->anchored_utf8;
17490    } else {
17491     assert(r->check_substr == r->float_substr);
17492     assert(r->check_utf8 == r->float_utf8);
17493     ret->check_substr = ret->float_substr;
17494     ret->check_utf8 = ret->float_utf8;
17495    }
17496   } else if (ret->check_utf8) {
17497    if (anchored) {
17498     ret->check_utf8 = ret->anchored_utf8;
17499    } else {
17500     ret->check_utf8 = ret->float_utf8;
17501    }
17502   }
17503  }
17504
17505  RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
17506  ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
17507
17508  if (ret->pprivate)
17509   RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
17510
17511  if (RX_MATCH_COPIED(dstr))
17512   ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
17513  else
17514   ret->subbeg = NULL;
17515 #ifdef PERL_ANY_COW
17516  ret->saved_copy = NULL;
17517 #endif
17518
17519  /* Whether mother_re be set or no, we need to copy the string.  We
17520  cannot refrain from copying it when the storage points directly to
17521  our mother regexp, because that's
17522    1: a buffer in a different thread
17523    2: something we no longer hold a reference on
17524    so we need to copy it locally.  */
17525  RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
17526  ret->mother_re   = NULL;
17527 }
17528 #endif /* PERL_IN_XSUB_RE */
17529
17530 /*
17531    regdupe_internal()
17532
17533    This is the internal complement to regdupe() which is used to copy
17534    the structure pointed to by the *pprivate pointer in the regexp.
17535    This is the core version of the extension overridable cloning hook.
17536    The regexp structure being duplicated will be copied by perl prior
17537    to this and will be provided as the regexp *r argument, however
17538    with the /old/ structures pprivate pointer value. Thus this routine
17539    may override any copying normally done by perl.
17540
17541    It returns a pointer to the new regexp_internal structure.
17542 */
17543
17544 void *
17545 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
17546 {
17547  dVAR;
17548  struct regexp *const r = ReANY(rx);
17549  regexp_internal *reti;
17550  int len;
17551  RXi_GET_DECL(r,ri);
17552
17553  PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
17554
17555  len = ProgLen(ri);
17556
17557  Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
17558   char, regexp_internal);
17559  Copy(ri->program, reti->program, len+1, regnode);
17560
17561  reti->num_code_blocks = ri->num_code_blocks;
17562  if (ri->code_blocks) {
17563   int n;
17564   Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
17565     struct reg_code_block);
17566   Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
17567     struct reg_code_block);
17568   for (n = 0; n < ri->num_code_blocks; n++)
17569    reti->code_blocks[n].src_regex = (REGEXP*)
17570      sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
17571  }
17572  else
17573   reti->code_blocks = NULL;
17574
17575  reti->regstclass = NULL;
17576
17577  if (ri->data) {
17578   struct reg_data *d;
17579   const int count = ri->data->count;
17580   int i;
17581
17582   Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
17583     char, struct reg_data);
17584   Newx(d->what, count, U8);
17585
17586   d->count = count;
17587   for (i = 0; i < count; i++) {
17588    d->what[i] = ri->data->what[i];
17589    switch (d->what[i]) {
17590     /* see also regcomp.h and regfree_internal() */
17591    case 'a': /* actually an AV, but the dup function is identical.  */
17592    case 'r':
17593    case 's':
17594    case 'S':
17595    case 'u': /* actually an HV, but the dup function is identical.  */
17596     d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
17597     break;
17598    case 'f':
17599     /* This is cheating. */
17600     Newx(d->data[i], 1, regnode_ssc);
17601     StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
17602     reti->regstclass = (regnode*)d->data[i];
17603     break;
17604    case 'T':
17605     /* Trie stclasses are readonly and can thus be shared
17606     * without duplication. We free the stclass in pregfree
17607     * when the corresponding reg_ac_data struct is freed.
17608     */
17609     reti->regstclass= ri->regstclass;
17610     /* FALLTHROUGH */
17611    case 't':
17612     OP_REFCNT_LOCK;
17613     ((reg_trie_data*)ri->data->data[i])->refcount++;
17614     OP_REFCNT_UNLOCK;
17615     /* FALLTHROUGH */
17616    case 'l':
17617    case 'L':
17618     d->data[i] = ri->data->data[i];
17619     break;
17620    default:
17621     Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
17622               ri->data->what[i]);
17623    }
17624   }
17625
17626   reti->data = d;
17627  }
17628  else
17629   reti->data = NULL;
17630
17631  reti->name_list_idx = ri->name_list_idx;
17632
17633 #ifdef RE_TRACK_PATTERN_OFFSETS
17634  if (ri->u.offsets) {
17635   Newx(reti->u.offsets, 2*len+1, U32);
17636   Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
17637  }
17638 #else
17639  SetProgLen(reti,len);
17640 #endif
17641
17642  return (void*)reti;
17643 }
17644
17645 #endif    /* USE_ITHREADS */
17646
17647 #ifndef PERL_IN_XSUB_RE
17648
17649 /*
17650  - regnext - dig the "next" pointer out of a node
17651  */
17652 regnode *
17653 Perl_regnext(pTHX_ regnode *p)
17654 {
17655  I32 offset;
17656
17657  if (!p)
17658   return(NULL);
17659
17660  if (OP(p) > REGNODE_MAX) {  /* regnode.type is unsigned */
17661   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
17662             (int)OP(p), (int)REGNODE_MAX);
17663  }
17664
17665  offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
17666  if (offset == 0)
17667   return(NULL);
17668
17669  return(p+offset);
17670 }
17671 #endif
17672
17673 STATIC void
17674 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
17675 {
17676  va_list args;
17677  STRLEN l1 = strlen(pat1);
17678  STRLEN l2 = strlen(pat2);
17679  char buf[512];
17680  SV *msv;
17681  const char *message;
17682
17683  PERL_ARGS_ASSERT_RE_CROAK2;
17684
17685  if (l1 > 510)
17686   l1 = 510;
17687  if (l1 + l2 > 510)
17688   l2 = 510 - l1;
17689  Copy(pat1, buf, l1 , char);
17690  Copy(pat2, buf + l1, l2 , char);
17691  buf[l1 + l2] = '\n';
17692  buf[l1 + l2 + 1] = '\0';
17693  va_start(args, pat2);
17694  msv = vmess(buf, &args);
17695  va_end(args);
17696  message = SvPV_const(msv,l1);
17697  if (l1 > 512)
17698   l1 = 512;
17699  Copy(message, buf, l1 , char);
17700  /* l1-1 to avoid \n */
17701  Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
17702 }
17703
17704 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
17705
17706 #ifndef PERL_IN_XSUB_RE
17707 void
17708 Perl_save_re_context(pTHX)
17709 {
17710  I32 nparens = -1;
17711  I32 i;
17712
17713  /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
17714
17715  if (PL_curpm) {
17716   const REGEXP * const rx = PM_GETRE(PL_curpm);
17717   if (rx)
17718    nparens = RX_NPARENS(rx);
17719  }
17720
17721  /* RT #124109. This is a complete hack; in the SWASHNEW case we know
17722  * that PL_curpm will be null, but that utf8.pm and the modules it
17723  * loads will only use $1..$3.
17724  * The t/porting/re_context.t test file checks this assumption.
17725  */
17726  if (nparens == -1)
17727   nparens = 3;
17728
17729  for (i = 1; i <= nparens; i++) {
17730   char digits[TYPE_CHARS(long)];
17731   const STRLEN len = my_snprintf(digits, sizeof(digits),
17732          "%lu", (long)i);
17733   GV *const *const gvp
17734    = (GV**)hv_fetch(PL_defstash, digits, len, 0);
17735
17736   if (gvp) {
17737    GV * const gv = *gvp;
17738    if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
17739     save_scalar(gv);
17740   }
17741  }
17742 }
17743 #endif
17744
17745 #ifdef DEBUGGING
17746
17747 STATIC void
17748 S_put_code_point(pTHX_ SV *sv, UV c)
17749 {
17750  PERL_ARGS_ASSERT_PUT_CODE_POINT;
17751
17752  if (c > 255) {
17753   Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
17754  }
17755  else if (isPRINT(c)) {
17756   const char string = (char) c;
17757   if (isBACKSLASHED_PUNCT(c))
17758    sv_catpvs(sv, "\\");
17759   sv_catpvn(sv, &string, 1);
17760  }
17761  else {
17762   const char * const mnemonic = cntrl_to_mnemonic((char) c);
17763   if (mnemonic) {
17764    Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
17765   }
17766   else {
17767    Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
17768   }
17769  }
17770 }
17771
17772 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
17773
17774 STATIC void
17775 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
17776 {
17777  /* Appends to 'sv' a displayable version of the range of code points from
17778  * 'start' to 'end'.  It assumes that only ASCII printables are displayable
17779  * as-is (though some of these will be escaped by put_code_point()). */
17780
17781  const unsigned int min_range_count = 3;
17782
17783  assert(start <= end);
17784
17785  PERL_ARGS_ASSERT_PUT_RANGE;
17786
17787  while (start <= end) {
17788   UV this_end;
17789   const char * format;
17790
17791   if (end - start < min_range_count) {
17792
17793    /* Individual chars in short ranges */
17794    for (; start <= end; start++) {
17795     put_code_point(sv, start);
17796    }
17797    break;
17798   }
17799
17800   /* If permitted by the input options, and there is a possibility that
17801   * this range contains a printable literal, look to see if there is
17802   * one.  */
17803   if (allow_literals && start <= MAX_PRINT_A) {
17804
17805    /* If the range begin isn't an ASCII printable, effectively split
17806    * the range into two parts:
17807    *  1) the portion before the first such printable,
17808    *  2) the rest
17809    * and output them separately. */
17810    if (! isPRINT_A(start)) {
17811     UV temp_end = start + 1;
17812
17813     /* There is no point looking beyond the final possible
17814     * printable, in MAX_PRINT_A */
17815     UV max = MIN(end, MAX_PRINT_A);
17816
17817     while (temp_end <= max && ! isPRINT_A(temp_end)) {
17818      temp_end++;
17819     }
17820
17821     /* Here, temp_end points to one beyond the first printable if
17822     * found, or to one beyond 'max' if not.  If none found, make
17823     * sure that we use the entire range */
17824     if (temp_end > MAX_PRINT_A) {
17825      temp_end = end + 1;
17826     }
17827
17828     /* Output the first part of the split range, the part that
17829     * doesn't have printables, with no looking for literals
17830     * (otherwise we would infinitely recurse) */
17831     put_range(sv, start, temp_end - 1, FALSE);
17832
17833     /* The 2nd part of the range (if any) starts here. */
17834     start = temp_end;
17835
17836     /* We continue instead of dropping down because even if the 2nd
17837     * part is non-empty, it could be so short that we want to
17838     * output it specially, as tested for at the top of this loop.
17839     * */
17840     continue;
17841    }
17842
17843    /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
17844    * output a sub-range of just the digits or letters, then process
17845    * the remaining portion as usual. */
17846    if (isALPHANUMERIC_A(start)) {
17847     UV mask = (isDIGIT_A(start))
17848       ? _CC_DIGIT
17849        : isUPPER_A(start)
17850        ? _CC_UPPER
17851        : _CC_LOWER;
17852     UV temp_end = start + 1;
17853
17854     /* Find the end of the sub-range that includes just the
17855     * characters in the same class as the first character in it */
17856     while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
17857      temp_end++;
17858     }
17859     temp_end--;
17860
17861     /* For short ranges, don't duplicate the code above to output
17862     * them; just call recursively */
17863     if (temp_end - start < min_range_count) {
17864      put_range(sv, start, temp_end, FALSE);
17865     }
17866     else {  /* Output as a range */
17867      put_code_point(sv, start);
17868      sv_catpvs(sv, "-");
17869      put_code_point(sv, temp_end);
17870     }
17871     start = temp_end + 1;
17872     continue;
17873    }
17874
17875    /* We output any other printables as individual characters */
17876    if (isPUNCT_A(start) || isSPACE_A(start)) {
17877     while (start <= end && (isPUNCT_A(start)
17878           || isSPACE_A(start)))
17879     {
17880      put_code_point(sv, start);
17881      start++;
17882     }
17883     continue;
17884    }
17885   } /* End of looking for literals */
17886
17887   /* Here is not to output as a literal.  Some control characters have
17888   * mnemonic names.  Split off any of those at the beginning and end of
17889   * the range to print mnemonically.  It isn't possible for many of
17890   * these to be in a row, so this won't overwhelm with output */
17891   while (isMNEMONIC_CNTRL(start) && start <= end) {
17892    put_code_point(sv, start);
17893    start++;
17894   }
17895   if (start < end && isMNEMONIC_CNTRL(end)) {
17896
17897    /* Here, the final character in the range has a mnemonic name.
17898    * Work backwards from the end to find the final non-mnemonic */
17899    UV temp_end = end - 1;
17900    while (isMNEMONIC_CNTRL(temp_end)) {
17901     temp_end--;
17902    }
17903
17904    /* And separately output the range that doesn't have mnemonics */
17905    put_range(sv, start, temp_end, FALSE);
17906
17907    /* Then output the mnemonic trailing controls */
17908    start = temp_end + 1;
17909    while (start <= end) {
17910     put_code_point(sv, start);
17911     start++;
17912    }
17913    break;
17914   }
17915
17916   /* As a final resort, output the range or subrange as hex. */
17917
17918   this_end = (end < NUM_ANYOF_CODE_POINTS)
17919      ? end
17920      : NUM_ANYOF_CODE_POINTS - 1;
17921   format = (this_end < 256)
17922     ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
17923     : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
17924   GCC_DIAG_IGNORE(-Wformat-nonliteral);
17925   Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
17926   GCC_DIAG_RESTORE;
17927   break;
17928  }
17929 }
17930
17931 STATIC bool
17932 S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
17933 {
17934  /* Appends to 'sv' a displayable version of the innards of the bracketed
17935  * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
17936  * output anything, and bitmap_invlist, if not NULL, will point to an
17937  * inversion list of what is in the bit map */
17938
17939  int i;
17940  UV start, end;
17941  unsigned int punct_count = 0;
17942  SV* invlist = NULL;
17943  SV** invlist_ptr;   /* Temporary, in case bitmap_invlist is NULL */
17944  bool allow_literals = TRUE;
17945
17946  PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
17947
17948  invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
17949
17950  /* Worst case is exactly every-other code point is in the list */
17951  *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
17952
17953  /* Convert the bit map to an inversion list, keeping track of how many
17954  * ASCII puncts are set, including an extra amount for the backslashed
17955  * ones.  */
17956  for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
17957   if (BITMAP_TEST(bitmap, i)) {
17958    *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
17959    if (isPUNCT_A(i)) {
17960     punct_count++;
17961     if isBACKSLASHED_PUNCT(i) {
17962      punct_count++;
17963     }
17964    }
17965   }
17966  }
17967
17968  /* Nothing to output */
17969  if (_invlist_len(*invlist_ptr) == 0) {
17970   SvREFCNT_dec(invlist);
17971   return FALSE;
17972  }
17973
17974  /* Generally, it is more readable if printable characters are output as
17975  * literals, but if a range (nearly) spans all of them, it's best to output
17976  * it as a single range.  This code will use a single range if all but 2
17977  * printables are in it */
17978  invlist_iterinit(*invlist_ptr);
17979  while (invlist_iternext(*invlist_ptr, &start, &end)) {
17980
17981   /* If range starts beyond final printable, it doesn't have any in it */
17982   if (start > MAX_PRINT_A) {
17983    break;
17984   }
17985
17986   /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
17987   * all but two, the range must start and end no later than 2 from
17988   * either end */
17989   if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
17990    if (end > MAX_PRINT_A) {
17991     end = MAX_PRINT_A;
17992    }
17993    if (start < ' ') {
17994     start = ' ';
17995    }
17996    if (end - start >= MAX_PRINT_A - ' ' - 2) {
17997     allow_literals = FALSE;
17998    }
17999    break;
18000   }
18001  }
18002  invlist_iterfinish(*invlist_ptr);
18003
18004  /* The legibility of the output depends mostly on how many punctuation
18005  * characters are output.  There are 32 possible ASCII ones, and some have
18006  * an additional backslash, bringing it to currently 36, so if any more
18007  * than 18 are to be output, we can instead output it as its complement,
18008  * yielding fewer puncts, and making it more legible.  But give some weight
18009  * to the fact that outputting it as a complement is less legible than a
18010  * straight output, so don't complement unless we are somewhat over the 18
18011  * mark */
18012  if (allow_literals && punct_count > 22) {
18013   sv_catpvs(sv, "^");
18014
18015   /* Add everything remaining to the list, so when we invert it just
18016   * below, it will be excluded */
18017   _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
18018   _invlist_invert(*invlist_ptr);
18019  }
18020
18021  /* Here we have figured things out.  Output each range */
18022  invlist_iterinit(*invlist_ptr);
18023  while (invlist_iternext(*invlist_ptr, &start, &end)) {
18024   if (start >= NUM_ANYOF_CODE_POINTS) {
18025    break;
18026   }
18027   put_range(sv, start, end, allow_literals);
18028  }
18029  invlist_iterfinish(*invlist_ptr);
18030
18031  return TRUE;
18032 }
18033
18034 #define CLEAR_OPTSTART \
18035  if (optstart) STMT_START {                                               \
18036   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
18037        " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
18038   optstart=NULL;                                                       \
18039  } STMT_END
18040
18041 #define DUMPUNTIL(b,e)                                                       \
18042      CLEAR_OPTSTART;                                          \
18043      node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
18044
18045 STATIC const regnode *
18046 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
18047    const regnode *last, const regnode *plast,
18048    SV* sv, I32 indent, U32 depth)
18049 {
18050  U8 op = PSEUDO; /* Arbitrary non-END op. */
18051  const regnode *next;
18052  const regnode *optstart= NULL;
18053
18054  RXi_GET_DECL(r,ri);
18055  GET_RE_DEBUG_FLAGS_DECL;
18056
18057  PERL_ARGS_ASSERT_DUMPUNTIL;
18058
18059 #ifdef DEBUG_DUMPUNTIL
18060  PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
18061   last ? last-start : 0,plast ? plast-start : 0);
18062 #endif
18063
18064  if (plast && plast < last)
18065   last= plast;
18066
18067  while (PL_regkind[op] != END && (!last || node < last)) {
18068   assert(node);
18069   /* While that wasn't END last time... */
18070   NODE_ALIGN(node);
18071   op = OP(node);
18072   if (op == CLOSE || op == WHILEM)
18073    indent--;
18074   next = regnext((regnode *)node);
18075
18076   /* Where, what. */
18077   if (OP(node) == OPTIMIZED) {
18078    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
18079     optstart = node;
18080    else
18081     goto after_print;
18082   } else
18083    CLEAR_OPTSTART;
18084
18085   regprop(r, sv, node, NULL, NULL);
18086   PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
18087      (int)(2*indent + 1), "", SvPVX_const(sv));
18088
18089   if (OP(node) != OPTIMIZED) {
18090    if (next == NULL)  /* Next ptr. */
18091     PerlIO_printf(Perl_debug_log, " (0)");
18092    else if (PL_regkind[(U8)op] == BRANCH
18093      && PL_regkind[OP(next)] != BRANCH )
18094     PerlIO_printf(Perl_debug_log, " (FAIL)");
18095    else
18096     PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
18097    (void)PerlIO_putc(Perl_debug_log, '\n');
18098   }
18099
18100  after_print:
18101   if (PL_regkind[(U8)op] == BRANCHJ) {
18102    assert(next);
18103    {
18104     const regnode *nnode = (OP(next) == LONGJMP
18105          ? regnext((regnode *)next)
18106          : next);
18107     if (last && nnode > last)
18108      nnode = last;
18109     DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
18110    }
18111   }
18112   else if (PL_regkind[(U8)op] == BRANCH) {
18113    assert(next);
18114    DUMPUNTIL(NEXTOPER(node), next);
18115   }
18116   else if ( PL_regkind[(U8)op]  == TRIE ) {
18117    const regnode *this_trie = node;
18118    const char op = OP(node);
18119    const U32 n = ARG(node);
18120    const reg_ac_data * const ac = op>=AHOCORASICK ?
18121    (reg_ac_data *)ri->data->data[n] :
18122    NULL;
18123    const reg_trie_data * const trie =
18124     (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
18125 #ifdef DEBUGGING
18126    AV *const trie_words
18127       = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
18128 #endif
18129    const regnode *nextbranch= NULL;
18130    I32 word_idx;
18131    sv_setpvs(sv, "");
18132    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
18133     SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
18134
18135     PerlIO_printf(Perl_debug_log, "%*s%s ",
18136     (int)(2*(indent+3)), "",
18137      elem_ptr
18138      ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
18139         SvCUR(*elem_ptr), 60,
18140         PL_colors[0], PL_colors[1],
18141         (SvUTF8(*elem_ptr)
18142         ? PERL_PV_ESCAPE_UNI
18143         : 0)
18144         | PERL_PV_PRETTY_ELLIPSES
18145         | PERL_PV_PRETTY_LTGT
18146        )
18147      : "???"
18148     );
18149     if (trie->jump) {
18150      U16 dist= trie->jump[word_idx+1];
18151      PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
18152        (UV)((dist ? this_trie + dist : next) - start));
18153      if (dist) {
18154       if (!nextbranch)
18155        nextbranch= this_trie + trie->jump[0];
18156       DUMPUNTIL(this_trie + dist, nextbranch);
18157      }
18158      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
18159       nextbranch= regnext((regnode *)nextbranch);
18160     } else {
18161      PerlIO_printf(Perl_debug_log, "\n");
18162     }
18163    }
18164    if (last && next > last)
18165     node= last;
18166    else
18167     node= next;
18168   }
18169   else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
18170    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
18171      NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
18172   }
18173   else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
18174    assert(next);
18175    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
18176   }
18177   else if ( op == PLUS || op == STAR) {
18178    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
18179   }
18180   else if (PL_regkind[(U8)op] == ANYOF) {
18181    /* arglen 1 + class block */
18182    node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
18183       ? ANYOF_POSIXL_SKIP
18184       : ANYOF_SKIP);
18185    node = NEXTOPER(node);
18186   }
18187   else if (PL_regkind[(U8)op] == EXACT) {
18188    /* Literal string, where present. */
18189    node += NODE_SZ_STR(node) - 1;
18190    node = NEXTOPER(node);
18191   }
18192   else {
18193    node = NEXTOPER(node);
18194    node += regarglen[(U8)op];
18195   }
18196   if (op == CURLYX || op == OPEN)
18197    indent++;
18198  }
18199  CLEAR_OPTSTART;
18200 #ifdef DEBUG_DUMPUNTIL
18201  PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
18202 #endif
18203  return node;
18204 }
18205
18206 #endif /* DEBUGGING */
18207
18208 /*
18209  * ex: set ts=8 sts=4 sw=4 et:
18210  */