src/5022000/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  * Copyright (c) 1986 by University of Toronto.
  42  * Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  * Permission is granted to anyone to use this software for any
  45  * purpose on any computer system, and to redistribute it freely,
  46  * subject to the following restrictions:
  47  *
  48  * 1. The author is not responsible for the consequences of use of
  49  *  this software, no matter how awful, even if they arise
  50  *  from defects in it.
  51  *
  52  * 2. The origin of this software must not be misrepresented, either
  53  *  by explicit claim or by omission.
  54  *
  55  * 3. Altered versions must be plainly marked as such, and must not
  56  *  be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #undef PERL_IN_XSUB_RE
  76 #define PERL_IN_XSUB_RE 1
  77 #include "perl.h"
  78 #undef PERL_IN_XSUB_RE
  79
  80 #ifndef PERL_IN_XSUB_RE
  81 #include "re_defs.h"
  82 #endif
  83
  84 #define REG_COMP_C
  85 #ifdef PERL_IN_XSUB_RE
  86 #  include "re_comp.h"
  87 EXTERN_C const struct regexp_engine my_reg_engine;
  88 #else
  89 #  include "regcomp.h"
  90 #endif
  91
  92 #include "dquote_static.c"
  93 #include "inline_invlist.c"
  94 #include "unicode_constants.h"
  95
  96 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  97  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  98 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
  99  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 100 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 101 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 102
 103 #ifndef STATIC
 104 #define STATIC static
 105 #endif
 106
 107 #ifndef MIN
 108 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 109 #endif
 110
 111 /* this is a chain of data about sub patterns we are processing that
 112    need to be handled separately/specially in study_chunk. Its so
 113    we can simulate recursion without losing state.  */
 114 struct scan_frame;
 115 typedef struct scan_frame {
 116  regnode *last_regnode;      /* last node to process in this frame */
 117  regnode *next_regnode;      /* next node to process when last is reached */
 118  U32 prev_recursed_depth;
 119  I32 stopparen;              /* what stopparen do we use */
 120  U32 is_top_frame;           /* what flags do we use? */
 121
 122  struct scan_frame *this_prev_frame; /* this previous frame */
 123  struct scan_frame *prev_frame;      /* previous frame */
 124  struct scan_frame *next_frame;      /* next frame */
 125 } scan_frame;
 126
 127 /* Certain characters are output as a sequence with the first being a
 128  * backslash. */
 129 #define isBACKSLASHED_PUNCT(c)                                              \
 130      ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
 131
 132
 133 struct RExC_state_t {
 134  U32  flags;   /* RXf_* are we folding, multilining? */
 135  U32  pm_flags;  /* PMf_* stuff from the calling PMOP */
 136  char *precomp;  /* uncompiled string. */
 137  REGEXP *rx_sv;   /* The SV that is the regexp. */
 138  regexp *rx;                    /* perl core regexp structure */
 139  regexp_internal *rxi;           /* internal data for regexp object
 140           pprivate field */
 141  char *start;   /* Start of input for compile */
 142  char *end;   /* End of input for compile */
 143  char *parse;   /* Input-scan pointer. */
 144  SSize_t whilem_seen;  /* number of WHILEM in this expr */
 145  regnode *emit_start;  /* Start of emitted-code area */
 146  regnode *emit_bound;  /* First regnode outside of the
 147           allocated space */
 148  regnode *emit;   /* Code-emit pointer; if = &emit_dummy,
 149           implies compiling, so don't emit */
 150  regnode_ssc emit_dummy;  /* placeholder for emit to point to;
 151           large enough for the largest
 152           non-EXACTish node, so can use it as
 153           scratch in pass1 */
 154  I32  naughty;  /* How bad is this pattern? */
 155  I32  sawback;  /* Did we see \1, ...? */
 156  U32  seen;
 157  SSize_t size;   /* Code size. */
 158  I32                npar;            /* Capture buffer count, (OPEN) plus
 159           one. ("par" 0 is the whole
 160           pattern)*/
 161  I32  nestroot;  /* root parens we are in - used by
 162           accept */
 163  I32  extralen;
 164  I32  seen_zerolen;
 165  regnode **open_parens;  /* pointers to open parens */
 166  regnode **close_parens;  /* pointers to close parens */
 167  regnode *opend;   /* END node in program */
 168  I32  utf8;  /* whether the pattern is utf8 or not */
 169  I32  orig_utf8; /* whether the pattern was originally in utf8 */
 170         /* XXX use this for future optimisation of case
 171         * where pattern must be upgraded to utf8. */
 172  I32  uni_semantics; /* If a d charset modifier should use unicode
 173         rules, even if the pattern is not in
 174         utf8 */
 175  HV  *paren_names;  /* Paren names */
 176
 177  regnode **recurse;  /* Recurse regops */
 178  I32  recurse_count;  /* Number of recurse regops */
 179  U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
 180           through */
 181  U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 182  I32  in_lookbehind;
 183  I32  contains_locale;
 184  I32  contains_i;
 185  I32  override_recoding;
 186 #ifdef EBCDIC
 187  I32  recode_x_to_native;
 188 #endif
 189  I32  in_multi_char_class;
 190  struct reg_code_block *code_blocks; /* positions of literal (?{})
 191            within pattern */
 192  int  num_code_blocks; /* size of code_blocks[] */
 193  int  code_index;  /* next code_blocks[] slot */
 194  SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 195  scan_frame *frame_head;
 196  scan_frame *frame_last;
 197  U32         frame_count;
 198  U32         strict;
 199 #ifdef ADD_TO_REGEXEC
 200  char  *starttry;  /* -Dr: where regtry was called. */
 201 #define RExC_starttry (pRExC_state->starttry)
 202 #endif
 203  SV  *runtime_code_qr; /* qr with the runtime code blocks */
 204 #ifdef DEBUGGING
 205  const char  *lastparse;
 206  I32         lastnum;
 207  AV          *paren_name_list;       /* idx -> name */
 208  U32         study_chunk_recursed_count;
 209  SV          *mysv1;
 210  SV          *mysv2;
 211 #define RExC_lastparse (pRExC_state->lastparse)
 212 #define RExC_lastnum (pRExC_state->lastnum)
 213 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 214 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
 215 #define RExC_mysv (pRExC_state->mysv1)
 216 #define RExC_mysv1 (pRExC_state->mysv1)
 217 #define RExC_mysv2 (pRExC_state->mysv2)
 218
 219 #endif
 220 };
 221
 222 #define RExC_flags (pRExC_state->flags)
 223 #define RExC_pm_flags (pRExC_state->pm_flags)
 224 #define RExC_precomp (pRExC_state->precomp)
 225 #define RExC_rx_sv (pRExC_state->rx_sv)
 226 #define RExC_rx  (pRExC_state->rx)
 227 #define RExC_rxi (pRExC_state->rxi)
 228 #define RExC_start (pRExC_state->start)
 229 #define RExC_end (pRExC_state->end)
 230 #define RExC_parse (pRExC_state->parse)
 231 #define RExC_whilem_seen (pRExC_state->whilem_seen)
 232 #ifdef RE_TRACK_PATTERN_OFFSETS
 233 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
 234               others */
 235 #endif
 236 #define RExC_emit (pRExC_state->emit)
 237 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 238 #define RExC_emit_start (pRExC_state->emit_start)
 239 #define RExC_emit_bound (pRExC_state->emit_bound)
 240 #define RExC_sawback (pRExC_state->sawback)
 241 #define RExC_seen (pRExC_state->seen)
 242 #define RExC_size (pRExC_state->size)
 243 #define RExC_maxlen        (pRExC_state->maxlen)
 244 #define RExC_npar (pRExC_state->npar)
 245 #define RExC_nestroot   (pRExC_state->nestroot)
 246 #define RExC_extralen (pRExC_state->extralen)
 247 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
 248 #define RExC_utf8 (pRExC_state->utf8)
 249 #define RExC_uni_semantics (pRExC_state->uni_semantics)
 250 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
 251 #define RExC_open_parens (pRExC_state->open_parens)
 252 #define RExC_close_parens (pRExC_state->close_parens)
 253 #define RExC_opend (pRExC_state->opend)
 254 #define RExC_paren_names (pRExC_state->paren_names)
 255 #define RExC_recurse (pRExC_state->recurse)
 256 #define RExC_recurse_count (pRExC_state->recurse_count)
 257 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 258 #define RExC_study_chunk_recursed_bytes  \
 259         (pRExC_state->study_chunk_recursed_bytes)
 260 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
 261 #define RExC_contains_locale (pRExC_state->contains_locale)
 262 #define RExC_contains_i (pRExC_state->contains_i)
 263 #define RExC_override_recoding (pRExC_state->override_recoding)
 264 #ifdef EBCDIC
 265 #   define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
 266 #endif
 267 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 268 #define RExC_frame_head (pRExC_state->frame_head)
 269 #define RExC_frame_last (pRExC_state->frame_last)
 270 #define RExC_frame_count (pRExC_state->frame_count)
 271 #define RExC_strict (pRExC_state->strict)
 272
 273 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
 274  * a flag to disable back-off on the fixed/floating substrings - if it's
 275  * a high complexity pattern we assume the benefit of avoiding a full match
 276  * is worth the cost of checking for the substrings even if they rarely help.
 277  */
 278 #define RExC_naughty (pRExC_state->naughty)
 279 #define TOO_NAUGHTY (10)
 280 #define MARK_NAUGHTY(add) \
 281  if (RExC_naughty < TOO_NAUGHTY) \
 282   RExC_naughty += (add)
 283 #define MARK_NAUGHTY_EXP(exp, add) \
 284  if (RExC_naughty < TOO_NAUGHTY) \
 285   RExC_naughty += RExC_naughty / (exp) + (add)
 286
 287 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
 288 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 289   ((*s) == '{' && regcurly(s)))
 290
 291 /*
 292  * Flags to be passed up and down.
 293  */
 294 #define WORST  0 /* Worst case. */
 295 #define HASWIDTH 0x01 /* Known to match non-null strings. */
 296
 297 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 298  * character.  (There needs to be a case: in the switch statement in regexec.c
 299  * for any node marked SIMPLE.)  Note that this is not the same thing as
 300  * REGNODE_SIMPLE */
 301 #define SIMPLE  0x02
 302 #define SPSTART  0x04 /* Starts with * or + */
 303 #define POSTPONED 0x08    /* (?1),(?&name), (??{...}) or similar */
 304 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
 305 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 306
 307 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 308
 309 /* whether trie related optimizations are enabled */
 310 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 311 #define TRIE_STUDY_OPT
 312 #define FULL_TRIE_STUDY
 313 #define TRIE_STCLASS
 314 #endif
 315
 316
 317
 318 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 319 #define PBITVAL(paren) (1 << ((paren) & 7))
 320 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 321 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 322 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 323
 324 #define REQUIRE_UTF8 STMT_START {                                       \
 325          if (!UTF) {                           \
 326           *flagp = RESTART_UTF8;            \
 327           return NULL;                      \
 328          }                                     \
 329       } STMT_END
 330
 331 /* This converts the named class defined in regcomp.h to its equivalent class
 332  * number defined in handy.h. */
 333 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 334 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 335
 336 #define _invlist_union_complement_2nd(a, b, output) \
 337       _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 338 #define _invlist_intersection_complement_2nd(a, b, output) \
 339     _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 340
 341 /* About scan_data_t.
 342
 343   During optimisation we recurse through the regexp program performing
 344   various inplace (keyhole style) optimisations. In addition study_chunk
 345   and scan_commit populate this data structure with information about
 346   what strings MUST appear in the pattern. We look for the longest
 347   string that must appear at a fixed location, and we look for the
 348   longest string that may appear at a floating location. So for instance
 349   in the pattern:
 350
 351  /FOO[xX]A.*B[xX]BAR/
 352
 353   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 354   strings (because they follow a .* construct). study_chunk will identify
 355   both FOO and BAR as being the longest fixed and floating strings respectively.
 356
 357   The strings can be composites, for instance
 358
 359  /(f)(o)(o)/
 360
 361   will result in a composite fixed substring 'foo'.
 362
 363   For each string some basic information is maintained:
 364
 365   - offset or min_offset
 366  This is the position the string must appear at, or not before.
 367  It also implicitly (when combined with minlenp) tells us how many
 368  characters must match before the string we are searching for.
 369  Likewise when combined with minlenp and the length of the string it
 370  tells us how many characters must appear after the string we have
 371  found.
 372
 373   - max_offset
 374  Only used for floating strings. This is the rightmost point that
 375  the string can appear at. If set to SSize_t_MAX it indicates that the
 376  string can occur infinitely far to the right.
 377
 378   - minlenp
 379  A pointer to the minimum number of characters of the pattern that the
 380  string was found inside. This is important as in the case of positive
 381  lookahead or positive lookbehind we can have multiple patterns
 382  involved. Consider
 383
 384  /(?=FOO).*F/
 385
 386  The minimum length of the pattern overall is 3, the minimum length
 387  of the lookahead part is 3, but the minimum length of the part that
 388  will actually match is 1. So 'FOO's minimum length is 3, but the
 389  minimum length for the F is 1. This is important as the minimum length
 390  is used to determine offsets in front of and behind the string being
 391  looked for.  Since strings can be composites this is the length of the
 392  pattern at the time it was committed with a scan_commit. Note that
 393  the length is calculated by study_chunk, so that the minimum lengths
 394  are not known until the full pattern has been compiled, thus the
 395  pointer to the value.
 396
 397   - lookbehind
 398
 399  In the case of lookbehind the string being searched for can be
 400  offset past the start point of the final matching string.
 401  If this value was just blithely removed from the min_offset it would
 402  invalidate some of the calculations for how many chars must match
 403  before or after (as they are derived from min_offset and minlen and
 404  the length of the string being searched for).
 405  When the final pattern is compiled and the data is moved from the
 406  scan_data_t structure into the regexp structure the information
 407  about lookbehind is factored in, with the information that would
 408  have been lost precalculated in the end_shift field for the
 409  associated string.
 410
 411   The fields pos_min and pos_delta are used to store the minimum offset
 412   and the delta to the maximum offset at the current point in the pattern.
 413
 414 */
 415
 416 typedef struct scan_data_t {
 417  /*I32 len_min;      unused */
 418  /*I32 len_delta;    unused */
 419  SSize_t pos_min;
 420  SSize_t pos_delta;
 421  SV *last_found;
 422  SSize_t last_end;     /* min value, <0 unless valid. */
 423  SSize_t last_start_min;
 424  SSize_t last_start_max;
 425  SV **longest;     /* Either &l_fixed, or &l_float. */
 426  SV *longest_fixed;      /* longest fixed string found in pattern */
 427  SSize_t offset_fixed;   /* offset where it starts */
 428  SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 429  I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 430  SV *longest_float;      /* longest floating string found in pattern */
 431  SSize_t offset_float_min; /* earliest point in string it can appear */
 432  SSize_t offset_float_max; /* latest point in string it can appear */
 433  SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 434  SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 435  I32 flags;
 436  I32 whilem_c;
 437  SSize_t *last_closep;
 438  regnode_ssc *start_class;
 439 } scan_data_t;
 440
 441 /*
 442  * Forward declarations for pregcomp()'s friends.
 443  */
 444
 445 static const scan_data_t zero_scan_data =
 446   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 447
 448 #define SF_BEFORE_EOL  (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 449 #define SF_BEFORE_SEOL  0x0001
 450 #define SF_BEFORE_MEOL  0x0002
 451 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 452 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 453
 454 #define SF_FIX_SHIFT_EOL (+2)
 455 #define SF_FL_SHIFT_EOL  (+4)
 456
 457 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 458 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 459
 460 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 461 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 462 #define SF_IS_INF  0x0040
 463 #define SF_HAS_PAR  0x0080
 464 #define SF_IN_PAR  0x0100
 465 #define SF_HAS_EVAL  0x0200
 466 #define SCF_DO_SUBSTR  0x0400
 467 #define SCF_DO_STCLASS_AND 0x0800
 468 #define SCF_DO_STCLASS_OR 0x1000
 469 #define SCF_DO_STCLASS  (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 470 #define SCF_WHILEM_VISITED_POS 0x2000
 471
 472 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 473 #define SCF_SEEN_ACCEPT         0x8000
 474 #define SCF_TRIE_DOING_RESTUDY 0x10000
 475 #define SCF_IN_DEFINE          0x20000
 476
 477
 478
 479
 480 #define UTF cBOOL(RExC_utf8)
 481
 482 /* The enums for all these are ordered so things work out correctly */
 483 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 484 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 485              == REGEX_DEPENDS_CHARSET)
 486 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 487 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 488              >= REGEX_UNICODE_CHARSET)
 489 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 490            == REGEX_ASCII_RESTRICTED_CHARSET)
 491 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 492            >= REGEX_ASCII_RESTRICTED_CHARSET)
 493 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 494           == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 495
 496 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 497
 498 /* For programs that want to be strictly Unicode compatible by dying if any
 499  * attempt is made to match a non-Unicode code point against a Unicode
 500  * property.  */
 501 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 502
 503 #define OOB_NAMEDCLASS  -1
 504
 505 /* There is no code point that is out-of-bounds, so this is problematic.  But
 506  * its only current use is to initialize a variable that is always set before
 507  * looked at. */
 508 #define OOB_UNICODE  0xDEADBEEF
 509
 510 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 511 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 512
 513
 514 /* length of regex to show in messages that don't mark a position within */
 515 #define RegexLengthToShowInErrorMessages 127
 516
 517 /*
 518  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 519  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 520  * op/pragma/warn/regcomp.
 521  */
 522 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 523 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 524
 525 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 526       " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 527
 528 #define REPORT_LOCATION_ARGS(offset)            \
 529     UTF8fARG(UTF, offset, RExC_precomp), \
 530     UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 531
 532 /* Used to point after bad bytes for an error message, but avoid skipping
 533  * past a nul byte. */
 534 #define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
 535
 536 /*
 537  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 538  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 539  * "...".
 540  */
 541 #define _FAIL(code) STMT_START {     \
 542  const char *ellipses = "";      \
 543  IV len = RExC_end - RExC_precomp;     \
 544                   \
 545  if (!SIZE_ONLY)       \
 546   SAVEFREESV(RExC_rx_sv);      \
 547  if (len > RegexLengthToShowInErrorMessages) {   \
 548   /* chop 10 shorter than the max, to ensure meaning of "..." */ \
 549   len = RegexLengthToShowInErrorMessages - 10;   \
 550   ellipses = "...";      \
 551  }         \
 552  code;                                                               \
 553 } STMT_END
 554
 555 #define FAIL(msg) _FAIL(       \
 556  Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",     \
 557    msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 558
 559 #define FAIL2(msg,arg) _FAIL(       \
 560  Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",     \
 561    arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 562
 563 /*
 564  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 565  */
 566 #define Simple_vFAIL(m) STMT_START {     \
 567  const IV offset =                                                   \
 568   (RExC_parse > RExC_end ? RExC_end : RExC_parse) - RExC_precomp; \
 569  Perl_croak(aTHX_ "%s" REPORT_LOCATION,    \
 570    m, REPORT_LOCATION_ARGS(offset)); \
 571 } STMT_END
 572
 573 /*
 574  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 575  */
 576 #define vFAIL(m) STMT_START {    \
 577  if (!SIZE_ONLY)     \
 578   SAVEFREESV(RExC_rx_sv);    \
 579  Simple_vFAIL(m);     \
 580 } STMT_END
 581
 582 /*
 583  * Like Simple_vFAIL(), but accepts two arguments.
 584  */
 585 #define Simple_vFAIL2(m,a1) STMT_START {   \
 586  const IV offset = RExC_parse - RExC_precomp;   \
 587  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,   \
 588      REPORT_LOCATION_ARGS(offset)); \
 589 } STMT_END
 590
 591 /*
 592  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 593  */
 594 #define vFAIL2(m,a1) STMT_START {   \
 595  if (!SIZE_ONLY)     \
 596   SAVEFREESV(RExC_rx_sv);    \
 597  Simple_vFAIL2(m, a1);    \
 598 } STMT_END
 599
 600
 601 /*
 602  * Like Simple_vFAIL(), but accepts three arguments.
 603  */
 604 #define Simple_vFAIL3(m, a1, a2) STMT_START {   \
 605  const IV offset = RExC_parse - RExC_precomp;  \
 606  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,  \
 607    REPORT_LOCATION_ARGS(offset)); \
 608 } STMT_END
 609
 610 /*
 611  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 612  */
 613 #define vFAIL3(m,a1,a2) STMT_START {   \
 614  if (!SIZE_ONLY)     \
 615   SAVEFREESV(RExC_rx_sv);    \
 616  Simple_vFAIL3(m, a1, a2);    \
 617 } STMT_END
 618
 619 /*
 620  * Like Simple_vFAIL(), but accepts four arguments.
 621  */
 622 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {  \
 623  const IV offset = RExC_parse - RExC_precomp;  \
 624  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,  \
 625    REPORT_LOCATION_ARGS(offset)); \
 626 } STMT_END
 627
 628 #define vFAIL4(m,a1,a2,a3) STMT_START {   \
 629  if (!SIZE_ONLY)     \
 630   SAVEFREESV(RExC_rx_sv);    \
 631  Simple_vFAIL4(m, a1, a2, a3);   \
 632 } STMT_END
 633
 634 /* A specialized version of vFAIL2 that works with UTF8f */
 635 #define vFAIL2utf8f(m, a1) STMT_START { \
 636  const IV offset = RExC_parse - RExC_precomp;   \
 637  if (!SIZE_ONLY)                                \
 638   SAVEFREESV(RExC_rx_sv);                    \
 639  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 640    REPORT_LOCATION_ARGS(offset));         \
 641 } STMT_END
 642
 643 /* These have asserts in them because of [perl #122671] Many warnings in
 644  * regcomp.c can occur twice.  If they get output in pass1 and later in that
 645  * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
 646  * would get output again.  So they should be output in pass2, and these
 647  * asserts make sure new warnings follow that paradigm. */
 648
 649 /* m is not necessarily a "literal string", in this macro */
 650 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 651  const IV offset = loc - RExC_precomp;                               \
 652  __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 653    m, REPORT_LOCATION_ARGS(offset));       \
 654 } STMT_END
 655
 656 #define ckWARNreg(loc,m) STMT_START {     \
 657  const IV offset = loc - RExC_precomp;    \
 658  __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 659    REPORT_LOCATION_ARGS(offset));  \
 660 } STMT_END
 661
 662 #define vWARN(loc, m) STMT_START {            \
 663  const IV offset = loc - RExC_precomp;    \
 664  __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 665    REPORT_LOCATION_ARGS(offset));         \
 666 } STMT_END
 667
 668 #define vWARN_dep(loc, m) STMT_START {            \
 669  const IV offset = loc - RExC_precomp;    \
 670  __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
 671    REPORT_LOCATION_ARGS(offset));         \
 672 } STMT_END
 673
 674 #define ckWARNdep(loc,m) STMT_START {            \
 675  const IV offset = loc - RExC_precomp;    \
 676  __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                 \
 677    m REPORT_LOCATION,      \
 678    REPORT_LOCATION_ARGS(offset));  \
 679 } STMT_END
 680
 681 #define ckWARNregdep(loc,m) STMT_START {    \
 682  const IV offset = loc - RExC_precomp;    \
 683  __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
 684    m REPORT_LOCATION,      \
 685    REPORT_LOCATION_ARGS(offset));  \
 686 } STMT_END
 687
 688 #define ckWARN2reg_d(loc,m, a1) STMT_START {    \
 689  const IV offset = loc - RExC_precomp;    \
 690  __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),   \
 691    m REPORT_LOCATION,      \
 692    a1, REPORT_LOCATION_ARGS(offset)); \
 693 } STMT_END
 694
 695 #define ckWARN2reg(loc, m, a1) STMT_START {    \
 696  const IV offset = loc - RExC_precomp;    \
 697  __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 698    a1, REPORT_LOCATION_ARGS(offset)); \
 699 } STMT_END
 700
 701 #define vWARN3(loc, m, a1, a2) STMT_START {    \
 702  const IV offset = loc - RExC_precomp;    \
 703  __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 704    a1, a2, REPORT_LOCATION_ARGS(offset)); \
 705 } STMT_END
 706
 707 #define ckWARN3reg(loc, m, a1, a2) STMT_START {    \
 708  const IV offset = loc - RExC_precomp;    \
 709  __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 710    a1, a2, REPORT_LOCATION_ARGS(offset)); \
 711 } STMT_END
 712
 713 #define vWARN4(loc, m, a1, a2, a3) STMT_START {    \
 714  const IV offset = loc - RExC_precomp;    \
 715  __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 716    a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 717 } STMT_END
 718
 719 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {   \
 720  const IV offset = loc - RExC_precomp;    \
 721  __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 722    a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 723 } STMT_END
 724
 725 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {   \
 726  const IV offset = loc - RExC_precomp;    \
 727  __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 728    a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 729 } STMT_END
 730
 731 /* Macros for recording node offsets.   20001227 mjd@plover.com
 732  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 733  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 734  * Element 0 holds the number n.
 735  * Position is 1 indexed.
 736  */
 737 #ifndef RE_TRACK_PATTERN_OFFSETS
 738 #define Set_Node_Offset_To_R(node,byte)
 739 #define Set_Node_Offset(node,byte)
 740 #define Set_Cur_Node_Offset
 741 #define Set_Node_Length_To_R(node,len)
 742 #define Set_Node_Length(node,len)
 743 #define Set_Node_Cur_Length(node,start)
 744 #define Node_Offset(n)
 745 #define Node_Length(n)
 746 #define Set_Node_Offset_Length(node,offset,len)
 747 #define ProgLen(ri) ri->u.proglen
 748 #define SetProgLen(ri,x) ri->u.proglen = x
 749 #else
 750 #define ProgLen(ri) ri->u.offsets[0]
 751 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 752 #define Set_Node_Offset_To_R(node,byte) STMT_START {   \
 753  if (! SIZE_ONLY) {       \
 754   MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",  \
 755      __LINE__, (int)(node), (int)(byte)));  \
 756   if((node) < 0) {      \
 757    Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 758           (int)(node));                  \
 759   } else {       \
 760    RExC_offsets[2*(node)-1] = (byte);    \
 761   }        \
 762  }         \
 763 } STMT_END
 764
 765 #define Set_Node_Offset(node,byte) \
 766  Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 767 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 768
 769 #define Set_Node_Length_To_R(node,len) STMT_START {   \
 770  if (! SIZE_ONLY) {       \
 771   MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",  \
 772     __LINE__, (int)(node), (int)(len)));   \
 773   if((node) < 0) {      \
 774    Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 775           (int)(node));                  \
 776   } else {       \
 777    RExC_offsets[2*(node)] = (len);    \
 778   }        \
 779  }         \
 780 } STMT_END
 781
 782 #define Set_Node_Length(node,len) \
 783  Set_Node_Length_To_R((node)-RExC_emit_start, len)
 784 #define Set_Node_Cur_Length(node, start)                \
 785  Set_Node_Length(node, RExC_parse - start)
 786
 787 /* Get offsets and lengths */
 788 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 789 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 790
 791 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
 792  Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
 793  Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
 794 } STMT_END
 795 #endif
 796
 797 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 798 #define EXPERIMENTAL_INPLACESCAN
 799 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 800
 801 #define DEBUG_RExC_seen() \
 802   DEBUG_OPTIMISE_MORE_r({                                             \
 803    PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
 804                    \
 805    if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 806     PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
 807                    \
 808    if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 809     PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
 810                    \
 811    if (RExC_seen & REG_GPOS_SEEN)                                  \
 812     PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
 813                    \
 814    if (RExC_seen & REG_CANY_SEEN)                                  \
 815     PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN ");             \
 816                    \
 817    if (RExC_seen & REG_RECURSE_SEEN)                               \
 818     PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
 819                    \
 820    if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
 821     PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
 822                    \
 823    if (RExC_seen & REG_VERBARG_SEEN)                               \
 824     PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
 825                    \
 826    if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 827     PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
 828                    \
 829    if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 830     PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
 831                    \
 832    if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 833     PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
 834                    \
 835    if (RExC_seen & REG_GOSTART_SEEN)                               \
 836     PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
 837                    \
 838    if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
 839     PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
 840                    \
 841    PerlIO_printf(Perl_debug_log,"\n");                             \
 842   });
 843
 844 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
 845   if ((flags) & flag) PerlIO_printf(Perl_debug_log, "%s ", #flag)
 846
 847 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str)                    \
 848  if ( ( flags ) ) {                                                      \
 849   PerlIO_printf(Perl_debug_log, "%s", open_str);                      \
 850   DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL);                     \
 851   DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL);                     \
 852   DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF);                             \
 853   DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR);                            \
 854   DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR);                             \
 855   DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL);                           \
 856   DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR);                         \
 857   DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND);                    \
 858   DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR);                     \
 859   DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS);                        \
 860   DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS);                \
 861   DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY);                      \
 862   DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT);                       \
 863   DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY);                \
 864   DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE);                         \
 865   PerlIO_printf(Perl_debug_log, "%s", close_str);                     \
 866  }
 867
 868
 869 #define DEBUG_STUDYDATA(str,data,depth)                              \
 870 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 871  PerlIO_printf(Perl_debug_log,                                    \
 872   "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 873   " Flags: 0x%"UVXf,                                           \
 874   (int)(depth)*2, "",                                          \
 875   (IV)((data)->pos_min),                                       \
 876   (IV)((data)->pos_delta),                                     \
 877   (UV)((data)->flags)                                          \
 878  );                                                               \
 879  DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]");                 \
 880  PerlIO_printf(Perl_debug_log,                                    \
 881   " Whilem_c: %"IVdf" Lcp: %"IVdf" %s",                        \
 882   (IV)((data)->whilem_c),                                      \
 883   (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 884   is_inf ? "INF " : ""                                         \
 885  );                                                               \
 886  if ((data)->last_found)                                          \
 887   PerlIO_printf(Perl_debug_log,                                \
 888    "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 889    " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 890    SvPVX_const((data)->last_found),                         \
 891    (IV)((data)->last_end),                                  \
 892    (IV)((data)->last_start_min),                            \
 893    (IV)((data)->last_start_max),                            \
 894    ((data)->longest &&                                      \
 895    (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 896    SvPVX_const((data)->longest_fixed),                      \
 897    (IV)((data)->offset_fixed),                              \
 898    ((data)->longest &&                                      \
 899    (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 900    SvPVX_const((data)->longest_float),                      \
 901    (IV)((data)->offset_float_min),                          \
 902    (IV)((data)->offset_float_max)                           \
 903   );                                                           \
 904  PerlIO_printf(Perl_debug_log,"\n");                              \
 905 });
 906
 907 /* is c a control character for which we have a mnemonic? */
 908 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 909
 910 STATIC const char *
 911 S_cntrl_to_mnemonic(const U8 c)
 912 {
 913  /* Returns the mnemonic string that represents character 'c', if one
 914  * exists; NULL otherwise.  The only ones that exist for the purposes of
 915  * this routine are a few control characters */
 916
 917  switch (c) {
 918   case '\a':       return "\\a";
 919   case '\b':       return "\\b";
 920   case ESC_NATIVE: return "\\e";
 921   case '\f':       return "\\f";
 922   case '\n':       return "\\n";
 923   case '\r':       return "\\r";
 924   case '\t':       return "\\t";
 925  }
 926
 927  return NULL;
 928 }
 929
 930 /* Mark that we cannot extend a found fixed substring at this point.
 931    Update the longest found anchored substring and the longest found
 932    floating substrings if needed. */
 933
 934 STATIC void
 935 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 936      SSize_t *minlenp, int is_inf)
 937 {
 938  const STRLEN l = CHR_SVLEN(data->last_found);
 939  const STRLEN old_l = CHR_SVLEN(*data->longest);
 940  GET_RE_DEBUG_FLAGS_DECL;
 941
 942  PERL_ARGS_ASSERT_SCAN_COMMIT;
 943
 944  if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 945   SvSetMagicSV(*data->longest, data->last_found);
 946   if (*data->longest == data->longest_fixed) {
 947    data->offset_fixed = l ? data->last_start_min : data->pos_min;
 948    if (data->flags & SF_BEFORE_EOL)
 949     data->flags
 950      |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 951    else
 952     data->flags &= ~SF_FIX_BEFORE_EOL;
 953    data->minlen_fixed=minlenp;
 954    data->lookbehind_fixed=0;
 955   }
 956   else { /* *data->longest == data->longest_float */
 957    data->offset_float_min = l ? data->last_start_min : data->pos_min;
 958    data->offset_float_max = (l
 959       ? data->last_start_max
 960       : (data->pos_delta > SSize_t_MAX - data->pos_min
 961           ? SSize_t_MAX
 962           : data->pos_min + data->pos_delta));
 963    if (is_inf
 964     || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 965     data->offset_float_max = SSize_t_MAX;
 966    if (data->flags & SF_BEFORE_EOL)
 967     data->flags
 968      |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 969    else
 970     data->flags &= ~SF_FL_BEFORE_EOL;
 971    data->minlen_float=minlenp;
 972    data->lookbehind_float=0;
 973   }
 974  }
 975  SvCUR_set(data->last_found, 0);
 976  {
 977   SV * const sv = data->last_found;
 978   if (SvUTF8(sv) && SvMAGICAL(sv)) {
 979    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 980    if (mg)
 981     mg->mg_len = 0;
 982   }
 983  }
 984  data->last_end = -1;
 985  data->flags &= ~SF_BEFORE_EOL;
 986  DEBUG_STUDYDATA("commit: ",data,0);
 987 }
 988
 989 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 990  * list that describes which code points it matches */
 991
 992 STATIC void
 993 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 994 {
 995  /* Set the SSC 'ssc' to match an empty string or any code point */
 996
 997  PERL_ARGS_ASSERT_SSC_ANYTHING;
 998
 999  assert(is_ANYOF_SYNTHETIC(ssc));
1000
1001  ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
1002  _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
1003  ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
1004 }
1005
1006 STATIC int
1007 S_ssc_is_anything(const regnode_ssc *ssc)
1008 {
1009  /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1010  * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
1011  * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1012  * in any way, so there's no point in using it */
1013
1014  UV start, end;
1015  bool ret;
1016
1017  PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1018
1019  assert(is_ANYOF_SYNTHETIC(ssc));
1020
1021  if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1022   return FALSE;
1023  }
1024
1025  /* See if the list consists solely of the range 0 - Infinity */
1026  invlist_iterinit(ssc->invlist);
1027  ret = invlist_iternext(ssc->invlist, &start, &end)
1028   && start == 0
1029   && end == UV_MAX;
1030
1031  invlist_iterfinish(ssc->invlist);
1032
1033  if (ret) {
1034   return TRUE;
1035  }
1036
1037  /* If e.g., both \w and \W are set, matches everything */
1038  if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1039   int i;
1040   for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1041    if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1042     return TRUE;
1043    }
1044   }
1045  }
1046
1047  return FALSE;
1048 }
1049
1050 STATIC void
1051 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1052 {
1053  /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
1054  * string, any code point, or any posix class under locale */
1055
1056  PERL_ARGS_ASSERT_SSC_INIT;
1057
1058  Zero(ssc, 1, regnode_ssc);
1059  set_ANYOF_SYNTHETIC(ssc);
1060  ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1061  ssc_anything(ssc);
1062
1063  /* If any portion of the regex is to operate under locale rules that aren't
1064  * fully known at compile time, initialization includes it.  The reason
1065  * this isn't done for all regexes is that the optimizer was written under
1066  * the assumption that locale was all-or-nothing.  Given the complexity and
1067  * lack of documentation in the optimizer, and that there are inadequate
1068  * test cases for locale, many parts of it may not work properly, it is
1069  * safest to avoid locale unless necessary. */
1070  if (RExC_contains_locale) {
1071   ANYOF_POSIXL_SETALL(ssc);
1072  }
1073  else {
1074   ANYOF_POSIXL_ZERO(ssc);
1075  }
1076 }
1077
1078 STATIC int
1079 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1080       const regnode_ssc *ssc)
1081 {
1082  /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1083  * to the list of code points matched, and locale posix classes; hence does
1084  * not check its flags) */
1085
1086  UV start, end;
1087  bool ret;
1088
1089  PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1090
1091  assert(is_ANYOF_SYNTHETIC(ssc));
1092
1093  invlist_iterinit(ssc->invlist);
1094  ret = invlist_iternext(ssc->invlist, &start, &end)
1095   && start == 0
1096   && end == UV_MAX;
1097
1098  invlist_iterfinish(ssc->invlist);
1099
1100  if (! ret) {
1101   return FALSE;
1102  }
1103
1104  if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1105   return FALSE;
1106  }
1107
1108  return TRUE;
1109 }
1110
1111 STATIC SV*
1112 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1113        const regnode_charclass* const node)
1114 {
1115  /* Returns a mortal inversion list defining which code points are matched
1116  * by 'node', which is of type ANYOF.  Handles complementing the result if
1117  * appropriate.  If some code points aren't knowable at this time, the
1118  * returned list must, and will, contain every code point that is a
1119  * possibility. */
1120
1121  SV* invlist = sv_2mortal(_new_invlist(0));
1122  SV* only_utf8_locale_invlist = NULL;
1123  unsigned int i;
1124  const U32 n = ARG(node);
1125  bool new_node_has_latin1 = FALSE;
1126
1127  PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1128
1129  /* Look at the data structure created by S_set_ANYOF_arg() */
1130  if (n != ANYOF_ONLY_HAS_BITMAP) {
1131   SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1132   AV * const av = MUTABLE_AV(SvRV(rv));
1133   SV **const ary = AvARRAY(av);
1134   assert(RExC_rxi->data->what[n] == 's');
1135
1136   if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1137    invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1138   }
1139   else if (ary[0] && ary[0] != &PL_sv_undef) {
1140
1141    /* Here, no compile-time swash, and there are things that won't be
1142    * known until runtime -- we have to assume it could be anything */
1143    return _add_range_to_invlist(invlist, 0, UV_MAX);
1144   }
1145   else if (ary[3] && ary[3] != &PL_sv_undef) {
1146
1147    /* Here no compile-time swash, and no run-time only data.  Use the
1148    * node's inversion list */
1149    invlist = sv_2mortal(invlist_clone(ary[3]));
1150   }
1151
1152   /* Get the code points valid only under UTF-8 locales */
1153   if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1154    && ary[2] && ary[2] != &PL_sv_undef)
1155   {
1156    only_utf8_locale_invlist = ary[2];
1157   }
1158  }
1159
1160  /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1161  * code points, and an inversion list for the others, but if there are code
1162  * points that should match only conditionally on the target string being
1163  * UTF-8, those are placed in the inversion list, and not the bitmap.
1164  * Since there are circumstances under which they could match, they are
1165  * included in the SSC.  But if the ANYOF node is to be inverted, we have
1166  * to exclude them here, so that when we invert below, the end result
1167  * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1168  * have to do this here before we add the unconditionally matched code
1169  * points */
1170  if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1171   _invlist_intersection_complement_2nd(invlist,
1172            PL_UpperLatin1,
1173            &invlist);
1174  }
1175
1176  /* Add in the points from the bit map */
1177  for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1178   if (ANYOF_BITMAP_TEST(node, i)) {
1179    invlist = add_cp_to_invlist(invlist, i);
1180    new_node_has_latin1 = TRUE;
1181   }
1182  }
1183
1184  /* If this can match all upper Latin1 code points, have to add them
1185  * as well */
1186  if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
1187   _invlist_union(invlist, PL_UpperLatin1, &invlist);
1188  }
1189
1190  /* Similarly for these */
1191  if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1192   _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1193  }
1194
1195  if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1196   _invlist_invert(invlist);
1197  }
1198  else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1199
1200   /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1201   * locale.  We can skip this if there are no 0-255 at all. */
1202   _invlist_union(invlist, PL_Latin1, &invlist);
1203  }
1204
1205  /* Similarly add the UTF-8 locale possible matches.  These have to be
1206  * deferred until after the non-UTF-8 locale ones are taken care of just
1207  * above, or it leads to wrong results under ANYOF_INVERT */
1208  if (only_utf8_locale_invlist) {
1209   _invlist_union_maybe_complement_2nd(invlist,
1210            only_utf8_locale_invlist,
1211            ANYOF_FLAGS(node) & ANYOF_INVERT,
1212            &invlist);
1213  }
1214
1215  return invlist;
1216 }
1217
1218 /* These two functions currently do the exact same thing */
1219 #define ssc_init_zero  ssc_init
1220
1221 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1222 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1223
1224 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1225  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1226  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1227
1228 STATIC void
1229 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1230     const regnode_charclass *and_with)
1231 {
1232  /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1233  * another SSC or a regular ANYOF class.  Can create false positives. */
1234
1235  SV* anded_cp_list;
1236  U8  anded_flags;
1237
1238  PERL_ARGS_ASSERT_SSC_AND;
1239
1240  assert(is_ANYOF_SYNTHETIC(ssc));
1241
1242  /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1243  * the code point inversion list and just the relevant flags */
1244  if (is_ANYOF_SYNTHETIC(and_with)) {
1245   anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1246   anded_flags = ANYOF_FLAGS(and_with);
1247
1248   /* XXX This is a kludge around what appears to be deficiencies in the
1249   * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1250   * there are paths through the optimizer where it doesn't get weeded
1251   * out when it should.  And if we don't make some extra provision for
1252   * it like the code just below, it doesn't get added when it should.
1253   * This solution is to add it only when AND'ing, which is here, and
1254   * only when what is being AND'ed is the pristine, original node
1255   * matching anything.  Thus it is like adding it to ssc_anything() but
1256   * only when the result is to be AND'ed.  Probably the same solution
1257   * could be adopted for the same problem we have with /l matching,
1258   * which is solved differently in S_ssc_init(), and that would lead to
1259   * fewer false positives than that solution has.  But if this solution
1260   * creates bugs, the consequences are only that a warning isn't raised
1261   * that should be; while the consequences for having /l bugs is
1262   * incorrect matches */
1263   if (ssc_is_anything((regnode_ssc *)and_with)) {
1264    anded_flags |= ANYOF_WARN_SUPER;
1265   }
1266  }
1267  else {
1268   anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1269   anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1270  }
1271
1272  ANYOF_FLAGS(ssc) &= anded_flags;
1273
1274  /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1275  * C2 is the list of code points in 'and-with'; P2, its posix classes.
1276  * 'and_with' may be inverted.  When not inverted, we have the situation of
1277  * computing:
1278  *  (C1 | P1) & (C2 | P2)
1279  *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1280  *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1281  *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1282  *                    <=  ((C1 & C2) | P1 | P2)
1283  * Alternatively, the last few steps could be:
1284  *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1285  *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1286  *                    <=  (C1 | C2 | (P1 & P2))
1287  * We favor the second approach if either P1 or P2 is non-empty.  This is
1288  * because these components are a barrier to doing optimizations, as what
1289  * they match cannot be known until the moment of matching as they are
1290  * dependent on the current locale, 'AND"ing them likely will reduce or
1291  * eliminate them.
1292  * But we can do better if we know that C1,P1 are in their initial state (a
1293  * frequent occurrence), each matching everything:
1294  *  (<everything>) & (C2 | P2) =  C2 | P2
1295  * Similarly, if C2,P2 are in their initial state (again a frequent
1296  * occurrence), the result is a no-op
1297  *  (C1 | P1) & (<everything>) =  C1 | P1
1298  *
1299  * Inverted, we have
1300  *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1301  *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1302  *                         <=  (C1 & ~C2) | (P1 & ~P2)
1303  * */
1304
1305  if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1306   && ! is_ANYOF_SYNTHETIC(and_with))
1307  {
1308   unsigned int i;
1309
1310   ssc_intersection(ssc,
1311       anded_cp_list,
1312       FALSE /* Has already been inverted */
1313       );
1314
1315   /* If either P1 or P2 is empty, the intersection will be also; can skip
1316   * the loop */
1317   if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1318    ANYOF_POSIXL_ZERO(ssc);
1319   }
1320   else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1321
1322    /* Note that the Posix class component P from 'and_with' actually
1323    * looks like:
1324    *      P = Pa | Pb | ... | Pn
1325    * where each component is one posix class, such as in [\w\s].
1326    * Thus
1327    *      ~P = ~(Pa | Pb | ... | Pn)
1328    *         = ~Pa & ~Pb & ... & ~Pn
1329    *        <= ~Pa | ~Pb | ... | ~Pn
1330    * The last is something we can easily calculate, but unfortunately
1331    * is likely to have many false positives.  We could do better
1332    * in some (but certainly not all) instances if two classes in
1333    * P have known relationships.  For example
1334    *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1335    * So
1336    *      :lower: & :print: = :lower:
1337    * And similarly for classes that must be disjoint.  For example,
1338    * since \s and \w can have no elements in common based on rules in
1339    * the POSIX standard,
1340    *      \w & ^\S = nothing
1341    * Unfortunately, some vendor locales do not meet the Posix
1342    * standard, in particular almost everything by Microsoft.
1343    * The loop below just changes e.g., \w into \W and vice versa */
1344
1345    regnode_charclass_posixl temp;
1346    int add = 1;    /* To calculate the index of the complement */
1347
1348    ANYOF_POSIXL_ZERO(&temp);
1349    for (i = 0; i < ANYOF_MAX; i++) {
1350     assert(i % 2 != 0
1351      || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1352      || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1353
1354     if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1355      ANYOF_POSIXL_SET(&temp, i + add);
1356     }
1357     add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1358    }
1359    ANYOF_POSIXL_AND(&temp, ssc);
1360
1361   } /* else ssc already has no posixes */
1362  } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1363   in its initial state */
1364  else if (! is_ANYOF_SYNTHETIC(and_with)
1365    || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1366  {
1367   /* But if 'ssc' is in its initial state, the result is just 'and_with';
1368   * copy it over 'ssc' */
1369   if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1370    if (is_ANYOF_SYNTHETIC(and_with)) {
1371     StructCopy(and_with, ssc, regnode_ssc);
1372    }
1373    else {
1374     ssc->invlist = anded_cp_list;
1375     ANYOF_POSIXL_ZERO(ssc);
1376     if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1377      ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1378     }
1379    }
1380   }
1381   else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1382     || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1383   {
1384    /* One or the other of P1, P2 is non-empty. */
1385    if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1386     ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1387    }
1388    ssc_union(ssc, anded_cp_list, FALSE);
1389   }
1390   else { /* P1 = P2 = empty */
1391    ssc_intersection(ssc, anded_cp_list, FALSE);
1392   }
1393  }
1394 }
1395
1396 STATIC void
1397 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1398    const regnode_charclass *or_with)
1399 {
1400  /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1401  * another SSC or a regular ANYOF class.  Can create false positives if
1402  * 'or_with' is to be inverted. */
1403
1404  SV* ored_cp_list;
1405  U8 ored_flags;
1406
1407  PERL_ARGS_ASSERT_SSC_OR;
1408
1409  assert(is_ANYOF_SYNTHETIC(ssc));
1410
1411  /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1412  * the code point inversion list and just the relevant flags */
1413  if (is_ANYOF_SYNTHETIC(or_with)) {
1414   ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1415   ored_flags = ANYOF_FLAGS(or_with);
1416  }
1417  else {
1418   ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1419   ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1420  }
1421
1422  ANYOF_FLAGS(ssc) |= ored_flags;
1423
1424  /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1425  * C2 is the list of code points in 'or-with'; P2, its posix classes.
1426  * 'or_with' may be inverted.  When not inverted, we have the simple
1427  * situation of computing:
1428  *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1429  * If P1|P2 yields a situation with both a class and its complement are
1430  * set, like having both \w and \W, this matches all code points, and we
1431  * can delete these from the P component of the ssc going forward.  XXX We
1432  * might be able to delete all the P components, but I (khw) am not certain
1433  * about this, and it is better to be safe.
1434  *
1435  * Inverted, we have
1436  *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1437  *                         <=  (C1 | P1) | ~C2
1438  *                         <=  (C1 | ~C2) | P1
1439  * (which results in actually simpler code than the non-inverted case)
1440  * */
1441
1442  if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1443   && ! is_ANYOF_SYNTHETIC(or_with))
1444  {
1445   /* We ignore P2, leaving P1 going forward */
1446  }   /* else  Not inverted */
1447  else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1448   ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1449   if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1450    unsigned int i;
1451    for (i = 0; i < ANYOF_MAX; i += 2) {
1452     if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1453     {
1454      ssc_match_all_cp(ssc);
1455      ANYOF_POSIXL_CLEAR(ssc, i);
1456      ANYOF_POSIXL_CLEAR(ssc, i+1);
1457     }
1458    }
1459   }
1460  }
1461
1462  ssc_union(ssc,
1463    ored_cp_list,
1464    FALSE /* Already has been inverted */
1465    );
1466 }
1467
1468 PERL_STATIC_INLINE void
1469 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1470 {
1471  PERL_ARGS_ASSERT_SSC_UNION;
1472
1473  assert(is_ANYOF_SYNTHETIC(ssc));
1474
1475  _invlist_union_maybe_complement_2nd(ssc->invlist,
1476           invlist,
1477           invert2nd,
1478           &ssc->invlist);
1479 }
1480
1481 PERL_STATIC_INLINE void
1482 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1483       SV* const invlist,
1484       const bool invert2nd)
1485 {
1486  PERL_ARGS_ASSERT_SSC_INTERSECTION;
1487
1488  assert(is_ANYOF_SYNTHETIC(ssc));
1489
1490  _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1491            invlist,
1492            invert2nd,
1493            &ssc->invlist);
1494 }
1495
1496 PERL_STATIC_INLINE void
1497 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1498 {
1499  PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1500
1501  assert(is_ANYOF_SYNTHETIC(ssc));
1502
1503  ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1504 }
1505
1506 PERL_STATIC_INLINE void
1507 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1508 {
1509  /* AND just the single code point 'cp' into the SSC 'ssc' */
1510
1511  SV* cp_list = _new_invlist(2);
1512
1513  PERL_ARGS_ASSERT_SSC_CP_AND;
1514
1515  assert(is_ANYOF_SYNTHETIC(ssc));
1516
1517  cp_list = add_cp_to_invlist(cp_list, cp);
1518  ssc_intersection(ssc, cp_list,
1519      FALSE /* Not inverted */
1520      );
1521  SvREFCNT_dec_NN(cp_list);
1522 }
1523
1524 PERL_STATIC_INLINE void
1525 S_ssc_clear_locale(regnode_ssc *ssc)
1526 {
1527  /* Set the SSC 'ssc' to not match any locale things */
1528  PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1529
1530  assert(is_ANYOF_SYNTHETIC(ssc));
1531
1532  ANYOF_POSIXL_ZERO(ssc);
1533  ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1534 }
1535
1536 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1537
1538 STATIC bool
1539 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1540 {
1541  /* The synthetic start class is used to hopefully quickly winnow down
1542  * places where a pattern could start a match in the target string.  If it
1543  * doesn't really narrow things down that much, there isn't much point to
1544  * having the overhead of using it.  This function uses some very crude
1545  * heuristics to decide if to use the ssc or not.
1546  *
1547  * It returns TRUE if 'ssc' rules out more than half what it considers to
1548  * be the "likely" possible matches, but of course it doesn't know what the
1549  * actual things being matched are going to be; these are only guesses
1550  *
1551  * For /l matches, it assumes that the only likely matches are going to be
1552  *      in the 0-255 range, uniformly distributed, so half of that is 127
1553  * For /a and /d matches, it assumes that the likely matches will be just
1554  *      the ASCII range, so half of that is 63
1555  * For /u and there isn't anything matching above the Latin1 range, it
1556  *      assumes that that is the only range likely to be matched, and uses
1557  *      half that as the cut-off: 127.  If anything matches above Latin1,
1558  *      it assumes that all of Unicode could match (uniformly), except for
1559  *      non-Unicode code points and things in the General Category "Other"
1560  *      (unassigned, private use, surrogates, controls and formats).  This
1561  *      is a much large number. */
1562
1563  const U32 max_match = (LOC)
1564       ? 127
1565       : (! UNI_SEMANTICS)
1566        ? 63
1567        : (invlist_highest(ssc->invlist) < 256)
1568        ? 127
1569        : ((NON_OTHER_COUNT + 1) / 2) - 1;
1570  U32 count = 0;      /* Running total of number of code points matched by
1571       'ssc' */
1572  UV start, end;      /* Start and end points of current range in inversion
1573       list */
1574
1575  PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1576
1577  invlist_iterinit(ssc->invlist);
1578  while (invlist_iternext(ssc->invlist, &start, &end)) {
1579
1580   /* /u is the only thing that we expect to match above 255; so if not /u
1581   * and even if there are matches above 255, ignore them.  This catches
1582   * things like \d under /d which does match the digits above 255, but
1583   * since the pattern is /d, it is not likely to be expecting them */
1584   if (! UNI_SEMANTICS) {
1585    if (start > 255) {
1586     break;
1587    }
1588    end = MIN(end, 255);
1589   }
1590   count += end - start + 1;
1591   if (count > max_match) {
1592    invlist_iterfinish(ssc->invlist);
1593    return FALSE;
1594   }
1595  }
1596
1597  return TRUE;
1598 }
1599
1600
1601 STATIC void
1602 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1603 {
1604  /* The inversion list in the SSC is marked mortal; now we need a more
1605  * permanent copy, which is stored the same way that is done in a regular
1606  * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1607  * map */
1608
1609  SV* invlist = invlist_clone(ssc->invlist);
1610
1611  PERL_ARGS_ASSERT_SSC_FINALIZE;
1612
1613  assert(is_ANYOF_SYNTHETIC(ssc));
1614
1615  /* The code in this file assumes that all but these flags aren't relevant
1616  * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1617  * by the time we reach here */
1618  assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1619
1620  populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1621
1622  set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1623         NULL, NULL, NULL, FALSE);
1624
1625  /* Make sure is clone-safe */
1626  ssc->invlist = NULL;
1627
1628  if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1629   ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1630  }
1631
1632  assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1633 }
1634
1635 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1636 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1637 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1638 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1639        ? (TRIE_LIST_CUR( idx ) - 1)           \
1640        : 0 )
1641
1642
1643 #ifdef DEBUGGING
1644 /*
1645    dump_trie(trie,widecharmap,revcharmap)
1646    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1647    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1648
1649    These routines dump out a trie in a somewhat readable format.
1650    The _interim_ variants are used for debugging the interim
1651    tables that are used to generate the final compressed
1652    representation which is what dump_trie expects.
1653
1654    Part of the reason for their existence is to provide a form
1655    of documentation as to how the different representations function.
1656
1657 */
1658
1659 /*
1660   Dumps the final compressed table form of the trie to Perl_debug_log.
1661   Used for debugging make_trie().
1662 */
1663
1664 STATIC void
1665 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1666    AV *revcharmap, U32 depth)
1667 {
1668  U32 state;
1669  SV *sv=sv_newmortal();
1670  int colwidth= widecharmap ? 6 : 4;
1671  U16 word;
1672  GET_RE_DEBUG_FLAGS_DECL;
1673
1674  PERL_ARGS_ASSERT_DUMP_TRIE;
1675
1676  PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1677   (int)depth * 2 + 2,"",
1678   "Match","Base","Ofs" );
1679
1680  for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1681   SV ** const tmp = av_fetch( revcharmap, state, 0);
1682   if ( tmp ) {
1683    PerlIO_printf( Perl_debug_log, "%*s",
1684     colwidth,
1685     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1686        PL_colors[0], PL_colors[1],
1687        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1688        PERL_PV_ESCAPE_FIRSTCHAR
1689     )
1690    );
1691   }
1692  }
1693  PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1694   (int)depth * 2 + 2,"");
1695
1696  for( state = 0 ; state < trie->uniquecharcount ; state++ )
1697   PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1698  PerlIO_printf( Perl_debug_log, "\n");
1699
1700  for( state = 1 ; state < trie->statecount ; state++ ) {
1701   const U32 base = trie->states[ state ].trans.base;
1702
1703   PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1704          (int)depth * 2 + 2,"", (UV)state);
1705
1706   if ( trie->states[ state ].wordnum ) {
1707    PerlIO_printf( Perl_debug_log, " W%4X",
1708           trie->states[ state ].wordnum );
1709   } else {
1710    PerlIO_printf( Perl_debug_log, "%6s", "" );
1711   }
1712
1713   PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1714
1715   if ( base ) {
1716    U32 ofs = 0;
1717
1718    while( ( base + ofs  < trie->uniquecharcount ) ||
1719     ( base + ofs - trie->uniquecharcount < trie->lasttrans
1720      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1721                  != state))
1722      ofs++;
1723
1724    PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1725
1726    for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1727     if ( ( base + ofs >= trie->uniquecharcount )
1728       && ( base + ofs - trie->uniquecharcount
1729               < trie->lasttrans )
1730       && trie->trans[ base + ofs
1731          - trie->uniquecharcount ].check == state )
1732     {
1733     PerlIO_printf( Perl_debug_log, "%*"UVXf,
1734      colwidth,
1735      (UV)trie->trans[ base + ofs
1736            - trie->uniquecharcount ].next );
1737     } else {
1738      PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1739     }
1740    }
1741
1742    PerlIO_printf( Perl_debug_log, "]");
1743
1744   }
1745   PerlIO_printf( Perl_debug_log, "\n" );
1746  }
1747  PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1748         (int)depth*2, "");
1749  for (word=1; word <= trie->wordcount; word++) {
1750   PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1751    (int)word, (int)(trie->wordinfo[word].prev),
1752    (int)(trie->wordinfo[word].len));
1753  }
1754  PerlIO_printf(Perl_debug_log, "\n" );
1755 }
1756 /*
1757   Dumps a fully constructed but uncompressed trie in list form.
1758   List tries normally only are used for construction when the number of
1759   possible chars (trie->uniquecharcount) is very high.
1760   Used for debugging make_trie().
1761 */
1762 STATIC void
1763 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1764       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1765       U32 depth)
1766 {
1767  U32 state;
1768  SV *sv=sv_newmortal();
1769  int colwidth= widecharmap ? 6 : 4;
1770  GET_RE_DEBUG_FLAGS_DECL;
1771
1772  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1773
1774  /* print out the table precompression.  */
1775  PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1776   (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1777   "------:-----+-----------------\n" );
1778
1779  for( state=1 ; state < next_alloc ; state ++ ) {
1780   U16 charid;
1781
1782   PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1783    (int)depth * 2 + 2,"", (UV)state  );
1784   if ( ! trie->states[ state ].wordnum ) {
1785    PerlIO_printf( Perl_debug_log, "%5s| ","");
1786   } else {
1787    PerlIO_printf( Perl_debug_log, "W%4x| ",
1788     trie->states[ state ].wordnum
1789    );
1790   }
1791   for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1792    SV ** const tmp = av_fetch( revcharmap,
1793           TRIE_LIST_ITEM(state,charid).forid, 0);
1794    if ( tmp ) {
1795     PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1796      colwidth,
1797      pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1798        colwidth,
1799        PL_colors[0], PL_colors[1],
1800        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1801        | PERL_PV_ESCAPE_FIRSTCHAR
1802      ) ,
1803      TRIE_LIST_ITEM(state,charid).forid,
1804      (UV)TRIE_LIST_ITEM(state,charid).newstate
1805     );
1806     if (!(charid % 10))
1807      PerlIO_printf(Perl_debug_log, "\n%*s| ",
1808       (int)((depth * 2) + 14), "");
1809    }
1810   }
1811   PerlIO_printf( Perl_debug_log, "\n");
1812  }
1813 }
1814
1815 /*
1816   Dumps a fully constructed but uncompressed trie in table form.
1817   This is the normal DFA style state transition table, with a few
1818   twists to facilitate compression later.
1819   Used for debugging make_trie().
1820 */
1821 STATIC void
1822 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1823       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1824       U32 depth)
1825 {
1826  U32 state;
1827  U16 charid;
1828  SV *sv=sv_newmortal();
1829  int colwidth= widecharmap ? 6 : 4;
1830  GET_RE_DEBUG_FLAGS_DECL;
1831
1832  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1833
1834  /*
1835  print out the table precompression so that we can do a visual check
1836  that they are identical.
1837  */
1838
1839  PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1840
1841  for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1842   SV ** const tmp = av_fetch( revcharmap, charid, 0);
1843   if ( tmp ) {
1844    PerlIO_printf( Perl_debug_log, "%*s",
1845     colwidth,
1846     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1847        PL_colors[0], PL_colors[1],
1848        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1849        PERL_PV_ESCAPE_FIRSTCHAR
1850     )
1851    );
1852   }
1853  }
1854
1855  PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1856
1857  for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1858   PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1859  }
1860
1861  PerlIO_printf( Perl_debug_log, "\n" );
1862
1863  for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1864
1865   PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1866    (int)depth * 2 + 2,"",
1867    (UV)TRIE_NODENUM( state ) );
1868
1869   for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1870    UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1871    if (v)
1872     PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1873    else
1874     PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1875   }
1876   if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1877    PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1878            (UV)trie->trans[ state ].check );
1879   } else {
1880    PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1881            (UV)trie->trans[ state ].check,
1882    trie->states[ TRIE_NODENUM( state ) ].wordnum );
1883   }
1884  }
1885 }
1886
1887 #endif
1888
1889
1890 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1891   startbranch: the first branch in the whole branch sequence
1892   first      : start branch of sequence of branch-exact nodes.
1893    May be the same as startbranch
1894   last       : Thing following the last branch.
1895    May be the same as tail.
1896   tail       : item following the branch sequence
1897   count      : words in the sequence
1898   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
1899   depth      : indent depth
1900
1901 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1902
1903 A trie is an N'ary tree where the branches are determined by digital
1904 decomposition of the key. IE, at the root node you look up the 1st character and
1905 follow that branch repeat until you find the end of the branches. Nodes can be
1906 marked as "accepting" meaning they represent a complete word. Eg:
1907
1908   /he|she|his|hers/
1909
1910 would convert into the following structure. Numbers represent states, letters
1911 following numbers represent valid transitions on the letter from that state, if
1912 the number is in square brackets it represents an accepting state, otherwise it
1913 will be in parenthesis.
1914
1915  +-h->+-e->[3]-+-r->(8)-+-s->[9]
1916  |    |
1917  |   (2)
1918  |    |
1919  (1)   +-i->(6)-+-s->[7]
1920  |
1921  +-s->(3)-+-h->(4)-+-e->[5]
1922
1923  Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1924
1925 This shows that when matching against the string 'hers' we will begin at state 1
1926 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1927 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1928 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1929 single traverse. We store a mapping from accepting to state to which word was
1930 matched, and then when we have multiple possibilities we try to complete the
1931 rest of the regex in the order in which they occurred in the alternation.
1932
1933 The only prior NFA like behaviour that would be changed by the TRIE support is
1934 the silent ignoring of duplicate alternations which are of the form:
1935
1936  / (DUPE|DUPE) X? (?{ ... }) Y /x
1937
1938 Thus EVAL blocks following a trie may be called a different number of times with
1939 and without the optimisation. With the optimisations dupes will be silently
1940 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1941 the following demonstrates:
1942
1943  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1944
1945 which prints out 'word' three times, but
1946
1947  'words'=~/(word|word|word)(?{ print $1 })S/
1948
1949 which doesnt print it out at all. This is due to other optimisations kicking in.
1950
1951 Example of what happens on a structural level:
1952
1953 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1954
1955    1: CURLYM[1] {1,32767}(18)
1956    5:   BRANCH(8)
1957    6:     EXACT <ac>(16)
1958    8:   BRANCH(11)
1959    9:     EXACT <ad>(16)
1960   11:   BRANCH(14)
1961   12:     EXACT <ab>(16)
1962   16:   SUCCEED(0)
1963   17:   NOTHING(18)
1964   18: END(0)
1965
1966 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1967 and should turn into:
1968
1969    1: CURLYM[1] {1,32767}(18)
1970    5:   TRIE(16)
1971   [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1972   <ac>
1973   <ad>
1974   <ab>
1975   16:   SUCCEED(0)
1976   17:   NOTHING(18)
1977   18: END(0)
1978
1979 Cases where tail != last would be like /(?foo|bar)baz/:
1980
1981    1: BRANCH(4)
1982    2:   EXACT <foo>(8)
1983    4: BRANCH(7)
1984    5:   EXACT <bar>(8)
1985    7: TAIL(8)
1986    8: EXACT <baz>(10)
1987   10: END(0)
1988
1989 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1990 and would end up looking like:
1991
1992  1: TRIE(8)
1993  [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1994   <foo>
1995   <bar>
1996    7: TAIL(8)
1997    8: EXACT <baz>(10)
1998   10: END(0)
1999
2000  d = uvchr_to_utf8_flags(d, uv, 0);
2001
2002 is the recommended Unicode-aware way of saying
2003
2004  *(d++) = uv;
2005 */
2006
2007 #define TRIE_STORE_REVCHAR(val)                                            \
2008  STMT_START {                                                           \
2009   if (UTF) {          \
2010    SV *zlopp = newSV(7); /* XXX: optimize me */                   \
2011    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);    \
2012    unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2013    SvCUR_set(zlopp, kapow - flrbbbbb);       \
2014    SvPOK_on(zlopp);         \
2015    SvUTF8_on(zlopp);         \
2016    av_push(revcharmap, zlopp);        \
2017   } else {          \
2018    char ooooff = (char)val;                                           \
2019    av_push(revcharmap, newSVpvn(&ooooff, 1));      \
2020   }           \
2021   } STMT_END
2022
2023 /* This gets the next character from the input, folding it if not already
2024  * folded. */
2025 #define TRIE_READ_CHAR STMT_START {                                           \
2026  wordlen++;                                                                \
2027  if ( UTF ) {                                                              \
2028   /* if it is UTF then it is either already folded, or does not need    \
2029   * folding */                                                         \
2030   uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
2031  }                                                                         \
2032  else if (folder == PL_fold_latin1) {                                      \
2033   /* This folder implies Unicode rules, which in the range expressible  \
2034   *  by not UTF is the lower case, with the two exceptions, one of     \
2035   *  which should have been taken care of before calling this */       \
2036   assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
2037   uvc = toLOWER_L1(*uc);                                                \
2038   if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
2039   len = 1;                                                              \
2040  } else {                                                                  \
2041   /* raw data, will be folded later if needed */                        \
2042   uvc = (U32)*uc;                                                       \
2043   len = 1;                                                              \
2044  }                                                                         \
2045 } STMT_END
2046
2047
2048
2049 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
2050  if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
2051   U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
2052   Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2053  }                                                           \
2054  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
2055  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
2056  TRIE_LIST_CUR( state )++;                                   \
2057 } STMT_END
2058
2059 #define TRIE_LIST_NEW(state) STMT_START {                       \
2060  Newxz( trie->states[ state ].trans.list,               \
2061   4, reg_trie_trans_le );                                 \
2062  TRIE_LIST_CUR( state ) = 1;                                \
2063  TRIE_LIST_LEN( state ) = 4;                                \
2064 } STMT_END
2065
2066 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
2067  U16 dupe= trie->states[ state ].wordnum;                    \
2068  regnode * const noper_next = regnext( noper );              \
2069                 \
2070  DEBUG_r({                                                   \
2071   /* store the word for dumping */                        \
2072   SV* tmp;                                                \
2073   if (OP(noper) != NOTHING)                               \
2074    tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2075   else                                                    \
2076    tmp = newSVpvn_utf8( "", 0, UTF );   \
2077   av_push( trie_words, tmp );                             \
2078  });                                                         \
2079                 \
2080  curword++;                                                  \
2081  trie->wordinfo[curword].prev   = 0;                         \
2082  trie->wordinfo[curword].len    = wordlen;                   \
2083  trie->wordinfo[curword].accept = state;                     \
2084                 \
2085  if ( noper_next < tail ) {                                  \
2086   if (!trie->jump)                                        \
2087    trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2088             sizeof(U16) ); \
2089   trie->jump[curword] = (U16)(noper_next - convert);      \
2090   if (!jumper)                                            \
2091    jumper = noper_next;                                \
2092   if (!nextbranch)                                        \
2093    nextbranch= regnext(cur);                           \
2094  }                                                           \
2095                 \
2096  if ( dupe ) {                                               \
2097   /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2098   /* chain, so that when the bits of chain are later    */\
2099   /* linked together, the dups appear in the chain      */\
2100   trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2101   trie->wordinfo[dupe].prev = curword;                    \
2102  } else {                                                    \
2103   /* we haven't inserted this word yet.                */ \
2104   trie->states[ state ].wordnum = curword;                \
2105  }                                                           \
2106 } STMT_END
2107
2108
2109 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)  \
2110  ( ( base + charid >=  ucharcount     \
2111   && base + charid < ubound     \
2112   && state == trie->trans[ base - ucharcount + charid ].check \
2113   && trie->trans[ base - ucharcount + charid ].next )  \
2114   ? trie->trans[ base - ucharcount + charid ].next  \
2115   : ( state==1 ? special : 0 )     \
2116  )
2117
2118 #define MADE_TRIE       1
2119 #define MADE_JUMP_TRIE  2
2120 #define MADE_EXACT_TRIE 4
2121
2122 STATIC I32
2123 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2124     regnode *first, regnode *last, regnode *tail,
2125     U32 word_count, U32 flags, U32 depth)
2126 {
2127  /* first pass, loop through and scan words */
2128  reg_trie_data *trie;
2129  HV *widecharmap = NULL;
2130  AV *revcharmap = newAV();
2131  regnode *cur;
2132  STRLEN len = 0;
2133  UV uvc = 0;
2134  U16 curword = 0;
2135  U32 next_alloc = 0;
2136  regnode *jumper = NULL;
2137  regnode *nextbranch = NULL;
2138  regnode *convert = NULL;
2139  U32 *prev_states; /* temp array mapping each state to previous one */
2140  /* we just use folder as a flag in utf8 */
2141  const U8 * folder = NULL;
2142
2143 #ifdef DEBUGGING
2144  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2145  AV *trie_words = NULL;
2146  /* along with revcharmap, this only used during construction but both are
2147  * useful during debugging so we store them in the struct when debugging.
2148  */
2149 #else
2150  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2151  STRLEN trie_charcount=0;
2152 #endif
2153  SV *re_trie_maxbuff;
2154  GET_RE_DEBUG_FLAGS_DECL;
2155
2156  PERL_ARGS_ASSERT_MAKE_TRIE;
2157 #ifndef DEBUGGING
2158  PERL_UNUSED_ARG(depth);
2159 #endif
2160
2161  switch (flags) {
2162   case EXACT: case EXACTL: break;
2163   case EXACTFA:
2164   case EXACTFU_SS:
2165   case EXACTFU:
2166   case EXACTFLU8: folder = PL_fold_latin1; break;
2167   case EXACTF:  folder = PL_fold; break;
2168   default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2169  }
2170
2171  trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2172  trie->refcount = 1;
2173  trie->startstate = 1;
2174  trie->wordcount = word_count;
2175  RExC_rxi->data->data[ data_slot ] = (void*)trie;
2176  trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2177  if (flags == EXACT || flags == EXACTL)
2178   trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2179  trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2180      trie->wordcount+1, sizeof(reg_trie_wordinfo));
2181
2182  DEBUG_r({
2183   trie_words = newAV();
2184  });
2185
2186  re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2187  assert(re_trie_maxbuff);
2188  if (!SvIOK(re_trie_maxbuff)) {
2189   sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2190  }
2191  DEBUG_TRIE_COMPILE_r({
2192   PerlIO_printf( Perl_debug_log,
2193   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2194   (int)depth * 2 + 2, "",
2195   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2196   REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2197  });
2198
2199    /* Find the node we are going to overwrite */
2200  if ( first == startbranch && OP( last ) != BRANCH ) {
2201   /* whole branch chain */
2202   convert = first;
2203  } else {
2204   /* branch sub-chain */
2205   convert = NEXTOPER( first );
2206  }
2207
2208  /*  -- First loop and Setup --
2209
2210  We first traverse the branches and scan each word to determine if it
2211  contains widechars, and how many unique chars there are, this is
2212  important as we have to build a table with at least as many columns as we
2213  have unique chars.
2214
2215  We use an array of integers to represent the character codes 0..255
2216  (trie->charmap) and we use a an HV* to store Unicode characters. We use
2217  the native representation of the character value as the key and IV's for
2218  the coded index.
2219
2220  *TODO* If we keep track of how many times each character is used we can
2221  remap the columns so that the table compression later on is more
2222  efficient in terms of memory by ensuring the most common value is in the
2223  middle and the least common are on the outside.  IMO this would be better
2224  than a most to least common mapping as theres a decent chance the most
2225  common letter will share a node with the least common, meaning the node
2226  will not be compressible. With a middle is most common approach the worst
2227  case is when we have the least common nodes twice.
2228
2229  */
2230
2231  for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2232   regnode *noper = NEXTOPER( cur );
2233   const U8 *uc = (U8*)STRING( noper );
2234   const U8 *e  = uc + STR_LEN( noper );
2235   int foldlen = 0;
2236   U32 wordlen      = 0;         /* required init */
2237   STRLEN minchars = 0;
2238   STRLEN maxchars = 0;
2239   bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2240            bitmap?*/
2241
2242   if (OP(noper) == NOTHING) {
2243    regnode *noper_next= regnext(noper);
2244    if (noper_next != tail && OP(noper_next) == flags) {
2245     noper = noper_next;
2246     uc= (U8*)STRING(noper);
2247     e= uc + STR_LEN(noper);
2248     trie->minlen= STR_LEN(noper);
2249    } else {
2250     trie->minlen= 0;
2251     continue;
2252    }
2253   }
2254
2255   if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2256    TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2257           regardless of encoding */
2258    if (OP( noper ) == EXACTFU_SS) {
2259     /* false positives are ok, so just set this */
2260     TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2261    }
2262   }
2263   for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2264           branch */
2265    TRIE_CHARCOUNT(trie)++;
2266    TRIE_READ_CHAR;
2267
2268    /* TRIE_READ_CHAR returns the current character, or its fold if /i
2269    * is in effect.  Under /i, this character can match itself, or
2270    * anything that folds to it.  If not under /i, it can match just
2271    * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2272    * all fold to k, and all are single characters.   But some folds
2273    * expand to more than one character, so for example LATIN SMALL
2274    * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2275    * the string beginning at 'uc' is 'ffi', it could be matched by
2276    * three characters, or just by the one ligature character. (It
2277    * could also be matched by two characters: LATIN SMALL LIGATURE FF
2278    * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2279    * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2280    * match.)  The trie needs to know the minimum and maximum number
2281    * of characters that could match so that it can use size alone to
2282    * quickly reject many match attempts.  The max is simple: it is
2283    * the number of folded characters in this branch (since a fold is
2284    * never shorter than what folds to it. */
2285
2286    maxchars++;
2287
2288    /* And the min is equal to the max if not under /i (indicated by
2289    * 'folder' being NULL), or there are no multi-character folds.  If
2290    * there is a multi-character fold, the min is incremented just
2291    * once, for the character that folds to the sequence.  Each
2292    * character in the sequence needs to be added to the list below of
2293    * characters in the trie, but we count only the first towards the
2294    * min number of characters needed.  This is done through the
2295    * variable 'foldlen', which is returned by the macros that look
2296    * for these sequences as the number of bytes the sequence
2297    * occupies.  Each time through the loop, we decrement 'foldlen' by
2298    * how many bytes the current char occupies.  Only when it reaches
2299    * 0 do we increment 'minchars' or look for another multi-character
2300    * sequence. */
2301    if (folder == NULL) {
2302     minchars++;
2303    }
2304    else if (foldlen > 0) {
2305     foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2306    }
2307    else {
2308     minchars++;
2309
2310     /* See if *uc is the beginning of a multi-character fold.  If
2311     * so, we decrement the length remaining to look at, to account
2312     * for the current character this iteration.  (We can use 'uc'
2313     * instead of the fold returned by TRIE_READ_CHAR because for
2314     * non-UTF, the latin1_safe macro is smart enough to account
2315     * for all the unfolded characters, and because for UTF, the
2316     * string will already have been folded earlier in the
2317     * compilation process */
2318     if (UTF) {
2319      if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2320       foldlen -= UTF8SKIP(uc);
2321      }
2322     }
2323     else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2324      foldlen--;
2325     }
2326    }
2327
2328    /* The current character (and any potential folds) should be added
2329    * to the possible matching characters for this position in this
2330    * branch */
2331    if ( uvc < 256 ) {
2332     if ( folder ) {
2333      U8 folded= folder[ (U8) uvc ];
2334      if ( !trie->charmap[ folded ] ) {
2335       trie->charmap[ folded ]=( ++trie->uniquecharcount );
2336       TRIE_STORE_REVCHAR( folded );
2337      }
2338     }
2339     if ( !trie->charmap[ uvc ] ) {
2340      trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2341      TRIE_STORE_REVCHAR( uvc );
2342     }
2343     if ( set_bit ) {
2344      /* store the codepoint in the bitmap, and its folded
2345      * equivalent. */
2346      TRIE_BITMAP_SET(trie, uvc);
2347
2348      /* store the folded codepoint */
2349      if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2350
2351      if ( !UTF ) {
2352       /* store first byte of utf8 representation of
2353       variant codepoints */
2354       if (! UVCHR_IS_INVARIANT(uvc)) {
2355        TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2356       }
2357      }
2358      set_bit = 0; /* We've done our bit :-) */
2359     }
2360    } else {
2361
2362     /* XXX We could come up with the list of code points that fold
2363     * to this using PL_utf8_foldclosures, except not for
2364     * multi-char folds, as there may be multiple combinations
2365     * there that could work, which needs to wait until runtime to
2366     * resolve (The comment about LIGATURE FFI above is such an
2367     * example */
2368
2369     SV** svpp;
2370     if ( !widecharmap )
2371      widecharmap = newHV();
2372
2373     svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2374
2375     if ( !svpp )
2376      Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2377
2378     if ( !SvTRUE( *svpp ) ) {
2379      sv_setiv( *svpp, ++trie->uniquecharcount );
2380      TRIE_STORE_REVCHAR(uvc);
2381     }
2382    }
2383   } /* end loop through characters in this branch of the trie */
2384
2385   /* We take the min and max for this branch and combine to find the min
2386   * and max for all branches processed so far */
2387   if( cur == first ) {
2388    trie->minlen = minchars;
2389    trie->maxlen = maxchars;
2390   } else if (minchars < trie->minlen) {
2391    trie->minlen = minchars;
2392   } else if (maxchars > trie->maxlen) {
2393    trie->maxlen = maxchars;
2394   }
2395  } /* end first pass */
2396  DEBUG_TRIE_COMPILE_r(
2397   PerlIO_printf( Perl_debug_log,
2398     "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2399     (int)depth * 2 + 2,"",
2400     ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2401     (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2402     (int)trie->minlen, (int)trie->maxlen )
2403  );
2404
2405  /*
2406   We now know what we are dealing with in terms of unique chars and
2407   string sizes so we can calculate how much memory a naive
2408   representation using a flat table  will take. If it's over a reasonable
2409   limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2410   conservative but potentially much slower representation using an array
2411   of lists.
2412
2413   At the end we convert both representations into the same compressed
2414   form that will be used in regexec.c for matching with. The latter
2415   is a form that cannot be used to construct with but has memory
2416   properties similar to the list form and access properties similar
2417   to the table form making it both suitable for fast searches and
2418   small enough that its feasable to store for the duration of a program.
2419
2420   See the comment in the code where the compressed table is produced
2421   inplace from the flat tabe representation for an explanation of how
2422   the compression works.
2423
2424  */
2425
2426
2427  Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2428  prev_states[1] = 0;
2429
2430  if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2431              > SvIV(re_trie_maxbuff) )
2432  {
2433   /*
2434    Second Pass -- Array Of Lists Representation
2435
2436    Each state will be represented by a list of charid:state records
2437    (reg_trie_trans_le) the first such element holds the CUR and LEN
2438    points of the allocated array. (See defines above).
2439
2440    We build the initial structure using the lists, and then convert
2441    it into the compressed table form which allows faster lookups
2442    (but cant be modified once converted).
2443   */
2444
2445   STRLEN transcount = 1;
2446
2447   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2448    "%*sCompiling trie using list compiler\n",
2449    (int)depth * 2 + 2, ""));
2450
2451   trie->states = (reg_trie_state *)
2452    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2453         sizeof(reg_trie_state) );
2454   TRIE_LIST_NEW(1);
2455   next_alloc = 2;
2456
2457   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2458
2459    regnode *noper   = NEXTOPER( cur );
2460    U8 *uc           = (U8*)STRING( noper );
2461    const U8 *e      = uc + STR_LEN( noper );
2462    U32 state        = 1;         /* required init */
2463    U16 charid       = 0;         /* sanity init */
2464    U32 wordlen      = 0;         /* required init */
2465
2466    if (OP(noper) == NOTHING) {
2467     regnode *noper_next= regnext(noper);
2468     if (noper_next != tail && OP(noper_next) == flags) {
2469      noper = noper_next;
2470      uc= (U8*)STRING(noper);
2471      e= uc + STR_LEN(noper);
2472     }
2473    }
2474
2475    if (OP(noper) != NOTHING) {
2476     for ( ; uc < e ; uc += len ) {
2477
2478      TRIE_READ_CHAR;
2479
2480      if ( uvc < 256 ) {
2481       charid = trie->charmap[ uvc ];
2482      } else {
2483       SV** const svpp = hv_fetch( widecharmap,
2484              (char*)&uvc,
2485              sizeof( UV ),
2486              0);
2487       if ( !svpp ) {
2488        charid = 0;
2489       } else {
2490        charid=(U16)SvIV( *svpp );
2491       }
2492      }
2493      /* charid is now 0 if we dont know the char read, or
2494      * nonzero if we do */
2495      if ( charid ) {
2496
2497       U16 check;
2498       U32 newstate = 0;
2499
2500       charid--;
2501       if ( !trie->states[ state ].trans.list ) {
2502        TRIE_LIST_NEW( state );
2503       }
2504       for ( check = 1;
2505        check <= TRIE_LIST_USED( state );
2506        check++ )
2507       {
2508        if ( TRIE_LIST_ITEM( state, check ).forid
2509                  == charid )
2510        {
2511         newstate = TRIE_LIST_ITEM( state, check ).newstate;
2512         break;
2513        }
2514       }
2515       if ( ! newstate ) {
2516        newstate = next_alloc++;
2517        prev_states[newstate] = state;
2518        TRIE_LIST_PUSH( state, charid, newstate );
2519        transcount++;
2520       }
2521       state = newstate;
2522      } else {
2523       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2524      }
2525     }
2526    }
2527    TRIE_HANDLE_WORD(state);
2528
2529   } /* end second pass */
2530
2531   /* next alloc is the NEXT state to be allocated */
2532   trie->statecount = next_alloc;
2533   trie->states = (reg_trie_state *)
2534    PerlMemShared_realloc( trie->states,
2535         next_alloc
2536         * sizeof(reg_trie_state) );
2537
2538   /* and now dump it out before we compress it */
2539   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2540               revcharmap, next_alloc,
2541               depth+1)
2542   );
2543
2544   trie->trans = (reg_trie_trans *)
2545    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2546   {
2547    U32 state;
2548    U32 tp = 0;
2549    U32 zp = 0;
2550
2551
2552    for( state=1 ; state < next_alloc ; state ++ ) {
2553     U32 base=0;
2554
2555     /*
2556     DEBUG_TRIE_COMPILE_MORE_r(
2557      PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2558     );
2559     */
2560
2561     if (trie->states[state].trans.list) {
2562      U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2563      U16 maxid=minid;
2564      U16 idx;
2565
2566      for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2567       const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2568       if ( forid < minid ) {
2569        minid=forid;
2570       } else if ( forid > maxid ) {
2571        maxid=forid;
2572       }
2573      }
2574      if ( transcount < tp + maxid - minid + 1) {
2575       transcount *= 2;
2576       trie->trans = (reg_trie_trans *)
2577        PerlMemShared_realloc( trie->trans,
2578              transcount
2579              * sizeof(reg_trie_trans) );
2580       Zero( trie->trans + (transcount / 2),
2581        transcount / 2,
2582        reg_trie_trans );
2583      }
2584      base = trie->uniquecharcount + tp - minid;
2585      if ( maxid == minid ) {
2586       U32 set = 0;
2587       for ( ; zp < tp ; zp++ ) {
2588        if ( ! trie->trans[ zp ].next ) {
2589         base = trie->uniquecharcount + zp - minid;
2590         trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2591                 1).newstate;
2592         trie->trans[ zp ].check = state;
2593         set = 1;
2594         break;
2595        }
2596       }
2597       if ( !set ) {
2598        trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2599                 1).newstate;
2600        trie->trans[ tp ].check = state;
2601        tp++;
2602        zp = tp;
2603       }
2604      } else {
2605       for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2606        const U32 tid = base
2607           - trie->uniquecharcount
2608           + TRIE_LIST_ITEM( state, idx ).forid;
2609        trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2610                 idx ).newstate;
2611        trie->trans[ tid ].check = state;
2612       }
2613       tp += ( maxid - minid + 1 );
2614      }
2615      Safefree(trie->states[ state ].trans.list);
2616     }
2617     /*
2618     DEBUG_TRIE_COMPILE_MORE_r(
2619      PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2620     );
2621     */
2622     trie->states[ state ].trans.base=base;
2623    }
2624    trie->lasttrans = tp + 1;
2625   }
2626  } else {
2627   /*
2628   Second Pass -- Flat Table Representation.
2629
2630   we dont use the 0 slot of either trans[] or states[] so we add 1 to
2631   each.  We know that we will need Charcount+1 trans at most to store
2632   the data (one row per char at worst case) So we preallocate both
2633   structures assuming worst case.
2634
2635   We then construct the trie using only the .next slots of the entry
2636   structs.
2637
2638   We use the .check field of the first entry of the node temporarily
2639   to make compression both faster and easier by keeping track of how
2640   many non zero fields are in the node.
2641
2642   Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2643   transition.
2644
2645   There are two terms at use here: state as a TRIE_NODEIDX() which is
2646   a number representing the first entry of the node, and state as a
2647   TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2648   and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2649   if there are 2 entrys per node. eg:
2650
2651    A B       A B
2652   1. 2 4    1. 3 7
2653   2. 0 3    3. 0 5
2654   3. 0 0    5. 0 0
2655   4. 0 0    7. 0 0
2656
2657   The table is internally in the right hand, idx form. However as we
2658   also have to deal with the states array which is indexed by nodenum
2659   we have to use TRIE_NODENUM() to convert.
2660
2661   */
2662   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2663    "%*sCompiling trie using table compiler\n",
2664    (int)depth * 2 + 2, ""));
2665
2666   trie->trans = (reg_trie_trans *)
2667    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2668         * trie->uniquecharcount + 1,
2669         sizeof(reg_trie_trans) );
2670   trie->states = (reg_trie_state *)
2671    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2672         sizeof(reg_trie_state) );
2673   next_alloc = trie->uniquecharcount + 1;
2674
2675
2676   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2677
2678    regnode *noper   = NEXTOPER( cur );
2679    const U8 *uc     = (U8*)STRING( noper );
2680    const U8 *e      = uc + STR_LEN( noper );
2681
2682    U32 state        = 1;         /* required init */
2683
2684    U16 charid       = 0;         /* sanity init */
2685    U32 accept_state = 0;         /* sanity init */
2686
2687    U32 wordlen      = 0;         /* required init */
2688
2689    if (OP(noper) == NOTHING) {
2690     regnode *noper_next= regnext(noper);
2691     if (noper_next != tail && OP(noper_next) == flags) {
2692      noper = noper_next;
2693      uc= (U8*)STRING(noper);
2694      e= uc + STR_LEN(noper);
2695     }
2696    }
2697
2698    if ( OP(noper) != NOTHING ) {
2699     for ( ; uc < e ; uc += len ) {
2700
2701      TRIE_READ_CHAR;
2702
2703      if ( uvc < 256 ) {
2704       charid = trie->charmap[ uvc ];
2705      } else {
2706       SV* const * const svpp = hv_fetch( widecharmap,
2707               (char*)&uvc,
2708               sizeof( UV ),
2709               0);
2710       charid = svpp ? (U16)SvIV(*svpp) : 0;
2711      }
2712      if ( charid ) {
2713       charid--;
2714       if ( !trie->trans[ state + charid ].next ) {
2715        trie->trans[ state + charid ].next = next_alloc;
2716        trie->trans[ state ].check++;
2717        prev_states[TRIE_NODENUM(next_alloc)]
2718          = TRIE_NODENUM(state);
2719        next_alloc += trie->uniquecharcount;
2720       }
2721       state = trie->trans[ state + charid ].next;
2722      } else {
2723       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2724      }
2725      /* charid is now 0 if we dont know the char read, or
2726      * nonzero if we do */
2727     }
2728    }
2729    accept_state = TRIE_NODENUM( state );
2730    TRIE_HANDLE_WORD(accept_state);
2731
2732   } /* end second pass */
2733
2734   /* and now dump it out before we compress it */
2735   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2736               revcharmap,
2737               next_alloc, depth+1));
2738
2739   {
2740   /*
2741   * Inplace compress the table.*
2742
2743   For sparse data sets the table constructed by the trie algorithm will
2744   be mostly 0/FAIL transitions or to put it another way mostly empty.
2745   (Note that leaf nodes will not contain any transitions.)
2746
2747   This algorithm compresses the tables by eliminating most such
2748   transitions, at the cost of a modest bit of extra work during lookup:
2749
2750   - Each states[] entry contains a .base field which indicates the
2751   index in the state[] array wheres its transition data is stored.
2752
2753   - If .base is 0 there are no valid transitions from that node.
2754
2755   - If .base is nonzero then charid is added to it to find an entry in
2756   the trans array.
2757
2758   -If trans[states[state].base+charid].check!=state then the
2759   transition is taken to be a 0/Fail transition. Thus if there are fail
2760   transitions at the front of the node then the .base offset will point
2761   somewhere inside the previous nodes data (or maybe even into a node
2762   even earlier), but the .check field determines if the transition is
2763   valid.
2764
2765   XXX - wrong maybe?
2766   The following process inplace converts the table to the compressed
2767   table: We first do not compress the root node 1,and mark all its
2768   .check pointers as 1 and set its .base pointer as 1 as well. This
2769   allows us to do a DFA construction from the compressed table later,
2770   and ensures that any .base pointers we calculate later are greater
2771   than 0.
2772
2773   - We set 'pos' to indicate the first entry of the second node.
2774
2775   - We then iterate over the columns of the node, finding the first and
2776   last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2777   and set the .check pointers accordingly, and advance pos
2778   appropriately and repreat for the next node. Note that when we copy
2779   the next pointers we have to convert them from the original
2780   NODEIDX form to NODENUM form as the former is not valid post
2781   compression.
2782
2783   - If a node has no transitions used we mark its base as 0 and do not
2784   advance the pos pointer.
2785
2786   - If a node only has one transition we use a second pointer into the
2787   structure to fill in allocated fail transitions from other states.
2788   This pointer is independent of the main pointer and scans forward
2789   looking for null transitions that are allocated to a state. When it
2790   finds one it writes the single transition into the "hole".  If the
2791   pointer doesnt find one the single transition is appended as normal.
2792
2793   - Once compressed we can Renew/realloc the structures to release the
2794   excess space.
2795
2796   See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2797   specifically Fig 3.47 and the associated pseudocode.
2798
2799   demq
2800   */
2801   const U32 laststate = TRIE_NODENUM( next_alloc );
2802   U32 state, charid;
2803   U32 pos = 0, zp=0;
2804   trie->statecount = laststate;
2805
2806   for ( state = 1 ; state < laststate ; state++ ) {
2807    U8 flag = 0;
2808    const U32 stateidx = TRIE_NODEIDX( state );
2809    const U32 o_used = trie->trans[ stateidx ].check;
2810    U32 used = trie->trans[ stateidx ].check;
2811    trie->trans[ stateidx ].check = 0;
2812
2813    for ( charid = 0;
2814     used && charid < trie->uniquecharcount;
2815     charid++ )
2816    {
2817     if ( flag || trie->trans[ stateidx + charid ].next ) {
2818      if ( trie->trans[ stateidx + charid ].next ) {
2819       if (o_used == 1) {
2820        for ( ; zp < pos ; zp++ ) {
2821         if ( ! trie->trans[ zp ].next ) {
2822          break;
2823         }
2824        }
2825        trie->states[ state ].trans.base
2826              = zp
2827              + trie->uniquecharcount
2828              - charid ;
2829        trie->trans[ zp ].next
2830         = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2831                + charid ].next );
2832        trie->trans[ zp ].check = state;
2833        if ( ++zp > pos ) pos = zp;
2834        break;
2835       }
2836       used--;
2837      }
2838      if ( !flag ) {
2839       flag = 1;
2840       trie->states[ state ].trans.base
2841          = pos + trie->uniquecharcount - charid ;
2842      }
2843      trie->trans[ pos ].next
2844       = SAFE_TRIE_NODENUM(
2845          trie->trans[ stateidx + charid ].next );
2846      trie->trans[ pos ].check = state;
2847      pos++;
2848     }
2849    }
2850   }
2851   trie->lasttrans = pos + 1;
2852   trie->states = (reg_trie_state *)
2853    PerlMemShared_realloc( trie->states, laststate
2854         * sizeof(reg_trie_state) );
2855   DEBUG_TRIE_COMPILE_MORE_r(
2856    PerlIO_printf( Perl_debug_log,
2857     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2858     (int)depth * 2 + 2,"",
2859     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2860      + 1 ),
2861     (IV)next_alloc,
2862     (IV)pos,
2863     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2864    );
2865
2866   } /* end table compress */
2867  }
2868  DEBUG_TRIE_COMPILE_MORE_r(
2869    PerlIO_printf(Perl_debug_log,
2870     "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2871     (int)depth * 2 + 2, "",
2872     (UV)trie->statecount,
2873     (UV)trie->lasttrans)
2874  );
2875  /* resize the trans array to remove unused space */
2876  trie->trans = (reg_trie_trans *)
2877   PerlMemShared_realloc( trie->trans, trie->lasttrans
2878        * sizeof(reg_trie_trans) );
2879
2880  {   /* Modify the program and insert the new TRIE node */
2881   U8 nodetype =(U8)(flags & 0xFF);
2882   char *str=NULL;
2883
2884 #ifdef DEBUGGING
2885   regnode *optimize = NULL;
2886 #ifdef RE_TRACK_PATTERN_OFFSETS
2887
2888   U32 mjd_offset = 0;
2889   U32 mjd_nodelen = 0;
2890 #endif /* RE_TRACK_PATTERN_OFFSETS */
2891 #endif /* DEBUGGING */
2892   /*
2893   This means we convert either the first branch or the first Exact,
2894   depending on whether the thing following (in 'last') is a branch
2895   or not and whther first is the startbranch (ie is it a sub part of
2896   the alternation or is it the whole thing.)
2897   Assuming its a sub part we convert the EXACT otherwise we convert
2898   the whole branch sequence, including the first.
2899   */
2900   /* Find the node we are going to overwrite */
2901   if ( first != startbranch || OP( last ) == BRANCH ) {
2902    /* branch sub-chain */
2903    NEXT_OFF( first ) = (U16)(last - first);
2904 #ifdef RE_TRACK_PATTERN_OFFSETS
2905    DEBUG_r({
2906     mjd_offset= Node_Offset((convert));
2907     mjd_nodelen= Node_Length((convert));
2908    });
2909 #endif
2910    /* whole branch chain */
2911   }
2912 #ifdef RE_TRACK_PATTERN_OFFSETS
2913   else {
2914    DEBUG_r({
2915     const  regnode *nop = NEXTOPER( convert );
2916     mjd_offset= Node_Offset((nop));
2917     mjd_nodelen= Node_Length((nop));
2918    });
2919   }
2920   DEBUG_OPTIMISE_r(
2921    PerlIO_printf(Perl_debug_log,
2922     "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2923     (int)depth * 2 + 2, "",
2924     (UV)mjd_offset, (UV)mjd_nodelen)
2925   );
2926 #endif
2927   /* But first we check to see if there is a common prefix we can
2928   split out as an EXACT and put in front of the TRIE node.  */
2929   trie->startstate= 1;
2930   if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2931    U32 state;
2932    for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2933     U32 ofs = 0;
2934     I32 idx = -1;
2935     U32 count = 0;
2936     const U32 base = trie->states[ state ].trans.base;
2937
2938     if ( trie->states[state].wordnum )
2939       count = 1;
2940
2941     for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2942      if ( ( base + ofs >= trie->uniquecharcount ) &&
2943       ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2944       trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2945      {
2946       if ( ++count > 1 ) {
2947        SV **tmp = av_fetch( revcharmap, ofs, 0);
2948        const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2949        if ( state == 1 ) break;
2950        if ( count == 2 ) {
2951         Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2952         DEBUG_OPTIMISE_r(
2953          PerlIO_printf(Perl_debug_log,
2954           "%*sNew Start State=%"UVuf" Class: [",
2955           (int)depth * 2 + 2, "",
2956           (UV)state));
2957         if (idx >= 0) {
2958          SV ** const tmp = av_fetch( revcharmap, idx, 0);
2959          const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2960
2961          TRIE_BITMAP_SET(trie,*ch);
2962          if ( folder )
2963           TRIE_BITMAP_SET(trie, folder[ *ch ]);
2964          DEBUG_OPTIMISE_r(
2965           PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2966          );
2967         }
2968        }
2969        TRIE_BITMAP_SET(trie,*ch);
2970        if ( folder )
2971         TRIE_BITMAP_SET(trie,folder[ *ch ]);
2972        DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2973       }
2974       idx = ofs;
2975      }
2976     }
2977     if ( count == 1 ) {
2978      SV **tmp = av_fetch( revcharmap, idx, 0);
2979      STRLEN len;
2980      char *ch = SvPV( *tmp, len );
2981      DEBUG_OPTIMISE_r({
2982       SV *sv=sv_newmortal();
2983       PerlIO_printf( Perl_debug_log,
2984        "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2985        (int)depth * 2 + 2, "",
2986        (UV)state, (UV)idx,
2987        pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2988         PL_colors[0], PL_colors[1],
2989         (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2990         PERL_PV_ESCAPE_FIRSTCHAR
2991        )
2992       );
2993      });
2994      if ( state==1 ) {
2995       OP( convert ) = nodetype;
2996       str=STRING(convert);
2997       STR_LEN(convert)=0;
2998      }
2999      STR_LEN(convert) += len;
3000      while (len--)
3001       *str++ = *ch++;
3002     } else {
3003 #ifdef DEBUGGING
3004      if (state>1)
3005       DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
3006 #endif
3007      break;
3008     }
3009    }
3010    trie->prefixlen = (state-1);
3011    if (str) {
3012     regnode *n = convert+NODE_SZ_STR(convert);
3013     NEXT_OFF(convert) = NODE_SZ_STR(convert);
3014     trie->startstate = state;
3015     trie->minlen -= (state - 1);
3016     trie->maxlen -= (state - 1);
3017 #ifdef DEBUGGING
3018    /* At least the UNICOS C compiler choked on this
3019     * being argument to DEBUG_r(), so let's just have
3020     * it right here. */
3021    if (
3022 #ifdef PERL_EXT_RE_BUILD
3023     1
3024 #else
3025     DEBUG_r_TEST
3026 #endif
3027     ) {
3028     regnode *fix = convert;
3029     U32 word = trie->wordcount;
3030     mjd_nodelen++;
3031     Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3032     while( ++fix < n ) {
3033      Set_Node_Offset_Length(fix, 0, 0);
3034     }
3035     while (word--) {
3036      SV ** const tmp = av_fetch( trie_words, word, 0 );
3037      if (tmp) {
3038       if ( STR_LEN(convert) <= SvCUR(*tmp) )
3039        sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3040       else
3041        sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3042      }
3043     }
3044    }
3045 #endif
3046     if (trie->maxlen) {
3047      convert = n;
3048     } else {
3049      NEXT_OFF(convert) = (U16)(tail - convert);
3050      DEBUG_r(optimize= n);
3051     }
3052    }
3053   }
3054   if (!jumper)
3055    jumper = last;
3056   if ( trie->maxlen ) {
3057    NEXT_OFF( convert ) = (U16)(tail - convert);
3058    ARG_SET( convert, data_slot );
3059    /* Store the offset to the first unabsorbed branch in
3060    jump[0], which is otherwise unused by the jump logic.
3061    We use this when dumping a trie and during optimisation. */
3062    if (trie->jump)
3063     trie->jump[0] = (U16)(nextbranch - convert);
3064
3065    /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3066    *   and there is a bitmap
3067    *   and the first "jump target" node we found leaves enough room
3068    * then convert the TRIE node into a TRIEC node, with the bitmap
3069    * embedded inline in the opcode - this is hypothetically faster.
3070    */
3071    if ( !trie->states[trie->startstate].wordnum
3072     && trie->bitmap
3073     && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3074    {
3075     OP( convert ) = TRIEC;
3076     Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3077     PerlMemShared_free(trie->bitmap);
3078     trie->bitmap= NULL;
3079    } else
3080     OP( convert ) = TRIE;
3081
3082    /* store the type in the flags */
3083    convert->flags = nodetype;
3084    DEBUG_r({
3085    optimize = convert
3086      + NODE_STEP_REGNODE
3087      + regarglen[ OP( convert ) ];
3088    });
3089    /* XXX We really should free up the resource in trie now,
3090     as we won't use them - (which resources?) dmq */
3091   }
3092   /* needed for dumping*/
3093   DEBUG_r(if (optimize) {
3094    regnode *opt = convert;
3095
3096    while ( ++opt < optimize) {
3097     Set_Node_Offset_Length(opt,0,0);
3098    }
3099    /*
3100     Try to clean up some of the debris left after the
3101     optimisation.
3102    */
3103    while( optimize < jumper ) {
3104     mjd_nodelen += Node_Length((optimize));
3105     OP( optimize ) = OPTIMIZED;
3106     Set_Node_Offset_Length(optimize,0,0);
3107     optimize++;
3108    }
3109    Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3110   });
3111  } /* end node insert */
3112  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
3113
3114  /*  Finish populating the prev field of the wordinfo array.  Walk back
3115  *  from each accept state until we find another accept state, and if
3116  *  so, point the first word's .prev field at the second word. If the
3117  *  second already has a .prev field set, stop now. This will be the
3118  *  case either if we've already processed that word's accept state,
3119  *  or that state had multiple words, and the overspill words were
3120  *  already linked up earlier.
3121  */
3122  {
3123   U16 word;
3124   U32 state;
3125   U16 prev;
3126
3127   for (word=1; word <= trie->wordcount; word++) {
3128    prev = 0;
3129    if (trie->wordinfo[word].prev)
3130     continue;
3131    state = trie->wordinfo[word].accept;
3132    while (state) {
3133     state = prev_states[state];
3134     if (!state)
3135      break;
3136     prev = trie->states[state].wordnum;
3137     if (prev)
3138      break;
3139    }
3140    trie->wordinfo[word].prev = prev;
3141   }
3142   Safefree(prev_states);
3143  }
3144
3145
3146  /* and now dump out the compressed format */
3147  DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3148
3149  RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3150 #ifdef DEBUGGING
3151  RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3152  RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3153 #else
3154  SvREFCNT_dec_NN(revcharmap);
3155 #endif
3156  return trie->jump
3157   ? MADE_JUMP_TRIE
3158   : trie->startstate>1
3159    ? MADE_EXACT_TRIE
3160    : MADE_TRIE;
3161 }
3162
3163 STATIC regnode *
3164 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3165 {
3166 /* The Trie is constructed and compressed now so we can build a fail array if
3167  * it's needed
3168
3169    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3170    3.32 in the
3171    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3172    Ullman 1985/88
3173    ISBN 0-201-10088-6
3174
3175    We find the fail state for each state in the trie, this state is the longest
3176    proper suffix of the current state's 'word' that is also a proper prefix of
3177    another word in our trie. State 1 represents the word '' and is thus the
3178    default fail state. This allows the DFA not to have to restart after its
3179    tried and failed a word at a given point, it simply continues as though it
3180    had been matching the other word in the first place.
3181    Consider
3182  'abcdgu'=~/abcdefg|cdgu/
3183    When we get to 'd' we are still matching the first word, we would encounter
3184    'g' which would fail, which would bring us to the state representing 'd' in
3185    the second word where we would try 'g' and succeed, proceeding to match
3186    'cdgu'.
3187  */
3188  /* add a fail transition */
3189  const U32 trie_offset = ARG(source);
3190  reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3191  U32 *q;
3192  const U32 ucharcount = trie->uniquecharcount;
3193  const U32 numstates = trie->statecount;
3194  const U32 ubound = trie->lasttrans + ucharcount;
3195  U32 q_read = 0;
3196  U32 q_write = 0;
3197  U32 charid;
3198  U32 base = trie->states[ 1 ].trans.base;
3199  U32 *fail;
3200  reg_ac_data *aho;
3201  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3202  regnode *stclass;
3203  GET_RE_DEBUG_FLAGS_DECL;
3204
3205  PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3206  PERL_UNUSED_CONTEXT;
3207 #ifndef DEBUGGING
3208  PERL_UNUSED_ARG(depth);
3209 #endif
3210
3211  if ( OP(source) == TRIE ) {
3212   struct regnode_1 *op = (struct regnode_1 *)
3213    PerlMemShared_calloc(1, sizeof(struct regnode_1));
3214   StructCopy(source,op,struct regnode_1);
3215   stclass = (regnode *)op;
3216  } else {
3217   struct regnode_charclass *op = (struct regnode_charclass *)
3218    PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3219   StructCopy(source,op,struct regnode_charclass);
3220   stclass = (regnode *)op;
3221  }
3222  OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3223
3224  ARG_SET( stclass, data_slot );
3225  aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3226  RExC_rxi->data->data[ data_slot ] = (void*)aho;
3227  aho->trie=trie_offset;
3228  aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3229  Copy( trie->states, aho->states, numstates, reg_trie_state );
3230  Newxz( q, numstates, U32);
3231  aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3232  aho->refcount = 1;
3233  fail = aho->fail;
3234  /* initialize fail[0..1] to be 1 so that we always have
3235  a valid final fail state */
3236  fail[ 0 ] = fail[ 1 ] = 1;
3237
3238  for ( charid = 0; charid < ucharcount ; charid++ ) {
3239   const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3240   if ( newstate ) {
3241    q[ q_write ] = newstate;
3242    /* set to point at the root */
3243    fail[ q[ q_write++ ] ]=1;
3244   }
3245  }
3246  while ( q_read < q_write) {
3247   const U32 cur = q[ q_read++ % numstates ];
3248   base = trie->states[ cur ].trans.base;
3249
3250   for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3251    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3252    if (ch_state) {
3253     U32 fail_state = cur;
3254     U32 fail_base;
3255     do {
3256      fail_state = fail[ fail_state ];
3257      fail_base = aho->states[ fail_state ].trans.base;
3258     } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3259
3260     fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3261     fail[ ch_state ] = fail_state;
3262     if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3263     {
3264       aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3265     }
3266     q[ q_write++ % numstates] = ch_state;
3267    }
3268   }
3269  }
3270  /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3271  when we fail in state 1, this allows us to use the
3272  charclass scan to find a valid start char. This is based on the principle
3273  that theres a good chance the string being searched contains lots of stuff
3274  that cant be a start char.
3275  */
3276  fail[ 0 ] = fail[ 1 ] = 0;
3277  DEBUG_TRIE_COMPILE_r({
3278   PerlIO_printf(Perl_debug_log,
3279      "%*sStclass Failtable (%"UVuf" states): 0",
3280      (int)(depth * 2), "", (UV)numstates
3281   );
3282   for( q_read=1; q_read<numstates; q_read++ ) {
3283    PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3284   }
3285   PerlIO_printf(Perl_debug_log, "\n");
3286  });
3287  Safefree(q);
3288  /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3289  return stclass;
3290 }
3291
3292
3293 #define DEBUG_PEEP(str,scan,depth) \
3294  DEBUG_OPTIMISE_r({if (scan){ \
3295  regnode *Next = regnext(scan); \
3296  regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); \
3297  PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)", \
3298   (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3299   Next ? (REG_NODE_NUM(Next)) : 0 ); \
3300  DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3301  PerlIO_printf(Perl_debug_log, "\n"); \
3302    }});
3303
3304 /* The below joins as many adjacent EXACTish nodes as possible into a single
3305  * one.  The regop may be changed if the node(s) contain certain sequences that
3306  * require special handling.  The joining is only done if:
3307  * 1) there is room in the current conglomerated node to entirely contain the
3308  *    next one.
3309  * 2) they are the exact same node type
3310  *
3311  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3312  * these get optimized out
3313  *
3314  * If a node is to match under /i (folded), the number of characters it matches
3315  * can be different than its character length if it contains a multi-character
3316  * fold.  *min_subtract is set to the total delta number of characters of the
3317  * input nodes.
3318  *
3319  * And *unfolded_multi_char is set to indicate whether or not the node contains
3320  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3321  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3322  * SMALL LETTER SHARP S, as only if the target string being matched against
3323  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3324  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3325  * whose components are all above the Latin1 range are not run-time locale
3326  * dependent, and have already been folded by the time this function is
3327  * called.)
3328  *
3329  * This is as good a place as any to discuss the design of handling these
3330  * multi-character fold sequences.  It's been wrong in Perl for a very long
3331  * time.  There are three code points in Unicode whose multi-character folds
3332  * were long ago discovered to mess things up.  The previous designs for
3333  * dealing with these involved assigning a special node for them.  This
3334  * approach doesn't always work, as evidenced by this example:
3335  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3336  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3337  * would match just the \xDF, it won't be able to handle the case where a
3338  * successful match would have to cross the node's boundary.  The new approach
3339  * that hopefully generally solves the problem generates an EXACTFU_SS node
3340  * that is "sss" in this case.
3341  *
3342  * It turns out that there are problems with all multi-character folds, and not
3343  * just these three.  Now the code is general, for all such cases.  The
3344  * approach taken is:
3345  * 1)   This routine examines each EXACTFish node that could contain multi-
3346  *      character folded sequences.  Since a single character can fold into
3347  *      such a sequence, the minimum match length for this node is less than
3348  *      the number of characters in the node.  This routine returns in
3349  *      *min_subtract how many characters to subtract from the the actual
3350  *      length of the string to get a real minimum match length; it is 0 if
3351  *      there are no multi-char foldeds.  This delta is used by the caller to
3352  *      adjust the min length of the match, and the delta between min and max,
3353  *      so that the optimizer doesn't reject these possibilities based on size
3354  *      constraints.
3355  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3356  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3357  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3358  *      there is a possible fold length change.  That means that a regular
3359  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3360  *      with length changes, and so can be processed faster.  regexec.c takes
3361  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3362  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3363  *      known until runtime).  This saves effort in regex matching.  However,
3364  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3365  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3366  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3367  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3368  *      possibilities for the non-UTF8 patterns are quite simple, except for
3369  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3370  *      members of a fold-pair, and arrays are set up for all of them so that
3371  *      the other member of the pair can be found quickly.  Code elsewhere in
3372  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3373  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3374  *      described in the next item.
3375  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3376  *      validity of the fold won't be known until runtime, and so must remain
3377  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3378  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3379  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3380  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3381  *      The reason this is a problem is that the optimizer part of regexec.c
3382  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3383  *      that a character in the pattern corresponds to at most a single
3384  *      character in the target string.  (And I do mean character, and not byte
3385  *      here, unlike other parts of the documentation that have never been
3386  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3387  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3388  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3389  *      nodes, violate the assumption, and they are the only instances where it
3390  *      is violated.  I'm reluctant to try to change the assumption, as the
3391  *      code involved is impenetrable to me (khw), so instead the code here
3392  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3393  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3394  *      boolean indicating whether or not the node contains such a fold.  When
3395  *      it is true, the caller sets a flag that later causes the optimizer in
3396  *      this file to not set values for the floating and fixed string lengths,
3397  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3398  *      assumption.  Thus, there is no optimization based on string lengths for
3399  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3400  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3401  *      assumption is wrong only in these cases is that all other non-UTF-8
3402  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3403  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3404  *      EXACTF nodes because we don't know at compile time if it actually
3405  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3406  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3407  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3408  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3409  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3410  *      string would require the pattern to be forced into UTF-8, the overhead
3411  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3412  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3413  *      locale.)
3414  *
3415  *      Similarly, the code that generates tries doesn't currently handle
3416  *      not-already-folded multi-char folds, and it looks like a pain to change
3417  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3418  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3419  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3420  *      using /iaa matching will be doing so almost entirely with ASCII
3421  *      strings, so this should rarely be encountered in practice */
3422
3423 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3424  if (PL_regkind[OP(scan)] == EXACT) \
3425   join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3426
3427 STATIC U32
3428 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3429     UV *min_subtract, bool *unfolded_multi_char,
3430     U32 flags,regnode *val, U32 depth)
3431 {
3432  /* Merge several consecutive EXACTish nodes into one. */
3433  regnode *n = regnext(scan);
3434  U32 stringok = 1;
3435  regnode *next = scan + NODE_SZ_STR(scan);
3436  U32 merged = 0;
3437  U32 stopnow = 0;
3438 #ifdef DEBUGGING
3439  regnode *stop = scan;
3440  GET_RE_DEBUG_FLAGS_DECL;
3441 #else
3442  PERL_UNUSED_ARG(depth);
3443 #endif
3444
3445  PERL_ARGS_ASSERT_JOIN_EXACT;
3446 #ifndef EXPERIMENTAL_INPLACESCAN
3447  PERL_UNUSED_ARG(flags);
3448  PERL_UNUSED_ARG(val);
3449 #endif
3450  DEBUG_PEEP("join",scan,depth);
3451
3452  /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3453  * EXACT ones that are mergeable to the current one. */
3454  while (n
3455   && (PL_regkind[OP(n)] == NOTHING
3456    || (stringok && OP(n) == OP(scan)))
3457   && NEXT_OFF(n)
3458   && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3459  {
3460
3461   if (OP(n) == TAIL || n > next)
3462    stringok = 0;
3463   if (PL_regkind[OP(n)] == NOTHING) {
3464    DEBUG_PEEP("skip:",n,depth);
3465    NEXT_OFF(scan) += NEXT_OFF(n);
3466    next = n + NODE_STEP_REGNODE;
3467 #ifdef DEBUGGING
3468    if (stringok)
3469     stop = n;
3470 #endif
3471    n = regnext(n);
3472   }
3473   else if (stringok) {
3474    const unsigned int oldl = STR_LEN(scan);
3475    regnode * const nnext = regnext(n);
3476
3477    /* XXX I (khw) kind of doubt that this works on platforms (should
3478    * Perl ever run on one) where U8_MAX is above 255 because of lots
3479    * of other assumptions */
3480    /* Don't join if the sum can't fit into a single node */
3481    if (oldl + STR_LEN(n) > U8_MAX)
3482     break;
3483
3484    DEBUG_PEEP("merg",n,depth);
3485    merged++;
3486
3487    NEXT_OFF(scan) += NEXT_OFF(n);
3488    STR_LEN(scan) += STR_LEN(n);
3489    next = n + NODE_SZ_STR(n);
3490    /* Now we can overwrite *n : */
3491    Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3492 #ifdef DEBUGGING
3493    stop = next - 1;
3494 #endif
3495    n = nnext;
3496    if (stopnow) break;
3497   }
3498
3499 #ifdef EXPERIMENTAL_INPLACESCAN
3500   if (flags && !NEXT_OFF(n)) {
3501    DEBUG_PEEP("atch", val, depth);
3502    if (reg_off_by_arg[OP(n)]) {
3503     ARG_SET(n, val - n);
3504    }
3505    else {
3506     NEXT_OFF(n) = val - n;
3507    }
3508    stopnow = 1;
3509   }
3510 #endif
3511  }
3512
3513  *min_subtract = 0;
3514  *unfolded_multi_char = FALSE;
3515
3516  /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3517  * can now analyze for sequences of problematic code points.  (Prior to
3518  * this final joining, sequences could have been split over boundaries, and
3519  * hence missed).  The sequences only happen in folding, hence for any
3520  * non-EXACT EXACTish node */
3521  if (OP(scan) != EXACT && OP(scan) != EXACTL) {
3522   U8* s0 = (U8*) STRING(scan);
3523   U8* s = s0;
3524   U8* s_end = s0 + STR_LEN(scan);
3525
3526   int total_count_delta = 0;  /* Total delta number of characters that
3527          multi-char folds expand to */
3528
3529   /* One pass is made over the node's string looking for all the
3530   * possibilities.  To avoid some tests in the loop, there are two main
3531   * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3532   * non-UTF-8 */
3533   if (UTF) {
3534    U8* folded = NULL;
3535
3536    if (OP(scan) == EXACTFL) {
3537     U8 *d;
3538
3539     /* An EXACTFL node would already have been changed to another
3540     * node type unless there is at least one character in it that
3541     * is problematic; likely a character whose fold definition
3542     * won't be known until runtime, and so has yet to be folded.
3543     * For all but the UTF-8 locale, folds are 1-1 in length, but
3544     * to handle the UTF-8 case, we need to create a temporary
3545     * folded copy using UTF-8 locale rules in order to analyze it.
3546     * This is because our macros that look to see if a sequence is
3547     * a multi-char fold assume everything is folded (otherwise the
3548     * tests in those macros would be too complicated and slow).
3549     * Note that here, the non-problematic folds will have already
3550     * been done, so we can just copy such characters.  We actually
3551     * don't completely fold the EXACTFL string.  We skip the
3552     * unfolded multi-char folds, as that would just create work
3553     * below to figure out the size they already are */
3554
3555     Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3556     d = folded;
3557     while (s < s_end) {
3558      STRLEN s_len = UTF8SKIP(s);
3559      if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3560       Copy(s, d, s_len, U8);
3561       d += s_len;
3562      }
3563      else if (is_FOLDS_TO_MULTI_utf8(s)) {
3564       *unfolded_multi_char = TRUE;
3565       Copy(s, d, s_len, U8);
3566       d += s_len;
3567      }
3568      else if (isASCII(*s)) {
3569       *(d++) = toFOLD(*s);
3570      }
3571      else {
3572       STRLEN len;
3573       _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3574       d += len;
3575      }
3576      s += s_len;
3577     }
3578
3579     /* Point the remainder of the routine to look at our temporary
3580     * folded copy */
3581     s = folded;
3582     s_end = d;
3583    } /* End of creating folded copy of EXACTFL string */
3584
3585    /* Examine the string for a multi-character fold sequence.  UTF-8
3586    * patterns have all characters pre-folded by the time this code is
3587    * executed */
3588    while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3589          length sequence we are looking for is 2 */
3590    {
3591     int count = 0;  /* How many characters in a multi-char fold */
3592     int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3593     if (! len) {    /* Not a multi-char fold: get next char */
3594      s += UTF8SKIP(s);
3595      continue;
3596     }
3597
3598     /* Nodes with 'ss' require special handling, except for
3599     * EXACTFA-ish for which there is no multi-char fold to this */
3600     if (len == 2 && *s == 's' && *(s+1) == 's'
3601      && OP(scan) != EXACTFA
3602      && OP(scan) != EXACTFA_NO_TRIE)
3603     {
3604      count = 2;
3605      if (OP(scan) != EXACTFL) {
3606       OP(scan) = EXACTFU_SS;
3607      }
3608      s += 2;
3609     }
3610     else { /* Here is a generic multi-char fold. */
3611      U8* multi_end  = s + len;
3612
3613      /* Count how many characters are in it.  In the case of
3614      * /aa, no folds which contain ASCII code points are
3615      * allowed, so check for those, and skip if found. */
3616      if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3617       count = utf8_length(s, multi_end);
3618       s = multi_end;
3619      }
3620      else {
3621       while (s < multi_end) {
3622        if (isASCII(*s)) {
3623         s++;
3624         goto next_iteration;
3625        }
3626        else {
3627         s += UTF8SKIP(s);
3628        }
3629        count++;
3630       }
3631      }
3632     }
3633
3634     /* The delta is how long the sequence is minus 1 (1 is how long
3635     * the character that folds to the sequence is) */
3636     total_count_delta += count - 1;
3637    next_iteration: ;
3638    }
3639
3640    /* We created a temporary folded copy of the string in EXACTFL
3641    * nodes.  Therefore we need to be sure it doesn't go below zero,
3642    * as the real string could be shorter */
3643    if (OP(scan) == EXACTFL) {
3644     int total_chars = utf8_length((U8*) STRING(scan),
3645           (U8*) STRING(scan) + STR_LEN(scan));
3646     if (total_count_delta > total_chars) {
3647      total_count_delta = total_chars;
3648     }
3649    }
3650
3651    *min_subtract += total_count_delta;
3652    Safefree(folded);
3653   }
3654   else if (OP(scan) == EXACTFA) {
3655
3656    /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3657    * fold to the ASCII range (and there are no existing ones in the
3658    * upper latin1 range).  But, as outlined in the comments preceding
3659    * this function, we need to flag any occurrences of the sharp s.
3660    * This character forbids trie formation (because of added
3661    * complexity) */
3662    while (s < s_end) {
3663     if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3664      OP(scan) = EXACTFA_NO_TRIE;
3665      *unfolded_multi_char = TRUE;
3666      break;
3667     }
3668     s++;
3669     continue;
3670    }
3671   }
3672   else {
3673
3674    /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3675    * folds that are all Latin1.  As explained in the comments
3676    * preceding this function, we look also for the sharp s in EXACTF
3677    * and EXACTFL nodes; it can be in the final position.  Otherwise
3678    * we can stop looking 1 byte earlier because have to find at least
3679    * two characters for a multi-fold */
3680    const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3681        ? s_end
3682        : s_end -1;
3683
3684    while (s < upper) {
3685     int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3686     if (! len) {    /* Not a multi-char fold. */
3687      if (*s == LATIN_SMALL_LETTER_SHARP_S
3688       && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3689      {
3690       *unfolded_multi_char = TRUE;
3691      }
3692      s++;
3693      continue;
3694     }
3695
3696     if (len == 2
3697      && isALPHA_FOLD_EQ(*s, 's')
3698      && isALPHA_FOLD_EQ(*(s+1), 's'))
3699     {
3700
3701      /* EXACTF nodes need to know that the minimum length
3702      * changed so that a sharp s in the string can match this
3703      * ss in the pattern, but they remain EXACTF nodes, as they
3704      * won't match this unless the target string is is UTF-8,
3705      * which we don't know until runtime.  EXACTFL nodes can't
3706      * transform into EXACTFU nodes */
3707      if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3708       OP(scan) = EXACTFU_SS;
3709      }
3710     }
3711
3712     *min_subtract += len - 1;
3713     s += len;
3714    }
3715   }
3716  }
3717
3718 #ifdef DEBUGGING
3719  /* Allow dumping but overwriting the collection of skipped
3720  * ops and/or strings with fake optimized ops */
3721  n = scan + NODE_SZ_STR(scan);
3722  while (n <= stop) {
3723   OP(n) = OPTIMIZED;
3724   FLAGS(n) = 0;
3725   NEXT_OFF(n) = 0;
3726   n++;
3727  }
3728 #endif
3729  DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3730  return stopnow;
3731 }
3732
3733 /* REx optimizer.  Converts nodes into quicker variants "in place".
3734    Finds fixed substrings.  */
3735
3736 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3737    to the position after last scanned or to NULL. */
3738
3739 #define INIT_AND_WITHP \
3740  assert(!and_withp); \
3741  Newx(and_withp,1, regnode_ssc); \
3742  SAVEFREEPV(and_withp)
3743
3744
3745 static void
3746 S_unwind_scan_frames(pTHX_ const void *p)
3747 {
3748  scan_frame *f= (scan_frame *)p;
3749  do {
3750   scan_frame *n= f->next_frame;
3751   Safefree(f);
3752   f= n;
3753  } while (f);
3754 }
3755
3756
3757 STATIC SSize_t
3758 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3759       SSize_t *minlenp, SSize_t *deltap,
3760       regnode *last,
3761       scan_data_t *data,
3762       I32 stopparen,
3763       U32 recursed_depth,
3764       regnode_ssc *and_withp,
3765       U32 flags, U32 depth)
3766       /* scanp: Start here (read-write). */
3767       /* deltap: Write maxlen-minlen here. */
3768       /* last: Stop before this one. */
3769       /* data: string data about the pattern */
3770       /* stopparen: treat close N as END */
3771       /* recursed: which subroutines have we recursed into */
3772       /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3773 {
3774  /* There must be at least this number of characters to match */
3775  SSize_t min = 0;
3776  I32 pars = 0, code;
3777  regnode *scan = *scanp, *next;
3778  SSize_t delta = 0;
3779  int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3780  int is_inf_internal = 0;  /* The studied chunk is infinite */
3781  I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3782  scan_data_t data_fake;
3783  SV *re_trie_maxbuff = NULL;
3784  regnode *first_non_open = scan;
3785  SSize_t stopmin = SSize_t_MAX;
3786  scan_frame *frame = NULL;
3787  GET_RE_DEBUG_FLAGS_DECL;
3788
3789  PERL_ARGS_ASSERT_STUDY_CHUNK;
3790
3791
3792  if ( depth == 0 ) {
3793   while (first_non_open && OP(first_non_open) == OPEN)
3794    first_non_open=regnext(first_non_open);
3795  }
3796
3797
3798   fake_study_recurse:
3799  DEBUG_r(
3800   RExC_study_chunk_recursed_count++;
3801  );
3802  DEBUG_OPTIMISE_MORE_r(
3803  {
3804   PerlIO_printf(Perl_debug_log,
3805    "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
3806    (int)(depth*2), "", (long)stopparen,
3807    (unsigned long)RExC_study_chunk_recursed_count,
3808    (unsigned long)depth, (unsigned long)recursed_depth,
3809    scan,
3810    last);
3811   if (recursed_depth) {
3812    U32 i;
3813    U32 j;
3814    for ( j = 0 ; j < recursed_depth ; j++ ) {
3815     for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
3816      if (
3817       PAREN_TEST(RExC_study_chunk_recursed +
3818         ( j * RExC_study_chunk_recursed_bytes), i )
3819       && (
3820        !j ||
3821        !PAREN_TEST(RExC_study_chunk_recursed +
3822         (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
3823       )
3824      ) {
3825       PerlIO_printf(Perl_debug_log," %d",(int)i);
3826       break;
3827      }
3828     }
3829     if ( j + 1 < recursed_depth ) {
3830      PerlIO_printf(Perl_debug_log, ",");
3831     }
3832    }
3833   }
3834   PerlIO_printf(Perl_debug_log,"\n");
3835  }
3836  );
3837  while ( scan && OP(scan) != END && scan < last ){
3838   UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3839         node length to get a real minimum (because
3840         the folded version may be shorter) */
3841   bool unfolded_multi_char = FALSE;
3842   /* Peephole optimizer: */
3843   DEBUG_STUDYDATA("Peep:", data, depth);
3844   DEBUG_PEEP("Peep", scan, depth);
3845
3846
3847   /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3848   * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3849   * by a different invocation of reg() -- Yves
3850   */
3851   JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3852
3853   /* Follow the next-chain of the current node and optimize
3854   away all the NOTHINGs from it.  */
3855   if (OP(scan) != CURLYX) {
3856    const int max = (reg_off_by_arg[OP(scan)]
3857      ? I32_MAX
3858      /* I32 may be smaller than U16 on CRAYs! */
3859      : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3860    int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3861    int noff;
3862    regnode *n = scan;
3863
3864    /* Skip NOTHING and LONGJMP. */
3865    while ((n = regnext(n))
3866     && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3867      || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3868     && off + noff < max)
3869     off += noff;
3870    if (reg_off_by_arg[OP(scan)])
3871     ARG(scan) = off;
3872    else
3873     NEXT_OFF(scan) = off;
3874   }
3875
3876   /* The principal pseudo-switch.  Cannot be a switch, since we
3877   look into several different things.  */
3878   if ( OP(scan) == DEFINEP ) {
3879    SSize_t minlen = 0;
3880    SSize_t deltanext = 0;
3881    SSize_t fake_last_close = 0;
3882    I32 f = SCF_IN_DEFINE;
3883
3884    StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3885    scan = regnext(scan);
3886    assert( OP(scan) == IFTHEN );
3887    DEBUG_PEEP("expect IFTHEN", scan, depth);
3888
3889    data_fake.last_closep= &fake_last_close;
3890    minlen = *minlenp;
3891    next = regnext(scan);
3892    scan = NEXTOPER(NEXTOPER(scan));
3893    DEBUG_PEEP("scan", scan, depth);
3894    DEBUG_PEEP("next", next, depth);
3895
3896    /* we suppose the run is continuous, last=next...
3897    * NOTE we dont use the return here! */
3898    (void)study_chunk(pRExC_state, &scan, &minlen,
3899        &deltanext, next, &data_fake, stopparen,
3900        recursed_depth, NULL, f, depth+1);
3901
3902    scan = next;
3903   } else
3904   if (
3905    OP(scan) == BRANCH  ||
3906    OP(scan) == BRANCHJ ||
3907    OP(scan) == IFTHEN
3908   ) {
3909    next = regnext(scan);
3910    code = OP(scan);
3911
3912    /* The op(next)==code check below is to see if we
3913    * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
3914    * IFTHEN is special as it might not appear in pairs.
3915    * Not sure whether BRANCH-BRANCHJ is possible, regardless
3916    * we dont handle it cleanly. */
3917    if (OP(next) == code || code == IFTHEN) {
3918     /* NOTE - There is similar code to this block below for
3919     * handling TRIE nodes on a re-study.  If you change stuff here
3920     * check there too. */
3921     SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3922     regnode_ssc accum;
3923     regnode * const startbranch=scan;
3924
3925     if (flags & SCF_DO_SUBSTR) {
3926      /* Cannot merge strings after this. */
3927      scan_commit(pRExC_state, data, minlenp, is_inf);
3928     }
3929
3930     if (flags & SCF_DO_STCLASS)
3931      ssc_init_zero(pRExC_state, &accum);
3932
3933     while (OP(scan) == code) {
3934      SSize_t deltanext, minnext, fake;
3935      I32 f = 0;
3936      regnode_ssc this_class;
3937
3938      DEBUG_PEEP("Branch", scan, depth);
3939
3940      num++;
3941      StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3942      if (data) {
3943       data_fake.whilem_c = data->whilem_c;
3944       data_fake.last_closep = data->last_closep;
3945      }
3946      else
3947       data_fake.last_closep = &fake;
3948
3949      data_fake.pos_delta = delta;
3950      next = regnext(scan);
3951
3952      scan = NEXTOPER(scan); /* everything */
3953      if (code != BRANCH)    /* everything but BRANCH */
3954       scan = NEXTOPER(scan);
3955
3956      if (flags & SCF_DO_STCLASS) {
3957       ssc_init(pRExC_state, &this_class);
3958       data_fake.start_class = &this_class;
3959       f = SCF_DO_STCLASS_AND;
3960      }
3961      if (flags & SCF_WHILEM_VISITED_POS)
3962       f |= SCF_WHILEM_VISITED_POS;
3963
3964      /* we suppose the run is continuous, last=next...*/
3965      minnext = study_chunk(pRExC_state, &scan, minlenp,
3966          &deltanext, next, &data_fake, stopparen,
3967          recursed_depth, NULL, f,depth+1);
3968
3969      if (min1 > minnext)
3970       min1 = minnext;
3971      if (deltanext == SSize_t_MAX) {
3972       is_inf = is_inf_internal = 1;
3973       max1 = SSize_t_MAX;
3974      } else if (max1 < minnext + deltanext)
3975       max1 = minnext + deltanext;
3976      scan = next;
3977      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3978       pars++;
3979      if (data_fake.flags & SCF_SEEN_ACCEPT) {
3980       if ( stopmin > minnext)
3981        stopmin = min + min1;
3982       flags &= ~SCF_DO_SUBSTR;
3983       if (data)
3984        data->flags |= SCF_SEEN_ACCEPT;
3985      }
3986      if (data) {
3987       if (data_fake.flags & SF_HAS_EVAL)
3988        data->flags |= SF_HAS_EVAL;
3989       data->whilem_c = data_fake.whilem_c;
3990      }
3991      if (flags & SCF_DO_STCLASS)
3992       ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3993     }
3994     if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3995      min1 = 0;
3996     if (flags & SCF_DO_SUBSTR) {
3997      data->pos_min += min1;
3998      if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3999       data->pos_delta = SSize_t_MAX;
4000      else
4001       data->pos_delta += max1 - min1;
4002      if (max1 != min1 || is_inf)
4003       data->longest = &(data->longest_float);
4004     }
4005     min += min1;
4006     if (delta == SSize_t_MAX
4007     || SSize_t_MAX - delta - (max1 - min1) < 0)
4008      delta = SSize_t_MAX;
4009     else
4010      delta += max1 - min1;
4011     if (flags & SCF_DO_STCLASS_OR) {
4012      ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4013      if (min1) {
4014       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4015       flags &= ~SCF_DO_STCLASS;
4016      }
4017     }
4018     else if (flags & SCF_DO_STCLASS_AND) {
4019      if (min1) {
4020       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4021       flags &= ~SCF_DO_STCLASS;
4022      }
4023      else {
4024       /* Switch to OR mode: cache the old value of
4025       * data->start_class */
4026       INIT_AND_WITHP;
4027       StructCopy(data->start_class, and_withp, regnode_ssc);
4028       flags &= ~SCF_DO_STCLASS_AND;
4029       StructCopy(&accum, data->start_class, regnode_ssc);
4030       flags |= SCF_DO_STCLASS_OR;
4031      }
4032     }
4033
4034     if (PERL_ENABLE_TRIE_OPTIMISATION &&
4035       OP( startbranch ) == BRANCH )
4036     {
4037     /* demq.
4038
4039     Assuming this was/is a branch we are dealing with: 'scan'
4040     now points at the item that follows the branch sequence,
4041     whatever it is. We now start at the beginning of the
4042     sequence and look for subsequences of
4043
4044     BRANCH->EXACT=>x1
4045     BRANCH->EXACT=>x2
4046     tail
4047
4048     which would be constructed from a pattern like
4049     /A|LIST|OF|WORDS/
4050
4051     If we can find such a subsequence we need to turn the first
4052     element into a trie and then add the subsequent branch exact
4053     strings to the trie.
4054
4055     We have two cases
4056
4057      1. patterns where the whole set of branches can be
4058       converted.
4059
4060      2. patterns where only a subset can be converted.
4061
4062     In case 1 we can replace the whole set with a single regop
4063     for the trie. In case 2 we need to keep the start and end
4064     branches so
4065
4066      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4067      becomes BRANCH TRIE; BRANCH X;
4068
4069     There is an additional case, that being where there is a
4070     common prefix, which gets split out into an EXACT like node
4071     preceding the TRIE node.
4072
4073     If x(1..n)==tail then we can do a simple trie, if not we make
4074     a "jump" trie, such that when we match the appropriate word
4075     we "jump" to the appropriate tail node. Essentially we turn
4076     a nested if into a case structure of sorts.
4077
4078     */
4079
4080      int made=0;
4081      if (!re_trie_maxbuff) {
4082       re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4083       if (!SvIOK(re_trie_maxbuff))
4084        sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4085      }
4086      if ( SvIV(re_trie_maxbuff)>=0  ) {
4087       regnode *cur;
4088       regnode *first = (regnode *)NULL;
4089       regnode *last = (regnode *)NULL;
4090       regnode *tail = scan;
4091       U8 trietype = 0;
4092       U32 count=0;
4093
4094       /* var tail is used because there may be a TAIL
4095       regop in the way. Ie, the exacts will point to the
4096       thing following the TAIL, but the last branch will
4097       point at the TAIL. So we advance tail. If we
4098       have nested (?:) we may have to move through several
4099       tails.
4100       */
4101
4102       while ( OP( tail ) == TAIL ) {
4103        /* this is the TAIL generated by (?:) */
4104        tail = regnext( tail );
4105       }
4106
4107
4108       DEBUG_TRIE_COMPILE_r({
4109        regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4110        PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
4111        (int)depth * 2 + 2, "",
4112        "Looking for TRIE'able sequences. Tail node is: ",
4113        SvPV_nolen_const( RExC_mysv )
4114        );
4115       });
4116
4117       /*
4118
4119        Step through the branches
4120         cur represents each branch,
4121         noper is the first thing to be matched as part
4122          of that branch
4123         noper_next is the regnext() of that node.
4124
4125        We normally handle a case like this
4126        /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4127        support building with NOJUMPTRIE, which restricts
4128        the trie logic to structures like /FOO|BAR/.
4129
4130        If noper is a trieable nodetype then the branch is
4131        a possible optimization target. If we are building
4132        under NOJUMPTRIE then we require that noper_next is
4133        the same as scan (our current position in the regex
4134        program).
4135
4136        Once we have two or more consecutive such branches
4137        we can create a trie of the EXACT's contents and
4138        stitch it in place into the program.
4139
4140        If the sequence represents all of the branches in
4141        the alternation we replace the entire thing with a
4142        single TRIE node.
4143
4144        Otherwise when it is a subsequence we need to
4145        stitch it in place and replace only the relevant
4146        branches. This means the first branch has to remain
4147        as it is used by the alternation logic, and its
4148        next pointer, and needs to be repointed at the item
4149        on the branch chain following the last branch we
4150        have optimized away.
4151
4152        This could be either a BRANCH, in which case the
4153        subsequence is internal, or it could be the item
4154        following the branch sequence in which case the
4155        subsequence is at the end (which does not
4156        necessarily mean the first node is the start of the
4157        alternation).
4158
4159        TRIE_TYPE(X) is a define which maps the optype to a
4160        trietype.
4161
4162         optype          |  trietype
4163         ----------------+-----------
4164         NOTHING         | NOTHING
4165         EXACT           | EXACT
4166         EXACTFU         | EXACTFU
4167         EXACTFU_SS      | EXACTFU
4168         EXACTFA         | EXACTFA
4169         EXACTL          | EXACTL
4170         EXACTFLU8       | EXACTFLU8
4171
4172
4173       */
4174 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
4175      ? NOTHING                                            \
4176      : ( EXACT == (X) )                                   \
4177       ? EXACT                                            \
4178       : ( EXACTFU == (X) || EXACTFU_SS == (X) )          \
4179       ? EXACTFU                                        \
4180       : ( EXACTFA == (X) )                             \
4181        ? EXACTFA                                      \
4182        : ( EXACTL == (X) )                            \
4183        ? EXACTL                                     \
4184        : ( EXACTFLU8 == (X) )                        \
4185         ? EXACTFLU8                                 \
4186         : 0 )
4187
4188       /* dont use tail as the end marker for this traverse */
4189       for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4190        regnode * const noper = NEXTOPER( cur );
4191        U8 noper_type = OP( noper );
4192        U8 noper_trietype = TRIE_TYPE( noper_type );
4193 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4194        regnode * const noper_next = regnext( noper );
4195        U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
4196        U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
4197 #endif
4198
4199        DEBUG_TRIE_COMPILE_r({
4200         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4201         PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
4202         (int)depth * 2 + 2,"", SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4203
4204         regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4205         PerlIO_printf( Perl_debug_log, " -> %s",
4206          SvPV_nolen_const(RExC_mysv));
4207
4208         if ( noper_next ) {
4209         regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4210         PerlIO_printf( Perl_debug_log,"\t=> %s\t",
4211          SvPV_nolen_const(RExC_mysv));
4212         }
4213         PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
4214         REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4215         PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4216         );
4217        });
4218
4219        /* Is noper a trieable nodetype that can be merged
4220        * with the current trie (if there is one)? */
4221        if ( noper_trietype
4222         &&
4223         (
4224           ( noper_trietype == NOTHING)
4225           || ( trietype == NOTHING )
4226           || ( trietype == noper_trietype )
4227         )
4228 #ifdef NOJUMPTRIE
4229         && noper_next == tail
4230 #endif
4231         && count < U16_MAX)
4232        {
4233         /* Handle mergable triable node Either we are
4234         * the first node in a new trieable sequence,
4235         * in which case we do some bookkeeping,
4236         * otherwise we update the end pointer. */
4237         if ( !first ) {
4238          first = cur;
4239          if ( noper_trietype == NOTHING ) {
4240 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4241           regnode * const noper_next = regnext( noper );
4242           U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4243           U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4244 #endif
4245
4246           if ( noper_next_trietype ) {
4247            trietype = noper_next_trietype;
4248           } else if (noper_next_type)  {
4249            /* a NOTHING regop is 1 regop wide.
4250            * We need at least two for a trie
4251            * so we can't merge this in */
4252            first = NULL;
4253           }
4254          } else {
4255           trietype = noper_trietype;
4256          }
4257         } else {
4258          if ( trietype == NOTHING )
4259           trietype = noper_trietype;
4260          last = cur;
4261         }
4262         if (first)
4263          count++;
4264        } /* end handle mergable triable node */
4265        else {
4266         /* handle unmergable node -
4267         * noper may either be a triable node which can
4268         * not be tried together with the current trie,
4269         * or a non triable node */
4270         if ( last ) {
4271          /* If last is set and trietype is not
4272          * NOTHING then we have found at least two
4273          * triable branch sequences in a row of a
4274          * similar trietype so we can turn them
4275          * into a trie. If/when we allow NOTHING to
4276          * start a trie sequence this condition
4277          * will be required, and it isn't expensive
4278          * so we leave it in for now. */
4279          if ( trietype && trietype != NOTHING )
4280           make_trie( pRExC_state,
4281             startbranch, first, cur, tail,
4282             count, trietype, depth+1 );
4283          last = NULL; /* note: we clear/update
4284              first, trietype etc below,
4285              so we dont do it here */
4286         }
4287         if ( noper_trietype
4288 #ifdef NOJUMPTRIE
4289          && noper_next == tail
4290 #endif
4291         ){
4292          /* noper is triable, so we can start a new
4293          * trie sequence */
4294          count = 1;
4295          first = cur;
4296          trietype = noper_trietype;
4297         } else if (first) {
4298          /* if we already saw a first but the
4299          * current node is not triable then we have
4300          * to reset the first information. */
4301          count = 0;
4302          first = NULL;
4303          trietype = 0;
4304         }
4305        } /* end handle unmergable node */
4306       } /* loop over branches */
4307       DEBUG_TRIE_COMPILE_r({
4308        regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4309        PerlIO_printf( Perl_debug_log,
4310        "%*s- %s (%d) <SCAN FINISHED>\n",
4311        (int)depth * 2 + 2,
4312        "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4313
4314       });
4315       if ( last && trietype ) {
4316        if ( trietype != NOTHING ) {
4317         /* the last branch of the sequence was part of
4318         * a trie, so we have to construct it here
4319         * outside of the loop */
4320         made= make_trie( pRExC_state, startbranch,
4321             first, scan, tail, count,
4322             trietype, depth+1 );
4323 #ifdef TRIE_STUDY_OPT
4324         if ( ((made == MADE_EXACT_TRIE &&
4325          startbranch == first)
4326          || ( first_non_open == first )) &&
4327          depth==0 ) {
4328          flags |= SCF_TRIE_RESTUDY;
4329          if ( startbranch == first
4330           && scan == tail )
4331          {
4332           RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4333          }
4334         }
4335 #endif
4336        } else {
4337         /* at this point we know whatever we have is a
4338         * NOTHING sequence/branch AND if 'startbranch'
4339         * is 'first' then we can turn the whole thing
4340         * into a NOTHING
4341         */
4342         if ( startbranch == first ) {
4343          regnode *opt;
4344          /* the entire thing is a NOTHING sequence,
4345          * something like this: (?:|) So we can
4346          * turn it into a plain NOTHING op. */
4347          DEBUG_TRIE_COMPILE_r({
4348           regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4349           PerlIO_printf( Perl_debug_log,
4350           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4351           "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4352
4353          });
4354          OP(startbranch)= NOTHING;
4355          NEXT_OFF(startbranch)= tail - startbranch;
4356          for ( opt= startbranch + 1; opt < tail ; opt++ )
4357           OP(opt)= OPTIMIZED;
4358         }
4359        }
4360       } /* end if ( last) */
4361      } /* TRIE_MAXBUF is non zero */
4362
4363     } /* do trie */
4364
4365    }
4366    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4367     scan = NEXTOPER(NEXTOPER(scan));
4368    } else   /* single branch is optimized. */
4369     scan = NEXTOPER(scan);
4370    continue;
4371   } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4372    I32 paren = 0;
4373    regnode *start = NULL;
4374    regnode *end = NULL;
4375    U32 my_recursed_depth= recursed_depth;
4376
4377
4378    if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
4379     /* Do setup, note this code has side effects beyond
4380     * the rest of this block. Specifically setting
4381     * RExC_recurse[] must happen at least once during
4382     * study_chunk(). */
4383     if (OP(scan) == GOSUB) {
4384      paren = ARG(scan);
4385      RExC_recurse[ARG2L(scan)] = scan;
4386      start = RExC_open_parens[paren-1];
4387      end   = RExC_close_parens[paren-1];
4388     } else {
4389      start = RExC_rxi->program + 1;
4390      end   = RExC_opend;
4391     }
4392     /* NOTE we MUST always execute the above code, even
4393     * if we do nothing with a GOSUB/GOSTART */
4394     if (
4395      ( flags & SCF_IN_DEFINE )
4396      ||
4397      (
4398       (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
4399       &&
4400       ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4401      )
4402     ) {
4403      /* no need to do anything here if we are in a define. */
4404      /* or we are after some kind of infinite construct
4405      * so we can skip recursing into this item.
4406      * Since it is infinite we will not change the maxlen
4407      * or delta, and if we miss something that might raise
4408      * the minlen it will merely pessimise a little.
4409      *
4410      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4411      * might result in a minlen of 1 and not of 4,
4412      * but this doesn't make us mismatch, just try a bit
4413      * harder than we should.
4414      * */
4415      scan= regnext(scan);
4416      continue;
4417     }
4418
4419     if (
4420      !recursed_depth
4421      ||
4422      !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4423     ) {
4424      /* it is quite possible that there are more efficient ways
4425      * to do this. We maintain a bitmap per level of recursion
4426      * of which patterns we have entered so we can detect if a
4427      * pattern creates a possible infinite loop. When we
4428      * recurse down a level we copy the previous levels bitmap
4429      * down. When we are at recursion level 0 we zero the top
4430      * level bitmap. It would be nice to implement a different
4431      * more efficient way of doing this. In particular the top
4432      * level bitmap may be unnecessary.
4433      */
4434      if (!recursed_depth) {
4435       Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4436      } else {
4437       Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4438        RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4439        RExC_study_chunk_recursed_bytes, U8);
4440      }
4441      /* we havent recursed into this paren yet, so recurse into it */
4442      DEBUG_STUDYDATA("set:", data,depth);
4443      PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4444      my_recursed_depth= recursed_depth + 1;
4445     } else {
4446      DEBUG_STUDYDATA("inf:", data,depth);
4447      /* some form of infinite recursion, assume infinite length
4448      * */
4449      if (flags & SCF_DO_SUBSTR) {
4450       scan_commit(pRExC_state, data, minlenp, is_inf);
4451       data->longest = &(data->longest_float);
4452      }
4453      is_inf = is_inf_internal = 1;
4454      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4455       ssc_anything(data->start_class);
4456      flags &= ~SCF_DO_STCLASS;
4457
4458      start= NULL; /* reset start so we dont recurse later on. */
4459     }
4460    } else {
4461     paren = stopparen;
4462     start = scan + 2;
4463     end = regnext(scan);
4464    }
4465    if (start) {
4466     scan_frame *newframe;
4467     assert(end);
4468     if (!RExC_frame_last) {
4469      Newxz(newframe, 1, scan_frame);
4470      SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4471      RExC_frame_head= newframe;
4472      RExC_frame_count++;
4473     } else if (!RExC_frame_last->next_frame) {
4474      Newxz(newframe,1,scan_frame);
4475      RExC_frame_last->next_frame= newframe;
4476      newframe->prev_frame= RExC_frame_last;
4477      RExC_frame_count++;
4478     } else {
4479      newframe= RExC_frame_last->next_frame;
4480     }
4481     RExC_frame_last= newframe;
4482
4483     newframe->next_regnode = regnext(scan);
4484     newframe->last_regnode = last;
4485     newframe->stopparen = stopparen;
4486     newframe->prev_recursed_depth = recursed_depth;
4487     newframe->this_prev_frame= frame;
4488
4489     DEBUG_STUDYDATA("frame-new:",data,depth);
4490     DEBUG_PEEP("fnew", scan, depth);
4491
4492     frame = newframe;
4493     scan =  start;
4494     stopparen = paren;
4495     last = end;
4496     depth = depth + 1;
4497     recursed_depth= my_recursed_depth;
4498
4499     continue;
4500    }
4501   }
4502   else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
4503    SSize_t l = STR_LEN(scan);
4504    UV uc;
4505    if (UTF) {
4506     const U8 * const s = (U8*)STRING(scan);
4507     uc = utf8_to_uvchr_buf(s, s + l, NULL);
4508     l = utf8_length(s, s + l);
4509    } else {
4510     uc = *((U8*)STRING(scan));
4511    }
4512    min += l;
4513    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4514     /* The code below prefers earlier match for fixed
4515     offset, later match for variable offset.  */
4516     if (data->last_end == -1) { /* Update the start info. */
4517      data->last_start_min = data->pos_min;
4518      data->last_start_max = is_inf
4519       ? SSize_t_MAX : data->pos_min + data->pos_delta;
4520     }
4521     sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4522     if (UTF)
4523      SvUTF8_on(data->last_found);
4524     {
4525      SV * const sv = data->last_found;
4526      MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4527       mg_find(sv, PERL_MAGIC_utf8) : NULL;
4528      if (mg && mg->mg_len >= 0)
4529       mg->mg_len += utf8_length((U8*)STRING(scan),
4530            (U8*)STRING(scan)+STR_LEN(scan));
4531     }
4532     data->last_end = data->pos_min + l;
4533     data->pos_min += l; /* As in the first entry. */
4534     data->flags &= ~SF_BEFORE_EOL;
4535    }
4536
4537    /* ANDing the code point leaves at most it, and not in locale, and
4538    * can't match null string */
4539    if (flags & SCF_DO_STCLASS_AND) {
4540     ssc_cp_and(data->start_class, uc);
4541     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4542     ssc_clear_locale(data->start_class);
4543    }
4544    else if (flags & SCF_DO_STCLASS_OR) {
4545     ssc_add_cp(data->start_class, uc);
4546     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4547
4548     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4549     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4550    }
4551    flags &= ~SCF_DO_STCLASS;
4552   }
4553   else if (PL_regkind[OP(scan)] == EXACT) {
4554    /* But OP != EXACT!, so is EXACTFish */
4555    SSize_t l = STR_LEN(scan);
4556    const U8 * s = (U8*)STRING(scan);
4557
4558    /* Search for fixed substrings supports EXACT only. */
4559    if (flags & SCF_DO_SUBSTR) {
4560     assert(data);
4561     scan_commit(pRExC_state, data, minlenp, is_inf);
4562    }
4563    if (UTF) {
4564     l = utf8_length(s, s + l);
4565    }
4566    if (unfolded_multi_char) {
4567     RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4568    }
4569    min += l - min_subtract;
4570    assert (min >= 0);
4571    delta += min_subtract;
4572    if (flags & SCF_DO_SUBSTR) {
4573     data->pos_min += l - min_subtract;
4574     if (data->pos_min < 0) {
4575      data->pos_min = 0;
4576     }
4577     data->pos_delta += min_subtract;
4578     if (min_subtract) {
4579      data->longest = &(data->longest_float);
4580     }
4581    }
4582
4583    if (flags & SCF_DO_STCLASS) {
4584     SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
4585
4586     assert(EXACTF_invlist);
4587     if (flags & SCF_DO_STCLASS_AND) {
4588      if (OP(scan) != EXACTFL)
4589       ssc_clear_locale(data->start_class);
4590      ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4591      ANYOF_POSIXL_ZERO(data->start_class);
4592      ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4593     }
4594     else {  /* SCF_DO_STCLASS_OR */
4595      ssc_union(data->start_class, EXACTF_invlist, FALSE);
4596      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4597
4598      /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4599      ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4600     }
4601     flags &= ~SCF_DO_STCLASS;
4602     SvREFCNT_dec(EXACTF_invlist);
4603    }
4604   }
4605   else if (REGNODE_VARIES(OP(scan))) {
4606    SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4607    I32 fl = 0, f = flags;
4608    regnode * const oscan = scan;
4609    regnode_ssc this_class;
4610    regnode_ssc *oclass = NULL;
4611    I32 next_is_eval = 0;
4612
4613    switch (PL_regkind[OP(scan)]) {
4614    case WHILEM:  /* End of (?:...)* . */
4615     scan = NEXTOPER(scan);
4616     goto finish;
4617    case PLUS:
4618     if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4619      next = NEXTOPER(scan);
4620      if (OP(next) == EXACT
4621       || OP(next) == EXACTL
4622       || (flags & SCF_DO_STCLASS))
4623      {
4624       mincount = 1;
4625       maxcount = REG_INFTY;
4626       next = regnext(scan);
4627       scan = NEXTOPER(scan);
4628       goto do_curly;
4629      }
4630     }
4631     if (flags & SCF_DO_SUBSTR)
4632      data->pos_min++;
4633     min++;
4634     /* FALLTHROUGH */
4635    case STAR:
4636     if (flags & SCF_DO_STCLASS) {
4637      mincount = 0;
4638      maxcount = REG_INFTY;
4639      next = regnext(scan);
4640      scan = NEXTOPER(scan);
4641      goto do_curly;
4642     }
4643     if (flags & SCF_DO_SUBSTR) {
4644      scan_commit(pRExC_state, data, minlenp, is_inf);
4645      /* Cannot extend fixed substrings */
4646      data->longest = &(data->longest_float);
4647     }
4648     is_inf = is_inf_internal = 1;
4649     scan = regnext(scan);
4650     goto optimize_curly_tail;
4651    case CURLY:
4652     if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4653      && (scan->flags == stopparen))
4654     {
4655      mincount = 1;
4656      maxcount = 1;
4657     } else {
4658      mincount = ARG1(scan);
4659      maxcount = ARG2(scan);
4660     }
4661     next = regnext(scan);
4662     if (OP(scan) == CURLYX) {
4663      I32 lp = (data ? *(data->last_closep) : 0);
4664      scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4665     }
4666     scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4667     next_is_eval = (OP(scan) == EVAL);
4668    do_curly:
4669     if (flags & SCF_DO_SUBSTR) {
4670      if (mincount == 0)
4671       scan_commit(pRExC_state, data, minlenp, is_inf);
4672      /* Cannot extend fixed substrings */
4673      pos_before = data->pos_min;
4674     }
4675     if (data) {
4676      fl = data->flags;
4677      data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4678      if (is_inf)
4679       data->flags |= SF_IS_INF;
4680     }
4681     if (flags & SCF_DO_STCLASS) {
4682      ssc_init(pRExC_state, &this_class);
4683      oclass = data->start_class;
4684      data->start_class = &this_class;
4685      f |= SCF_DO_STCLASS_AND;
4686      f &= ~SCF_DO_STCLASS_OR;
4687     }
4688     /* Exclude from super-linear cache processing any {n,m}
4689     regops for which the combination of input pos and regex
4690     pos is not enough information to determine if a match
4691     will be possible.
4692
4693     For example, in the regex /foo(bar\s*){4,8}baz/ with the
4694     regex pos at the \s*, the prospects for a match depend not
4695     only on the input position but also on how many (bar\s*)
4696     repeats into the {4,8} we are. */
4697    if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4698      f &= ~SCF_WHILEM_VISITED_POS;
4699
4700     /* This will finish on WHILEM, setting scan, or on NULL: */
4701     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4702         last, data, stopparen, recursed_depth, NULL,
4703         (mincount == 0
4704         ? (f & ~SCF_DO_SUBSTR)
4705         : f)
4706         ,depth+1);
4707
4708     if (flags & SCF_DO_STCLASS)
4709      data->start_class = oclass;
4710     if (mincount == 0 || minnext == 0) {
4711      if (flags & SCF_DO_STCLASS_OR) {
4712       ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4713      }
4714      else if (flags & SCF_DO_STCLASS_AND) {
4715       /* Switch to OR mode: cache the old value of
4716       * data->start_class */
4717       INIT_AND_WITHP;
4718       StructCopy(data->start_class, and_withp, regnode_ssc);
4719       flags &= ~SCF_DO_STCLASS_AND;
4720       StructCopy(&this_class, data->start_class, regnode_ssc);
4721       flags |= SCF_DO_STCLASS_OR;
4722       ANYOF_FLAGS(data->start_class)
4723             |= SSC_MATCHES_EMPTY_STRING;
4724      }
4725     } else {  /* Non-zero len */
4726      if (flags & SCF_DO_STCLASS_OR) {
4727       ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4728       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4729      }
4730      else if (flags & SCF_DO_STCLASS_AND)
4731       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4732      flags &= ~SCF_DO_STCLASS;
4733     }
4734     if (!scan)   /* It was not CURLYX, but CURLY. */
4735      scan = next;
4736     if (!(flags & SCF_TRIE_DOING_RESTUDY)
4737      /* ? quantifier ok, except for (?{ ... }) */
4738      && (next_is_eval || !(mincount == 0 && maxcount == 1))
4739      && (minnext == 0) && (deltanext == 0)
4740      && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4741      && maxcount <= REG_INFTY/3) /* Complement check for big
4742             count */
4743     {
4744      /* Fatal warnings may leak the regexp without this: */
4745      SAVEFREESV(RExC_rx_sv);
4746      Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
4747       "Quantifier unexpected on zero-length expression "
4748       "in regex m/%"UTF8f"/",
4749       UTF8fARG(UTF, RExC_end - RExC_precomp,
4750         RExC_precomp));
4751      (void)ReREFCNT_inc(RExC_rx_sv);
4752     }
4753
4754     min += minnext * mincount;
4755     is_inf_internal |= deltanext == SSize_t_MAX
4756       || (maxcount == REG_INFTY && minnext + deltanext > 0);
4757     is_inf |= is_inf_internal;
4758     if (is_inf) {
4759      delta = SSize_t_MAX;
4760     } else {
4761      delta += (minnext + deltanext) * maxcount
4762        - minnext * mincount;
4763     }
4764     /* Try powerful optimization CURLYX => CURLYN. */
4765     if (  OP(oscan) == CURLYX && data
4766      && data->flags & SF_IN_PAR
4767      && !(data->flags & SF_HAS_EVAL)
4768      && !deltanext && minnext == 1 ) {
4769      /* Try to optimize to CURLYN.  */
4770      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4771      regnode * const nxt1 = nxt;
4772 #ifdef DEBUGGING
4773      regnode *nxt2;
4774 #endif
4775
4776      /* Skip open. */
4777      nxt = regnext(nxt);
4778      if (!REGNODE_SIMPLE(OP(nxt))
4779       && !(PL_regkind[OP(nxt)] == EXACT
4780        && STR_LEN(nxt) == 1))
4781       goto nogo;
4782 #ifdef DEBUGGING
4783      nxt2 = nxt;
4784 #endif
4785      nxt = regnext(nxt);
4786      if (OP(nxt) != CLOSE)
4787       goto nogo;
4788      if (RExC_open_parens) {
4789       RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4790       RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4791      }
4792      /* Now we know that nxt2 is the only contents: */
4793      oscan->flags = (U8)ARG(nxt);
4794      OP(oscan) = CURLYN;
4795      OP(nxt1) = NOTHING; /* was OPEN. */
4796
4797 #ifdef DEBUGGING
4798      OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4799      NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4800      NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4801      OP(nxt) = OPTIMIZED; /* was CLOSE. */
4802      OP(nxt + 1) = OPTIMIZED; /* was count. */
4803      NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4804 #endif
4805     }
4806    nogo:
4807
4808     /* Try optimization CURLYX => CURLYM. */
4809     if (  OP(oscan) == CURLYX && data
4810      && !(data->flags & SF_HAS_PAR)
4811      && !(data->flags & SF_HAS_EVAL)
4812      && !deltanext /* atom is fixed width */
4813      && minnext != 0 /* CURLYM can't handle zero width */
4814
4815       /* Nor characters whose fold at run-time may be
4816       * multi-character */
4817      && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4818     ) {
4819      /* XXXX How to optimize if data == 0? */
4820      /* Optimize to a simpler form.  */
4821      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4822      regnode *nxt2;
4823
4824      OP(oscan) = CURLYM;
4825      while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4826        && (OP(nxt2) != WHILEM))
4827       nxt = nxt2;
4828      OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4829      /* Need to optimize away parenths. */
4830      if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4831       /* Set the parenth number.  */
4832       regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4833
4834       oscan->flags = (U8)ARG(nxt);
4835       if (RExC_open_parens) {
4836        RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4837        RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4838       }
4839       OP(nxt1) = OPTIMIZED; /* was OPEN. */
4840       OP(nxt) = OPTIMIZED; /* was CLOSE. */
4841
4842 #ifdef DEBUGGING
4843       OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4844       OP(nxt + 1) = OPTIMIZED; /* was count. */
4845       NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4846       NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4847 #endif
4848 #if 0
4849       while ( nxt1 && (OP(nxt1) != WHILEM)) {
4850        regnode *nnxt = regnext(nxt1);
4851        if (nnxt == nxt) {
4852         if (reg_off_by_arg[OP(nxt1)])
4853          ARG_SET(nxt1, nxt2 - nxt1);
4854         else if (nxt2 - nxt1 < U16_MAX)
4855          NEXT_OFF(nxt1) = nxt2 - nxt1;
4856         else
4857          OP(nxt) = NOTHING; /* Cannot beautify */
4858        }
4859        nxt1 = nnxt;
4860       }
4861 #endif
4862       /* Optimize again: */
4863       study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4864          NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4865      }
4866      else
4867       oscan->flags = 0;
4868     }
4869     else if ((OP(oscan) == CURLYX)
4870       && (flags & SCF_WHILEM_VISITED_POS)
4871       /* See the comment on a similar expression above.
4872        However, this time it's not a subexpression
4873        we care about, but the expression itself. */
4874       && (maxcount == REG_INFTY)
4875       && data && ++data->whilem_c < 16) {
4876      /* This stays as CURLYX, we can put the count/of pair. */
4877      /* Find WHILEM (as in regexec.c) */
4878      regnode *nxt = oscan + NEXT_OFF(oscan);
4879
4880      if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4881       nxt += ARG(nxt);
4882      PREVOPER(nxt)->flags = (U8)(data->whilem_c
4883       | (RExC_whilem_seen << 4)); /* On WHILEM */
4884     }
4885     if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4886      pars++;
4887     if (flags & SCF_DO_SUBSTR) {
4888      SV *last_str = NULL;
4889      STRLEN last_chrs = 0;
4890      int counted = mincount != 0;
4891
4892      if (data->last_end > 0 && mincount != 0) { /* Ends with a
4893                 string. */
4894       SSize_t b = pos_before >= data->last_start_min
4895        ? pos_before : data->last_start_min;
4896       STRLEN l;
4897       const char * const s = SvPV_const(data->last_found, l);
4898       SSize_t old = b - data->last_start_min;
4899
4900       if (UTF)
4901        old = utf8_hop((U8*)s, old) - (U8*)s;
4902       l -= old;
4903       /* Get the added string: */
4904       last_str = newSVpvn_utf8(s  + old, l, UTF);
4905       last_chrs = UTF ? utf8_length((U8*)(s + old),
4906            (U8*)(s + old + l)) : l;
4907       if (deltanext == 0 && pos_before == b) {
4908        /* What was added is a constant string */
4909        if (mincount > 1) {
4910
4911         SvGROW(last_str, (mincount * l) + 1);
4912         repeatcpy(SvPVX(last_str) + l,
4913           SvPVX_const(last_str), l,
4914           mincount - 1);
4915         SvCUR_set(last_str, SvCUR(last_str) * mincount);
4916         /* Add additional parts. */
4917         SvCUR_set(data->last_found,
4918           SvCUR(data->last_found) - l);
4919         sv_catsv(data->last_found, last_str);
4920         {
4921          SV * sv = data->last_found;
4922          MAGIC *mg =
4923           SvUTF8(sv) && SvMAGICAL(sv) ?
4924           mg_find(sv, PERL_MAGIC_utf8) : NULL;
4925          if (mg && mg->mg_len >= 0)
4926           mg->mg_len += last_chrs * (mincount-1);
4927         }
4928         last_chrs *= mincount;
4929         data->last_end += l * (mincount - 1);
4930        }
4931       } else {
4932        /* start offset must point into the last copy */
4933        data->last_start_min += minnext * (mincount - 1);
4934        data->last_start_max =
4935        is_inf
4936        ? SSize_t_MAX
4937        : data->last_start_max +
4938         (maxcount - 1) * (minnext + data->pos_delta);
4939       }
4940      }
4941      /* It is counted once already... */
4942      data->pos_min += minnext * (mincount - counted);
4943 #if 0
4944 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
4945        " SSize_t_MAX=%"UVuf" minnext=%"UVuf
4946        " maxcount=%"UVuf" mincount=%"UVuf"\n",
4947  (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4948  (UV)mincount);
4949 if (deltanext != SSize_t_MAX)
4950 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
4951  (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4952   - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4953 #endif
4954      if (deltanext == SSize_t_MAX
4955       || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4956       data->pos_delta = SSize_t_MAX;
4957      else
4958       data->pos_delta += - counted * deltanext +
4959       (minnext + deltanext) * maxcount - minnext * mincount;
4960      if (mincount != maxcount) {
4961       /* Cannot extend fixed substrings found inside
4962        the group.  */
4963       scan_commit(pRExC_state, data, minlenp, is_inf);
4964       if (mincount && last_str) {
4965        SV * const sv = data->last_found;
4966        MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4967         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4968
4969        if (mg)
4970         mg->mg_len = -1;
4971        sv_setsv(sv, last_str);
4972        data->last_end = data->pos_min;
4973        data->last_start_min = data->pos_min - last_chrs;
4974        data->last_start_max = is_inf
4975         ? SSize_t_MAX
4976         : data->pos_min + data->pos_delta - last_chrs;
4977       }
4978       data->longest = &(data->longest_float);
4979      }
4980      SvREFCNT_dec(last_str);
4981     }
4982     if (data && (fl & SF_HAS_EVAL))
4983      data->flags |= SF_HAS_EVAL;
4984    optimize_curly_tail:
4985     if (OP(oscan) != CURLYX) {
4986      while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4987       && NEXT_OFF(next))
4988       NEXT_OFF(oscan) += NEXT_OFF(next);
4989     }
4990     continue;
4991
4992    default:
4993 #ifdef DEBUGGING
4994     Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4995                  OP(scan));
4996 #endif
4997    case REF:
4998    case CLUMP:
4999     if (flags & SCF_DO_SUBSTR) {
5000      /* Cannot expect anything... */
5001      scan_commit(pRExC_state, data, minlenp, is_inf);
5002      data->longest = &(data->longest_float);
5003     }
5004     is_inf = is_inf_internal = 1;
5005     if (flags & SCF_DO_STCLASS_OR) {
5006      if (OP(scan) == CLUMP) {
5007       /* Actually is any start char, but very few code points
5008       * aren't start characters */
5009       ssc_match_all_cp(data->start_class);
5010      }
5011      else {
5012       ssc_anything(data->start_class);
5013      }
5014     }
5015     flags &= ~SCF_DO_STCLASS;
5016     break;
5017    }
5018   }
5019   else if (OP(scan) == LNBREAK) {
5020    if (flags & SCF_DO_STCLASS) {
5021      if (flags & SCF_DO_STCLASS_AND) {
5022      ssc_intersection(data->start_class,
5023          PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5024      ssc_clear_locale(data->start_class);
5025      ANYOF_FLAGS(data->start_class)
5026             &= ~SSC_MATCHES_EMPTY_STRING;
5027     }
5028     else if (flags & SCF_DO_STCLASS_OR) {
5029      ssc_union(data->start_class,
5030        PL_XPosix_ptrs[_CC_VERTSPACE],
5031        FALSE);
5032      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5033
5034      /* See commit msg for
5035      * 749e076fceedeb708a624933726e7989f2302f6a */
5036      ANYOF_FLAGS(data->start_class)
5037             &= ~SSC_MATCHES_EMPTY_STRING;
5038     }
5039     flags &= ~SCF_DO_STCLASS;
5040    }
5041    min++;
5042    if (delta != SSize_t_MAX)
5043     delta++;    /* Because of the 2 char string cr-lf */
5044    if (flags & SCF_DO_SUBSTR) {
5045     /* Cannot expect anything... */
5046     scan_commit(pRExC_state, data, minlenp, is_inf);
5047      data->pos_min += 1;
5048     data->pos_delta += 1;
5049     data->longest = &(data->longest_float);
5050     }
5051   }
5052   else if (REGNODE_SIMPLE(OP(scan))) {
5053
5054    if (flags & SCF_DO_SUBSTR) {
5055     scan_commit(pRExC_state, data, minlenp, is_inf);
5056     data->pos_min++;
5057    }
5058    min++;
5059    if (flags & SCF_DO_STCLASS) {
5060     bool invert = 0;
5061     SV* my_invlist = NULL;
5062     U8 namedclass;
5063
5064     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5065     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5066
5067     /* Some of the logic below assumes that switching
5068     locale on will only add false positives. */
5069     switch (OP(scan)) {
5070
5071     default:
5072 #ifdef DEBUGGING
5073     Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5074                  OP(scan));
5075 #endif
5076     case CANY:
5077     case SANY:
5078      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5079       ssc_match_all_cp(data->start_class);
5080      break;
5081
5082     case REG_ANY:
5083      {
5084       SV* REG_ANY_invlist = _new_invlist(2);
5085       REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5086                '\n');
5087       if (flags & SCF_DO_STCLASS_OR) {
5088        ssc_union(data->start_class,
5089          REG_ANY_invlist,
5090          TRUE /* TRUE => invert, hence all but \n
5091            */
5092          );
5093       }
5094       else if (flags & SCF_DO_STCLASS_AND) {
5095        ssc_intersection(data->start_class,
5096            REG_ANY_invlist,
5097            TRUE  /* TRUE => invert */
5098            );
5099        ssc_clear_locale(data->start_class);
5100       }
5101       SvREFCNT_dec_NN(REG_ANY_invlist);
5102      }
5103      break;
5104
5105     case ANYOFL:
5106     case ANYOF:
5107      if (flags & SCF_DO_STCLASS_AND)
5108       ssc_and(pRExC_state, data->start_class,
5109         (regnode_charclass *) scan);
5110      else
5111       ssc_or(pRExC_state, data->start_class,
5112               (regnode_charclass *) scan);
5113      break;
5114
5115     case NPOSIXL:
5116      invert = 1;
5117      /* FALLTHROUGH */
5118
5119     case POSIXL:
5120      namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5121      if (flags & SCF_DO_STCLASS_AND) {
5122       bool was_there = cBOOL(
5123           ANYOF_POSIXL_TEST(data->start_class,
5124                 namedclass));
5125       ANYOF_POSIXL_ZERO(data->start_class);
5126       if (was_there) {    /* Do an AND */
5127        ANYOF_POSIXL_SET(data->start_class, namedclass);
5128       }
5129       /* No individual code points can now match */
5130       data->start_class->invlist
5131             = sv_2mortal(_new_invlist(0));
5132      }
5133      else {
5134       int complement = namedclass + ((invert) ? -1 : 1);
5135
5136       assert(flags & SCF_DO_STCLASS_OR);
5137
5138       /* If the complement of this class was already there,
5139       * the result is that they match all code points,
5140       * (\d + \D == everything).  Remove the classes from
5141       * future consideration.  Locale is not relevant in
5142       * this case */
5143       if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5144        ssc_match_all_cp(data->start_class);
5145        ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5146        ANYOF_POSIXL_CLEAR(data->start_class, complement);
5147       }
5148       else {  /* The usual case; just add this class to the
5149         existing set */
5150        ANYOF_POSIXL_SET(data->start_class, namedclass);
5151       }
5152      }
5153      break;
5154
5155     case NPOSIXA:   /* For these, we always know the exact set of
5156         what's matched */
5157      invert = 1;
5158      /* FALLTHROUGH */
5159     case POSIXA:
5160      if (FLAGS(scan) == _CC_ASCII) {
5161       my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
5162      }
5163      else {
5164       _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5165            PL_XPosix_ptrs[_CC_ASCII],
5166            &my_invlist);
5167      }
5168      goto join_posix;
5169
5170     case NPOSIXD:
5171     case NPOSIXU:
5172      invert = 1;
5173      /* FALLTHROUGH */
5174     case POSIXD:
5175     case POSIXU:
5176      my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5177
5178      /* NPOSIXD matches all upper Latin1 code points unless the
5179      * target string being matched is UTF-8, which is
5180      * unknowable until match time.  Since we are going to
5181      * invert, we want to get rid of all of them so that the
5182      * inversion will match all */
5183      if (OP(scan) == NPOSIXD) {
5184       _invlist_subtract(my_invlist, PL_UpperLatin1,
5185           &my_invlist);
5186      }
5187
5188     join_posix:
5189
5190      if (flags & SCF_DO_STCLASS_AND) {
5191       ssc_intersection(data->start_class, my_invlist, invert);
5192       ssc_clear_locale(data->start_class);
5193      }
5194      else {
5195       assert(flags & SCF_DO_STCLASS_OR);
5196       ssc_union(data->start_class, my_invlist, invert);
5197      }
5198      SvREFCNT_dec(my_invlist);
5199     }
5200     if (flags & SCF_DO_STCLASS_OR)
5201      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5202     flags &= ~SCF_DO_STCLASS;
5203    }
5204   }
5205   else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5206    data->flags |= (OP(scan) == MEOL
5207        ? SF_BEFORE_MEOL
5208        : SF_BEFORE_SEOL);
5209    scan_commit(pRExC_state, data, minlenp, is_inf);
5210
5211   }
5212   else if (  PL_regkind[OP(scan)] == BRANCHJ
5213     /* Lookbehind, or need to calculate parens/evals/stclass: */
5214     && (scan->flags || data || (flags & SCF_DO_STCLASS))
5215     && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5216   {
5217    if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5218     || OP(scan) == UNLESSM )
5219    {
5220     /* Negative Lookahead/lookbehind
5221     In this case we can't do fixed string optimisation.
5222     */
5223
5224     SSize_t deltanext, minnext, fake = 0;
5225     regnode *nscan;
5226     regnode_ssc intrnl;
5227     int f = 0;
5228
5229     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5230     if (data) {
5231      data_fake.whilem_c = data->whilem_c;
5232      data_fake.last_closep = data->last_closep;
5233     }
5234     else
5235      data_fake.last_closep = &fake;
5236     data_fake.pos_delta = delta;
5237     if ( flags & SCF_DO_STCLASS && !scan->flags
5238      && OP(scan) == IFMATCH ) { /* Lookahead */
5239      ssc_init(pRExC_state, &intrnl);
5240      data_fake.start_class = &intrnl;
5241      f |= SCF_DO_STCLASS_AND;
5242     }
5243     if (flags & SCF_WHILEM_VISITED_POS)
5244      f |= SCF_WHILEM_VISITED_POS;
5245     next = regnext(scan);
5246     nscan = NEXTOPER(NEXTOPER(scan));
5247     minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5248          last, &data_fake, stopparen,
5249          recursed_depth, NULL, f, depth+1);
5250     if (scan->flags) {
5251      if (deltanext) {
5252       FAIL("Variable length lookbehind not implemented");
5253      }
5254      else if (minnext > (I32)U8_MAX) {
5255       FAIL2("Lookbehind longer than %"UVuf" not implemented",
5256        (UV)U8_MAX);
5257      }
5258      scan->flags = (U8)minnext;
5259     }
5260     if (data) {
5261      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5262       pars++;
5263      if (data_fake.flags & SF_HAS_EVAL)
5264       data->flags |= SF_HAS_EVAL;
5265      data->whilem_c = data_fake.whilem_c;
5266     }
5267     if (f & SCF_DO_STCLASS_AND) {
5268      if (flags & SCF_DO_STCLASS_OR) {
5269       /* OR before, AND after: ideally we would recurse with
5270       * data_fake to get the AND applied by study of the
5271       * remainder of the pattern, and then derecurse;
5272       * *** HACK *** for now just treat as "no information".
5273       * See [perl #56690].
5274       */
5275       ssc_init(pRExC_state, data->start_class);
5276      }  else {
5277       /* AND before and after: combine and continue.  These
5278       * assertions are zero-length, so can match an EMPTY
5279       * string */
5280       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5281       ANYOF_FLAGS(data->start_class)
5282             |= SSC_MATCHES_EMPTY_STRING;
5283      }
5284     }
5285    }
5286 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5287    else {
5288     /* Positive Lookahead/lookbehind
5289     In this case we can do fixed string optimisation,
5290     but we must be careful about it. Note in the case of
5291     lookbehind the positions will be offset by the minimum
5292     length of the pattern, something we won't know about
5293     until after the recurse.
5294     */
5295     SSize_t deltanext, fake = 0;
5296     regnode *nscan;
5297     regnode_ssc intrnl;
5298     int f = 0;
5299     /* We use SAVEFREEPV so that when the full compile
5300      is finished perl will clean up the allocated
5301      minlens when it's all done. This way we don't
5302      have to worry about freeing them when we know
5303      they wont be used, which would be a pain.
5304     */
5305     SSize_t *minnextp;
5306     Newx( minnextp, 1, SSize_t );
5307     SAVEFREEPV(minnextp);
5308
5309     if (data) {
5310      StructCopy(data, &data_fake, scan_data_t);
5311      if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5312       f |= SCF_DO_SUBSTR;
5313       if (scan->flags)
5314        scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5315       data_fake.last_found=newSVsv(data->last_found);
5316      }
5317     }
5318     else
5319      data_fake.last_closep = &fake;
5320     data_fake.flags = 0;
5321     data_fake.pos_delta = delta;
5322     if (is_inf)
5323      data_fake.flags |= SF_IS_INF;
5324     if ( flags & SCF_DO_STCLASS && !scan->flags
5325      && OP(scan) == IFMATCH ) { /* Lookahead */
5326      ssc_init(pRExC_state, &intrnl);
5327      data_fake.start_class = &intrnl;
5328      f |= SCF_DO_STCLASS_AND;
5329     }
5330     if (flags & SCF_WHILEM_VISITED_POS)
5331      f |= SCF_WHILEM_VISITED_POS;
5332     next = regnext(scan);
5333     nscan = NEXTOPER(NEXTOPER(scan));
5334
5335     *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5336           &deltanext, last, &data_fake,
5337           stopparen, recursed_depth, NULL,
5338           f,depth+1);
5339     if (scan->flags) {
5340      if (deltanext) {
5341       FAIL("Variable length lookbehind not implemented");
5342      }
5343      else if (*minnextp > (I32)U8_MAX) {
5344       FAIL2("Lookbehind longer than %"UVuf" not implemented",
5345        (UV)U8_MAX);
5346      }
5347      scan->flags = (U8)*minnextp;
5348     }
5349
5350     *minnextp += min;
5351
5352     if (f & SCF_DO_STCLASS_AND) {
5353      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5354      ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5355     }
5356     if (data) {
5357      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5358       pars++;
5359      if (data_fake.flags & SF_HAS_EVAL)
5360       data->flags |= SF_HAS_EVAL;
5361      data->whilem_c = data_fake.whilem_c;
5362      if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5363       if (RExC_rx->minlen<*minnextp)
5364        RExC_rx->minlen=*minnextp;
5365       scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5366       SvREFCNT_dec_NN(data_fake.last_found);
5367
5368       if ( data_fake.minlen_fixed != minlenp )
5369       {
5370        data->offset_fixed= data_fake.offset_fixed;
5371        data->minlen_fixed= data_fake.minlen_fixed;
5372        data->lookbehind_fixed+= scan->flags;
5373       }
5374       if ( data_fake.minlen_float != minlenp )
5375       {
5376        data->minlen_float= data_fake.minlen_float;
5377        data->offset_float_min=data_fake.offset_float_min;
5378        data->offset_float_max=data_fake.offset_float_max;
5379        data->lookbehind_float+= scan->flags;
5380       }
5381      }
5382     }
5383    }
5384 #endif
5385   }
5386   else if (OP(scan) == OPEN) {
5387    if (stopparen != (I32)ARG(scan))
5388     pars++;
5389   }
5390   else if (OP(scan) == CLOSE) {
5391    if (stopparen == (I32)ARG(scan)) {
5392     break;
5393    }
5394    if ((I32)ARG(scan) == is_par) {
5395     next = regnext(scan);
5396
5397     if ( next && (OP(next) != WHILEM) && next < last)
5398      is_par = 0;  /* Disable optimization */
5399    }
5400    if (data)
5401     *(data->last_closep) = ARG(scan);
5402   }
5403   else if (OP(scan) == EVAL) {
5404     if (data)
5405      data->flags |= SF_HAS_EVAL;
5406   }
5407   else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5408    if (flags & SCF_DO_SUBSTR) {
5409     scan_commit(pRExC_state, data, minlenp, is_inf);
5410     flags &= ~SCF_DO_SUBSTR;
5411    }
5412    if (data && OP(scan)==ACCEPT) {
5413     data->flags |= SCF_SEEN_ACCEPT;
5414     if (stopmin > min)
5415      stopmin = min;
5416    }
5417   }
5418   else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5419   {
5420     if (flags & SCF_DO_SUBSTR) {
5421      scan_commit(pRExC_state, data, minlenp, is_inf);
5422      data->longest = &(data->longest_float);
5423     }
5424     is_inf = is_inf_internal = 1;
5425     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5426      ssc_anything(data->start_class);
5427     flags &= ~SCF_DO_STCLASS;
5428   }
5429   else if (OP(scan) == GPOS) {
5430    if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5431     !(delta || is_inf || (data && data->pos_delta)))
5432    {
5433     if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5434      RExC_rx->intflags |= PREGf_ANCH_GPOS;
5435     if (RExC_rx->gofs < (STRLEN)min)
5436      RExC_rx->gofs = min;
5437    } else {
5438     RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5439     RExC_rx->gofs = 0;
5440    }
5441   }
5442 #ifdef TRIE_STUDY_OPT
5443 #ifdef FULL_TRIE_STUDY
5444   else if (PL_regkind[OP(scan)] == TRIE) {
5445    /* NOTE - There is similar code to this block above for handling
5446    BRANCH nodes on the initial study.  If you change stuff here
5447    check there too. */
5448    regnode *trie_node= scan;
5449    regnode *tail= regnext(scan);
5450    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5451    SSize_t max1 = 0, min1 = SSize_t_MAX;
5452    regnode_ssc accum;
5453
5454    if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5455     /* Cannot merge strings after this. */
5456     scan_commit(pRExC_state, data, minlenp, is_inf);
5457    }
5458    if (flags & SCF_DO_STCLASS)
5459     ssc_init_zero(pRExC_state, &accum);
5460
5461    if (!trie->jump) {
5462     min1= trie->minlen;
5463     max1= trie->maxlen;
5464    } else {
5465     const regnode *nextbranch= NULL;
5466     U32 word;
5467
5468     for ( word=1 ; word <= trie->wordcount ; word++)
5469     {
5470      SSize_t deltanext=0, minnext=0, f = 0, fake;
5471      regnode_ssc this_class;
5472
5473      StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5474      if (data) {
5475       data_fake.whilem_c = data->whilem_c;
5476       data_fake.last_closep = data->last_closep;
5477      }
5478      else
5479       data_fake.last_closep = &fake;
5480      data_fake.pos_delta = delta;
5481      if (flags & SCF_DO_STCLASS) {
5482       ssc_init(pRExC_state, &this_class);
5483       data_fake.start_class = &this_class;
5484       f = SCF_DO_STCLASS_AND;
5485      }
5486      if (flags & SCF_WHILEM_VISITED_POS)
5487       f |= SCF_WHILEM_VISITED_POS;
5488
5489      if (trie->jump[word]) {
5490       if (!nextbranch)
5491        nextbranch = trie_node + trie->jump[0];
5492       scan= trie_node + trie->jump[word];
5493       /* We go from the jump point to the branch that follows
5494       it. Note this means we need the vestigal unused
5495       branches even though they arent otherwise used. */
5496       minnext = study_chunk(pRExC_state, &scan, minlenp,
5497        &deltanext, (regnode *)nextbranch, &data_fake,
5498        stopparen, recursed_depth, NULL, f,depth+1);
5499      }
5500      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5501       nextbranch= regnext((regnode*)nextbranch);
5502
5503      if (min1 > (SSize_t)(minnext + trie->minlen))
5504       min1 = minnext + trie->minlen;
5505      if (deltanext == SSize_t_MAX) {
5506       is_inf = is_inf_internal = 1;
5507       max1 = SSize_t_MAX;
5508      } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5509       max1 = minnext + deltanext + trie->maxlen;
5510
5511      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5512       pars++;
5513      if (data_fake.flags & SCF_SEEN_ACCEPT) {
5514       if ( stopmin > min + min1)
5515        stopmin = min + min1;
5516       flags &= ~SCF_DO_SUBSTR;
5517       if (data)
5518        data->flags |= SCF_SEEN_ACCEPT;
5519      }
5520      if (data) {
5521       if (data_fake.flags & SF_HAS_EVAL)
5522        data->flags |= SF_HAS_EVAL;
5523       data->whilem_c = data_fake.whilem_c;
5524      }
5525      if (flags & SCF_DO_STCLASS)
5526       ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5527     }
5528    }
5529    if (flags & SCF_DO_SUBSTR) {
5530     data->pos_min += min1;
5531     data->pos_delta += max1 - min1;
5532     if (max1 != min1 || is_inf)
5533      data->longest = &(data->longest_float);
5534    }
5535    min += min1;
5536    if (delta != SSize_t_MAX)
5537     delta += max1 - min1;
5538    if (flags & SCF_DO_STCLASS_OR) {
5539     ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5540     if (min1) {
5541      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5542      flags &= ~SCF_DO_STCLASS;
5543     }
5544    }
5545    else if (flags & SCF_DO_STCLASS_AND) {
5546     if (min1) {
5547      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5548      flags &= ~SCF_DO_STCLASS;
5549     }
5550     else {
5551      /* Switch to OR mode: cache the old value of
5552      * data->start_class */
5553      INIT_AND_WITHP;
5554      StructCopy(data->start_class, and_withp, regnode_ssc);
5555      flags &= ~SCF_DO_STCLASS_AND;
5556      StructCopy(&accum, data->start_class, regnode_ssc);
5557      flags |= SCF_DO_STCLASS_OR;
5558     }
5559    }
5560    scan= tail;
5561    continue;
5562   }
5563 #else
5564   else if (PL_regkind[OP(scan)] == TRIE) {
5565    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5566    U8*bang=NULL;
5567
5568    min += trie->minlen;
5569    delta += (trie->maxlen - trie->minlen);
5570    flags &= ~SCF_DO_STCLASS; /* xxx */
5571    if (flags & SCF_DO_SUBSTR) {
5572     /* Cannot expect anything... */
5573     scan_commit(pRExC_state, data, minlenp, is_inf);
5574      data->pos_min += trie->minlen;
5575      data->pos_delta += (trie->maxlen - trie->minlen);
5576     if (trie->maxlen != trie->minlen)
5577      data->longest = &(data->longest_float);
5578     }
5579     if (trie->jump) /* no more substrings -- for now /grr*/
5580    flags &= ~SCF_DO_SUBSTR;
5581   }
5582 #endif /* old or new */
5583 #endif /* TRIE_STUDY_OPT */
5584
5585   /* Else: zero-length, ignore. */
5586   scan = regnext(scan);
5587  }
5588  /* If we are exiting a recursion we can unset its recursed bit
5589  * and allow ourselves to enter it again - no danger of an
5590  * infinite loop there.
5591  if (stopparen > -1 && recursed) {
5592   DEBUG_STUDYDATA("unset:", data,depth);
5593   PAREN_UNSET( recursed, stopparen);
5594  }
5595  */
5596  if (frame) {
5597   depth = depth - 1;
5598
5599   DEBUG_STUDYDATA("frame-end:",data,depth);
5600   DEBUG_PEEP("fend", scan, depth);
5601
5602   /* restore previous context */
5603   last = frame->last_regnode;
5604   scan = frame->next_regnode;
5605   stopparen = frame->stopparen;
5606   recursed_depth = frame->prev_recursed_depth;
5607
5608   RExC_frame_last = frame->prev_frame;
5609   frame = frame->this_prev_frame;
5610   goto fake_study_recurse;
5611  }
5612
5613   finish:
5614  assert(!frame);
5615  DEBUG_STUDYDATA("pre-fin:",data,depth);
5616
5617  *scanp = scan;
5618  *deltap = is_inf_internal ? SSize_t_MAX : delta;
5619
5620  if (flags & SCF_DO_SUBSTR && is_inf)
5621   data->pos_delta = SSize_t_MAX - data->pos_min;
5622  if (is_par > (I32)U8_MAX)
5623   is_par = 0;
5624  if (is_par && pars==1 && data) {
5625   data->flags |= SF_IN_PAR;
5626   data->flags &= ~SF_HAS_PAR;
5627  }
5628  else if (pars && data) {
5629   data->flags |= SF_HAS_PAR;
5630   data->flags &= ~SF_IN_PAR;
5631  }
5632  if (flags & SCF_DO_STCLASS_OR)
5633   ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5634  if (flags & SCF_TRIE_RESTUDY)
5635   data->flags |=  SCF_TRIE_RESTUDY;
5636
5637  DEBUG_STUDYDATA("post-fin:",data,depth);
5638
5639  {
5640   SSize_t final_minlen= min < stopmin ? min : stopmin;
5641
5642   if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
5643    if (final_minlen > SSize_t_MAX - delta)
5644     RExC_maxlen = SSize_t_MAX;
5645    else if (RExC_maxlen < final_minlen + delta)
5646     RExC_maxlen = final_minlen + delta;
5647   }
5648   return final_minlen;
5649  }
5650  NOT_REACHED; /* NOTREACHED */
5651 }
5652
5653 STATIC U32
5654 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5655 {
5656  U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5657
5658  PERL_ARGS_ASSERT_ADD_DATA;
5659
5660  Renewc(RExC_rxi->data,
5661   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5662   char, struct reg_data);
5663  if(count)
5664   Renew(RExC_rxi->data->what, count + n, U8);
5665  else
5666   Newx(RExC_rxi->data->what, n, U8);
5667  RExC_rxi->data->count = count + n;
5668  Copy(s, RExC_rxi->data->what + count, n, U8);
5669  return count;
5670 }
5671
5672 /*XXX: todo make this not included in a non debugging perl, but appears to be
5673  * used anyway there, in 'use re' */
5674 #ifndef PERL_IN_XSUB_RE
5675 void
5676 Perl_reginitcolors(pTHX)
5677 {
5678  const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5679  if (s) {
5680   char *t = savepv(s);
5681   int i = 0;
5682   PL_colors[0] = t;
5683   while (++i < 6) {
5684    t = strchr(t, '\t');
5685    if (t) {
5686     *t = '\0';
5687     PL_colors[i] = ++t;
5688    }
5689    else
5690     PL_colors[i] = t = (char *)"";
5691   }
5692  } else {
5693   int i = 0;
5694   while (i < 6)
5695    PL_colors[i++] = (char *)"";
5696  }
5697  PL_colorset = 1;
5698 }
5699 #endif
5700
5701
5702 #ifdef TRIE_STUDY_OPT
5703 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5704  STMT_START {                                            \
5705   if (                                                \
5706    (data.flags & SCF_TRIE_RESTUDY)               \
5707    && ! restudied++                              \
5708   ) {                                                 \
5709    dOsomething;                                    \
5710    goto reStudy;                                   \
5711   }                                                   \
5712  } STMT_END
5713 #else
5714 #define CHECK_RESTUDY_GOTO_butfirst
5715 #endif
5716
5717 /*
5718  * pregcomp - compile a regular expression into internal code
5719  *
5720  * Decides which engine's compiler to call based on the hint currently in
5721  * scope
5722  */
5723
5724 #ifndef PERL_IN_XSUB_RE
5725
5726 /* return the currently in-scope regex engine (or the default if none)  */
5727
5728 regexp_engine const *
5729 Perl_current_re_engine(pTHX)
5730 {
5731  if (IN_PERL_COMPILETIME) {
5732   HV * const table = GvHV(PL_hintgv);
5733   SV **ptr;
5734
5735   if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5736    return &reh_regexp_engine;
5737   ptr = hv_fetchs(table, "regcomp", FALSE);
5738   if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5739    return &reh_regexp_engine;
5740   return INT2PTR(regexp_engine*,SvIV(*ptr));
5741  }
5742  else {
5743   SV *ptr;
5744   if (!PL_curcop->cop_hints_hash)
5745    return &reh_regexp_engine;
5746   ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5747   if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5748    return &reh_regexp_engine;
5749   return INT2PTR(regexp_engine*,SvIV(ptr));
5750  }
5751 }
5752
5753
5754 REGEXP *
5755 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5756 {
5757  regexp_engine const *eng = current_re_engine();
5758  GET_RE_DEBUG_FLAGS_DECL;
5759
5760  PERL_ARGS_ASSERT_PREGCOMP;
5761
5762  /* Dispatch a request to compile a regexp to correct regexp engine. */
5763  DEBUG_COMPILE_r({
5764   PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5765       PTR2UV(eng));
5766  });
5767  return CALLREGCOMP_ENG(eng, pattern, flags);
5768 }
5769 #endif
5770
5771 /* public(ish) entry point for the perl core's own regex compiling code.
5772  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5773  * pattern rather than a list of OPs, and uses the internal engine rather
5774  * than the current one */
5775
5776 REGEXP *
5777 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5778 {
5779  SV *pat = pattern; /* defeat constness! */
5780  PERL_ARGS_ASSERT_RE_COMPILE;
5781  return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5782 #ifdef PERL_IN_XSUB_RE
5783         &my_reg_engine,
5784 #else
5785         &reh_regexp_engine,
5786 #endif
5787         NULL, NULL, rx_flags, 0);
5788 }
5789
5790
5791 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5792  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5793  * point to the realloced string and length.
5794  *
5795  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5796  * stuff added */
5797
5798 static void
5799 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5800      char **pat_p, STRLEN *plen_p, int num_code_blocks)
5801 {
5802  U8 *const src = (U8*)*pat_p;
5803  U8 *dst, *d;
5804  int n=0;
5805  STRLEN s = 0;
5806  bool do_end = 0;
5807  GET_RE_DEBUG_FLAGS_DECL;
5808
5809  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5810   "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5811
5812  Newx(dst, *plen_p * 2 + 1, U8);
5813  d = dst;
5814
5815  while (s < *plen_p) {
5816   append_utf8_from_native_byte(src[s], &d);
5817   if (n < num_code_blocks) {
5818    if (!do_end && pRExC_state->code_blocks[n].start == s) {
5819     pRExC_state->code_blocks[n].start = d - dst - 1;
5820     assert(*(d - 1) == '(');
5821     do_end = 1;
5822    }
5823    else if (do_end && pRExC_state->code_blocks[n].end == s) {
5824     pRExC_state->code_blocks[n].end = d - dst - 1;
5825     assert(*(d - 1) == ')');
5826     do_end = 0;
5827     n++;
5828    }
5829   }
5830   s++;
5831  }
5832  *d = '\0';
5833  *plen_p = d - dst;
5834  *pat_p = (char*) dst;
5835  SAVEFREEPV(*pat_p);
5836  RExC_orig_utf8 = RExC_utf8 = 1;
5837 }
5838
5839
5840
5841 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5842  * while recording any code block indices, and handling overloading,
5843  * nested qr// objects etc.  If pat is null, it will allocate a new
5844  * string, or just return the first arg, if there's only one.
5845  *
5846  * Returns the malloced/updated pat.
5847  * patternp and pat_count is the array of SVs to be concatted;
5848  * oplist is the optional list of ops that generated the SVs;
5849  * recompile_p is a pointer to a boolean that will be set if
5850  *   the regex will need to be recompiled.
5851  * delim, if non-null is an SV that will be inserted between each element
5852  */
5853
5854 static SV*
5855 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5856     SV *pat, SV ** const patternp, int pat_count,
5857     OP *oplist, bool *recompile_p, SV *delim)
5858 {
5859  SV **svp;
5860  int n = 0;
5861  bool use_delim = FALSE;
5862  bool alloced = FALSE;
5863
5864  /* if we know we have at least two args, create an empty string,
5865  * then concatenate args to that. For no args, return an empty string */
5866  if (!pat && pat_count != 1) {
5867   pat = newSVpvs("");
5868   SAVEFREESV(pat);
5869   alloced = TRUE;
5870  }
5871
5872  for (svp = patternp; svp < patternp + pat_count; svp++) {
5873   SV *sv;
5874   SV *rx  = NULL;
5875   STRLEN orig_patlen = 0;
5876   bool code = 0;
5877   SV *msv = use_delim ? delim : *svp;
5878   if (!msv) msv = &PL_sv_undef;
5879
5880   /* if we've got a delimiter, we go round the loop twice for each
5881   * svp slot (except the last), using the delimiter the second
5882   * time round */
5883   if (use_delim) {
5884    svp--;
5885    use_delim = FALSE;
5886   }
5887   else if (delim)
5888    use_delim = TRUE;
5889
5890   if (SvTYPE(msv) == SVt_PVAV) {
5891    /* we've encountered an interpolated array within
5892    * the pattern, e.g. /...@a..../. Expand the list of elements,
5893    * then recursively append elements.
5894    * The code in this block is based on S_pushav() */
5895
5896    AV *const av = (AV*)msv;
5897    const SSize_t maxarg = AvFILL(av) + 1;
5898    SV **array;
5899
5900    if (oplist) {
5901     assert(oplist->op_type == OP_PADAV
5902      || oplist->op_type == OP_RV2AV);
5903     oplist = OpSIBLING(oplist);
5904    }
5905
5906    if (SvRMAGICAL(av)) {
5907     SSize_t i;
5908
5909     Newx(array, maxarg, SV*);
5910     SAVEFREEPV(array);
5911     for (i=0; i < maxarg; i++) {
5912      SV ** const svp = av_fetch(av, i, FALSE);
5913      array[i] = svp ? *svp : &PL_sv_undef;
5914     }
5915    }
5916    else
5917     array = AvARRAY(av);
5918
5919    pat = S_concat_pat(aTHX_ pRExC_state, pat,
5920         array, maxarg, NULL, recompile_p,
5921         /* $" */
5922         GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5923
5924    continue;
5925   }
5926
5927
5928   /* we make the assumption here that each op in the list of
5929   * op_siblings maps to one SV pushed onto the stack,
5930   * except for code blocks, with have both an OP_NULL and
5931   * and OP_CONST.
5932   * This allows us to match up the list of SVs against the
5933   * list of OPs to find the next code block.
5934   *
5935   * Note that       PUSHMARK PADSV PADSV ..
5936   * is optimised to
5937   *                 PADRANGE PADSV  PADSV  ..
5938   * so the alignment still works. */
5939
5940   if (oplist) {
5941    if (oplist->op_type == OP_NULL
5942     && (oplist->op_flags & OPf_SPECIAL))
5943    {
5944     assert(n < pRExC_state->num_code_blocks);
5945     pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5946     pRExC_state->code_blocks[n].block = oplist;
5947     pRExC_state->code_blocks[n].src_regex = NULL;
5948     n++;
5949     code = 1;
5950     oplist = OpSIBLING(oplist); /* skip CONST */
5951     assert(oplist);
5952    }
5953    oplist = OpSIBLING(oplist);;
5954   }
5955
5956   /* apply magic and QR overloading to arg */
5957
5958   SvGETMAGIC(msv);
5959   if (SvROK(msv) && SvAMAGIC(msv)) {
5960    SV *sv = AMG_CALLunary(msv, regexp_amg);
5961    if (sv) {
5962     if (SvROK(sv))
5963      sv = SvRV(sv);
5964     if (SvTYPE(sv) != SVt_REGEXP)
5965      Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5966     msv = sv;
5967    }
5968   }
5969
5970   /* try concatenation overload ... */
5971   if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5972     (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5973   {
5974    sv_setsv(pat, sv);
5975    /* overloading involved: all bets are off over literal
5976    * code. Pretend we haven't seen it */
5977    pRExC_state->num_code_blocks -= n;
5978    n = 0;
5979   }
5980   else  {
5981    /* ... or failing that, try "" overload */
5982    while (SvAMAGIC(msv)
5983      && (sv = AMG_CALLunary(msv, string_amg))
5984      && sv != msv
5985      &&  !(   SvROK(msv)
5986       && SvROK(sv)
5987       && SvRV(msv) == SvRV(sv))
5988    ) {
5989     msv = sv;
5990     SvGETMAGIC(msv);
5991    }
5992    if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5993     msv = SvRV(msv);
5994
5995    if (pat) {
5996     /* this is a partially unrolled
5997     *     sv_catsv_nomg(pat, msv);
5998     * that allows us to adjust code block indices if
5999     * needed */
6000     STRLEN dlen;
6001     char *dst = SvPV_force_nomg(pat, dlen);
6002     orig_patlen = dlen;
6003     if (SvUTF8(msv) && !SvUTF8(pat)) {
6004      S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6005      sv_setpvn(pat, dst, dlen);
6006      SvUTF8_on(pat);
6007     }
6008     sv_catsv_nomg(pat, msv);
6009     rx = msv;
6010    }
6011    else
6012     pat = msv;
6013
6014    if (code)
6015     pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
6016   }
6017
6018   /* extract any code blocks within any embedded qr//'s */
6019   if (rx && SvTYPE(rx) == SVt_REGEXP
6020    && RX_ENGINE((REGEXP*)rx)->op_comp)
6021   {
6022
6023    RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6024    if (ri->num_code_blocks) {
6025     int i;
6026     /* the presence of an embedded qr// with code means
6027     * we should always recompile: the text of the
6028     * qr// may not have changed, but it may be a
6029     * different closure than last time */
6030     *recompile_p = 1;
6031     Renew(pRExC_state->code_blocks,
6032      pRExC_state->num_code_blocks + ri->num_code_blocks,
6033      struct reg_code_block);
6034     pRExC_state->num_code_blocks += ri->num_code_blocks;
6035
6036     for (i=0; i < ri->num_code_blocks; i++) {
6037      struct reg_code_block *src, *dst;
6038      STRLEN offset =  orig_patlen
6039       + ReANY((REGEXP *)rx)->pre_prefix;
6040      assert(n < pRExC_state->num_code_blocks);
6041      src = &ri->code_blocks[i];
6042      dst = &pRExC_state->code_blocks[n];
6043      dst->start     = src->start + offset;
6044      dst->end     = src->end   + offset;
6045      dst->block     = src->block;
6046      dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
6047            src->src_regex
6048             ? src->src_regex
6049             : (REGEXP*)rx);
6050      n++;
6051     }
6052    }
6053   }
6054  }
6055  /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6056  if (alloced)
6057   SvSETMAGIC(pat);
6058
6059  return pat;
6060 }
6061
6062
6063
6064 /* see if there are any run-time code blocks in the pattern.
6065  * False positives are allowed */
6066
6067 static bool
6068 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6069      char *pat, STRLEN plen)
6070 {
6071  int n = 0;
6072  STRLEN s;
6073
6074  PERL_UNUSED_CONTEXT;
6075
6076  for (s = 0; s < plen; s++) {
6077   if (n < pRExC_state->num_code_blocks
6078    && s == pRExC_state->code_blocks[n].start)
6079   {
6080    s = pRExC_state->code_blocks[n].end;
6081    n++;
6082    continue;
6083   }
6084   /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6085   * positives here */
6086   if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6087    (pat[s+2] == '{'
6088     || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6089   )
6090    return 1;
6091  }
6092  return 0;
6093 }
6094
6095 /* Handle run-time code blocks. We will already have compiled any direct
6096  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6097  * copy of it, but with any literal code blocks blanked out and
6098  * appropriate chars escaped; then feed it into
6099  *
6100  *    eval "qr'modified_pattern'"
6101  *
6102  * For example,
6103  *
6104  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6105  *
6106  * becomes
6107  *
6108  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6109  *
6110  * After eval_sv()-ing that, grab any new code blocks from the returned qr
6111  * and merge them with any code blocks of the original regexp.
6112  *
6113  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6114  * instead, just save the qr and return FALSE; this tells our caller that
6115  * the original pattern needs upgrading to utf8.
6116  */
6117
6118 static bool
6119 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6120  char *pat, STRLEN plen)
6121 {
6122  SV *qr;
6123
6124  GET_RE_DEBUG_FLAGS_DECL;
6125
6126  if (pRExC_state->runtime_code_qr) {
6127   /* this is the second time we've been called; this should
6128   * only happen if the main pattern got upgraded to utf8
6129   * during compilation; re-use the qr we compiled first time
6130   * round (which should be utf8 too)
6131   */
6132   qr = pRExC_state->runtime_code_qr;
6133   pRExC_state->runtime_code_qr = NULL;
6134   assert(RExC_utf8 && SvUTF8(qr));
6135  }
6136  else {
6137   int n = 0;
6138   STRLEN s;
6139   char *p, *newpat;
6140   int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6141   SV *sv, *qr_ref;
6142   dSP;
6143
6144   /* determine how many extra chars we need for ' and \ escaping */
6145   for (s = 0; s < plen; s++) {
6146    if (pat[s] == '\'' || pat[s] == '\\')
6147     newlen++;
6148   }
6149
6150   Newx(newpat, newlen, char);
6151   p = newpat;
6152   *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6153
6154   for (s = 0; s < plen; s++) {
6155    if (n < pRExC_state->num_code_blocks
6156     && s == pRExC_state->code_blocks[n].start)
6157    {
6158     /* blank out literal code block */
6159     assert(pat[s] == '(');
6160     while (s <= pRExC_state->code_blocks[n].end) {
6161      *p++ = '_';
6162      s++;
6163     }
6164     s--;
6165     n++;
6166     continue;
6167    }
6168    if (pat[s] == '\'' || pat[s] == '\\')
6169     *p++ = '\\';
6170    *p++ = pat[s];
6171   }
6172   *p++ = '\'';
6173   if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6174    *p++ = 'x';
6175   *p++ = '\0';
6176   DEBUG_COMPILE_r({
6177    PerlIO_printf(Perl_debug_log,
6178     "%sre-parsing pattern for runtime code:%s %s\n",
6179     PL_colors[4],PL_colors[5],newpat);
6180   });
6181
6182   sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6183   Safefree(newpat);
6184
6185   ENTER;
6186   SAVETMPS;
6187   save_re_context();
6188   PUSHSTACKi(PERLSI_REQUIRE);
6189   /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6190   * parsing qr''; normally only q'' does this. It also alters
6191   * hints handling */
6192   eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6193   SvREFCNT_dec_NN(sv);
6194   SPAGAIN;
6195   qr_ref = POPs;
6196   PUTBACK;
6197   {
6198    SV * const errsv = ERRSV;
6199    if (SvTRUE_NN(errsv))
6200    {
6201     Safefree(pRExC_state->code_blocks);
6202     /* use croak_sv ? */
6203     Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6204    }
6205   }
6206   assert(SvROK(qr_ref));
6207   qr = SvRV(qr_ref);
6208   assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6209   /* the leaving below frees the tmp qr_ref.
6210   * Give qr a life of its own */
6211   SvREFCNT_inc(qr);
6212   POPSTACK;
6213   FREETMPS;
6214   LEAVE;
6215
6216  }
6217
6218  if (!RExC_utf8 && SvUTF8(qr)) {
6219   /* first time through; the pattern got upgraded; save the
6220   * qr for the next time through */
6221   assert(!pRExC_state->runtime_code_qr);
6222   pRExC_state->runtime_code_qr = qr;
6223   return 0;
6224  }
6225
6226
6227  /* extract any code blocks within the returned qr//  */
6228
6229
6230  /* merge the main (r1) and run-time (r2) code blocks into one */
6231  {
6232   RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6233   struct reg_code_block *new_block, *dst;
6234   RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6235   int i1 = 0, i2 = 0;
6236
6237   if (!r2->num_code_blocks) /* we guessed wrong */
6238   {
6239    SvREFCNT_dec_NN(qr);
6240    return 1;
6241   }
6242
6243   Newx(new_block,
6244    r1->num_code_blocks + r2->num_code_blocks,
6245    struct reg_code_block);
6246   dst = new_block;
6247
6248   while (    i1 < r1->num_code_blocks
6249     || i2 < r2->num_code_blocks)
6250   {
6251    struct reg_code_block *src;
6252    bool is_qr = 0;
6253
6254    if (i1 == r1->num_code_blocks) {
6255     src = &r2->code_blocks[i2++];
6256     is_qr = 1;
6257    }
6258    else if (i2 == r2->num_code_blocks)
6259     src = &r1->code_blocks[i1++];
6260    else if (  r1->code_blocks[i1].start
6261      < r2->code_blocks[i2].start)
6262    {
6263     src = &r1->code_blocks[i1++];
6264     assert(src->end < r2->code_blocks[i2].start);
6265    }
6266    else {
6267     assert(  r1->code_blocks[i1].start
6268      > r2->code_blocks[i2].start);
6269     src = &r2->code_blocks[i2++];
6270     is_qr = 1;
6271     assert(src->end < r1->code_blocks[i1].start);
6272    }
6273
6274    assert(pat[src->start] == '(');
6275    assert(pat[src->end]   == ')');
6276    dst->start     = src->start;
6277    dst->end     = src->end;
6278    dst->block     = src->block;
6279    dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6280          : src->src_regex;
6281    dst++;
6282   }
6283   r1->num_code_blocks += r2->num_code_blocks;
6284   Safefree(r1->code_blocks);
6285   r1->code_blocks = new_block;
6286  }
6287
6288  SvREFCNT_dec_NN(qr);
6289  return 1;
6290 }
6291
6292
6293 STATIC bool
6294 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6295      SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6296      SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6297      STRLEN longest_length, bool eol, bool meol)
6298 {
6299  /* This is the common code for setting up the floating and fixed length
6300  * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6301  * as to whether succeeded or not */
6302
6303  I32 t;
6304  SSize_t ml;
6305
6306  if (! (longest_length
6307   || (eol /* Can't have SEOL and MULTI */
6308    && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6309   )
6310    /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6311   || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6312  {
6313   return FALSE;
6314  }
6315
6316  /* copy the information about the longest from the reg_scan_data
6317   over to the program. */
6318  if (SvUTF8(sv_longest)) {
6319   *rx_utf8 = sv_longest;
6320   *rx_substr = NULL;
6321  } else {
6322   *rx_substr = sv_longest;
6323   *rx_utf8 = NULL;
6324  }
6325  /* end_shift is how many chars that must be matched that
6326   follow this item. We calculate it ahead of time as once the
6327   lookbehind offset is added in we lose the ability to correctly
6328   calculate it.*/
6329  ml = minlen ? *(minlen) : (SSize_t)longest_length;
6330  *rx_end_shift = ml - offset
6331   - longest_length + (SvTAIL(sv_longest) != 0)
6332   + lookbehind;
6333
6334  t = (eol/* Can't have SEOL and MULTI */
6335   && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6336  fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6337
6338  return TRUE;
6339 }
6340
6341 /*
6342  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6343  * regular expression into internal code.
6344  * The pattern may be passed either as:
6345  *    a list of SVs (patternp plus pat_count)
6346  *    a list of OPs (expr)
6347  * If both are passed, the SV list is used, but the OP list indicates
6348  * which SVs are actually pre-compiled code blocks
6349  *
6350  * The SVs in the list have magic and qr overloading applied to them (and
6351  * the list may be modified in-place with replacement SVs in the latter
6352  * case).
6353  *
6354  * If the pattern hasn't changed from old_re, then old_re will be
6355  * returned.
6356  *
6357  * eng is the current engine. If that engine has an op_comp method, then
6358  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6359  * do the initial concatenation of arguments and pass on to the external
6360  * engine.
6361  *
6362  * If is_bare_re is not null, set it to a boolean indicating whether the
6363  * arg list reduced (after overloading) to a single bare regex which has
6364  * been returned (i.e. /$qr/).
6365  *
6366  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6367  *
6368  * pm_flags contains the PMf_* flags, typically based on those from the
6369  * pm_flags field of the related PMOP. Currently we're only interested in
6370  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6371  *
6372  * We can't allocate space until we know how big the compiled form will be,
6373  * but we can't compile it (and thus know how big it is) until we've got a
6374  * place to put the code.  So we cheat:  we compile it twice, once with code
6375  * generation turned off and size counting turned on, and once "for real".
6376  * This also means that we don't allocate space until we are sure that the
6377  * thing really will compile successfully, and we never have to move the
6378  * code and thus invalidate pointers into it.  (Note that it has to be in
6379  * one piece because free() must be able to free it all.) [NB: not true in perl]
6380  *
6381  * Beware that the optimization-preparation code in here knows about some
6382  * of the structure of the compiled regexp.  [I'll say.]
6383  */
6384
6385 REGEXP *
6386 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6387      OP *expr, const regexp_engine* eng, REGEXP *old_re,
6388      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6389 {
6390  REGEXP *rx;
6391  struct regexp *r;
6392  regexp_internal *ri;
6393  STRLEN plen;
6394  char *exp;
6395  regnode *scan;
6396  I32 flags;
6397  SSize_t minlen = 0;
6398  U32 rx_flags;
6399  SV *pat;
6400  SV *code_blocksv = NULL;
6401  SV** new_patternp = patternp;
6402
6403  /* these are all flags - maybe they should be turned
6404  * into a single int with different bit masks */
6405  I32 sawlookahead = 0;
6406  I32 sawplus = 0;
6407  I32 sawopen = 0;
6408  I32 sawminmod = 0;
6409
6410  regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6411  bool recompile = 0;
6412  bool runtime_code = 0;
6413  scan_data_t data;
6414  RExC_state_t RExC_state;
6415  RExC_state_t * const pRExC_state = &RExC_state;
6416 #ifdef TRIE_STUDY_OPT
6417  int restudied = 0;
6418  RExC_state_t copyRExC_state;
6419 #endif
6420  GET_RE_DEBUG_FLAGS_DECL;
6421
6422  PERL_ARGS_ASSERT_RE_OP_COMPILE;
6423
6424  DEBUG_r(if (!PL_colorset) reginitcolors());
6425
6426  /* Initialize these here instead of as-needed, as is quick and avoids
6427  * having to test them each time otherwise */
6428  if (! PL_AboveLatin1) {
6429   PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6430   PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6431   PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6432   PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6433   PL_HasMultiCharFold =
6434      _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6435
6436   /* This is calculated here, because the Perl program that generates the
6437   * static global ones doesn't currently have access to
6438   * NUM_ANYOF_CODE_POINTS */
6439   PL_InBitmap = _new_invlist(2);
6440   PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6441              NUM_ANYOF_CODE_POINTS - 1);
6442  }
6443
6444  pRExC_state->code_blocks = NULL;
6445  pRExC_state->num_code_blocks = 0;
6446
6447  if (is_bare_re)
6448   *is_bare_re = FALSE;
6449
6450  if (expr && (expr->op_type == OP_LIST ||
6451     (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6452   /* allocate code_blocks if needed */
6453   OP *o;
6454   int ncode = 0;
6455
6456   for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
6457    if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6458     ncode++; /* count of DO blocks */
6459   if (ncode) {
6460    pRExC_state->num_code_blocks = ncode;
6461    Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6462   }
6463  }
6464
6465  if (!pat_count) {
6466   /* compile-time pattern with just OP_CONSTs and DO blocks */
6467
6468   int n;
6469   OP *o;
6470
6471   /* find how many CONSTs there are */
6472   assert(expr);
6473   n = 0;
6474   if (expr->op_type == OP_CONST)
6475    n = 1;
6476   else
6477    for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6478     if (o->op_type == OP_CONST)
6479      n++;
6480    }
6481
6482   /* fake up an SV array */
6483
6484   assert(!new_patternp);
6485   Newx(new_patternp, n, SV*);
6486   SAVEFREEPV(new_patternp);
6487   pat_count = n;
6488
6489   n = 0;
6490   if (expr->op_type == OP_CONST)
6491    new_patternp[n] = cSVOPx_sv(expr);
6492   else
6493    for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6494     if (o->op_type == OP_CONST)
6495      new_patternp[n++] = cSVOPo_sv;
6496    }
6497
6498  }
6499
6500  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6501   "Assembling pattern from %d elements%s\n", pat_count,
6502    orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6503
6504  /* set expr to the first arg op */
6505
6506  if (pRExC_state->num_code_blocks
6507   && expr->op_type != OP_CONST)
6508  {
6509    expr = cLISTOPx(expr)->op_first;
6510    assert(   expr->op_type == OP_PUSHMARK
6511     || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6512     || expr->op_type == OP_PADRANGE);
6513    expr = OpSIBLING(expr);
6514  }
6515
6516  pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6517       expr, &recompile, NULL);
6518
6519  /* handle bare (possibly after overloading) regex: foo =~ $re */
6520  {
6521   SV *re = pat;
6522   if (SvROK(re))
6523    re = SvRV(re);
6524   if (SvTYPE(re) == SVt_REGEXP) {
6525    if (is_bare_re)
6526     *is_bare_re = TRUE;
6527    SvREFCNT_inc(re);
6528    Safefree(pRExC_state->code_blocks);
6529    DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6530     "Precompiled pattern%s\n",
6531      orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6532
6533    return (REGEXP*)re;
6534   }
6535  }
6536
6537  exp = SvPV_nomg(pat, plen);
6538
6539  if (!eng->op_comp) {
6540   if ((SvUTF8(pat) && IN_BYTES)
6541     || SvGMAGICAL(pat) || SvAMAGIC(pat))
6542   {
6543    /* make a temporary copy; either to convert to bytes,
6544    * or to avoid repeating get-magic / overloaded stringify */
6545    pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6546           (IN_BYTES ? 0 : SvUTF8(pat)));
6547   }
6548   Safefree(pRExC_state->code_blocks);
6549   return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6550  }
6551
6552  /* ignore the utf8ness if the pattern is 0 length */
6553  RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6554  RExC_uni_semantics = 0;
6555  RExC_contains_locale = 0;
6556  RExC_contains_i = 0;
6557  RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
6558  pRExC_state->runtime_code_qr = NULL;
6559  RExC_frame_head= NULL;
6560  RExC_frame_last= NULL;
6561  RExC_frame_count= 0;
6562
6563  DEBUG_r({
6564   RExC_mysv1= sv_newmortal();
6565   RExC_mysv2= sv_newmortal();
6566  });
6567  DEBUG_COMPILE_r({
6568    SV *dsv= sv_newmortal();
6569    RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6570    PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6571       PL_colors[4],PL_colors[5],s);
6572   });
6573
6574   redo_first_pass:
6575  /* we jump here if we upgrade the pattern to utf8 and have to
6576  * recompile */
6577
6578  if ((pm_flags & PMf_USE_RE_EVAL)
6579     /* this second condition covers the non-regex literal case,
6580     * i.e.  $foo =~ '(?{})'. */
6581     || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6582  )
6583   runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6584
6585  /* return old regex if pattern hasn't changed */
6586  /* XXX: note in the below we have to check the flags as well as the
6587  * pattern.
6588  *
6589  * Things get a touch tricky as we have to compare the utf8 flag
6590  * independently from the compile flags.  */
6591
6592  if (   old_re
6593   && !recompile
6594   && !!RX_UTF8(old_re) == !!RExC_utf8
6595   && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6596   && RX_PRECOMP(old_re)
6597   && RX_PRELEN(old_re) == plen
6598   && memEQ(RX_PRECOMP(old_re), exp, plen)
6599   && !runtime_code /* with runtime code, always recompile */ )
6600  {
6601   Safefree(pRExC_state->code_blocks);
6602   return old_re;
6603  }
6604
6605  rx_flags = orig_rx_flags;
6606
6607  if (rx_flags & PMf_FOLD) {
6608   RExC_contains_i = 1;
6609  }
6610  if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6611
6612   /* Set to use unicode semantics if the pattern is in utf8 and has the
6613   * 'depends' charset specified, as it means unicode when utf8  */
6614   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6615  }
6616
6617  RExC_precomp = exp;
6618  RExC_flags = rx_flags;
6619  RExC_pm_flags = pm_flags;
6620
6621  if (runtime_code) {
6622   if (TAINTING_get && TAINT_get)
6623    Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6624
6625   if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6626    /* whoops, we have a non-utf8 pattern, whilst run-time code
6627    * got compiled as utf8. Try again with a utf8 pattern */
6628    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6629          pRExC_state->num_code_blocks);
6630    goto redo_first_pass;
6631   }
6632  }
6633  assert(!pRExC_state->runtime_code_qr);
6634
6635  RExC_sawback = 0;
6636
6637  RExC_seen = 0;
6638  RExC_maxlen = 0;
6639  RExC_in_lookbehind = 0;
6640  RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6641  RExC_extralen = 0;
6642  RExC_override_recoding = 0;
6643 #ifdef EBCDIC
6644  RExC_recode_x_to_native = 0;
6645 #endif
6646  RExC_in_multi_char_class = 0;
6647
6648  /* First pass: determine size, legality. */
6649  RExC_parse = exp;
6650  RExC_start = exp;
6651  RExC_end = exp + plen;
6652  RExC_naughty = 0;
6653  RExC_npar = 1;
6654  RExC_nestroot = 0;
6655  RExC_size = 0L;
6656  RExC_emit = (regnode *) &RExC_emit_dummy;
6657  RExC_whilem_seen = 0;
6658  RExC_open_parens = NULL;
6659  RExC_close_parens = NULL;
6660  RExC_opend = NULL;
6661  RExC_paren_names = NULL;
6662 #ifdef DEBUGGING
6663  RExC_paren_name_list = NULL;
6664 #endif
6665  RExC_recurse = NULL;
6666  RExC_study_chunk_recursed = NULL;
6667  RExC_study_chunk_recursed_bytes= 0;
6668  RExC_recurse_count = 0;
6669  pRExC_state->code_index = 0;
6670
6671  DEBUG_PARSE_r(
6672   PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6673   RExC_lastnum=0;
6674   RExC_lastparse=NULL;
6675  );
6676  /* reg may croak on us, not giving us a chance to free
6677  pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6678  need it to survive as long as the regexp (qr/(?{})/).
6679  We must check that code_blocksv is not already set, because we may
6680  have jumped back to restart the sizing pass. */
6681  if (pRExC_state->code_blocks && !code_blocksv) {
6682   code_blocksv = newSV_type(SVt_PV);
6683   SAVEFREESV(code_blocksv);
6684   SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6685   SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6686  }
6687  if (reg(pRExC_state, 0, &flags,1) == NULL) {
6688   /* It's possible to write a regexp in ascii that represents Unicode
6689   codepoints outside of the byte range, such as via \x{100}. If we
6690   detect such a sequence we have to convert the entire pattern to utf8
6691   and then recompile, as our sizing calculation will have been based
6692   on 1 byte == 1 character, but we will need to use utf8 to encode
6693   at least some part of the pattern, and therefore must convert the whole
6694   thing.
6695   -- dmq */
6696   if (flags & RESTART_UTF8) {
6697    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6698          pRExC_state->num_code_blocks);
6699    goto redo_first_pass;
6700   }
6701   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6702  }
6703  if (code_blocksv)
6704   SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6705
6706  DEBUG_PARSE_r({
6707   PerlIO_printf(Perl_debug_log,
6708    "Required size %"IVdf" nodes\n"
6709    "Starting second pass (creation)\n",
6710    (IV)RExC_size);
6711   RExC_lastnum=0;
6712   RExC_lastparse=NULL;
6713  });
6714
6715  /* The first pass could have found things that force Unicode semantics */
6716  if ((RExC_utf8 || RExC_uni_semantics)
6717   && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6718  {
6719   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6720  }
6721
6722  /* Small enough for pointer-storage convention?
6723  If extralen==0, this means that we will not need long jumps. */
6724  if (RExC_size >= 0x10000L && RExC_extralen)
6725   RExC_size += RExC_extralen;
6726  else
6727   RExC_extralen = 0;
6728  if (RExC_whilem_seen > 15)
6729   RExC_whilem_seen = 15;
6730
6731  /* Allocate space and zero-initialize. Note, the two step process
6732  of zeroing when in debug mode, thus anything assigned has to
6733  happen after that */
6734  rx = (REGEXP*) newSV_type(SVt_REGEXP);
6735  r = ReANY(rx);
6736  Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6737   char, regexp_internal);
6738  if ( r == NULL || ri == NULL )
6739   FAIL("Regexp out of space");
6740 #ifdef DEBUGGING
6741  /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6742  Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6743   char);
6744 #else
6745  /* bulk initialize base fields with 0. */
6746  Zero(ri, sizeof(regexp_internal), char);
6747 #endif
6748
6749  /* non-zero initialization begins here */
6750  RXi_SET( r, ri );
6751  r->engine= eng;
6752  r->extflags = rx_flags;
6753  RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6754
6755  if (pm_flags & PMf_IS_QR) {
6756   ri->code_blocks = pRExC_state->code_blocks;
6757   ri->num_code_blocks = pRExC_state->num_code_blocks;
6758  }
6759  else
6760  {
6761   int n;
6762   for (n = 0; n < pRExC_state->num_code_blocks; n++)
6763    if (pRExC_state->code_blocks[n].src_regex)
6764     SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6765   SAVEFREEPV(pRExC_state->code_blocks);
6766  }
6767
6768  {
6769   bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6770   bool has_charset = (get_regex_charset(r->extflags)
6771              != REGEX_DEPENDS_CHARSET);
6772
6773   /* The caret is output if there are any defaults: if not all the STD
6774   * flags are set, or if no character set specifier is needed */
6775   bool has_default =
6776      (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6777      || ! has_charset);
6778   bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6779             == REG_RUN_ON_COMMENT_SEEN);
6780   U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6781        >> RXf_PMf_STD_PMMOD_SHIFT);
6782   const char *fptr = STD_PAT_MODS;        /*"msixn"*/
6783   char *p;
6784   /* Allocate for the worst case, which is all the std flags are turned
6785   * on.  If more precision is desired, we could do a population count of
6786   * the flags set.  This could be done with a small lookup table, or by
6787   * shifting, masking and adding, or even, when available, assembly
6788   * language for a machine-language population count.
6789   * We never output a minus, as all those are defaults, so are
6790   * covered by the caret */
6791   const STRLEN wraplen = plen + has_p + has_runon
6792    + has_default       /* If needs a caret */
6793
6794     /* If needs a character set specifier */
6795    + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6796    + (sizeof(STD_PAT_MODS) - 1)
6797    + (sizeof("(?:)") - 1);
6798
6799   Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6800   r->xpv_len_u.xpvlenu_pv = p;
6801   if (RExC_utf8)
6802    SvFLAGS(rx) |= SVf_UTF8;
6803   *p++='('; *p++='?';
6804
6805   /* If a default, cover it using the caret */
6806   if (has_default) {
6807    *p++= DEFAULT_PAT_MOD;
6808   }
6809   if (has_charset) {
6810    STRLEN len;
6811    const char* const name = get_regex_charset_name(r->extflags, &len);
6812    Copy(name, p, len, char);
6813    p += len;
6814   }
6815   if (has_p)
6816    *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6817   {
6818    char ch;
6819    while((ch = *fptr++)) {
6820     if(reganch & 1)
6821      *p++ = ch;
6822     reganch >>= 1;
6823    }
6824   }
6825
6826   *p++ = ':';
6827   Copy(RExC_precomp, p, plen, char);
6828   assert ((RX_WRAPPED(rx) - p) < 16);
6829   r->pre_prefix = p - RX_WRAPPED(rx);
6830   p += plen;
6831   if (has_runon)
6832    *p++ = '\n';
6833   *p++ = ')';
6834   *p = 0;
6835   SvCUR_set(rx, p - RX_WRAPPED(rx));
6836  }
6837
6838  r->intflags = 0;
6839  r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6840
6841  /* setup various meta data about recursion, this all requires
6842  * RExC_npar to be correctly set, and a bit later on we clear it */
6843  if (RExC_seen & REG_RECURSE_SEEN) {
6844   Newxz(RExC_open_parens, RExC_npar,regnode *);
6845   SAVEFREEPV(RExC_open_parens);
6846   Newxz(RExC_close_parens,RExC_npar,regnode *);
6847   SAVEFREEPV(RExC_close_parens);
6848  }
6849  if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6850   /* Note, RExC_npar is 1 + the number of parens in a pattern.
6851   * So its 1 if there are no parens. */
6852   RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6853           ((RExC_npar & 0x07) != 0);
6854   Newx(RExC_study_chunk_recursed,
6855    RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6856   SAVEFREEPV(RExC_study_chunk_recursed);
6857  }
6858
6859  /* Useful during FAIL. */
6860 #ifdef RE_TRACK_PATTERN_OFFSETS
6861  Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6862  DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6863       "%s %"UVuf" bytes for offset annotations.\n",
6864       ri->u.offsets ? "Got" : "Couldn't get",
6865       (UV)((2*RExC_size+1) * sizeof(U32))));
6866 #endif
6867  SetProgLen(ri,RExC_size);
6868  RExC_rx_sv = rx;
6869  RExC_rx = r;
6870  RExC_rxi = ri;
6871  REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
6872
6873  /* Second pass: emit code. */
6874  RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6875  RExC_pm_flags = pm_flags;
6876  RExC_parse = exp;
6877  RExC_end = exp + plen;
6878  RExC_naughty = 0;
6879  RExC_npar = 1;
6880  RExC_emit_start = ri->program;
6881  RExC_emit = ri->program;
6882  RExC_emit_bound = ri->program + RExC_size + 1;
6883  pRExC_state->code_index = 0;
6884
6885  *((char*) RExC_emit++) = (char) REG_MAGIC;
6886  if (reg(pRExC_state, 0, &flags,1) == NULL) {
6887   ReREFCNT_dec(rx);
6888   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6889  }
6890  /* XXXX To minimize changes to RE engine we always allocate
6891  3-units-long substrs field. */
6892  Newx(r->substrs, 1, struct reg_substr_data);
6893  if (RExC_recurse_count) {
6894   Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6895   SAVEFREEPV(RExC_recurse);
6896  }
6897
6898   reStudy:
6899  r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6900  DEBUG_r(
6901   RExC_study_chunk_recursed_count= 0;
6902  );
6903  Zero(r->substrs, 1, struct reg_substr_data);
6904  if (RExC_study_chunk_recursed) {
6905   Zero(RExC_study_chunk_recursed,
6906    RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6907  }
6908
6909
6910 #ifdef TRIE_STUDY_OPT
6911  if (!restudied) {
6912   StructCopy(&zero_scan_data, &data, scan_data_t);
6913   copyRExC_state = RExC_state;
6914  } else {
6915   U32 seen=RExC_seen;
6916   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6917
6918   RExC_state = copyRExC_state;
6919   if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6920    RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6921   else
6922    RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6923   StructCopy(&zero_scan_data, &data, scan_data_t);
6924  }
6925 #else
6926  StructCopy(&zero_scan_data, &data, scan_data_t);
6927 #endif
6928
6929  /* Dig out information for optimizations. */
6930  r->extflags = RExC_flags; /* was pm_op */
6931  /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6932
6933  if (UTF)
6934   SvUTF8_on(rx); /* Unicode in it? */
6935  ri->regstclass = NULL;
6936  if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
6937   r->intflags |= PREGf_NAUGHTY;
6938  scan = ri->program + 1;  /* First BRANCH. */
6939
6940  /* testing for BRANCH here tells us whether there is "must appear"
6941  data in the pattern. If there is then we can use it for optimisations */
6942  if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
6943             */
6944   SSize_t fake;
6945   STRLEN longest_float_length, longest_fixed_length;
6946   regnode_ssc ch_class; /* pointed to by data */
6947   int stclass_flag;
6948   SSize_t last_close = 0; /* pointed to by data */
6949   regnode *first= scan;
6950   regnode *first_next= regnext(first);
6951   /*
6952   * Skip introductions and multiplicators >= 1
6953   * so that we can extract the 'meat' of the pattern that must
6954   * match in the large if() sequence following.
6955   * NOTE that EXACT is NOT covered here, as it is normally
6956   * picked up by the optimiser separately.
6957   *
6958   * This is unfortunate as the optimiser isnt handling lookahead
6959   * properly currently.
6960   *
6961   */
6962   while ((OP(first) == OPEN && (sawopen = 1)) ||
6963    /* An OR of *one* alternative - should not happen now. */
6964    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6965    /* for now we can't handle lookbehind IFMATCH*/
6966    (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6967    (OP(first) == PLUS) ||
6968    (OP(first) == MINMOD) ||
6969    /* An {n,m} with n>0 */
6970    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6971    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6972   {
6973     /*
6974     * the only op that could be a regnode is PLUS, all the rest
6975     * will be regnode_1 or regnode_2.
6976     *
6977     * (yves doesn't think this is true)
6978     */
6979     if (OP(first) == PLUS)
6980      sawplus = 1;
6981     else {
6982      if (OP(first) == MINMOD)
6983       sawminmod = 1;
6984      first += regarglen[OP(first)];
6985     }
6986     first = NEXTOPER(first);
6987     first_next= regnext(first);
6988   }
6989
6990   /* Starting-point info. */
6991  again:
6992   DEBUG_PEEP("first:",first,0);
6993   /* Ignore EXACT as we deal with it later. */
6994   if (PL_regkind[OP(first)] == EXACT) {
6995    if (OP(first) == EXACT || OP(first) == EXACTL)
6996     NOOP; /* Empty, get anchored substr later. */
6997    else
6998     ri->regstclass = first;
6999   }
7000 #ifdef TRIE_STCLASS
7001   else if (PL_regkind[OP(first)] == TRIE &&
7002     ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
7003   {
7004    /* this can happen only on restudy */
7005    ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7006   }
7007 #endif
7008   else if (REGNODE_SIMPLE(OP(first)))
7009    ri->regstclass = first;
7010   else if (PL_regkind[OP(first)] == BOUND ||
7011     PL_regkind[OP(first)] == NBOUND)
7012    ri->regstclass = first;
7013   else if (PL_regkind[OP(first)] == BOL) {
7014    r->intflags |= (OP(first) == MBOL
7015       ? PREGf_ANCH_MBOL
7016       : PREGf_ANCH_SBOL);
7017    first = NEXTOPER(first);
7018    goto again;
7019   }
7020   else if (OP(first) == GPOS) {
7021    r->intflags |= PREGf_ANCH_GPOS;
7022    first = NEXTOPER(first);
7023    goto again;
7024   }
7025   else if ((!sawopen || !RExC_sawback) &&
7026    !sawlookahead &&
7027    (OP(first) == STAR &&
7028    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7029    !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7030   {
7031    /* turn .* into ^.* with an implied $*=1 */
7032    const int type =
7033     (OP(NEXTOPER(first)) == REG_ANY)
7034      ? PREGf_ANCH_MBOL
7035      : PREGf_ANCH_SBOL;
7036    r->intflags |= (type | PREGf_IMPLICIT);
7037    first = NEXTOPER(first);
7038    goto again;
7039   }
7040   if (sawplus && !sawminmod && !sawlookahead
7041    && (!sawopen || !RExC_sawback)
7042    && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7043    /* x+ must match at the 1st pos of run of x's */
7044    r->intflags |= PREGf_SKIP;
7045
7046   /* Scan is after the zeroth branch, first is atomic matcher. */
7047 #ifdef TRIE_STUDY_OPT
7048   DEBUG_PARSE_r(
7049    if (!restudied)
7050     PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7051        (IV)(first - scan + 1))
7052   );
7053 #else
7054   DEBUG_PARSE_r(
7055    PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7056     (IV)(first - scan + 1))
7057   );
7058 #endif
7059
7060
7061   /*
7062   * If there's something expensive in the r.e., find the
7063   * longest literal string that must appear and make it the
7064   * regmust.  Resolve ties in favor of later strings, since
7065   * the regstart check works with the beginning of the r.e.
7066   * and avoiding duplication strengthens checking.  Not a
7067   * strong reason, but sufficient in the absence of others.
7068   * [Now we resolve ties in favor of the earlier string if
7069   * it happens that c_offset_min has been invalidated, since the
7070   * earlier string may buy us something the later one won't.]
7071   */
7072
7073   data.longest_fixed = newSVpvs("");
7074   data.longest_float = newSVpvs("");
7075   data.last_found = newSVpvs("");
7076   data.longest = &(data.longest_fixed);
7077   ENTER_with_name("study_chunk");
7078   SAVEFREESV(data.longest_fixed);
7079   SAVEFREESV(data.longest_float);
7080   SAVEFREESV(data.last_found);
7081   first = scan;
7082   if (!ri->regstclass) {
7083    ssc_init(pRExC_state, &ch_class);
7084    data.start_class = &ch_class;
7085    stclass_flag = SCF_DO_STCLASS_AND;
7086   } else    /* XXXX Check for BOUND? */
7087    stclass_flag = 0;
7088   data.last_closep = &last_close;
7089
7090   DEBUG_RExC_seen();
7091   minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7092        scan + RExC_size, /* Up to end */
7093    &data, -1, 0, NULL,
7094    SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7095       | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7096    0);
7097
7098
7099   CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7100
7101
7102   if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7103    && data.last_start_min == 0 && data.last_end > 0
7104    && !RExC_seen_zerolen
7105    && !(RExC_seen & REG_VERBARG_SEEN)
7106    && !(RExC_seen & REG_GPOS_SEEN)
7107   ){
7108    r->extflags |= RXf_CHECK_ALL;
7109   }
7110   scan_commit(pRExC_state, &data,&minlen,0);
7111
7112   longest_float_length = CHR_SVLEN(data.longest_float);
7113
7114   if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
7115     && data.offset_fixed == data.offset_float_min
7116     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7117    && S_setup_longest (aTHX_ pRExC_state,
7118          data.longest_float,
7119          &(r->float_utf8),
7120          &(r->float_substr),
7121          &(r->float_end_shift),
7122          data.lookbehind_float,
7123          data.offset_float_min,
7124          data.minlen_float,
7125          longest_float_length,
7126          cBOOL(data.flags & SF_FL_BEFORE_EOL),
7127          cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7128   {
7129    r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7130    r->float_max_offset = data.offset_float_max;
7131    if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7132     r->float_max_offset -= data.lookbehind_float;
7133    SvREFCNT_inc_simple_void_NN(data.longest_float);
7134   }
7135   else {
7136    r->float_substr = r->float_utf8 = NULL;
7137    longest_float_length = 0;
7138   }
7139
7140   longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7141
7142   if (S_setup_longest (aTHX_ pRExC_state,
7143         data.longest_fixed,
7144         &(r->anchored_utf8),
7145         &(r->anchored_substr),
7146         &(r->anchored_end_shift),
7147         data.lookbehind_fixed,
7148         data.offset_fixed,
7149         data.minlen_fixed,
7150         longest_fixed_length,
7151         cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7152         cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7153   {
7154    r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7155    SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7156   }
7157   else {
7158    r->anchored_substr = r->anchored_utf8 = NULL;
7159    longest_fixed_length = 0;
7160   }
7161   LEAVE_with_name("study_chunk");
7162
7163   if (ri->regstclass
7164    && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7165    ri->regstclass = NULL;
7166
7167   if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7168    && stclass_flag
7169    && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7170    && is_ssc_worth_it(pRExC_state, data.start_class))
7171   {
7172    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7173
7174    ssc_finalize(pRExC_state, data.start_class);
7175
7176    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7177    StructCopy(data.start_class,
7178      (regnode_ssc*)RExC_rxi->data->data[n],
7179      regnode_ssc);
7180    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7181    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7182    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7183      regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7184      PerlIO_printf(Perl_debug_log,
7185          "synthetic stclass \"%s\".\n",
7186          SvPVX_const(sv));});
7187    data.start_class = NULL;
7188   }
7189
7190   /* A temporary algorithm prefers floated substr to fixed one to dig
7191   * more info. */
7192   if (longest_fixed_length > longest_float_length) {
7193    r->substrs->check_ix = 0;
7194    r->check_end_shift = r->anchored_end_shift;
7195    r->check_substr = r->anchored_substr;
7196    r->check_utf8 = r->anchored_utf8;
7197    r->check_offset_min = r->check_offset_max = r->anchored_offset;
7198    if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7199     r->intflags |= PREGf_NOSCAN;
7200   }
7201   else {
7202    r->substrs->check_ix = 1;
7203    r->check_end_shift = r->float_end_shift;
7204    r->check_substr = r->float_substr;
7205    r->check_utf8 = r->float_utf8;
7206    r->check_offset_min = r->float_min_offset;
7207    r->check_offset_max = r->float_max_offset;
7208   }
7209   if ((r->check_substr || r->check_utf8) ) {
7210    r->extflags |= RXf_USE_INTUIT;
7211    if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7212     r->extflags |= RXf_INTUIT_TAIL;
7213   }
7214   r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7215
7216   /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7217   if ( (STRLEN)minlen < longest_float_length )
7218    minlen= longest_float_length;
7219   if ( (STRLEN)minlen < longest_fixed_length )
7220    minlen= longest_fixed_length;
7221   */
7222  }
7223  else {
7224   /* Several toplevels. Best we can is to set minlen. */
7225   SSize_t fake;
7226   regnode_ssc ch_class;
7227   SSize_t last_close = 0;
7228
7229   DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7230
7231   scan = ri->program + 1;
7232   ssc_init(pRExC_state, &ch_class);
7233   data.start_class = &ch_class;
7234   data.last_closep = &last_close;
7235
7236   DEBUG_RExC_seen();
7237   minlen = study_chunk(pRExC_state,
7238    &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7239    SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7240              ? SCF_TRIE_DOING_RESTUDY
7241              : 0),
7242    0);
7243
7244   CHECK_RESTUDY_GOTO_butfirst(NOOP);
7245
7246   r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7247     = r->float_substr = r->float_utf8 = NULL;
7248
7249   if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7250    && is_ssc_worth_it(pRExC_state, data.start_class))
7251   {
7252    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7253
7254    ssc_finalize(pRExC_state, data.start_class);
7255
7256    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7257    StructCopy(data.start_class,
7258      (regnode_ssc*)RExC_rxi->data->data[n],
7259      regnode_ssc);
7260    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7261    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7262    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7263      regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7264      PerlIO_printf(Perl_debug_log,
7265          "synthetic stclass \"%s\".\n",
7266          SvPVX_const(sv));});
7267    data.start_class = NULL;
7268   }
7269  }
7270
7271  if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7272   r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7273   r->maxlen = REG_INFTY;
7274  }
7275  else {
7276   r->maxlen = RExC_maxlen;
7277  }
7278
7279  /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7280  the "real" pattern. */
7281  DEBUG_OPTIMISE_r({
7282   PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7283      (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7284  });
7285  r->minlenret = minlen;
7286  if (r->minlen < minlen)
7287   r->minlen = minlen;
7288
7289  if (RExC_seen & REG_GPOS_SEEN)
7290   r->intflags |= PREGf_GPOS_SEEN;
7291  if (RExC_seen & REG_LOOKBEHIND_SEEN)
7292   r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7293             lookbehind */
7294  if (pRExC_state->num_code_blocks)
7295   r->extflags |= RXf_EVAL_SEEN;
7296  if (RExC_seen & REG_CANY_SEEN)
7297   r->intflags |= PREGf_CANY_SEEN;
7298  if (RExC_seen & REG_VERBARG_SEEN)
7299  {
7300   r->intflags |= PREGf_VERBARG_SEEN;
7301   r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7302  }
7303  if (RExC_seen & REG_CUTGROUP_SEEN)
7304   r->intflags |= PREGf_CUTGROUP_SEEN;
7305  if (pm_flags & PMf_USE_RE_EVAL)
7306   r->intflags |= PREGf_USE_RE_EVAL;
7307  if (RExC_paren_names)
7308   RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7309  else
7310   RXp_PAREN_NAMES(r) = NULL;
7311
7312  /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7313  * so it can be used in pp.c */
7314  if (r->intflags & PREGf_ANCH)
7315   r->extflags |= RXf_IS_ANCHORED;
7316
7317
7318  {
7319   /* this is used to identify "special" patterns that might result
7320   * in Perl NOT calling the regex engine and instead doing the match "itself",
7321   * particularly special cases in split//. By having the regex compiler
7322   * do this pattern matching at a regop level (instead of by inspecting the pattern)
7323   * we avoid weird issues with equivalent patterns resulting in different behavior,
7324   * AND we allow non Perl engines to get the same optimizations by the setting the
7325   * flags appropriately - Yves */
7326   regnode *first = ri->program + 1;
7327   U8 fop = OP(first);
7328   regnode *next = regnext(first);
7329   U8 nop = OP(next);
7330
7331   if (PL_regkind[fop] == NOTHING && nop == END)
7332    r->extflags |= RXf_NULL;
7333   else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7334    /* when fop is SBOL first->flags will be true only when it was
7335    * produced by parsing /\A/, and not when parsing /^/. This is
7336    * very important for the split code as there we want to
7337    * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7338    * See rt #122761 for more details. -- Yves */
7339    r->extflags |= RXf_START_ONLY;
7340   else if (fop == PLUS
7341     && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7342     && nop == END)
7343    r->extflags |= RXf_WHITE;
7344   else if ( r->extflags & RXf_SPLIT
7345     && (fop == EXACT || fop == EXACTL)
7346     && STR_LEN(first) == 1
7347     && *(STRING(first)) == ' '
7348     && nop == END )
7349    r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7350
7351  }
7352
7353  if (RExC_contains_locale) {
7354   RXp_EXTFLAGS(r) |= RXf_TAINTED;
7355  }
7356
7357 #ifdef DEBUGGING
7358  if (RExC_paren_names) {
7359   ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7360   ri->data->data[ri->name_list_idx]
7361         = (void*)SvREFCNT_inc(RExC_paren_name_list);
7362  } else
7363 #endif
7364   ri->name_list_idx = 0;
7365
7366  if (RExC_recurse_count) {
7367   for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7368    const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7369    ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7370   }
7371  }
7372  Newxz(r->offs, RExC_npar, regexp_paren_pair);
7373  /* assume we don't need to swap parens around before we match */
7374  DEBUG_TEST_r({
7375   PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
7376    (unsigned long)RExC_study_chunk_recursed_count);
7377  });
7378  DEBUG_DUMP_r({
7379   DEBUG_RExC_seen();
7380   PerlIO_printf(Perl_debug_log,"Final program:\n");
7381   regdump(r);
7382  });
7383 #ifdef RE_TRACK_PATTERN_OFFSETS
7384  DEBUG_OFFSETS_r(if (ri->u.offsets) {
7385   const STRLEN len = ri->u.offsets[0];
7386   STRLEN i;
7387   GET_RE_DEBUG_FLAGS_DECL;
7388   PerlIO_printf(Perl_debug_log,
7389      "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7390   for (i = 1; i <= len; i++) {
7391    if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7392     PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7393     (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7394    }
7395   PerlIO_printf(Perl_debug_log, "\n");
7396  });
7397 #endif
7398
7399 #ifdef USE_ITHREADS
7400  /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7401  * by setting the regexp SV to readonly-only instead. If the
7402  * pattern's been recompiled, the USEDness should remain. */
7403  if (old_re && SvREADONLY(old_re))
7404   SvREADONLY_on(rx);
7405 #endif
7406  return rx;
7407 }
7408
7409
7410 SV*
7411 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7412      const U32 flags)
7413 {
7414  PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7415
7416  PERL_UNUSED_ARG(value);
7417
7418  if (flags & RXapif_FETCH) {
7419   return reg_named_buff_fetch(rx, key, flags);
7420  } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7421   Perl_croak_no_modify();
7422   return NULL;
7423  } else if (flags & RXapif_EXISTS) {
7424   return reg_named_buff_exists(rx, key, flags)
7425    ? &PL_sv_yes
7426    : &PL_sv_no;
7427  } else if (flags & RXapif_REGNAMES) {
7428   return reg_named_buff_all(rx, flags);
7429  } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7430   return reg_named_buff_scalar(rx, flags);
7431  } else {
7432   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7433   return NULL;
7434  }
7435 }
7436
7437 SV*
7438 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7439       const U32 flags)
7440 {
7441  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7442  PERL_UNUSED_ARG(lastkey);
7443
7444  if (flags & RXapif_FIRSTKEY)
7445   return reg_named_buff_firstkey(rx, flags);
7446  else if (flags & RXapif_NEXTKEY)
7447   return reg_named_buff_nextkey(rx, flags);
7448  else {
7449   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7450            (int)flags);
7451   return NULL;
7452  }
7453 }
7454
7455 SV*
7456 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7457       const U32 flags)
7458 {
7459  AV *retarray = NULL;
7460  SV *ret;
7461  struct regexp *const rx = ReANY(r);
7462
7463  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7464
7465  if (flags & RXapif_ALL)
7466   retarray=newAV();
7467
7468  if (rx && RXp_PAREN_NAMES(rx)) {
7469   HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7470   if (he_str) {
7471    IV i;
7472    SV* sv_dat=HeVAL(he_str);
7473    I32 *nums=(I32*)SvPVX(sv_dat);
7474    for ( i=0; i<SvIVX(sv_dat); i++ ) {
7475     if ((I32)(rx->nparens) >= nums[i]
7476      && rx->offs[nums[i]].start != -1
7477      && rx->offs[nums[i]].end != -1)
7478     {
7479      ret = newSVpvs("");
7480      CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7481      if (!retarray)
7482       return ret;
7483     } else {
7484      if (retarray)
7485       ret = newSVsv(&PL_sv_undef);
7486     }
7487     if (retarray)
7488      av_push(retarray, ret);
7489    }
7490    if (retarray)
7491     return newRV_noinc(MUTABLE_SV(retarray));
7492   }
7493  }
7494  return NULL;
7495 }
7496
7497 bool
7498 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7499       const U32 flags)
7500 {
7501  struct regexp *const rx = ReANY(r);
7502
7503  PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7504
7505  if (rx && RXp_PAREN_NAMES(rx)) {
7506   if (flags & RXapif_ALL) {
7507    return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7508   } else {
7509    SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7510    if (sv) {
7511     SvREFCNT_dec_NN(sv);
7512     return TRUE;
7513    } else {
7514     return FALSE;
7515    }
7516   }
7517  } else {
7518   return FALSE;
7519  }
7520 }
7521
7522 SV*
7523 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7524 {
7525  struct regexp *const rx = ReANY(r);
7526
7527  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7528
7529  if ( rx && RXp_PAREN_NAMES(rx) ) {
7530   (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7531
7532   return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7533  } else {
7534   return FALSE;
7535  }
7536 }
7537
7538 SV*
7539 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7540 {
7541  struct regexp *const rx = ReANY(r);
7542  GET_RE_DEBUG_FLAGS_DECL;
7543
7544  PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7545
7546  if (rx && RXp_PAREN_NAMES(rx)) {
7547   HV *hv = RXp_PAREN_NAMES(rx);
7548   HE *temphe;
7549   while ( (temphe = hv_iternext_flags(hv,0)) ) {
7550    IV i;
7551    IV parno = 0;
7552    SV* sv_dat = HeVAL(temphe);
7553    I32 *nums = (I32*)SvPVX(sv_dat);
7554    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7555     if ((I32)(rx->lastparen) >= nums[i] &&
7556      rx->offs[nums[i]].start != -1 &&
7557      rx->offs[nums[i]].end != -1)
7558     {
7559      parno = nums[i];
7560      break;
7561     }
7562    }
7563    if (parno || flags & RXapif_ALL) {
7564     return newSVhek(HeKEY_hek(temphe));
7565    }
7566   }
7567  }
7568  return NULL;
7569 }
7570
7571 SV*
7572 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7573 {
7574  SV *ret;
7575  AV *av;
7576  SSize_t length;
7577  struct regexp *const rx = ReANY(r);
7578
7579  PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7580
7581  if (rx && RXp_PAREN_NAMES(rx)) {
7582   if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7583    return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7584   } else if (flags & RXapif_ONE) {
7585    ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7586    av = MUTABLE_AV(SvRV(ret));
7587    length = av_tindex(av);
7588    SvREFCNT_dec_NN(ret);
7589    return newSViv(length + 1);
7590   } else {
7591    Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7592             (int)flags);
7593    return NULL;
7594   }
7595  }
7596  return &PL_sv_undef;
7597 }
7598
7599 SV*
7600 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7601 {
7602  struct regexp *const rx = ReANY(r);
7603  AV *av = newAV();
7604
7605  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7606
7607  if (rx && RXp_PAREN_NAMES(rx)) {
7608   HV *hv= RXp_PAREN_NAMES(rx);
7609   HE *temphe;
7610   (void)hv_iterinit(hv);
7611   while ( (temphe = hv_iternext_flags(hv,0)) ) {
7612    IV i;
7613    IV parno = 0;
7614    SV* sv_dat = HeVAL(temphe);
7615    I32 *nums = (I32*)SvPVX(sv_dat);
7616    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7617     if ((I32)(rx->lastparen) >= nums[i] &&
7618      rx->offs[nums[i]].start != -1 &&
7619      rx->offs[nums[i]].end != -1)
7620     {
7621      parno = nums[i];
7622      break;
7623     }
7624    }
7625    if (parno || flags & RXapif_ALL) {
7626     av_push(av, newSVhek(HeKEY_hek(temphe)));
7627    }
7628   }
7629  }
7630
7631  return newRV_noinc(MUTABLE_SV(av));
7632 }
7633
7634 void
7635 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7636        SV * const sv)
7637 {
7638  struct regexp *const rx = ReANY(r);
7639  char *s = NULL;
7640  SSize_t i = 0;
7641  SSize_t s1, t1;
7642  I32 n = paren;
7643
7644  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7645
7646  if (      n == RX_BUFF_IDX_CARET_PREMATCH
7647   || n == RX_BUFF_IDX_CARET_FULLMATCH
7648   || n == RX_BUFF_IDX_CARET_POSTMATCH
7649  )
7650  {
7651   bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7652   if (!keepcopy) {
7653    /* on something like
7654    *    $r = qr/.../;
7655    *    /$qr/p;
7656    * the KEEPCOPY is set on the PMOP rather than the regex */
7657    if (PL_curpm && r == PM_GETRE(PL_curpm))
7658     keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7659   }
7660   if (!keepcopy)
7661    goto ret_undef;
7662  }
7663
7664  if (!rx->subbeg)
7665   goto ret_undef;
7666
7667  if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7668   /* no need to distinguish between them any more */
7669   n = RX_BUFF_IDX_FULLMATCH;
7670
7671  if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7672   && rx->offs[0].start != -1)
7673  {
7674   /* $`, ${^PREMATCH} */
7675   i = rx->offs[0].start;
7676   s = rx->subbeg;
7677  }
7678  else
7679  if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7680   && rx->offs[0].end != -1)
7681  {
7682   /* $', ${^POSTMATCH} */
7683   s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7684   i = rx->sublen + rx->suboffset - rx->offs[0].end;
7685  }
7686  else
7687  if ( 0 <= n && n <= (I32)rx->nparens &&
7688   (s1 = rx->offs[n].start) != -1 &&
7689   (t1 = rx->offs[n].end) != -1)
7690  {
7691   /* $&, ${^MATCH},  $1 ... */
7692   i = t1 - s1;
7693   s = rx->subbeg + s1 - rx->suboffset;
7694  } else {
7695   goto ret_undef;
7696  }
7697
7698  assert(s >= rx->subbeg);
7699  assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7700  if (i >= 0) {
7701 #ifdef NO_TAINT_SUPPORT
7702   sv_setpvn(sv, s, i);
7703 #else
7704   const int oldtainted = TAINT_get;
7705   TAINT_NOT;
7706   sv_setpvn(sv, s, i);
7707   TAINT_set(oldtainted);
7708 #endif
7709   if ( (rx->intflags & PREGf_CANY_SEEN)
7710    ? (RXp_MATCH_UTF8(rx)
7711       && (!i || is_utf8_string((U8*)s, i)))
7712    : (RXp_MATCH_UTF8(rx)) )
7713   {
7714    SvUTF8_on(sv);
7715   }
7716   else
7717    SvUTF8_off(sv);
7718   if (TAINTING_get) {
7719    if (RXp_MATCH_TAINTED(rx)) {
7720     if (SvTYPE(sv) >= SVt_PVMG) {
7721      MAGIC* const mg = SvMAGIC(sv);
7722      MAGIC* mgt;
7723      TAINT;
7724      SvMAGIC_set(sv, mg->mg_moremagic);
7725      SvTAINT(sv);
7726      if ((mgt = SvMAGIC(sv))) {
7727       mg->mg_moremagic = mgt;
7728       SvMAGIC_set(sv, mg);
7729      }
7730     } else {
7731      TAINT;
7732      SvTAINT(sv);
7733     }
7734    } else
7735     SvTAINTED_off(sv);
7736   }
7737  } else {
7738  ret_undef:
7739   sv_setsv(sv,&PL_sv_undef);
7740   return;
7741  }
7742 }
7743
7744 void
7745 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7746               SV const * const value)
7747 {
7748  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7749
7750  PERL_UNUSED_ARG(rx);
7751  PERL_UNUSED_ARG(paren);
7752  PERL_UNUSED_ARG(value);
7753
7754  if (!PL_localizing)
7755   Perl_croak_no_modify();
7756 }
7757
7758 I32
7759 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7760        const I32 paren)
7761 {
7762  struct regexp *const rx = ReANY(r);
7763  I32 i;
7764  I32 s1, t1;
7765
7766  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7767
7768  if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7769   || paren == RX_BUFF_IDX_CARET_FULLMATCH
7770   || paren == RX_BUFF_IDX_CARET_POSTMATCH
7771  )
7772  {
7773   bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7774   if (!keepcopy) {
7775    /* on something like
7776    *    $r = qr/.../;
7777    *    /$qr/p;
7778    * the KEEPCOPY is set on the PMOP rather than the regex */
7779    if (PL_curpm && r == PM_GETRE(PL_curpm))
7780     keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7781   }
7782   if (!keepcopy)
7783    goto warn_undef;
7784  }
7785
7786  /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7787  switch (paren) {
7788  case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7789  case RX_BUFF_IDX_PREMATCH:       /* $` */
7790   if (rx->offs[0].start != -1) {
7791       i = rx->offs[0].start;
7792       if (i > 0) {
7793         s1 = 0;
7794         t1 = i;
7795         goto getlen;
7796       }
7797    }
7798   return 0;
7799
7800  case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7801  case RX_BUFF_IDX_POSTMATCH:       /* $' */
7802    if (rx->offs[0].end != -1) {
7803       i = rx->sublen - rx->offs[0].end;
7804       if (i > 0) {
7805         s1 = rx->offs[0].end;
7806         t1 = rx->sublen;
7807         goto getlen;
7808       }
7809    }
7810   return 0;
7811
7812  default: /* $& / ${^MATCH}, $1, $2, ... */
7813    if (paren <= (I32)rx->nparens &&
7814    (s1 = rx->offs[paren].start) != -1 &&
7815    (t1 = rx->offs[paren].end) != -1)
7816    {
7817    i = t1 - s1;
7818    goto getlen;
7819   } else {
7820   warn_undef:
7821    if (ckWARN(WARN_UNINITIALIZED))
7822     report_uninit((const SV *)sv);
7823    return 0;
7824   }
7825  }
7826   getlen:
7827  if (i > 0 && RXp_MATCH_UTF8(rx)) {
7828   const char * const s = rx->subbeg - rx->suboffset + s1;
7829   const U8 *ep;
7830   STRLEN el;
7831
7832   i = t1 - s1;
7833   if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7834       i = el;
7835  }
7836  return i;
7837 }
7838
7839 SV*
7840 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7841 {
7842  PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7843   PERL_UNUSED_ARG(rx);
7844   if (0)
7845    return NULL;
7846   else
7847    return newSVpvs("Regexp");
7848 }
7849
7850 /* Scans the name of a named buffer from the pattern.
7851  * If flags is REG_RSN_RETURN_NULL returns null.
7852  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7853  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7854  * to the parsed name as looked up in the RExC_paren_names hash.
7855  * If there is an error throws a vFAIL().. type exception.
7856  */
7857
7858 #define REG_RSN_RETURN_NULL    0
7859 #define REG_RSN_RETURN_NAME    1
7860 #define REG_RSN_RETURN_DATA    2
7861
7862 STATIC SV*
7863 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7864 {
7865  char *name_start = RExC_parse;
7866
7867  PERL_ARGS_ASSERT_REG_SCAN_NAME;
7868
7869  assert (RExC_parse <= RExC_end);
7870  if (RExC_parse == RExC_end) NOOP;
7871  else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7872   /* skip IDFIRST by using do...while */
7873   if (UTF)
7874    do {
7875     RExC_parse += UTF8SKIP(RExC_parse);
7876    } while (isWORDCHAR_utf8((U8*)RExC_parse));
7877   else
7878    do {
7879     RExC_parse++;
7880    } while (isWORDCHAR(*RExC_parse));
7881  } else {
7882   RExC_parse++; /* so the <- from the vFAIL is after the offending
7883       character */
7884   vFAIL("Group name must start with a non-digit word character");
7885  }
7886  if ( flags ) {
7887   SV* sv_name
7888    = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7889        SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7890   if ( flags == REG_RSN_RETURN_NAME)
7891    return sv_name;
7892   else if (flags==REG_RSN_RETURN_DATA) {
7893    HE *he_str = NULL;
7894    SV *sv_dat = NULL;
7895    if ( ! sv_name )      /* should not happen*/
7896     Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7897    if (RExC_paren_names)
7898     he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7899    if ( he_str )
7900     sv_dat = HeVAL(he_str);
7901    if ( ! sv_dat )
7902     vFAIL("Reference to nonexistent named group");
7903    return sv_dat;
7904   }
7905   else {
7906    Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7907      (unsigned long) flags);
7908   }
7909   NOT_REACHED; /* NOTREACHED */
7910  }
7911  return NULL;
7912 }
7913
7914 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7915  int num;                                                    \
7916  if (RExC_lastparse!=RExC_parse) {                           \
7917   PerlIO_printf(Perl_debug_log, "%s",                     \
7918    Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
7919     RExC_end - RExC_parse, 16,                      \
7920     "", "",                                         \
7921     PERL_PV_ESCAPE_UNI_DETECT |                     \
7922     PERL_PV_PRETTY_ELLIPSES   |                     \
7923     PERL_PV_PRETTY_LTGT       |                     \
7924     PERL_PV_ESCAPE_RE         |                     \
7925     PERL_PV_PRETTY_EXACTSIZE                        \
7926    )                                                   \
7927   );                                                      \
7928  } else                                                      \
7929   PerlIO_printf(Perl_debug_log,"%16s","");                \
7930                 \
7931  if (SIZE_ONLY)                                              \
7932  num = RExC_size + 1;                                     \
7933  else                                                        \
7934  num=REG_NODE_NUM(RExC_emit);                             \
7935  if (RExC_lastnum!=num)                                      \
7936  PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7937  else                                                        \
7938  PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7939  PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7940   (int)((depth*2)), "",                                   \
7941   (funcname)                                              \
7942  );                                                          \
7943  RExC_lastnum=num;                                           \
7944  RExC_lastparse=RExC_parse;                                  \
7945 })
7946
7947
7948
7949 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7950  DEBUG_PARSE_MSG((funcname));                            \
7951  PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7952 })
7953 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7954  DEBUG_PARSE_MSG((funcname));                            \
7955  PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7956 })
7957
7958 /* This section of code defines the inversion list object and its methods.  The
7959  * interfaces are highly subject to change, so as much as possible is static to
7960  * this file.  An inversion list is here implemented as a malloc'd C UV array
7961  * as an SVt_INVLIST scalar.
7962  *
7963  * An inversion list for Unicode is an array of code points, sorted by ordinal
7964  * number.  The zeroth element is the first code point in the list.  The 1th
7965  * element is the first element beyond that not in the list.  In other words,
7966  * the first range is
7967  *  invlist[0]..(invlist[1]-1)
7968  * The other ranges follow.  Thus every element whose index is divisible by two
7969  * marks the beginning of a range that is in the list, and every element not
7970  * divisible by two marks the beginning of a range not in the list.  A single
7971  * element inversion list that contains the single code point N generally
7972  * consists of two elements
7973  *  invlist[0] == N
7974  *  invlist[1] == N+1
7975  * (The exception is when N is the highest representable value on the
7976  * machine, in which case the list containing just it would be a single
7977  * element, itself.  By extension, if the last range in the list extends to
7978  * infinity, then the first element of that range will be in the inversion list
7979  * at a position that is divisible by two, and is the final element in the
7980  * list.)
7981  * Taking the complement (inverting) an inversion list is quite simple, if the
7982  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7983  * This implementation reserves an element at the beginning of each inversion
7984  * list to always contain 0; there is an additional flag in the header which
7985  * indicates if the list begins at the 0, or is offset to begin at the next
7986  * element.
7987  *
7988  * More about inversion lists can be found in "Unicode Demystified"
7989  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7990  * More will be coming when functionality is added later.
7991  *
7992  * The inversion list data structure is currently implemented as an SV pointing
7993  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7994  * array of UV whose memory management is automatically handled by the existing
7995  * facilities for SV's.
7996  *
7997  * Some of the methods should always be private to the implementation, and some
7998  * should eventually be made public */
7999
8000 /* The header definitions are in F<inline_invlist.c> */
8001
8002 PERL_STATIC_INLINE UV*
8003 S__invlist_array_init(SV* const invlist, const bool will_have_0)
8004 {
8005  /* Returns a pointer to the first element in the inversion list's array.
8006  * This is called upon initialization of an inversion list.  Where the
8007  * array begins depends on whether the list has the code point U+0000 in it
8008  * or not.  The other parameter tells it whether the code that follows this
8009  * call is about to put a 0 in the inversion list or not.  The first
8010  * element is either the element reserved for 0, if TRUE, or the element
8011  * after it, if FALSE */
8012
8013  bool* offset = get_invlist_offset_addr(invlist);
8014  UV* zero_addr = (UV *) SvPVX(invlist);
8015
8016  PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8017
8018  /* Must be empty */
8019  assert(! _invlist_len(invlist));
8020
8021  *zero_addr = 0;
8022
8023  /* 1^1 = 0; 1^0 = 1 */
8024  *offset = 1 ^ will_have_0;
8025  return zero_addr + *offset;
8026 }
8027
8028 PERL_STATIC_INLINE void
8029 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8030 {
8031  /* Sets the current number of elements stored in the inversion list.
8032  * Updates SvCUR correspondingly */
8033  PERL_UNUSED_CONTEXT;
8034  PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8035
8036  assert(SvTYPE(invlist) == SVt_INVLIST);
8037
8038  SvCUR_set(invlist,
8039    (len == 0)
8040    ? 0
8041    : TO_INTERNAL_SIZE(len + offset));
8042  assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8043 }
8044
8045 #ifndef PERL_IN_XSUB_RE
8046
8047 PERL_STATIC_INLINE IV*
8048 S_get_invlist_previous_index_addr(SV* invlist)
8049 {
8050  /* Return the address of the IV that is reserved to hold the cached index
8051  * */
8052  PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8053
8054  assert(SvTYPE(invlist) == SVt_INVLIST);
8055
8056  return &(((XINVLIST*) SvANY(invlist))->prev_index);
8057 }
8058
8059 PERL_STATIC_INLINE IV
8060 S_invlist_previous_index(SV* const invlist)
8061 {
8062  /* Returns cached index of previous search */
8063
8064  PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8065
8066  return *get_invlist_previous_index_addr(invlist);
8067 }
8068
8069 PERL_STATIC_INLINE void
8070 S_invlist_set_previous_index(SV* const invlist, const IV index)
8071 {
8072  /* Caches <index> for later retrieval */
8073
8074  PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8075
8076  assert(index == 0 || index < (int) _invlist_len(invlist));
8077
8078  *get_invlist_previous_index_addr(invlist) = index;
8079 }
8080
8081 PERL_STATIC_INLINE void
8082 S_invlist_trim(SV* const invlist)
8083 {
8084  PERL_ARGS_ASSERT_INVLIST_TRIM;
8085
8086  assert(SvTYPE(invlist) == SVt_INVLIST);
8087
8088  /* Change the length of the inversion list to how many entries it currently
8089  * has */
8090  SvPV_shrink_to_cur((SV *) invlist);
8091 }
8092
8093 PERL_STATIC_INLINE bool
8094 S_invlist_is_iterating(SV* const invlist)
8095 {
8096  PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8097
8098  return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8099 }
8100
8101 #endif /* ifndef PERL_IN_XSUB_RE */
8102
8103 PERL_STATIC_INLINE UV
8104 S_invlist_max(SV* const invlist)
8105 {
8106  /* Returns the maximum number of elements storable in the inversion list's
8107  * array, without having to realloc() */
8108
8109  PERL_ARGS_ASSERT_INVLIST_MAX;
8110
8111  assert(SvTYPE(invlist) == SVt_INVLIST);
8112
8113  /* Assumes worst case, in which the 0 element is not counted in the
8114  * inversion list, so subtracts 1 for that */
8115  return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
8116   ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8117   : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8118 }
8119
8120 #ifndef PERL_IN_XSUB_RE
8121 SV*
8122 Perl__new_invlist(pTHX_ IV initial_size)
8123 {
8124
8125  /* Return a pointer to a newly constructed inversion list, with enough
8126  * space to store 'initial_size' elements.  If that number is negative, a
8127  * system default is used instead */
8128
8129  SV* new_list;
8130
8131  if (initial_size < 0) {
8132   initial_size = 10;
8133  }
8134
8135  /* Allocate the initial space */
8136  new_list = newSV_type(SVt_INVLIST);
8137
8138  /* First 1 is in case the zero element isn't in the list; second 1 is for
8139  * trailing NUL */
8140  SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8141  invlist_set_len(new_list, 0, 0);
8142
8143  /* Force iterinit() to be used to get iteration to work */
8144  *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8145
8146  *get_invlist_previous_index_addr(new_list) = 0;
8147
8148  return new_list;
8149 }
8150
8151 SV*
8152 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8153 {
8154  /* Return a pointer to a newly constructed inversion list, initialized to
8155  * point to <list>, which has to be in the exact correct inversion list
8156  * form, including internal fields.  Thus this is a dangerous routine that
8157  * should not be used in the wrong hands.  The passed in 'list' contains
8158  * several header fields at the beginning that are not part of the
8159  * inversion list body proper */
8160
8161  const STRLEN length = (STRLEN) list[0];
8162  const UV version_id =          list[1];
8163  const bool offset   =    cBOOL(list[2]);
8164 #define HEADER_LENGTH 3
8165  /* If any of the above changes in any way, you must change HEADER_LENGTH
8166  * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8167  *      perl -E 'say int(rand 2**31-1)'
8168  */
8169 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8170           data structure type, so that one being
8171           passed in can be validated to be an
8172           inversion list of the correct vintage.
8173          */
8174
8175  SV* invlist = newSV_type(SVt_INVLIST);
8176
8177  PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8178
8179  if (version_id != INVLIST_VERSION_ID) {
8180   Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8181  }
8182
8183  /* The generated array passed in includes header elements that aren't part
8184  * of the list proper, so start it just after them */
8185  SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8186
8187  SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
8188        shouldn't touch it */
8189
8190  *(get_invlist_offset_addr(invlist)) = offset;
8191
8192  /* The 'length' passed to us is the physical number of elements in the
8193  * inversion list.  But if there is an offset the logical number is one
8194  * less than that */
8195  invlist_set_len(invlist, length  - offset, offset);
8196
8197  invlist_set_previous_index(invlist, 0);
8198
8199  /* Initialize the iteration pointer. */
8200  invlist_iterfinish(invlist);
8201
8202  SvREADONLY_on(invlist);
8203
8204  return invlist;
8205 }
8206 #endif /* ifndef PERL_IN_XSUB_RE */
8207
8208 STATIC void
8209 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8210 {
8211  /* Grow the maximum size of an inversion list */
8212
8213  PERL_ARGS_ASSERT_INVLIST_EXTEND;
8214
8215  assert(SvTYPE(invlist) == SVt_INVLIST);
8216
8217  /* Add one to account for the zero element at the beginning which may not
8218  * be counted by the calling parameters */
8219  SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8220 }
8221
8222 STATIC void
8223 S__append_range_to_invlist(pTHX_ SV* const invlist,
8224         const UV start, const UV end)
8225 {
8226    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8227  * the end of the inversion list.  The range must be above any existing
8228  * ones. */
8229
8230  UV* array;
8231  UV max = invlist_max(invlist);
8232  UV len = _invlist_len(invlist);
8233  bool offset;
8234
8235  PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8236
8237  if (len == 0) { /* Empty lists must be initialized */
8238   offset = start != 0;
8239   array = _invlist_array_init(invlist, ! offset);
8240  }
8241  else {
8242   /* Here, the existing list is non-empty. The current max entry in the
8243   * list is generally the first value not in the set, except when the
8244   * set extends to the end of permissible values, in which case it is
8245   * the first entry in that final set, and so this call is an attempt to
8246   * append out-of-order */
8247
8248   UV final_element = len - 1;
8249   array = invlist_array(invlist);
8250   if (array[final_element] > start
8251    || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8252   {
8253    Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8254      array[final_element], start,
8255      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8256   }
8257
8258   /* Here, it is a legal append.  If the new range begins with the first
8259   * value not in the set, it is extending the set, so the new first
8260   * value not in the set is one greater than the newly extended range.
8261   * */
8262   offset = *get_invlist_offset_addr(invlist);
8263   if (array[final_element] == start) {
8264    if (end != UV_MAX) {
8265     array[final_element] = end + 1;
8266    }
8267    else {
8268     /* But if the end is the maximum representable on the machine,
8269     * just let the range that this would extend to have no end */
8270     invlist_set_len(invlist, len - 1, offset);
8271    }
8272    return;
8273   }
8274  }
8275
8276  /* Here the new range doesn't extend any existing set.  Add it */
8277
8278  len += 2; /* Includes an element each for the start and end of range */
8279
8280  /* If wll overflow the existing space, extend, which may cause the array to
8281  * be moved */
8282  if (max < len) {
8283   invlist_extend(invlist, len);
8284
8285   /* Have to set len here to avoid assert failure in invlist_array() */
8286   invlist_set_len(invlist, len, offset);
8287
8288   array = invlist_array(invlist);
8289  }
8290  else {
8291   invlist_set_len(invlist, len, offset);
8292  }
8293
8294  /* The next item on the list starts the range, the one after that is
8295  * one past the new range.  */
8296  array[len - 2] = start;
8297  if (end != UV_MAX) {
8298   array[len - 1] = end + 1;
8299  }
8300  else {
8301   /* But if the end is the maximum representable on the machine, just let
8302   * the range have no end */
8303   invlist_set_len(invlist, len - 1, offset);
8304  }
8305 }
8306
8307 #ifndef PERL_IN_XSUB_RE
8308
8309 IV
8310 Perl__invlist_search(SV* const invlist, const UV cp)
8311 {
8312  /* Searches the inversion list for the entry that contains the input code
8313  * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8314  * return value is the index into the list's array of the range that
8315  * contains <cp> */
8316
8317  IV low = 0;
8318  IV mid;
8319  IV high = _invlist_len(invlist);
8320  const IV highest_element = high - 1;
8321  const UV* array;
8322
8323  PERL_ARGS_ASSERT__INVLIST_SEARCH;
8324
8325  /* If list is empty, return failure. */
8326  if (high == 0) {
8327   return -1;
8328  }
8329
8330  /* (We can't get the array unless we know the list is non-empty) */
8331  array = invlist_array(invlist);
8332
8333  mid = invlist_previous_index(invlist);
8334  assert(mid >=0 && mid <= highest_element);
8335
8336  /* <mid> contains the cache of the result of the previous call to this
8337  * function (0 the first time).  See if this call is for the same result,
8338  * or if it is for mid-1.  This is under the theory that calls to this
8339  * function will often be for related code points that are near each other.
8340  * And benchmarks show that caching gives better results.  We also test
8341  * here if the code point is within the bounds of the list.  These tests
8342  * replace others that would have had to be made anyway to make sure that
8343  * the array bounds were not exceeded, and these give us extra information
8344  * at the same time */
8345  if (cp >= array[mid]) {
8346   if (cp >= array[highest_element]) {
8347    return highest_element;
8348   }
8349
8350   /* Here, array[mid] <= cp < array[highest_element].  This means that
8351   * the final element is not the answer, so can exclude it; it also
8352   * means that <mid> is not the final element, so can refer to 'mid + 1'
8353   * safely */
8354   if (cp < array[mid + 1]) {
8355    return mid;
8356   }
8357   high--;
8358   low = mid + 1;
8359  }
8360  else { /* cp < aray[mid] */
8361   if (cp < array[0]) { /* Fail if outside the array */
8362    return -1;
8363   }
8364   high = mid;
8365   if (cp >= array[mid - 1]) {
8366    goto found_entry;
8367   }
8368  }
8369
8370  /* Binary search.  What we are looking for is <i> such that
8371  * array[i] <= cp < array[i+1]
8372  * The loop below converges on the i+1.  Note that there may not be an
8373  * (i+1)th element in the array, and things work nonetheless */
8374  while (low < high) {
8375   mid = (low + high) / 2;
8376   assert(mid <= highest_element);
8377   if (array[mid] <= cp) { /* cp >= array[mid] */
8378    low = mid + 1;
8379
8380    /* We could do this extra test to exit the loop early.
8381    if (cp < array[low]) {
8382     return mid;
8383    }
8384    */
8385   }
8386   else { /* cp < array[mid] */
8387    high = mid;
8388   }
8389  }
8390
8391   found_entry:
8392  high--;
8393  invlist_set_previous_index(invlist, high);
8394  return high;
8395 }
8396
8397 void
8398 Perl__invlist_populate_swatch(SV* const invlist,
8399        const UV start, const UV end, U8* swatch)
8400 {
8401  /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8402  * but is used when the swash has an inversion list.  This makes this much
8403  * faster, as it uses a binary search instead of a linear one.  This is
8404  * intimately tied to that function, and perhaps should be in utf8.c,
8405  * except it is intimately tied to inversion lists as well.  It assumes
8406  * that <swatch> is all 0's on input */
8407
8408  UV current = start;
8409  const IV len = _invlist_len(invlist);
8410  IV i;
8411  const UV * array;
8412
8413  PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8414
8415  if (len == 0) { /* Empty inversion list */
8416   return;
8417  }
8418
8419  array = invlist_array(invlist);
8420
8421  /* Find which element it is */
8422  i = _invlist_search(invlist, start);
8423
8424  /* We populate from <start> to <end> */
8425  while (current < end) {
8426   UV upper;
8427
8428   /* The inversion list gives the results for every possible code point
8429   * after the first one in the list.  Only those ranges whose index is
8430   * even are ones that the inversion list matches.  For the odd ones,
8431   * and if the initial code point is not in the list, we have to skip
8432   * forward to the next element */
8433   if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8434    i++;
8435    if (i >= len) { /* Finished if beyond the end of the array */
8436     return;
8437    }
8438    current = array[i];
8439    if (current >= end) {   /* Finished if beyond the end of what we
8440          are populating */
8441     if (LIKELY(end < UV_MAX)) {
8442      return;
8443     }
8444
8445     /* We get here when the upper bound is the maximum
8446     * representable on the machine, and we are looking for just
8447     * that code point.  Have to special case it */
8448     i = len;
8449     goto join_end_of_list;
8450    }
8451   }
8452   assert(current >= start);
8453
8454   /* The current range ends one below the next one, except don't go past
8455   * <end> */
8456   i++;
8457   upper = (i < len && array[i] < end) ? array[i] : end;
8458
8459   /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8460   * for each code point in it */
8461   for (; current < upper; current++) {
8462    const STRLEN offset = (STRLEN)(current - start);
8463    swatch[offset >> 3] |= 1 << (offset & 7);
8464   }
8465
8466  join_end_of_list:
8467
8468   /* Quit if at the end of the list */
8469   if (i >= len) {
8470
8471    /* But first, have to deal with the highest possible code point on
8472    * the platform.  The previous code assumes that <end> is one
8473    * beyond where we want to populate, but that is impossible at the
8474    * platform's infinity, so have to handle it specially */
8475    if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8476    {
8477     const STRLEN offset = (STRLEN)(end - start);
8478     swatch[offset >> 3] |= 1 << (offset & 7);
8479    }
8480    return;
8481   }
8482
8483   /* Advance to the next range, which will be for code points not in the
8484   * inversion list */
8485   current = array[i];
8486  }
8487
8488  return;
8489 }
8490
8491 void
8492 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8493           const bool complement_b, SV** output)
8494 {
8495  /* Take the union of two inversion lists and point <output> to it.  *output
8496  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8497  * the reference count to that list will be decremented if not already a
8498  * temporary (mortal); otherwise *output will be made correspondingly
8499  * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8500  * second list is returned.  If <complement_b> is TRUE, the union is taken
8501  * of the complement (inversion) of <b> instead of b itself.
8502  *
8503  * The basis for this comes from "Unicode Demystified" Chapter 13 by
8504  * Richard Gillam, published by Addison-Wesley, and explained at some
8505  * length there.  The preface says to incorporate its examples into your
8506  * code at your own risk.
8507  *
8508  * The algorithm is like a merge sort.
8509  *
8510  * XXX A potential performance improvement is to keep track as we go along
8511  * if only one of the inputs contributes to the result, meaning the other
8512  * is a subset of that one.  In that case, we can skip the final copy and
8513  * return the larger of the input lists, but then outside code might need
8514  * to keep track of whether to free the input list or not */
8515
8516  const UV* array_a;    /* a's array */
8517  const UV* array_b;
8518  UV len_a;     /* length of a's array */
8519  UV len_b;
8520
8521  SV* u;   /* the resulting union */
8522  UV* array_u;
8523  UV len_u;
8524
8525  UV i_a = 0;      /* current index into a's array */
8526  UV i_b = 0;
8527  UV i_u = 0;
8528
8529  /* running count, as explained in the algorithm source book; items are
8530  * stopped accumulating and are output when the count changes to/from 0.
8531  * The count is incremented when we start a range that's in the set, and
8532  * decremented when we start a range that's not in the set.  So its range
8533  * is 0 to 2.  Only when the count is zero is something not in the set.
8534  */
8535  UV count = 0;
8536
8537  PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8538  assert(a != b);
8539
8540  /* If either one is empty, the union is the other one */
8541  if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8542   bool make_temp = FALSE; /* Should we mortalize the result? */
8543
8544   if (*output == a) {
8545    if (a != NULL) {
8546     if (! (make_temp = cBOOL(SvTEMP(a)))) {
8547      SvREFCNT_dec_NN(a);
8548     }
8549    }
8550   }
8551   if (*output != b) {
8552    *output = invlist_clone(b);
8553    if (complement_b) {
8554     _invlist_invert(*output);
8555    }
8556   } /* else *output already = b; */
8557
8558   if (make_temp) {
8559    sv_2mortal(*output);
8560   }
8561   return;
8562  }
8563  else if ((len_b = _invlist_len(b)) == 0) {
8564   bool make_temp = FALSE;
8565   if (*output == b) {
8566    if (! (make_temp = cBOOL(SvTEMP(b)))) {
8567     SvREFCNT_dec_NN(b);
8568    }
8569   }
8570
8571   /* The complement of an empty list is a list that has everything in it,
8572   * so the union with <a> includes everything too */
8573   if (complement_b) {
8574    if (a == *output) {
8575     if (! (make_temp = cBOOL(SvTEMP(a)))) {
8576      SvREFCNT_dec_NN(a);
8577     }
8578    }
8579    *output = _new_invlist(1);
8580    _append_range_to_invlist(*output, 0, UV_MAX);
8581   }
8582   else if (*output != a) {
8583    *output = invlist_clone(a);
8584   }
8585   /* else *output already = a; */
8586
8587   if (make_temp) {
8588    sv_2mortal(*output);
8589   }
8590   return;
8591  }
8592
8593  /* Here both lists exist and are non-empty */
8594  array_a = invlist_array(a);
8595  array_b = invlist_array(b);
8596
8597  /* If are to take the union of 'a' with the complement of b, set it
8598  * up so are looking at b's complement. */
8599  if (complement_b) {
8600
8601   /* To complement, we invert: if the first element is 0, remove it.  To
8602   * do this, we just pretend the array starts one later */
8603   if (array_b[0] == 0) {
8604    array_b++;
8605    len_b--;
8606   }
8607   else {
8608
8609    /* But if the first element is not zero, we pretend the list starts
8610    * at the 0 that is always stored immediately before the array. */
8611    array_b--;
8612    len_b++;
8613   }
8614  }
8615
8616  /* Size the union for the worst case: that the sets are completely
8617  * disjoint */
8618  u = _new_invlist(len_a + len_b);
8619
8620  /* Will contain U+0000 if either component does */
8621  array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8622          || (len_b > 0 && array_b[0] == 0));
8623
8624  /* Go through each list item by item, stopping when exhausted one of
8625  * them */
8626  while (i_a < len_a && i_b < len_b) {
8627   UV cp;     /* The element to potentially add to the union's array */
8628   bool cp_in_set;   /* is it in the the input list's set or not */
8629
8630   /* We need to take one or the other of the two inputs for the union.
8631   * Since we are merging two sorted lists, we take the smaller of the
8632   * next items.  In case of a tie, we take the one that is in its set
8633   * first.  If we took one not in the set first, it would decrement the
8634   * count, possibly to 0 which would cause it to be output as ending the
8635   * range, and the next time through we would take the same number, and
8636   * output it again as beginning the next range.  By doing it the
8637   * opposite way, there is no possibility that the count will be
8638   * momentarily decremented to 0, and thus the two adjoining ranges will
8639   * be seamlessly merged.  (In a tie and both are in the set or both not
8640   * in the set, it doesn't matter which we take first.) */
8641   if (array_a[i_a] < array_b[i_b]
8642    || (array_a[i_a] == array_b[i_b]
8643     && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8644   {
8645    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8646    cp= array_a[i_a++];
8647   }
8648   else {
8649    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8650    cp = array_b[i_b++];
8651   }
8652
8653   /* Here, have chosen which of the two inputs to look at.  Only output
8654   * if the running count changes to/from 0, which marks the
8655   * beginning/end of a range in that's in the set */
8656   if (cp_in_set) {
8657    if (count == 0) {
8658     array_u[i_u++] = cp;
8659    }
8660    count++;
8661   }
8662   else {
8663    count--;
8664    if (count == 0) {
8665     array_u[i_u++] = cp;
8666    }
8667   }
8668  }
8669
8670  /* Here, we are finished going through at least one of the lists, which
8671  * means there is something remaining in at most one.  We check if the list
8672  * that hasn't been exhausted is positioned such that we are in the middle
8673  * of a range in its set or not.  (i_a and i_b point to the element beyond
8674  * the one we care about.) If in the set, we decrement 'count'; if 0, there
8675  * is potentially more to output.
8676  * There are four cases:
8677  * 1) Both weren't in their sets, count is 0, and remains 0.  What's left
8678  *    in the union is entirely from the non-exhausted set.
8679  * 2) Both were in their sets, count is 2.  Nothing further should
8680  *    be output, as everything that remains will be in the exhausted
8681  *    list's set, hence in the union; decrementing to 1 but not 0 insures
8682  *    that
8683  * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8684  *    Nothing further should be output because the union includes
8685  *    everything from the exhausted set.  Not decrementing ensures that.
8686  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8687  *    decrementing to 0 insures that we look at the remainder of the
8688  *    non-exhausted set */
8689  if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8690   || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8691  {
8692   count--;
8693  }
8694
8695  /* The final length is what we've output so far, plus what else is about to
8696  * be output.  (If 'count' is non-zero, then the input list we exhausted
8697  * has everything remaining up to the machine's limit in its set, and hence
8698  * in the union, so there will be no further output. */
8699  len_u = i_u;
8700  if (count == 0) {
8701   /* At most one of the subexpressions will be non-zero */
8702   len_u += (len_a - i_a) + (len_b - i_b);
8703  }
8704
8705  /* Set result to final length, which can change the pointer to array_u, so
8706  * re-find it */
8707  if (len_u != _invlist_len(u)) {
8708   invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8709   invlist_trim(u);
8710   array_u = invlist_array(u);
8711  }
8712
8713  /* When 'count' is 0, the list that was exhausted (if one was shorter than
8714  * the other) ended with everything above it not in its set.  That means
8715  * that the remaining part of the union is precisely the same as the
8716  * non-exhausted list, so can just copy it unchanged.  (If both list were
8717  * exhausted at the same time, then the operations below will be both 0.)
8718  */
8719  if (count == 0) {
8720   IV copy_count; /* At most one will have a non-zero copy count */
8721   if ((copy_count = len_a - i_a) > 0) {
8722    Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8723   }
8724   else if ((copy_count = len_b - i_b) > 0) {
8725    Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8726   }
8727  }
8728
8729  /*  We may be removing a reference to one of the inputs.  If so, the output
8730  *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8731  *  count decremented) */
8732  if (a == *output || b == *output) {
8733   assert(! invlist_is_iterating(*output));
8734   if ((SvTEMP(*output))) {
8735    sv_2mortal(u);
8736   }
8737   else {
8738    SvREFCNT_dec_NN(*output);
8739   }
8740  }
8741
8742  *output = u;
8743
8744  return;
8745 }
8746
8747 void
8748 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8749            const bool complement_b, SV** i)
8750 {
8751  /* Take the intersection of two inversion lists and point <i> to it.  *i
8752  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8753  * the reference count to that list will be decremented if not already a
8754  * temporary (mortal); otherwise *i will be made correspondingly mortal.
8755  * The first list, <a>, may be NULL, in which case an empty list is
8756  * returned.  If <complement_b> is TRUE, the result will be the
8757  * intersection of <a> and the complement (or inversion) of <b> instead of
8758  * <b> directly.
8759  *
8760  * The basis for this comes from "Unicode Demystified" Chapter 13 by
8761  * Richard Gillam, published by Addison-Wesley, and explained at some
8762  * length there.  The preface says to incorporate its examples into your
8763  * code at your own risk.  In fact, it had bugs
8764  *
8765  * The algorithm is like a merge sort, and is essentially the same as the
8766  * union above
8767  */
8768
8769  const UV* array_a;  /* a's array */
8770  const UV* array_b;
8771  UV len_a; /* length of a's array */
8772  UV len_b;
8773
8774  SV* r;       /* the resulting intersection */
8775  UV* array_r;
8776  UV len_r;
8777
8778  UV i_a = 0;      /* current index into a's array */
8779  UV i_b = 0;
8780  UV i_r = 0;
8781
8782  /* running count, as explained in the algorithm source book; items are
8783  * stopped accumulating and are output when the count changes to/from 2.
8784  * The count is incremented when we start a range that's in the set, and
8785  * decremented when we start a range that's not in the set.  So its range
8786  * is 0 to 2.  Only when the count is 2 is something in the intersection.
8787  */
8788  UV count = 0;
8789
8790  PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8791  assert(a != b);
8792
8793  /* Special case if either one is empty */
8794  len_a = (a == NULL) ? 0 : _invlist_len(a);
8795  if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8796   bool make_temp = FALSE;
8797
8798   if (len_a != 0 && complement_b) {
8799
8800    /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8801    * be empty.  Here, also we are using 'b's complement, which hence
8802    * must be every possible code point.  Thus the intersection is
8803    * simply 'a'. */
8804    if (*i != a) {
8805     if (*i == b) {
8806      if (! (make_temp = cBOOL(SvTEMP(b)))) {
8807       SvREFCNT_dec_NN(b);
8808      }
8809     }
8810
8811     *i = invlist_clone(a);
8812    }
8813    /* else *i is already 'a' */
8814
8815    if (make_temp) {
8816     sv_2mortal(*i);
8817    }
8818    return;
8819   }
8820
8821   /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8822   * intersection must be empty */
8823   if (*i == a) {
8824    if (! (make_temp = cBOOL(SvTEMP(a)))) {
8825     SvREFCNT_dec_NN(a);
8826    }
8827   }
8828   else if (*i == b) {
8829    if (! (make_temp = cBOOL(SvTEMP(b)))) {
8830     SvREFCNT_dec_NN(b);
8831    }
8832   }
8833   *i = _new_invlist(0);
8834   if (make_temp) {
8835    sv_2mortal(*i);
8836   }
8837
8838   return;
8839  }
8840
8841  /* Here both lists exist and are non-empty */
8842  array_a = invlist_array(a);
8843  array_b = invlist_array(b);
8844
8845  /* If are to take the intersection of 'a' with the complement of b, set it
8846  * up so are looking at b's complement. */
8847  if (complement_b) {
8848
8849   /* To complement, we invert: if the first element is 0, remove it.  To
8850   * do this, we just pretend the array starts one later */
8851   if (array_b[0] == 0) {
8852    array_b++;
8853    len_b--;
8854   }
8855   else {
8856
8857    /* But if the first element is not zero, we pretend the list starts
8858    * at the 0 that is always stored immediately before the array. */
8859    array_b--;
8860    len_b++;
8861   }
8862  }
8863
8864  /* Size the intersection for the worst case: that the intersection ends up
8865  * fragmenting everything to be completely disjoint */
8866  r= _new_invlist(len_a + len_b);
8867
8868  /* Will contain U+0000 iff both components do */
8869  array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8870          && len_b > 0 && array_b[0] == 0);
8871
8872  /* Go through each list item by item, stopping when exhausted one of
8873  * them */
8874  while (i_a < len_a && i_b < len_b) {
8875   UV cp;     /* The element to potentially add to the intersection's
8876      array */
8877   bool cp_in_set; /* Is it in the input list's set or not */
8878
8879   /* We need to take one or the other of the two inputs for the
8880   * intersection.  Since we are merging two sorted lists, we take the
8881   * smaller of the next items.  In case of a tie, we take the one that
8882   * is not in its set first (a difference from the union algorithm).  If
8883   * we took one in the set first, it would increment the count, possibly
8884   * to 2 which would cause it to be output as starting a range in the
8885   * intersection, and the next time through we would take that same
8886   * number, and output it again as ending the set.  By doing it the
8887   * opposite of this, there is no possibility that the count will be
8888   * momentarily incremented to 2.  (In a tie and both are in the set or
8889   * both not in the set, it doesn't matter which we take first.) */
8890   if (array_a[i_a] < array_b[i_b]
8891    || (array_a[i_a] == array_b[i_b]
8892     && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8893   {
8894    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8895    cp= array_a[i_a++];
8896   }
8897   else {
8898    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8899    cp= array_b[i_b++];
8900   }
8901
8902   /* Here, have chosen which of the two inputs to look at.  Only output
8903   * if the running count changes to/from 2, which marks the
8904   * beginning/end of a range that's in the intersection */
8905   if (cp_in_set) {
8906    count++;
8907    if (count == 2) {
8908     array_r[i_r++] = cp;
8909    }
8910   }
8911   else {
8912    if (count == 2) {
8913     array_r[i_r++] = cp;
8914    }
8915    count--;
8916   }
8917  }
8918
8919  /* Here, we are finished going through at least one of the lists, which
8920  * means there is something remaining in at most one.  We check if the list
8921  * that has been exhausted is positioned such that we are in the middle
8922  * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8923  * the ones we care about.)  There are four cases:
8924  * 1) Both weren't in their sets, count is 0, and remains 0.  There's
8925  *    nothing left in the intersection.
8926  * 2) Both were in their sets, count is 2 and perhaps is incremented to
8927  *    above 2.  What should be output is exactly that which is in the
8928  *    non-exhausted set, as everything it has is also in the intersection
8929  *    set, and everything it doesn't have can't be in the intersection
8930  * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8931  *    gets incremented to 2.  Like the previous case, the intersection is
8932  *    everything that remains in the non-exhausted set.
8933  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8934  *    remains 1.  And the intersection has nothing more. */
8935  if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8936   || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8937  {
8938   count++;
8939  }
8940
8941  /* The final length is what we've output so far plus what else is in the
8942  * intersection.  At most one of the subexpressions below will be non-zero
8943  * */
8944  len_r = i_r;
8945  if (count >= 2) {
8946   len_r += (len_a - i_a) + (len_b - i_b);
8947  }
8948
8949  /* Set result to final length, which can change the pointer to array_r, so
8950  * re-find it */
8951  if (len_r != _invlist_len(r)) {
8952   invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8953   invlist_trim(r);
8954   array_r = invlist_array(r);
8955  }
8956
8957  /* Finish outputting any remaining */
8958  if (count >= 2) { /* At most one will have a non-zero copy count */
8959   IV copy_count;
8960   if ((copy_count = len_a - i_a) > 0) {
8961    Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8962   }
8963   else if ((copy_count = len_b - i_b) > 0) {
8964    Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8965   }
8966  }
8967
8968  /*  We may be removing a reference to one of the inputs.  If so, the output
8969  *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8970  *  count decremented) */
8971  if (a == *i || b == *i) {
8972   assert(! invlist_is_iterating(*i));
8973   if (SvTEMP(*i)) {
8974    sv_2mortal(r);
8975   }
8976   else {
8977    SvREFCNT_dec_NN(*i);
8978   }
8979  }
8980
8981  *i = r;
8982
8983  return;
8984 }
8985
8986 SV*
8987 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8988 {
8989  /* Add the range from 'start' to 'end' inclusive to the inversion list's
8990  * set.  A pointer to the inversion list is returned.  This may actually be
8991  * a new list, in which case the passed in one has been destroyed.  The
8992  * passed-in inversion list can be NULL, in which case a new one is created
8993  * with just the one range in it */
8994
8995  SV* range_invlist;
8996  UV len;
8997
8998  if (invlist == NULL) {
8999   invlist = _new_invlist(2);
9000   len = 0;
9001  }
9002  else {
9003   len = _invlist_len(invlist);
9004  }
9005
9006  /* If comes after the final entry actually in the list, can just append it
9007  * to the end, */
9008  if (len == 0
9009   || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
9010    && start >= invlist_array(invlist)[len - 1]))
9011  {
9012   _append_range_to_invlist(invlist, start, end);
9013   return invlist;
9014  }
9015
9016  /* Here, can't just append things, create and return a new inversion list
9017  * which is the union of this range and the existing inversion list */
9018  range_invlist = _new_invlist(2);
9019  _append_range_to_invlist(range_invlist, start, end);
9020
9021  _invlist_union(invlist, range_invlist, &invlist);
9022
9023  /* The temporary can be freed */
9024  SvREFCNT_dec_NN(range_invlist);
9025
9026  return invlist;
9027 }
9028
9029 SV*
9030 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9031         UV** other_elements_ptr)
9032 {
9033  /* Create and return an inversion list whose contents are to be populated
9034  * by the caller.  The caller gives the number of elements (in 'size') and
9035  * the very first element ('element0').  This function will set
9036  * '*other_elements_ptr' to an array of UVs, where the remaining elements
9037  * are to be placed.
9038  *
9039  * Obviously there is some trust involved that the caller will properly
9040  * fill in the other elements of the array.
9041  *
9042  * (The first element needs to be passed in, as the underlying code does
9043  * things differently depending on whether it is zero or non-zero) */
9044
9045  SV* invlist = _new_invlist(size);
9046  bool offset;
9047
9048  PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9049
9050  _append_range_to_invlist(invlist, element0, element0);
9051  offset = *get_invlist_offset_addr(invlist);
9052
9053  invlist_set_len(invlist, size, offset);
9054  *other_elements_ptr = invlist_array(invlist) + 1;
9055  return invlist;
9056 }
9057
9058 #endif
9059
9060 PERL_STATIC_INLINE SV*
9061 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9062  return _add_range_to_invlist(invlist, cp, cp);
9063 }
9064
9065 #ifndef PERL_IN_XSUB_RE
9066 void
9067 Perl__invlist_invert(pTHX_ SV* const invlist)
9068 {
9069  /* Complement the input inversion list.  This adds a 0 if the list didn't
9070  * have a zero; removes it otherwise.  As described above, the data
9071  * structure is set up so that this is very efficient */
9072
9073  PERL_ARGS_ASSERT__INVLIST_INVERT;
9074
9075  assert(! invlist_is_iterating(invlist));
9076
9077  /* The inverse of matching nothing is matching everything */
9078  if (_invlist_len(invlist) == 0) {
9079   _append_range_to_invlist(invlist, 0, UV_MAX);
9080   return;
9081  }
9082
9083  *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9084 }
9085
9086 #endif
9087
9088 PERL_STATIC_INLINE SV*
9089 S_invlist_clone(pTHX_ SV* const invlist)
9090 {
9091
9092  /* Return a new inversion list that is a copy of the input one, which is
9093  * unchanged.  The new list will not be mortal even if the old one was. */
9094
9095  /* Need to allocate extra space to accommodate Perl's addition of a
9096  * trailing NUL to SvPV's, since it thinks they are always strings */
9097  SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9098  STRLEN physical_length = SvCUR(invlist);
9099  bool offset = *(get_invlist_offset_addr(invlist));
9100
9101  PERL_ARGS_ASSERT_INVLIST_CLONE;
9102
9103  *(get_invlist_offset_addr(new_invlist)) = offset;
9104  invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9105  Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9106
9107  return new_invlist;
9108 }
9109
9110 PERL_STATIC_INLINE STRLEN*
9111 S_get_invlist_iter_addr(SV* invlist)
9112 {
9113  /* Return the address of the UV that contains the current iteration
9114  * position */
9115
9116  PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9117
9118  assert(SvTYPE(invlist) == SVt_INVLIST);
9119
9120  return &(((XINVLIST*) SvANY(invlist))->iterator);
9121 }
9122
9123 PERL_STATIC_INLINE void
9124 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9125 {
9126  PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9127
9128  *get_invlist_iter_addr(invlist) = 0;
9129 }
9130
9131 PERL_STATIC_INLINE void
9132 S_invlist_iterfinish(SV* invlist)
9133 {
9134  /* Terminate iterator for invlist.  This is to catch development errors.
9135  * Any iteration that is interrupted before completed should call this
9136  * function.  Functions that add code points anywhere else but to the end
9137  * of an inversion list assert that they are not in the middle of an
9138  * iteration.  If they were, the addition would make the iteration
9139  * problematical: if the iteration hadn't reached the place where things
9140  * were being added, it would be ok */
9141
9142  PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9143
9144  *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9145 }
9146
9147 STATIC bool
9148 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9149 {
9150  /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9151  * This call sets in <*start> and <*end>, the next range in <invlist>.
9152  * Returns <TRUE> if successful and the next call will return the next
9153  * range; <FALSE> if was already at the end of the list.  If the latter,
9154  * <*start> and <*end> are unchanged, and the next call to this function
9155  * will start over at the beginning of the list */
9156
9157  STRLEN* pos = get_invlist_iter_addr(invlist);
9158  UV len = _invlist_len(invlist);
9159  UV *array;
9160
9161  PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9162
9163  if (*pos >= len) {
9164   *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9165   return FALSE;
9166  }
9167
9168  array = invlist_array(invlist);
9169
9170  *start = array[(*pos)++];
9171
9172  if (*pos >= len) {
9173   *end = UV_MAX;
9174  }
9175  else {
9176   *end = array[(*pos)++] - 1;
9177  }
9178
9179  return TRUE;
9180 }
9181
9182 PERL_STATIC_INLINE UV
9183 S_invlist_highest(SV* const invlist)
9184 {
9185  /* Returns the highest code point that matches an inversion list.  This API
9186  * has an ambiguity, as it returns 0 under either the highest is actually
9187  * 0, or if the list is empty.  If this distinction matters to you, check
9188  * for emptiness before calling this function */
9189
9190  UV len = _invlist_len(invlist);
9191  UV *array;
9192
9193  PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9194
9195  if (len == 0) {
9196   return 0;
9197  }
9198
9199  array = invlist_array(invlist);
9200
9201  /* The last element in the array in the inversion list always starts a
9202  * range that goes to infinity.  That range may be for code points that are
9203  * matched in the inversion list, or it may be for ones that aren't
9204  * matched.  In the latter case, the highest code point in the set is one
9205  * less than the beginning of this range; otherwise it is the final element
9206  * of this range: infinity */
9207  return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9208   ? UV_MAX
9209   : array[len - 1] - 1;
9210 }
9211
9212 #ifndef PERL_IN_XSUB_RE
9213 SV *
9214 Perl__invlist_contents(pTHX_ SV* const invlist)
9215 {
9216  /* Get the contents of an inversion list into a string SV so that they can
9217  * be printed out.  It uses the format traditionally done for debug tracing
9218  */
9219
9220  UV start, end;
9221  SV* output = newSVpvs("\n");
9222
9223  PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9224
9225  assert(! invlist_is_iterating(invlist));
9226
9227  invlist_iterinit(invlist);
9228  while (invlist_iternext(invlist, &start, &end)) {
9229   if (end == UV_MAX) {
9230    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9231   }
9232   else if (end != start) {
9233    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9234      start,       end);
9235   }
9236   else {
9237    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9238   }
9239  }
9240
9241  return output;
9242 }
9243 #endif
9244
9245 #ifndef PERL_IN_XSUB_RE
9246 void
9247 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9248       const char * const indent, SV* const invlist)
9249 {
9250  /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9251  * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9252  * the string 'indent'.  The output looks like this:
9253   [0] 0x000A .. 0x000D
9254   [2] 0x0085
9255   [4] 0x2028 .. 0x2029
9256   [6] 0x3104 .. INFINITY
9257  * This means that the first range of code points matched by the list are
9258  * 0xA through 0xD; the second range contains only the single code point
9259  * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9260  * are used to define each range (except if the final range extends to
9261  * infinity, only a single element is needed).  The array index of the
9262  * first element for the corresponding range is given in brackets. */
9263
9264  UV start, end;
9265  STRLEN count = 0;
9266
9267  PERL_ARGS_ASSERT__INVLIST_DUMP;
9268
9269  if (invlist_is_iterating(invlist)) {
9270   Perl_dump_indent(aTHX_ level, file,
9271    "%sCan't dump inversion list because is in middle of iterating\n",
9272    indent);
9273   return;
9274  }
9275
9276  invlist_iterinit(invlist);
9277  while (invlist_iternext(invlist, &start, &end)) {
9278   if (end == UV_MAX) {
9279    Perl_dump_indent(aTHX_ level, file,
9280          "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9281         indent, (UV)count, start);
9282   }
9283   else if (end != start) {
9284    Perl_dump_indent(aTHX_ level, file,
9285          "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9286         indent, (UV)count, start,         end);
9287   }
9288   else {
9289    Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9290            indent, (UV)count, start);
9291   }
9292   count += 2;
9293  }
9294 }
9295
9296 void
9297 Perl__load_PL_utf8_foldclosures (pTHX)
9298 {
9299  assert(! PL_utf8_foldclosures);
9300
9301  /* If the folds haven't been read in, call a fold function
9302  * to force that */
9303  if (! PL_utf8_tofold) {
9304   U8 dummy[UTF8_MAXBYTES_CASE+1];
9305
9306   /* This string is just a short named one above \xff */
9307   to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9308   assert(PL_utf8_tofold); /* Verify that worked */
9309  }
9310  PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9311 }
9312 #endif
9313
9314 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9315 bool
9316 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9317 {
9318  /* Return a boolean as to if the two passed in inversion lists are
9319  * identical.  The final argument, if TRUE, says to take the complement of
9320  * the second inversion list before doing the comparison */
9321
9322  const UV* array_a = invlist_array(a);
9323  const UV* array_b = invlist_array(b);
9324  UV len_a = _invlist_len(a);
9325  UV len_b = _invlist_len(b);
9326
9327  UV i = 0;      /* current index into the arrays */
9328  bool retval = TRUE;     /* Assume are identical until proven otherwise */
9329
9330  PERL_ARGS_ASSERT__INVLISTEQ;
9331
9332  /* If are to compare 'a' with the complement of b, set it
9333  * up so are looking at b's complement. */
9334  if (complement_b) {
9335
9336   /* The complement of nothing is everything, so <a> would have to have
9337   * just one element, starting at zero (ending at infinity) */
9338   if (len_b == 0) {
9339    return (len_a == 1 && array_a[0] == 0);
9340   }
9341   else if (array_b[0] == 0) {
9342
9343    /* Otherwise, to complement, we invert.  Here, the first element is
9344    * 0, just remove it.  To do this, we just pretend the array starts
9345    * one later */
9346
9347    array_b++;
9348    len_b--;
9349   }
9350   else {
9351
9352    /* But if the first element is not zero, we pretend the list starts
9353    * at the 0 that is always stored immediately before the array. */
9354    array_b--;
9355    len_b++;
9356   }
9357  }
9358
9359  /* Make sure that the lengths are the same, as well as the final element
9360  * before looping through the remainder.  (Thus we test the length, final,
9361  * and first elements right off the bat) */
9362  if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9363   retval = FALSE;
9364  }
9365  else for (i = 0; i < len_a - 1; i++) {
9366   if (array_a[i] != array_b[i]) {
9367    retval = FALSE;
9368    break;
9369   }
9370  }
9371
9372  return retval;
9373 }
9374 #endif
9375
9376 /*
9377  * As best we can, determine the characters that can match the start of
9378  * the given EXACTF-ish node.
9379  *
9380  * Returns the invlist as a new SV*; it is the caller's responsibility to
9381  * call SvREFCNT_dec() when done with it.
9382  */
9383 STATIC SV*
9384 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
9385 {
9386  const U8 * s = (U8*)STRING(node);
9387  SSize_t bytelen = STR_LEN(node);
9388  UV uc;
9389  /* Start out big enough for 2 separate code points */
9390  SV* invlist = _new_invlist(4);
9391
9392  PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
9393
9394  if (! UTF) {
9395   uc = *s;
9396
9397   /* We punt and assume can match anything if the node begins
9398   * with a multi-character fold.  Things are complicated.  For
9399   * example, /ffi/i could match any of:
9400   *  "\N{LATIN SMALL LIGATURE FFI}"
9401   *  "\N{LATIN SMALL LIGATURE FF}I"
9402   *  "F\N{LATIN SMALL LIGATURE FI}"
9403   *  plus several other things; and making sure we have all the
9404   *  possibilities is hard. */
9405   if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
9406    invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9407   }
9408   else {
9409    /* Any Latin1 range character can potentially match any
9410    * other depending on the locale */
9411    if (OP(node) == EXACTFL) {
9412     _invlist_union(invlist, PL_Latin1, &invlist);
9413    }
9414    else {
9415     /* But otherwise, it matches at least itself.  We can
9416     * quickly tell if it has a distinct fold, and if so,
9417     * it matches that as well */
9418     invlist = add_cp_to_invlist(invlist, uc);
9419     if (IS_IN_SOME_FOLD_L1(uc))
9420      invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
9421    }
9422
9423    /* Some characters match above-Latin1 ones under /i.  This
9424    * is true of EXACTFL ones when the locale is UTF-8 */
9425    if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
9426     && (! isASCII(uc) || (OP(node) != EXACTFA
9427          && OP(node) != EXACTFA_NO_TRIE)))
9428    {
9429     add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
9430    }
9431   }
9432  }
9433  else {  /* Pattern is UTF-8 */
9434   U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
9435   STRLEN foldlen = UTF8SKIP(s);
9436   const U8* e = s + bytelen;
9437   SV** listp;
9438
9439   uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
9440
9441   /* The only code points that aren't folded in a UTF EXACTFish
9442   * node are are the problematic ones in EXACTFL nodes */
9443   if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
9444    /* We need to check for the possibility that this EXACTFL
9445    * node begins with a multi-char fold.  Therefore we fold
9446    * the first few characters of it so that we can make that
9447    * check */
9448    U8 *d = folded;
9449    int i;
9450
9451    for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
9452     if (isASCII(*s)) {
9453      *(d++) = (U8) toFOLD(*s);
9454      s++;
9455     }
9456     else {
9457      STRLEN len;
9458      to_utf8_fold(s, d, &len);
9459      d += len;
9460      s += UTF8SKIP(s);
9461     }
9462    }
9463
9464    /* And set up so the code below that looks in this folded
9465    * buffer instead of the node's string */
9466    e = d;
9467    foldlen = UTF8SKIP(folded);
9468    s = folded;
9469   }
9470
9471   /* When we reach here 's' points to the fold of the first
9472   * character(s) of the node; and 'e' points to far enough along
9473   * the folded string to be just past any possible multi-char
9474   * fold. 'foldlen' is the length in bytes of the first
9475   * character in 's'
9476   *
9477   * Unlike the non-UTF-8 case, the macro for determining if a
9478   * string is a multi-char fold requires all the characters to
9479   * already be folded.  This is because of all the complications
9480   * if not.  Note that they are folded anyway, except in EXACTFL
9481   * nodes.  Like the non-UTF case above, we punt if the node
9482   * begins with a multi-char fold  */
9483
9484   if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
9485    invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9486   }
9487   else {  /* Single char fold */
9488
9489    /* It matches all the things that fold to it, which are
9490    * found in PL_utf8_foldclosures (including itself) */
9491    invlist = add_cp_to_invlist(invlist, uc);
9492    if (! PL_utf8_foldclosures)
9493     _load_PL_utf8_foldclosures();
9494    if ((listp = hv_fetch(PL_utf8_foldclosures,
9495         (char *) s, foldlen, FALSE)))
9496    {
9497     AV* list = (AV*) *listp;
9498     IV k;
9499     for (k = 0; k <= av_tindex(list); k++) {
9500      SV** c_p = av_fetch(list, k, FALSE);
9501      UV c;
9502      assert(c_p);
9503
9504      c = SvUV(*c_p);
9505
9506      /* /aa doesn't allow folds between ASCII and non- */
9507      if ((OP(node) == EXACTFA || OP(node) == EXACTFA_NO_TRIE)
9508       && isASCII(c) != isASCII(uc))
9509      {
9510       continue;
9511      }
9512
9513      invlist = add_cp_to_invlist(invlist, c);
9514     }
9515    }
9516   }
9517  }
9518
9519  return invlist;
9520 }
9521
9522 #undef HEADER_LENGTH
9523 #undef TO_INTERNAL_SIZE
9524 #undef FROM_INTERNAL_SIZE
9525 #undef INVLIST_VERSION_ID
9526
9527 /* End of inversion list object */
9528
9529 STATIC void
9530 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9531 {
9532  /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9533  * constructs, and updates RExC_flags with them.  On input, RExC_parse
9534  * should point to the first flag; it is updated on output to point to the
9535  * final ')' or ':'.  There needs to be at least one flag, or this will
9536  * abort */
9537
9538  /* for (?g), (?gc), and (?o) warnings; warning
9539  about (?c) will warn about (?g) -- japhy    */
9540
9541 #define WASTED_O  0x01
9542 #define WASTED_G  0x02
9543 #define WASTED_C  0x04
9544 #define WASTED_GC (WASTED_G|WASTED_C)
9545  I32 wastedflags = 0x00;
9546  U32 posflags = 0, negflags = 0;
9547  U32 *flagsp = &posflags;
9548  char has_charset_modifier = '\0';
9549  regex_charset cs;
9550  bool has_use_defaults = FALSE;
9551  const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9552  int x_mod_count = 0;
9553
9554  PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9555
9556  /* '^' as an initial flag sets certain defaults */
9557  if (UCHARAT(RExC_parse) == '^') {
9558   RExC_parse++;
9559   has_use_defaults = TRUE;
9560   STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9561   set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9562           ? REGEX_UNICODE_CHARSET
9563           : REGEX_DEPENDS_CHARSET);
9564  }
9565
9566  cs = get_regex_charset(RExC_flags);
9567  if (cs == REGEX_DEPENDS_CHARSET
9568   && (RExC_utf8 || RExC_uni_semantics))
9569  {
9570   cs = REGEX_UNICODE_CHARSET;
9571  }
9572
9573  while (*RExC_parse) {
9574   /* && strchr("iogcmsx", *RExC_parse) */
9575   /* (?g), (?gc) and (?o) are useless here
9576   and must be globally applied -- japhy */
9577   switch (*RExC_parse) {
9578
9579    /* Code for the imsxn flags */
9580    CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
9581
9582    case LOCALE_PAT_MOD:
9583     if (has_charset_modifier) {
9584      goto excess_modifier;
9585     }
9586     else if (flagsp == &negflags) {
9587      goto neg_modifier;
9588     }
9589     cs = REGEX_LOCALE_CHARSET;
9590     has_charset_modifier = LOCALE_PAT_MOD;
9591     break;
9592    case UNICODE_PAT_MOD:
9593     if (has_charset_modifier) {
9594      goto excess_modifier;
9595     }
9596     else if (flagsp == &negflags) {
9597      goto neg_modifier;
9598     }
9599     cs = REGEX_UNICODE_CHARSET;
9600     has_charset_modifier = UNICODE_PAT_MOD;
9601     break;
9602    case ASCII_RESTRICT_PAT_MOD:
9603     if (flagsp == &negflags) {
9604      goto neg_modifier;
9605     }
9606     if (has_charset_modifier) {
9607      if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9608       goto excess_modifier;
9609      }
9610      /* Doubled modifier implies more restricted */
9611      cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9612     }
9613     else {
9614      cs = REGEX_ASCII_RESTRICTED_CHARSET;
9615     }
9616     has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9617     break;
9618    case DEPENDS_PAT_MOD:
9619     if (has_use_defaults) {
9620      goto fail_modifiers;
9621     }
9622     else if (flagsp == &negflags) {
9623      goto neg_modifier;
9624     }
9625     else if (has_charset_modifier) {
9626      goto excess_modifier;
9627     }
9628
9629     /* The dual charset means unicode semantics if the
9630     * pattern (or target, not known until runtime) are
9631     * utf8, or something in the pattern indicates unicode
9632     * semantics */
9633     cs = (RExC_utf8 || RExC_uni_semantics)
9634      ? REGEX_UNICODE_CHARSET
9635      : REGEX_DEPENDS_CHARSET;
9636     has_charset_modifier = DEPENDS_PAT_MOD;
9637     break;
9638    excess_modifier:
9639     RExC_parse++;
9640     if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9641      vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9642     }
9643     else if (has_charset_modifier == *(RExC_parse - 1)) {
9644      vFAIL2("Regexp modifier \"%c\" may not appear twice",
9645           *(RExC_parse - 1));
9646     }
9647     else {
9648      vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9649     }
9650     NOT_REACHED; /*NOTREACHED*/
9651    neg_modifier:
9652     RExC_parse++;
9653     vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9654          *(RExC_parse - 1));
9655     NOT_REACHED; /*NOTREACHED*/
9656    case ONCE_PAT_MOD: /* 'o' */
9657    case GLOBAL_PAT_MOD: /* 'g' */
9658     if (PASS2 && ckWARN(WARN_REGEXP)) {
9659      const I32 wflagbit = *RExC_parse == 'o'
9660           ? WASTED_O
9661           : WASTED_G;
9662      if (! (wastedflags & wflagbit) ) {
9663       wastedflags |= wflagbit;
9664       /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9665       vWARN5(
9666        RExC_parse + 1,
9667        "Useless (%s%c) - %suse /%c modifier",
9668        flagsp == &negflags ? "?-" : "?",
9669        *RExC_parse,
9670        flagsp == &negflags ? "don't " : "",
9671        *RExC_parse
9672       );
9673      }
9674     }
9675     break;
9676
9677    case CONTINUE_PAT_MOD: /* 'c' */
9678     if (PASS2 && ckWARN(WARN_REGEXP)) {
9679      if (! (wastedflags & WASTED_C) ) {
9680       wastedflags |= WASTED_GC;
9681       /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9682       vWARN3(
9683        RExC_parse + 1,
9684        "Useless (%sc) - %suse /gc modifier",
9685        flagsp == &negflags ? "?-" : "?",
9686        flagsp == &negflags ? "don't " : ""
9687       );
9688      }
9689     }
9690     break;
9691    case KEEPCOPY_PAT_MOD: /* 'p' */
9692     if (flagsp == &negflags) {
9693      if (PASS2)
9694       ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9695     } else {
9696      *flagsp |= RXf_PMf_KEEPCOPY;
9697     }
9698     break;
9699    case '-':
9700     /* A flag is a default iff it is following a minus, so
9701     * if there is a minus, it means will be trying to
9702     * re-specify a default which is an error */
9703     if (has_use_defaults || flagsp == &negflags) {
9704      goto fail_modifiers;
9705     }
9706     flagsp = &negflags;
9707     wastedflags = 0;  /* reset so (?g-c) warns twice */
9708     break;
9709    case ':':
9710    case ')':
9711     RExC_flags |= posflags;
9712     RExC_flags &= ~negflags;
9713     set_regex_charset(&RExC_flags, cs);
9714     if (RExC_flags & RXf_PMf_FOLD) {
9715      RExC_contains_i = 1;
9716     }
9717     if (PASS2) {
9718      STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9719     }
9720     return;
9721     /*NOTREACHED*/
9722    default:
9723    fail_modifiers:
9724     RExC_parse += SKIP_IF_CHAR(RExC_parse);
9725     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9726     vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9727      UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9728     NOT_REACHED; /*NOTREACHED*/
9729   }
9730
9731   ++RExC_parse;
9732  }
9733
9734  if (PASS2) {
9735   STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9736  }
9737 }
9738
9739 /*
9740  - reg - regular expression, i.e. main body or parenthesized thing
9741  *
9742  * Caller must absorb opening parenthesis.
9743  *
9744  * Combining parenthesis handling with the base level of regular expression
9745  * is a trifle forced, but the need to tie the tails of the branches to what
9746  * follows makes it hard to avoid.
9747  */
9748 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9749 #ifdef DEBUGGING
9750 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9751 #else
9752 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9753 #endif
9754
9755 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9756    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9757    needs to be restarted.
9758    Otherwise would only return NULL if regbranch() returns NULL, which
9759    cannot happen.  */
9760 STATIC regnode *
9761 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9762  /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9763  * 2 is like 1, but indicates that nextchar() has been called to advance
9764  * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9765  * this flag alerts us to the need to check for that */
9766 {
9767  regnode *ret;  /* Will be the head of the group. */
9768  regnode *br;
9769  regnode *lastbr;
9770  regnode *ender = NULL;
9771  I32 parno = 0;
9772  I32 flags;
9773  U32 oregflags = RExC_flags;
9774  bool have_branch = 0;
9775  bool is_open = 0;
9776  I32 freeze_paren = 0;
9777  I32 after_freeze = 0;
9778  I32 num; /* numeric backreferences */
9779
9780  char * parse_start = RExC_parse; /* MJD */
9781  char * const oregcomp_parse = RExC_parse;
9782
9783  GET_RE_DEBUG_FLAGS_DECL;
9784
9785  PERL_ARGS_ASSERT_REG;
9786  DEBUG_PARSE("reg ");
9787
9788  *flagp = 0;    /* Tentatively. */
9789
9790
9791  /* Make an OPEN node, if parenthesized. */
9792  if (paren) {
9793
9794   /* Under /x, space and comments can be gobbled up between the '(' and
9795   * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9796   * intervening space, as the sequence is a token, and a token should be
9797   * indivisible */
9798   bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9799
9800   if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9801    char *start_verb = RExC_parse;
9802    STRLEN verb_len = 0;
9803    char *start_arg = NULL;
9804    unsigned char op = 0;
9805    int argok = 1;
9806    int internal_argval = 0; /* internal_argval is only useful if
9807           !argok */
9808
9809    if (has_intervening_patws) {
9810     RExC_parse++;
9811     vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9812    }
9813    while ( *RExC_parse && *RExC_parse != ')' ) {
9814     if ( *RExC_parse == ':' ) {
9815      start_arg = RExC_parse + 1;
9816      break;
9817     }
9818     RExC_parse++;
9819    }
9820    ++start_verb;
9821    verb_len = RExC_parse - start_verb;
9822    if ( start_arg ) {
9823     RExC_parse++;
9824     while ( *RExC_parse && *RExC_parse != ')' )
9825      RExC_parse++;
9826     if ( *RExC_parse != ')' )
9827      vFAIL("Unterminated verb pattern argument");
9828     if ( RExC_parse == start_arg )
9829      start_arg = NULL;
9830    } else {
9831     if ( *RExC_parse != ')' )
9832      vFAIL("Unterminated verb pattern");
9833    }
9834
9835    switch ( *start_verb ) {
9836    case 'A':  /* (*ACCEPT) */
9837     if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9838      op = ACCEPT;
9839      internal_argval = RExC_nestroot;
9840     }
9841     break;
9842    case 'C':  /* (*COMMIT) */
9843     if ( memEQs(start_verb,verb_len,"COMMIT") )
9844      op = COMMIT;
9845     break;
9846    case 'F':  /* (*FAIL) */
9847     if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9848      op = OPFAIL;
9849      argok = 0;
9850     }
9851     break;
9852    case ':':  /* (*:NAME) */
9853    case 'M':  /* (*MARK:NAME) */
9854     if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9855      op = MARKPOINT;
9856      argok = -1;
9857     }
9858     break;
9859    case 'P':  /* (*PRUNE) */
9860     if ( memEQs(start_verb,verb_len,"PRUNE") )
9861      op = PRUNE;
9862     break;
9863    case 'S':   /* (*SKIP) */
9864     if ( memEQs(start_verb,verb_len,"SKIP") )
9865      op = SKIP;
9866     break;
9867    case 'T':  /* (*THEN) */
9868     /* [19:06] <TimToady> :: is then */
9869     if ( memEQs(start_verb,verb_len,"THEN") ) {
9870      op = CUTGROUP;
9871      RExC_seen |= REG_CUTGROUP_SEEN;
9872     }
9873     break;
9874    }
9875    if ( ! op ) {
9876     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9877     vFAIL2utf8f(
9878      "Unknown verb pattern '%"UTF8f"'",
9879      UTF8fARG(UTF, verb_len, start_verb));
9880    }
9881    if ( argok ) {
9882     if ( start_arg && internal_argval ) {
9883      vFAIL3("Verb pattern '%.*s' may not have an argument",
9884       verb_len, start_verb);
9885     } else if ( argok < 0 && !start_arg ) {
9886      vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9887       verb_len, start_verb);
9888     } else {
9889      ret = reganode(pRExC_state, op, internal_argval);
9890      if ( ! internal_argval && ! SIZE_ONLY ) {
9891       if (start_arg) {
9892        SV *sv = newSVpvn( start_arg,
9893            RExC_parse - start_arg);
9894        ARG(ret) = add_data( pRExC_state,
9895             STR_WITH_LEN("S"));
9896        RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9897        ret->flags = 0;
9898       } else {
9899        ret->flags = 1;
9900       }
9901      }
9902     }
9903     if (!internal_argval)
9904      RExC_seen |= REG_VERBARG_SEEN;
9905    } else if ( start_arg ) {
9906     vFAIL3("Verb pattern '%.*s' may not have an argument",
9907       verb_len, start_verb);
9908    } else {
9909     ret = reg_node(pRExC_state, op);
9910    }
9911    nextchar(pRExC_state);
9912    return ret;
9913   }
9914   else if (*RExC_parse == '?') { /* (?...) */
9915    bool is_logical = 0;
9916    const char * const seqstart = RExC_parse;
9917    const char * endptr;
9918    if (has_intervening_patws) {
9919     RExC_parse++;
9920     vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9921    }
9922
9923    RExC_parse++;
9924    paren = *RExC_parse++;
9925    ret = NULL;   /* For look-ahead/behind. */
9926    switch (paren) {
9927
9928    case 'P': /* (?P...) variants for those used to PCRE/Python */
9929     paren = *RExC_parse++;
9930     if ( paren == '<')         /* (?P<...>) named capture */
9931      goto named_capture;
9932     else if (paren == '>') {   /* (?P>name) named recursion */
9933      goto named_recursion;
9934     }
9935     else if (paren == '=') {   /* (?P=...)  named backref */
9936      /* this pretty much dupes the code for \k<NAME> in
9937      * regatom(), if you change this make sure you change that
9938      * */
9939      char* name_start = RExC_parse;
9940      U32 num = 0;
9941      SV *sv_dat = reg_scan_name(pRExC_state,
9942       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9943      if (RExC_parse == name_start || *RExC_parse != ')')
9944       /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9945       vFAIL2("Sequence %.3s... not terminated",parse_start);
9946
9947      if (!SIZE_ONLY) {
9948       num = add_data( pRExC_state, STR_WITH_LEN("S"));
9949       RExC_rxi->data->data[num]=(void*)sv_dat;
9950       SvREFCNT_inc_simple_void(sv_dat);
9951      }
9952      RExC_sawback = 1;
9953      ret = reganode(pRExC_state,
9954         ((! FOLD)
9955          ? NREF
9956          : (ASCII_FOLD_RESTRICTED)
9957          ? NREFFA
9958          : (AT_LEAST_UNI_SEMANTICS)
9959           ? NREFFU
9960           : (LOC)
9961           ? NREFFL
9962           : NREFF),
9963          num);
9964      *flagp |= HASWIDTH;
9965
9966      Set_Node_Offset(ret, parse_start+1);
9967      Set_Node_Cur_Length(ret, parse_start);
9968
9969      nextchar(pRExC_state);
9970      return ret;
9971     }
9972     --RExC_parse;
9973     RExC_parse += SKIP_IF_CHAR(RExC_parse);
9974     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9975     vFAIL3("Sequence (%.*s...) not recognized",
9976         RExC_parse-seqstart, seqstart);
9977     NOT_REACHED; /*NOTREACHED*/
9978    case '<':           /* (?<...) */
9979     if (*RExC_parse == '!')
9980      paren = ',';
9981     else if (*RExC_parse != '=')
9982    named_capture:
9983     {               /* (?<...>) */
9984      char *name_start;
9985      SV *svname;
9986      paren= '>';
9987    case '\'':          /* (?'...') */
9988       name_start= RExC_parse;
9989       svname = reg_scan_name(pRExC_state,
9990       SIZE_ONLY    /* reverse test from the others */
9991       ? REG_RSN_RETURN_NAME
9992       : REG_RSN_RETURN_NULL);
9993      if (RExC_parse == name_start || *RExC_parse != paren)
9994       vFAIL2("Sequence (?%c... not terminated",
9995        paren=='>' ? '<' : paren);
9996      if (SIZE_ONLY) {
9997       HE *he_str;
9998       SV *sv_dat = NULL;
9999       if (!svname) /* shouldn't happen */
10000        Perl_croak(aTHX_
10001         "panic: reg_scan_name returned NULL");
10002       if (!RExC_paren_names) {
10003        RExC_paren_names= newHV();
10004        sv_2mortal(MUTABLE_SV(RExC_paren_names));
10005 #ifdef DEBUGGING
10006        RExC_paren_name_list= newAV();
10007        sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
10008 #endif
10009       }
10010       he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
10011       if ( he_str )
10012        sv_dat = HeVAL(he_str);
10013       if ( ! sv_dat ) {
10014        /* croak baby croak */
10015        Perl_croak(aTHX_
10016         "panic: paren_name hash element allocation failed");
10017       } else if ( SvPOK(sv_dat) ) {
10018        /* (?|...) can mean we have dupes so scan to check
10019        its already been stored. Maybe a flag indicating
10020        we are inside such a construct would be useful,
10021        but the arrays are likely to be quite small, so
10022        for now we punt -- dmq */
10023        IV count = SvIV(sv_dat);
10024        I32 *pv = (I32*)SvPVX(sv_dat);
10025        IV i;
10026        for ( i = 0 ; i < count ; i++ ) {
10027         if ( pv[i] == RExC_npar ) {
10028          count = 0;
10029          break;
10030         }
10031        }
10032        if ( count ) {
10033         pv = (I32*)SvGROW(sv_dat,
10034             SvCUR(sv_dat) + sizeof(I32)+1);
10035         SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10036         pv[count] = RExC_npar;
10037         SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10038        }
10039       } else {
10040        (void)SvUPGRADE(sv_dat,SVt_PVNV);
10041        sv_setpvn(sv_dat, (char *)&(RExC_npar),
10042                 sizeof(I32));
10043        SvIOK_on(sv_dat);
10044        SvIV_set(sv_dat, 1);
10045       }
10046 #ifdef DEBUGGING
10047       /* Yes this does cause a memory leak in debugging Perls
10048       * */
10049       if (!av_store(RExC_paren_name_list,
10050          RExC_npar, SvREFCNT_inc(svname)))
10051        SvREFCNT_dec_NN(svname);
10052 #endif
10053
10054       /*sv_dump(sv_dat);*/
10055      }
10056      nextchar(pRExC_state);
10057      paren = 1;
10058      goto capturing_parens;
10059     }
10060     RExC_seen |= REG_LOOKBEHIND_SEEN;
10061     RExC_in_lookbehind++;
10062     RExC_parse++;
10063     /* FALLTHROUGH */
10064    case '=':           /* (?=...) */
10065     RExC_seen_zerolen++;
10066     break;
10067    case '!':           /* (?!...) */
10068     RExC_seen_zerolen++;
10069     /* check if we're really just a "FAIL" assertion */
10070     --RExC_parse;
10071     nextchar(pRExC_state);
10072     if (*RExC_parse == ')') {
10073      ret=reg_node(pRExC_state, OPFAIL);
10074      nextchar(pRExC_state);
10075      return ret;
10076     }
10077     break;
10078    case '|':           /* (?|...) */
10079     /* branch reset, behave like a (?:...) except that
10080     buffers in alternations share the same numbers */
10081     paren = ':';
10082     after_freeze = freeze_paren = RExC_npar;
10083     break;
10084    case ':':           /* (?:...) */
10085    case '>':           /* (?>...) */
10086     break;
10087    case '$':           /* (?$...) */
10088    case '@':           /* (?@...) */
10089     vFAIL2("Sequence (?%c...) not implemented", (int)paren);
10090     break;
10091    case '0' :           /* (?0) */
10092    case 'R' :           /* (?R) */
10093     if (*RExC_parse != ')')
10094      FAIL("Sequence (?R) not terminated");
10095     ret = reg_node(pRExC_state, GOSTART);
10096      RExC_seen |= REG_GOSTART_SEEN;
10097     *flagp |= POSTPONED;
10098     nextchar(pRExC_state);
10099     return ret;
10100     /*notreached*/
10101    /* named and numeric backreferences */
10102    case '&':            /* (?&NAME) */
10103     parse_start = RExC_parse - 1;
10104    named_recursion:
10105     {
10106       SV *sv_dat = reg_scan_name(pRExC_state,
10107        SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10108       num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10109     }
10110     if (RExC_parse == RExC_end || *RExC_parse != ')')
10111      vFAIL("Sequence (?&... not terminated");
10112     goto gen_recurse_regop;
10113     /* NOTREACHED */
10114    case '+':
10115     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10116      RExC_parse++;
10117      vFAIL("Illegal pattern");
10118     }
10119     goto parse_recursion;
10120     /* NOTREACHED*/
10121    case '-': /* (?-1) */
10122     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10123      RExC_parse--; /* rewind to let it be handled later */
10124      goto parse_flags;
10125     }
10126     /* FALLTHROUGH */
10127    case '1': case '2': case '3': case '4': /* (?1) */
10128    case '5': case '6': case '7': case '8': case '9':
10129     RExC_parse--;
10130    parse_recursion:
10131     {
10132      bool is_neg = FALSE;
10133      UV unum;
10134      parse_start = RExC_parse - 1; /* MJD */
10135      if (*RExC_parse == '-') {
10136       RExC_parse++;
10137       is_neg = TRUE;
10138      }
10139      if (grok_atoUV(RExC_parse, &unum, &endptr)
10140       && unum <= I32_MAX
10141      ) {
10142       num = (I32)unum;
10143       RExC_parse = (char*)endptr;
10144      } else
10145       num = I32_MAX;
10146      if (is_neg) {
10147       /* Some limit for num? */
10148       num = -num;
10149      }
10150     }
10151     if (*RExC_parse!=')')
10152      vFAIL("Expecting close bracket");
10153
10154    gen_recurse_regop:
10155     if ( paren == '-' ) {
10156      /*
10157      Diagram of capture buffer numbering.
10158      Top line is the normal capture buffer numbers
10159      Bottom line is the negative indexing as from
10160      the X (the (?-2))
10161
10162      +   1 2    3 4 5 X          6 7
10163      /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10164      -   5 4    3 2 1 X          x x
10165
10166      */
10167      num = RExC_npar + num;
10168      if (num < 1)  {
10169       RExC_parse++;
10170       vFAIL("Reference to nonexistent group");
10171      }
10172     } else if ( paren == '+' ) {
10173      num = RExC_npar + num - 1;
10174     }
10175
10176     ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10177     if (!SIZE_ONLY) {
10178      if (num > (I32)RExC_rx->nparens) {
10179       RExC_parse++;
10180       vFAIL("Reference to nonexistent group");
10181      }
10182      RExC_recurse_count++;
10183      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10184       "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
10185        22, "|    |", (int)(depth * 2 + 1), "",
10186        (UV)ARG(ret), (IV)ARG2L(ret)));
10187     }
10188     RExC_seen |= REG_RECURSE_SEEN;
10189     Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
10190     Set_Node_Offset(ret, parse_start); /* MJD */
10191
10192     *flagp |= POSTPONED;
10193     nextchar(pRExC_state);
10194     return ret;
10195
10196    /* NOTREACHED */
10197
10198    case '?':           /* (??...) */
10199     is_logical = 1;
10200     if (*RExC_parse != '{') {
10201      RExC_parse += SKIP_IF_CHAR(RExC_parse);
10202      /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10203      vFAIL2utf8f(
10204       "Sequence (%"UTF8f"...) not recognized",
10205       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10206      NOT_REACHED; /*NOTREACHED*/
10207     }
10208     *flagp |= POSTPONED;
10209     paren = *RExC_parse++;
10210     /* FALLTHROUGH */
10211    case '{':           /* (?{...}) */
10212    {
10213     U32 n = 0;
10214     struct reg_code_block *cb;
10215
10216     RExC_seen_zerolen++;
10217
10218     if (   !pRExC_state->num_code_blocks
10219      || pRExC_state->code_index >= pRExC_state->num_code_blocks
10220      || pRExC_state->code_blocks[pRExC_state->code_index].start
10221       != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
10222        - RExC_start)
10223     ) {
10224      if (RExC_pm_flags & PMf_USE_RE_EVAL)
10225       FAIL("panic: Sequence (?{...}): no code block found\n");
10226      FAIL("Eval-group not allowed at runtime, use re 'eval'");
10227     }
10228     /* this is a pre-compiled code block (?{...}) */
10229     cb = &pRExC_state->code_blocks[pRExC_state->code_index];
10230     RExC_parse = RExC_start + cb->end;
10231     if (!SIZE_ONLY) {
10232      OP *o = cb->block;
10233      if (cb->src_regex) {
10234       n = add_data(pRExC_state, STR_WITH_LEN("rl"));
10235       RExC_rxi->data->data[n] =
10236        (void*)SvREFCNT_inc((SV*)cb->src_regex);
10237       RExC_rxi->data->data[n+1] = (void*)o;
10238      }
10239      else {
10240       n = add_data(pRExC_state,
10241        (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
10242       RExC_rxi->data->data[n] = (void*)o;
10243      }
10244     }
10245     pRExC_state->code_index++;
10246     nextchar(pRExC_state);
10247
10248     if (is_logical) {
10249      regnode *eval;
10250      ret = reg_node(pRExC_state, LOGICAL);
10251
10252      eval = reg2Lanode(pRExC_state, EVAL,
10253          n,
10254
10255          /* for later propagation into (??{})
10256           * return value */
10257          RExC_flags & RXf_PMf_COMPILETIME
10258          );
10259      if (!SIZE_ONLY) {
10260       ret->flags = 2;
10261      }
10262      REGTAIL(pRExC_state, ret, eval);
10263      /* deal with the length of this later - MJD */
10264      return ret;
10265     }
10266     ret = reg2Lanode(pRExC_state, EVAL, n, 0);
10267     Set_Node_Length(ret, RExC_parse - parse_start + 1);
10268     Set_Node_Offset(ret, parse_start);
10269     return ret;
10270    }
10271    case '(':           /* (?(?{...})...) and (?(?=...)...) */
10272    {
10273     int is_define= 0;
10274     const int DEFINE_len = sizeof("DEFINE") - 1;
10275     if (RExC_parse[0] == '?') {        /* (?(?...)) */
10276      if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
10277       || RExC_parse[1] == '<'
10278       || RExC_parse[1] == '{') { /* Lookahead or eval. */
10279       I32 flag;
10280       regnode *tail;
10281
10282       ret = reg_node(pRExC_state, LOGICAL);
10283       if (!SIZE_ONLY)
10284        ret->flags = 1;
10285
10286       tail = reg(pRExC_state, 1, &flag, depth+1);
10287       if (flag & RESTART_UTF8) {
10288        *flagp = RESTART_UTF8;
10289        return NULL;
10290       }
10291       REGTAIL(pRExC_state, ret, tail);
10292       goto insert_if;
10293      }
10294      /* Fall through to ‘Unknown switch condition’ at the
10295      end of the if/else chain. */
10296     }
10297     else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
10298       || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10299     {
10300      char ch = RExC_parse[0] == '<' ? '>' : '\'';
10301      char *name_start= RExC_parse++;
10302      U32 num = 0;
10303      SV *sv_dat=reg_scan_name(pRExC_state,
10304       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10305      if (RExC_parse == name_start || *RExC_parse != ch)
10306       vFAIL2("Sequence (?(%c... not terminated",
10307        (ch == '>' ? '<' : ch));
10308      RExC_parse++;
10309      if (!SIZE_ONLY) {
10310       num = add_data( pRExC_state, STR_WITH_LEN("S"));
10311       RExC_rxi->data->data[num]=(void*)sv_dat;
10312       SvREFCNT_inc_simple_void(sv_dat);
10313      }
10314      ret = reganode(pRExC_state,NGROUPP,num);
10315      goto insert_if_check_paren;
10316     }
10317     else if (RExC_end - RExC_parse >= DEFINE_len
10318       && strnEQ(RExC_parse, "DEFINE", DEFINE_len))
10319     {
10320      ret = reganode(pRExC_state,DEFINEP,0);
10321      RExC_parse += DEFINE_len;
10322      is_define = 1;
10323      goto insert_if_check_paren;
10324     }
10325     else if (RExC_parse[0] == 'R') {
10326      RExC_parse++;
10327      parno = 0;
10328      if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10329       UV uv;
10330       if (grok_atoUV(RExC_parse, &uv, &endptr)
10331        && uv <= I32_MAX
10332       ) {
10333        parno = (I32)uv;
10334        RExC_parse = (char*)endptr;
10335       }
10336       /* else "Switch condition not recognized" below */
10337      } else if (RExC_parse[0] == '&') {
10338       SV *sv_dat;
10339       RExC_parse++;
10340       sv_dat = reg_scan_name(pRExC_state,
10341        SIZE_ONLY
10342        ? REG_RSN_RETURN_NULL
10343        : REG_RSN_RETURN_DATA);
10344        parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10345      }
10346      ret = reganode(pRExC_state,INSUBP,parno);
10347      goto insert_if_check_paren;
10348     }
10349     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10350      /* (?(1)...) */
10351      char c;
10352      char *tmp;
10353      UV uv;
10354      if (grok_atoUV(RExC_parse, &uv, &endptr)
10355       && uv <= I32_MAX
10356      ) {
10357       parno = (I32)uv;
10358       RExC_parse = (char*)endptr;
10359      }
10360      /* XXX else what? */
10361      ret = reganode(pRExC_state, GROUPP, parno);
10362
10363     insert_if_check_paren:
10364      if (*(tmp = nextchar(pRExC_state)) != ')') {
10365       /* nextchar also skips comments, so undo its work
10366       * and skip over the the next character.
10367       */
10368       RExC_parse = tmp;
10369       RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10370       vFAIL("Switch condition not recognized");
10371      }
10372     insert_if:
10373      REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10374      br = regbranch(pRExC_state, &flags, 1,depth+1);
10375      if (br == NULL) {
10376       if (flags & RESTART_UTF8) {
10377        *flagp = RESTART_UTF8;
10378        return NULL;
10379       }
10380       FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10381        (UV) flags);
10382      } else
10383       REGTAIL(pRExC_state, br, reganode(pRExC_state,
10384               LONGJMP, 0));
10385      c = *nextchar(pRExC_state);
10386      if (flags&HASWIDTH)
10387       *flagp |= HASWIDTH;
10388      if (c == '|') {
10389       if (is_define)
10390        vFAIL("(?(DEFINE)....) does not allow branches");
10391
10392       /* Fake one for optimizer.  */
10393       lastbr = reganode(pRExC_state, IFTHEN, 0);
10394
10395       if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10396        if (flags & RESTART_UTF8) {
10397         *flagp = RESTART_UTF8;
10398         return NULL;
10399        }
10400        FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10401         (UV) flags);
10402       }
10403       REGTAIL(pRExC_state, ret, lastbr);
10404       if (flags&HASWIDTH)
10405        *flagp |= HASWIDTH;
10406       c = *nextchar(pRExC_state);
10407      }
10408      else
10409       lastbr = NULL;
10410      if (c != ')') {
10411       if (RExC_parse>RExC_end)
10412        vFAIL("Switch (?(condition)... not terminated");
10413       else
10414        vFAIL("Switch (?(condition)... contains too many branches");
10415      }
10416      ender = reg_node(pRExC_state, TAIL);
10417      REGTAIL(pRExC_state, br, ender);
10418      if (lastbr) {
10419       REGTAIL(pRExC_state, lastbr, ender);
10420       REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10421      }
10422      else
10423       REGTAIL(pRExC_state, ret, ender);
10424      RExC_size++; /* XXX WHY do we need this?!!
10425          For large programs it seems to be required
10426          but I can't figure out why. -- dmq*/
10427      return ret;
10428     }
10429     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10430     vFAIL("Unknown switch condition (?(...))");
10431    }
10432    case '[':           /* (?[ ... ]) */
10433     return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10434           oregcomp_parse);
10435    case 0:
10436     RExC_parse--; /* for vFAIL to print correctly */
10437     vFAIL("Sequence (? incomplete");
10438     break;
10439    default: /* e.g., (?i) */
10440     --RExC_parse;
10441    parse_flags:
10442     parse_lparen_question_flags(pRExC_state);
10443     if (UCHARAT(RExC_parse) != ':') {
10444      if (*RExC_parse)
10445       nextchar(pRExC_state);
10446      *flagp = TRYAGAIN;
10447      return NULL;
10448     }
10449     paren = ':';
10450     nextchar(pRExC_state);
10451     ret = NULL;
10452     goto parse_rest;
10453    } /* end switch */
10454   }
10455   else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
10456   capturing_parens:
10457    parno = RExC_npar;
10458    RExC_npar++;
10459
10460    ret = reganode(pRExC_state, OPEN, parno);
10461    if (!SIZE_ONLY ){
10462     if (!RExC_nestroot)
10463      RExC_nestroot = parno;
10464     if (RExC_seen & REG_RECURSE_SEEN
10465      && !RExC_open_parens[parno-1])
10466     {
10467      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10468       "%*s%*s Setting open paren #%"IVdf" to %d\n",
10469       22, "|    |", (int)(depth * 2 + 1), "",
10470       (IV)parno, REG_NODE_NUM(ret)));
10471      RExC_open_parens[parno-1]= ret;
10472     }
10473    }
10474    Set_Node_Length(ret, 1); /* MJD */
10475    Set_Node_Offset(ret, RExC_parse); /* MJD */
10476    is_open = 1;
10477   } else {
10478    /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
10479    paren = ':';
10480    ret = NULL;
10481   }
10482  }
10483  else                        /* ! paren */
10484   ret = NULL;
10485
10486    parse_rest:
10487  /* Pick up the branches, linking them together. */
10488  parse_start = RExC_parse;   /* MJD */
10489  br = regbranch(pRExC_state, &flags, 1,depth+1);
10490
10491  /*     branch_len = (paren != 0); */
10492
10493  if (br == NULL) {
10494   if (flags & RESTART_UTF8) {
10495    *flagp = RESTART_UTF8;
10496    return NULL;
10497   }
10498   FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10499  }
10500  if (*RExC_parse == '|') {
10501   if (!SIZE_ONLY && RExC_extralen) {
10502    reginsert(pRExC_state, BRANCHJ, br, depth+1);
10503   }
10504   else {                  /* MJD */
10505    reginsert(pRExC_state, BRANCH, br, depth+1);
10506    Set_Node_Length(br, paren != 0);
10507    Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10508   }
10509   have_branch = 1;
10510   if (SIZE_ONLY)
10511    RExC_extralen += 1;  /* For BRANCHJ-BRANCH. */
10512  }
10513  else if (paren == ':') {
10514   *flagp |= flags&SIMPLE;
10515  }
10516  if (is_open) {    /* Starts with OPEN. */
10517   REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10518  }
10519  else if (paren != '?')  /* Not Conditional */
10520   ret = br;
10521  *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10522  lastbr = br;
10523  while (*RExC_parse == '|') {
10524   if (!SIZE_ONLY && RExC_extralen) {
10525    ender = reganode(pRExC_state, LONGJMP,0);
10526
10527    /* Append to the previous. */
10528    REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10529   }
10530   if (SIZE_ONLY)
10531    RExC_extralen += 2;  /* Account for LONGJMP. */
10532   nextchar(pRExC_state);
10533   if (freeze_paren) {
10534    if (RExC_npar > after_freeze)
10535     after_freeze = RExC_npar;
10536    RExC_npar = freeze_paren;
10537   }
10538   br = regbranch(pRExC_state, &flags, 0, depth+1);
10539
10540   if (br == NULL) {
10541    if (flags & RESTART_UTF8) {
10542     *flagp = RESTART_UTF8;
10543     return NULL;
10544    }
10545    FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10546   }
10547   REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10548   lastbr = br;
10549   *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10550  }
10551
10552  if (have_branch || paren != ':') {
10553   /* Make a closing node, and hook it on the end. */
10554   switch (paren) {
10555   case ':':
10556    ender = reg_node(pRExC_state, TAIL);
10557    break;
10558   case 1: case 2:
10559    ender = reganode(pRExC_state, CLOSE, parno);
10560    if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10561     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10562       "%*s%*s Setting close paren #%"IVdf" to %d\n",
10563       22, "|    |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
10564     RExC_close_parens[parno-1]= ender;
10565     if (RExC_nestroot == parno)
10566      RExC_nestroot = 0;
10567    }
10568    Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10569    Set_Node_Length(ender,1); /* MJD */
10570    break;
10571   case '<':
10572   case ',':
10573   case '=':
10574   case '!':
10575    *flagp &= ~HASWIDTH;
10576    /* FALLTHROUGH */
10577   case '>':
10578    ender = reg_node(pRExC_state, SUCCEED);
10579    break;
10580   case 0:
10581    ender = reg_node(pRExC_state, END);
10582    if (!SIZE_ONLY) {
10583     assert(!RExC_opend); /* there can only be one! */
10584     RExC_opend = ender;
10585    }
10586    break;
10587   }
10588   DEBUG_PARSE_r(if (!SIZE_ONLY) {
10589    DEBUG_PARSE_MSG("lsbr");
10590    regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
10591    regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10592    PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10593       SvPV_nolen_const(RExC_mysv1),
10594       (IV)REG_NODE_NUM(lastbr),
10595       SvPV_nolen_const(RExC_mysv2),
10596       (IV)REG_NODE_NUM(ender),
10597       (IV)(ender - lastbr)
10598    );
10599   });
10600   REGTAIL(pRExC_state, lastbr, ender);
10601
10602   if (have_branch && !SIZE_ONLY) {
10603    char is_nothing= 1;
10604    if (depth==1)
10605     RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10606
10607    /* Hook the tails of the branches to the closing node. */
10608    for (br = ret; br; br = regnext(br)) {
10609     const U8 op = PL_regkind[OP(br)];
10610     if (op == BRANCH) {
10611      REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10612      if ( OP(NEXTOPER(br)) != NOTHING
10613       || regnext(NEXTOPER(br)) != ender)
10614       is_nothing= 0;
10615     }
10616     else if (op == BRANCHJ) {
10617      REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10618      /* for now we always disable this optimisation * /
10619      if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10620       || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10621      */
10622       is_nothing= 0;
10623     }
10624    }
10625    if (is_nothing) {
10626     br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10627     DEBUG_PARSE_r(if (!SIZE_ONLY) {
10628      DEBUG_PARSE_MSG("NADA");
10629      regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
10630      regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10631      PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10632         SvPV_nolen_const(RExC_mysv1),
10633         (IV)REG_NODE_NUM(ret),
10634         SvPV_nolen_const(RExC_mysv2),
10635         (IV)REG_NODE_NUM(ender),
10636         (IV)(ender - ret)
10637      );
10638     });
10639     OP(br)= NOTHING;
10640     if (OP(ender) == TAIL) {
10641      NEXT_OFF(br)= 0;
10642      RExC_emit= br + 1;
10643     } else {
10644      regnode *opt;
10645      for ( opt= br + 1; opt < ender ; opt++ )
10646       OP(opt)= OPTIMIZED;
10647      NEXT_OFF(br)= ender - br;
10648     }
10649    }
10650   }
10651  }
10652
10653  {
10654   const char *p;
10655   static const char parens[] = "=!<,>";
10656
10657   if (paren && (p = strchr(parens, paren))) {
10658    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10659    int flag = (p - parens) > 1;
10660
10661    if (paren == '>')
10662     node = SUSPEND, flag = 0;
10663    reginsert(pRExC_state, node,ret, depth+1);
10664    Set_Node_Cur_Length(ret, parse_start);
10665    Set_Node_Offset(ret, parse_start + 1);
10666    ret->flags = flag;
10667    REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10668   }
10669  }
10670
10671  /* Check for proper termination. */
10672  if (paren) {
10673   /* restore original flags, but keep (?p) */
10674   RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10675   if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10676    RExC_parse = oregcomp_parse;
10677    vFAIL("Unmatched (");
10678   }
10679  }
10680  else if (!paren && RExC_parse < RExC_end) {
10681   if (*RExC_parse == ')') {
10682    RExC_parse++;
10683    vFAIL("Unmatched )");
10684   }
10685   else
10686    FAIL("Junk on end of regexp"); /* "Can't happen". */
10687   NOT_REACHED; /* NOTREACHED */
10688  }
10689
10690  if (RExC_in_lookbehind) {
10691   RExC_in_lookbehind--;
10692  }
10693  if (after_freeze > RExC_npar)
10694   RExC_npar = after_freeze;
10695  return(ret);
10696 }
10697
10698 /*
10699  - regbranch - one alternative of an | operator
10700  *
10701  * Implements the concatenation operator.
10702  *
10703  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10704  * restarted.
10705  */
10706 STATIC regnode *
10707 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10708 {
10709  regnode *ret;
10710  regnode *chain = NULL;
10711  regnode *latest;
10712  I32 flags = 0, c = 0;
10713  GET_RE_DEBUG_FLAGS_DECL;
10714
10715  PERL_ARGS_ASSERT_REGBRANCH;
10716
10717  DEBUG_PARSE("brnc");
10718
10719  if (first)
10720   ret = NULL;
10721  else {
10722   if (!SIZE_ONLY && RExC_extralen)
10723    ret = reganode(pRExC_state, BRANCHJ,0);
10724   else {
10725    ret = reg_node(pRExC_state, BRANCH);
10726    Set_Node_Length(ret, 1);
10727   }
10728  }
10729
10730  if (!first && SIZE_ONLY)
10731   RExC_extralen += 1;   /* BRANCHJ */
10732
10733  *flagp = WORST;   /* Tentatively. */
10734
10735  RExC_parse--;
10736  nextchar(pRExC_state);
10737  while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10738   flags &= ~TRYAGAIN;
10739   latest = regpiece(pRExC_state, &flags,depth+1);
10740   if (latest == NULL) {
10741    if (flags & TRYAGAIN)
10742     continue;
10743    if (flags & RESTART_UTF8) {
10744     *flagp = RESTART_UTF8;
10745     return NULL;
10746    }
10747    FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10748   }
10749   else if (ret == NULL)
10750    ret = latest;
10751   *flagp |= flags&(HASWIDTH|POSTPONED);
10752   if (chain == NULL)  /* First piece. */
10753    *flagp |= flags&SPSTART;
10754   else {
10755    /* FIXME adding one for every branch after the first is probably
10756    * excessive now we have TRIE support. (hv) */
10757    MARK_NAUGHTY(1);
10758    REGTAIL(pRExC_state, chain, latest);
10759   }
10760   chain = latest;
10761   c++;
10762  }
10763  if (chain == NULL) { /* Loop ran zero times. */
10764   chain = reg_node(pRExC_state, NOTHING);
10765   if (ret == NULL)
10766    ret = chain;
10767  }
10768  if (c == 1) {
10769   *flagp |= flags&SIMPLE;
10770  }
10771
10772  return ret;
10773 }
10774
10775 /*
10776  - regpiece - something followed by possible [*+?]
10777  *
10778  * Note that the branching code sequences used for ? and the general cases
10779  * of * and + are somewhat optimized:  they use the same NOTHING node as
10780  * both the endmarker for their branch list and the body of the last branch.
10781  * It might seem that this node could be dispensed with entirely, but the
10782  * endmarker role is not redundant.
10783  *
10784  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10785  * TRYAGAIN.
10786  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10787  * restarted.
10788  */
10789 STATIC regnode *
10790 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10791 {
10792  regnode *ret;
10793  char op;
10794  char *next;
10795  I32 flags;
10796  const char * const origparse = RExC_parse;
10797  I32 min;
10798  I32 max = REG_INFTY;
10799 #ifdef RE_TRACK_PATTERN_OFFSETS
10800  char *parse_start;
10801 #endif
10802  const char *maxpos = NULL;
10803  UV uv;
10804
10805  /* Save the original in case we change the emitted regop to a FAIL. */
10806  regnode * const orig_emit = RExC_emit;
10807
10808  GET_RE_DEBUG_FLAGS_DECL;
10809
10810  PERL_ARGS_ASSERT_REGPIECE;
10811
10812  DEBUG_PARSE("piec");
10813
10814  ret = regatom(pRExC_state, &flags,depth+1);
10815  if (ret == NULL) {
10816   if (flags & (TRYAGAIN|RESTART_UTF8))
10817    *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10818   else
10819    FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10820   return(NULL);
10821  }
10822
10823  op = *RExC_parse;
10824
10825  if (op == '{' && regcurly(RExC_parse)) {
10826   maxpos = NULL;
10827 #ifdef RE_TRACK_PATTERN_OFFSETS
10828   parse_start = RExC_parse; /* MJD */
10829 #endif
10830   next = RExC_parse + 1;
10831   while (isDIGIT(*next) || *next == ',') {
10832    if (*next == ',') {
10833     if (maxpos)
10834      break;
10835     else
10836      maxpos = next;
10837    }
10838    next++;
10839   }
10840   if (*next == '}') {  /* got one */
10841    const char* endptr;
10842    if (!maxpos)
10843     maxpos = next;
10844    RExC_parse++;
10845    if (isDIGIT(*RExC_parse)) {
10846     if (!grok_atoUV(RExC_parse, &uv, &endptr))
10847      vFAIL("Invalid quantifier in {,}");
10848     if (uv >= REG_INFTY)
10849      vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10850     min = (I32)uv;
10851    } else {
10852     min = 0;
10853    }
10854    if (*maxpos == ',')
10855     maxpos++;
10856    else
10857     maxpos = RExC_parse;
10858    if (isDIGIT(*maxpos)) {
10859     if (!grok_atoUV(maxpos, &uv, &endptr))
10860      vFAIL("Invalid quantifier in {,}");
10861     if (uv >= REG_INFTY)
10862      vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10863     max = (I32)uv;
10864    } else {
10865     max = REG_INFTY;  /* meaning "infinity" */
10866    }
10867    RExC_parse = next;
10868    nextchar(pRExC_state);
10869    if (max < min) {    /* If can't match, warn and optimize to fail
10870         unconditionally */
10871     if (SIZE_ONLY) {
10872
10873      /* We can't back off the size because we have to reserve
10874      * enough space for all the things we are about to throw
10875      * away, but we can shrink it by the ammount we are about
10876      * to re-use here */
10877      RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10878     }
10879     else {
10880      ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10881      RExC_emit = orig_emit;
10882     }
10883     ret = reg_node(pRExC_state, OPFAIL);
10884     return ret;
10885    }
10886    else if (min == max
10887      && RExC_parse < RExC_end
10888      && (*RExC_parse == '?' || *RExC_parse == '+'))
10889    {
10890     if (PASS2) {
10891      ckWARN2reg(RExC_parse + 1,
10892        "Useless use of greediness modifier '%c'",
10893        *RExC_parse);
10894     }
10895     /* Absorb the modifier, so later code doesn't see nor use
10896      * it */
10897     nextchar(pRExC_state);
10898    }
10899
10900   do_curly:
10901    if ((flags&SIMPLE)) {
10902     MARK_NAUGHTY_EXP(2, 2);
10903     reginsert(pRExC_state, CURLY, ret, depth+1);
10904     Set_Node_Offset(ret, parse_start+1); /* MJD */
10905     Set_Node_Cur_Length(ret, parse_start);
10906    }
10907    else {
10908     regnode * const w = reg_node(pRExC_state, WHILEM);
10909
10910     w->flags = 0;
10911     REGTAIL(pRExC_state, ret, w);
10912     if (!SIZE_ONLY && RExC_extralen) {
10913      reginsert(pRExC_state, LONGJMP,ret, depth+1);
10914      reginsert(pRExC_state, NOTHING,ret, depth+1);
10915      NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10916     }
10917     reginsert(pRExC_state, CURLYX,ret, depth+1);
10918         /* MJD hk */
10919     Set_Node_Offset(ret, parse_start+1);
10920     Set_Node_Length(ret,
10921         op == '{' ? (RExC_parse - parse_start) : 1);
10922
10923     if (!SIZE_ONLY && RExC_extralen)
10924      NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10925     REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10926     if (SIZE_ONLY)
10927      RExC_whilem_seen++, RExC_extralen += 3;
10928     MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
10929    }
10930    ret->flags = 0;
10931
10932    if (min > 0)
10933     *flagp = WORST;
10934    if (max > 0)
10935     *flagp |= HASWIDTH;
10936    if (!SIZE_ONLY) {
10937     ARG1_SET(ret, (U16)min);
10938     ARG2_SET(ret, (U16)max);
10939    }
10940    if (max == REG_INFTY)
10941     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10942
10943    goto nest_check;
10944   }
10945  }
10946
10947  if (!ISMULT1(op)) {
10948   *flagp = flags;
10949   return(ret);
10950  }
10951
10952 #if 0    /* Now runtime fix should be reliable. */
10953
10954  /* if this is reinstated, don't forget to put this back into perldiag:
10955
10956    =item Regexp *+ operand could be empty at {#} in regex m/%s/
10957
10958   (F) The part of the regexp subject to either the * or + quantifier
10959   could match an empty string. The {#} shows in the regular
10960   expression about where the problem was discovered.
10961
10962  */
10963
10964  if (!(flags&HASWIDTH) && op != '?')
10965  vFAIL("Regexp *+ operand could be empty");
10966 #endif
10967
10968 #ifdef RE_TRACK_PATTERN_OFFSETS
10969  parse_start = RExC_parse;
10970 #endif
10971  nextchar(pRExC_state);
10972
10973  *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10974
10975  if (op == '*' && (flags&SIMPLE)) {
10976   reginsert(pRExC_state, STAR, ret, depth+1);
10977   ret->flags = 0;
10978   MARK_NAUGHTY(4);
10979   RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10980  }
10981  else if (op == '*') {
10982   min = 0;
10983   goto do_curly;
10984  }
10985  else if (op == '+' && (flags&SIMPLE)) {
10986   reginsert(pRExC_state, PLUS, ret, depth+1);
10987   ret->flags = 0;
10988   MARK_NAUGHTY(3);
10989   RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10990  }
10991  else if (op == '+') {
10992   min = 1;
10993   goto do_curly;
10994  }
10995  else if (op == '?') {
10996   min = 0; max = 1;
10997   goto do_curly;
10998  }
10999   nest_check:
11000  if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
11001   SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
11002   ckWARN2reg(RExC_parse,
11003     "%"UTF8f" matches null string many times",
11004     UTF8fARG(UTF, (RExC_parse >= origparse
11005         ? RExC_parse - origparse
11006         : 0),
11007     origparse));
11008   (void)ReREFCNT_inc(RExC_rx_sv);
11009  }
11010
11011  if (RExC_parse < RExC_end && *RExC_parse == '?') {
11012   nextchar(pRExC_state);
11013   reginsert(pRExC_state, MINMOD, ret, depth+1);
11014   REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
11015  }
11016  else
11017  if (RExC_parse < RExC_end && *RExC_parse == '+') {
11018   regnode *ender;
11019   nextchar(pRExC_state);
11020   ender = reg_node(pRExC_state, SUCCEED);
11021   REGTAIL(pRExC_state, ret, ender);
11022   reginsert(pRExC_state, SUSPEND, ret, depth+1);
11023   ret->flags = 0;
11024   ender = reg_node(pRExC_state, TAIL);
11025   REGTAIL(pRExC_state, ret, ender);
11026  }
11027
11028  if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
11029   RExC_parse++;
11030   vFAIL("Nested quantifiers");
11031  }
11032
11033  return(ret);
11034 }
11035
11036 STATIC bool
11037 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
11038     regnode ** node_p,
11039     UV * code_point_p,
11040     int * cp_count,
11041     I32 * flagp,
11042     const U32 depth
11043  )
11044 {
11045  /* This routine teases apart the various meanings of \N and returns
11046   * accordingly.  The input parameters constrain which meaning(s) is/are valid
11047   * in the current context.
11048   *
11049   * Exactly one of <node_p> and <code_point_p> must be non-NULL.
11050   *
11051   * If <code_point_p> is not NULL, the context is expecting the result to be a
11052   * single code point.  If this \N instance turns out to a single code point,
11053   * the function returns TRUE and sets *code_point_p to that code point.
11054   *
11055   * If <node_p> is not NULL, the context is expecting the result to be one of
11056   * the things representable by a regnode.  If this \N instance turns out to be
11057   * one such, the function generates the regnode, returns TRUE and sets *node_p
11058   * to point to that regnode.
11059   *
11060   * If this instance of \N isn't legal in any context, this function will
11061   * generate a fatal error and not return.
11062   *
11063   * On input, RExC_parse should point to the first char following the \N at the
11064   * time of the call.  On successful return, RExC_parse will have been updated
11065   * to point to just after the sequence identified by this routine.  Also
11066   * *flagp has been updated as needed.
11067   *
11068   * When there is some problem with the current context and this \N instance,
11069   * the function returns FALSE, without advancing RExC_parse, nor setting
11070   * *node_p, nor *code_point_p, nor *flagp.
11071   *
11072   * If <cp_count> is not NULL, the caller wants to know the length (in code
11073   * points) that this \N sequence matches.  This is set even if the function
11074   * returns FALSE, as detailed below.
11075   *
11076   * There are 5 possibilities here, as detailed in the next 5 paragraphs.
11077   *
11078   * Probably the most common case is for the \N to specify a single code point.
11079   * *cp_count will be set to 1, and *code_point_p will be set to that code
11080   * point.
11081   *
11082   * Another possibility is for the input to be an empty \N{}, which for
11083   * backwards compatibility we accept.  *cp_count will be set to 0. *node_p
11084   * will be set to a generated NOTHING node.
11085   *
11086   * Still another possibility is for the \N to mean [^\n]. *cp_count will be
11087   * set to 0. *node_p will be set to a generated REG_ANY node.
11088   *
11089   * The fourth possibility is that \N resolves to a sequence of more than one
11090   * code points.  *cp_count will be set to the number of code points in the
11091   * sequence. *node_p * will be set to a generated node returned by this
11092   * function calling S_reg().
11093   *
11094   * The final possibility, which happens only when the fourth one would
11095   * otherwise be in effect, is that one of those code points requires the
11096   * pattern to be recompiled as UTF-8.  The function returns FALSE, and sets
11097   * the RESTART_UTF8 flag in *flagp.  When this happens, the caller needs to
11098   * desist from continuing parsing, and return this information to its caller.
11099   * This is not set for when there is only one code point, as this can be
11100   * called as part of an ANYOF node, and they can store above-Latin1 code
11101   * points without the pattern having to be in UTF-8.
11102   *
11103   * For non-single-quoted regexes, the tokenizer has resolved character and
11104   * sequence names inside \N{...} into their Unicode values, normalizing the
11105   * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
11106   * hex-represented code points in the sequence.  This is done there because
11107   * the names can vary based on what charnames pragma is in scope at the time,
11108   * so we need a way to take a snapshot of what they resolve to at the time of
11109   * the original parse. [perl #56444].
11110   *
11111   * That parsing is skipped for single-quoted regexes, so we may here get
11112   * '\N{NAME}'.  This is a fatal error.  These names have to be resolved by the
11113   * parser.  But if the single-quoted regex is something like '\N{U+41}', that
11114   * is legal and handled here.  The code point is Unicode, and has to be
11115   * translated into the native character set for non-ASCII platforms.
11116   * the tokenizer passes the \N sequence through unchanged; this code will not
11117   * attempt to determine this nor expand those, instead raising a syntax error.
11118   */
11119
11120  char * endbrace;    /* points to '}' following the name */
11121  char *endchar; /* Points to '.' or '}' ending cur char in the input
11122       stream */
11123  char* p;            /* Temporary */
11124
11125  GET_RE_DEBUG_FLAGS_DECL;
11126
11127  PERL_ARGS_ASSERT_GROK_BSLASH_N;
11128
11129  GET_RE_DEBUG_FLAGS;
11130
11131  assert(cBOOL(node_p) ^ cBOOL(code_point_p));  /* Exactly one should be set */
11132  assert(! (node_p && cp_count));               /* At most 1 should be set */
11133
11134  if (cp_count) {     /* Initialize return for the most common case */
11135   *cp_count = 1;
11136  }
11137
11138  /* The [^\n] meaning of \N ignores spaces and comments under the /x
11139  * modifier.  The other meanings do not, so use a temporary until we find
11140  * out which we are being called with */
11141  p = (RExC_flags & RXf_PMf_EXTENDED)
11142   ? regpatws(pRExC_state, RExC_parse,
11143         TRUE) /* means recognize comments */
11144   : RExC_parse;
11145
11146  /* Disambiguate between \N meaning a named character versus \N meaning
11147  * [^\n].  The latter is assumed when the {...} following the \N is a legal
11148  * quantifier, or there is no a '{' at all */
11149  if (*p != '{' || regcurly(p)) {
11150   RExC_parse = p;
11151   if (cp_count) {
11152    *cp_count = -1;
11153   }
11154
11155   if (! node_p) {
11156    return FALSE;
11157   }
11158   RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
11159       current char */
11160   nextchar(pRExC_state);
11161   *node_p = reg_node(pRExC_state, REG_ANY);
11162   *flagp |= HASWIDTH|SIMPLE;
11163   MARK_NAUGHTY(1);
11164   Set_Node_Length(*node_p, 1); /* MJD */
11165   return TRUE;
11166  }
11167
11168  /* Here, we have decided it should be a named character or sequence */
11169
11170  /* The test above made sure that the next real character is a '{', but
11171  * under the /x modifier, it could be separated by space (or a comment and
11172  * \n) and this is not allowed (for consistency with \x{...} and the
11173  * tokenizer handling of \N{NAME}). */
11174  if (*RExC_parse != '{') {
11175   vFAIL("Missing braces on \\N{}");
11176  }
11177
11178  RExC_parse++; /* Skip past the '{' */
11179
11180  if (! (endbrace = strchr(RExC_parse, '}'))  /* no trailing brace */
11181   || ! (endbrace == RExC_parse  /* nothing between the {} */
11182    || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked... */
11183     && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better
11184              error msg) */
11185  {
11186   if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
11187   vFAIL("\\N{NAME} must be resolved by the lexer");
11188  }
11189
11190  RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
11191
11192  if (endbrace == RExC_parse) {   /* empty: \N{} */
11193   if (cp_count) {
11194    *cp_count = 0;
11195   }
11196   nextchar(pRExC_state);
11197   if (! node_p) {
11198    return FALSE;
11199   }
11200
11201   *node_p = reg_node(pRExC_state,NOTHING);
11202   return TRUE;
11203  }
11204
11205  RExC_parse += 2; /* Skip past the 'U+' */
11206
11207  endchar = RExC_parse + strcspn(RExC_parse, ".}");
11208
11209  /* Code points are separated by dots.  If none, there is only one code
11210  * point, and is terminated by the brace */
11211
11212  if (endchar >= endbrace) {
11213   STRLEN length_of_hex;
11214   I32 grok_hex_flags;
11215
11216   /* Here, exactly one code point.  If that isn't what is wanted, fail */
11217   if (! code_point_p) {
11218    RExC_parse = p;
11219    return FALSE;
11220   }
11221
11222   /* Convert code point from hex */
11223   length_of_hex = (STRLEN)(endchar - RExC_parse);
11224   grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
11225       | PERL_SCAN_DISALLOW_PREFIX
11226
11227        /* No errors in the first pass (See [perl
11228        * #122671].)  We let the code below find the
11229        * errors when there are multiple chars. */
11230       | ((SIZE_ONLY)
11231        ? PERL_SCAN_SILENT_ILLDIGIT
11232        : 0);
11233
11234   /* This routine is the one place where both single- and double-quotish
11235   * \N{U+xxxx} are evaluated.  The value is a Unicode code point which
11236   * must be converted to native. */
11237   *code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse,
11238           &length_of_hex,
11239           &grok_hex_flags,
11240           NULL));
11241
11242   /* The tokenizer should have guaranteed validity, but it's possible to
11243   * bypass it by using single quoting, so check.  Don't do the check
11244   * here when there are multiple chars; we do it below anyway. */
11245   if (length_of_hex == 0
11246    || length_of_hex != (STRLEN)(endchar - RExC_parse) )
11247   {
11248    RExC_parse += length_of_hex; /* Includes all the valid */
11249    RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
11250        ? UTF8SKIP(RExC_parse)
11251        : 1;
11252    /* Guard against malformed utf8 */
11253    if (RExC_parse >= endchar) {
11254     RExC_parse = endchar;
11255    }
11256    vFAIL("Invalid hexadecimal number in \\N{U+...}");
11257   }
11258
11259   RExC_parse = endbrace + 1;
11260   return TRUE;
11261  }
11262  else {  /* Is a multiple character sequence */
11263   SV * substitute_parse;
11264   STRLEN len;
11265   char *orig_end = RExC_end;
11266   I32 flags;
11267
11268   /* Count the code points, if desired, in the sequence */
11269   if (cp_count) {
11270    *cp_count = 0;
11271    while (RExC_parse < endbrace) {
11272     /* Point to the beginning of the next character in the sequence. */
11273     RExC_parse = endchar + 1;
11274     endchar = RExC_parse + strcspn(RExC_parse, ".}");
11275     (*cp_count)++;
11276    }
11277   }
11278
11279   /* Fail if caller doesn't want to handle a multi-code-point sequence.
11280   * But don't backup up the pointer if the caller want to know how many
11281   * code points there are (they can then handle things) */
11282   if (! node_p) {
11283    if (! cp_count) {
11284     RExC_parse = p;
11285    }
11286    return FALSE;
11287   }
11288
11289   /* What is done here is to convert this to a sub-pattern of the form
11290   * \x{char1}\x{char2}...  and then call reg recursively to parse it
11291   * (enclosing in "(?: ... )" ).  That way, it retains its atomicness,
11292   * while not having to worry about special handling that some code
11293   * points may have. */
11294
11295   substitute_parse = newSVpvs("?:");
11296
11297   while (RExC_parse < endbrace) {
11298
11299    /* Convert to notation the rest of the code understands */
11300    sv_catpv(substitute_parse, "\\x{");
11301    sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
11302    sv_catpv(substitute_parse, "}");
11303
11304    /* Point to the beginning of the next character in the sequence. */
11305    RExC_parse = endchar + 1;
11306    endchar = RExC_parse + strcspn(RExC_parse, ".}");
11307
11308   }
11309   sv_catpv(substitute_parse, ")");
11310
11311   RExC_parse = SvPV(substitute_parse, len);
11312
11313   /* Don't allow empty number */
11314   if (len < (STRLEN) 8) {
11315    RExC_parse = endbrace;
11316    vFAIL("Invalid hexadecimal number in \\N{U+...}");
11317   }
11318   RExC_end = RExC_parse + len;
11319
11320   /* The values are Unicode, and therefore not subject to recoding, but
11321   * have to be converted to native on a non-Unicode (meaning non-ASCII)
11322   * platform. */
11323   RExC_override_recoding = 1;
11324 #ifdef EBCDIC
11325   RExC_recode_x_to_native = 1;
11326 #endif
11327
11328   if (node_p) {
11329    if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
11330     if (flags & RESTART_UTF8) {
11331      *flagp = RESTART_UTF8;
11332      return FALSE;
11333     }
11334     FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
11335      (UV) flags);
11336    }
11337    *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11338   }
11339
11340   /* Restore the saved values */
11341   RExC_parse = endbrace;
11342   RExC_end = orig_end;
11343   RExC_override_recoding = 0;
11344 #ifdef EBCDIC
11345   RExC_recode_x_to_native = 0;
11346 #endif
11347
11348   SvREFCNT_dec_NN(substitute_parse);
11349   nextchar(pRExC_state);
11350
11351   return TRUE;
11352  }
11353 }
11354
11355
11356 /*
11357  * reg_recode
11358  *
11359  * It returns the code point in utf8 for the value in *encp.
11360  *    value: a code value in the source encoding
11361  *    encp:  a pointer to an Encode object
11362  *
11363  * If the result from Encode is not a single character,
11364  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
11365  */
11366 STATIC UV
11367 S_reg_recode(pTHX_ const char value, SV **encp)
11368 {
11369  STRLEN numlen = 1;
11370  SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
11371  const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
11372  const STRLEN newlen = SvCUR(sv);
11373  UV uv = UNICODE_REPLACEMENT;
11374
11375  PERL_ARGS_ASSERT_REG_RECODE;
11376
11377  if (newlen)
11378   uv = SvUTF8(sv)
11379    ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
11380    : *(U8*)s;
11381
11382  if (!newlen || numlen != newlen) {
11383   uv = UNICODE_REPLACEMENT;
11384   *encp = NULL;
11385  }
11386  return uv;
11387 }
11388
11389 PERL_STATIC_INLINE U8
11390 S_compute_EXACTish(RExC_state_t *pRExC_state)
11391 {
11392  U8 op;
11393
11394  PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
11395
11396  if (! FOLD) {
11397   return (LOC)
11398     ? EXACTL
11399     : EXACT;
11400  }
11401
11402  op = get_regex_charset(RExC_flags);
11403  if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11404   op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11405     been, so there is no hole */
11406  }
11407
11408  return op + EXACTF;
11409 }
11410
11411 PERL_STATIC_INLINE void
11412 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11413       regnode *node, I32* flagp, STRLEN len, UV code_point,
11414       bool downgradable)
11415 {
11416  /* This knows the details about sizing an EXACTish node, setting flags for
11417  * it (by setting <*flagp>, and potentially populating it with a single
11418  * character.
11419  *
11420  * If <len> (the length in bytes) is non-zero, this function assumes that
11421  * the node has already been populated, and just does the sizing.  In this
11422  * case <code_point> should be the final code point that has already been
11423  * placed into the node.  This value will be ignored except that under some
11424  * circumstances <*flagp> is set based on it.
11425  *
11426  * If <len> is zero, the function assumes that the node is to contain only
11427  * the single character given by <code_point> and calculates what <len>
11428  * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
11429  * additionally will populate the node's STRING with <code_point> or its
11430  * fold if folding.
11431  *
11432  * In both cases <*flagp> is appropriately set
11433  *
11434  * It knows that under FOLD, the Latin Sharp S and UTF characters above
11435  * 255, must be folded (the former only when the rules indicate it can
11436  * match 'ss')
11437  *
11438  * When it does the populating, it looks at the flag 'downgradable'.  If
11439  * true with a node that folds, it checks if the single code point
11440  * participates in a fold, and if not downgrades the node to an EXACT.
11441  * This helps the optimizer */
11442
11443  bool len_passed_in = cBOOL(len != 0);
11444  U8 character[UTF8_MAXBYTES_CASE+1];
11445
11446  PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11447
11448  /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11449  * sizing difference, and is extra work that is thrown away */
11450  if (downgradable && ! PASS2) {
11451   downgradable = FALSE;
11452  }
11453
11454  if (! len_passed_in) {
11455   if (UTF) {
11456    if (UVCHR_IS_INVARIANT(code_point)) {
11457     if (LOC || ! FOLD) {    /* /l defers folding until runtime */
11458      *character = (U8) code_point;
11459     }
11460     else { /* Here is /i and not /l. (toFOLD() is defined on just
11461       ASCII, which isn't the same thing as INVARIANT on
11462       EBCDIC, but it works there, as the extra invariants
11463       fold to themselves) */
11464      *character = toFOLD((U8) code_point);
11465
11466      /* We can downgrade to an EXACT node if this character
11467      * isn't a folding one.  Note that this assumes that
11468      * nothing above Latin1 folds to some other invariant than
11469      * one of these alphabetics; otherwise we would also have
11470      * to check:
11471      *  && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11472      *      || ASCII_FOLD_RESTRICTED))
11473      */
11474      if (downgradable && PL_fold[code_point] == code_point) {
11475       OP(node) = EXACT;
11476      }
11477     }
11478     len = 1;
11479    }
11480    else if (FOLD && (! LOC
11481        || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11482    {   /* Folding, and ok to do so now */
11483     UV folded = _to_uni_fold_flags(
11484         code_point,
11485         character,
11486         &len,
11487         FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11488              ? FOLD_FLAGS_NOMIX_ASCII
11489              : 0));
11490     if (downgradable
11491      && folded == code_point /* This quickly rules out many
11492            cases, avoiding the
11493            _invlist_contains_cp() overhead
11494            for those.  */
11495      && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11496     {
11497      OP(node) = (LOC)
11498        ? EXACTL
11499        : EXACT;
11500     }
11501    }
11502    else if (code_point <= MAX_UTF8_TWO_BYTE) {
11503
11504     /* Not folding this cp, and can output it directly */
11505     *character = UTF8_TWO_BYTE_HI(code_point);
11506     *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11507     len = 2;
11508    }
11509    else {
11510     uvchr_to_utf8( character, code_point);
11511     len = UTF8SKIP(character);
11512    }
11513   } /* Else pattern isn't UTF8.  */
11514   else if (! FOLD) {
11515    *character = (U8) code_point;
11516    len = 1;
11517   } /* Else is folded non-UTF8 */
11518   else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11519
11520    /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11521    * comments at join_exact()); */
11522    *character = (U8) code_point;
11523    len = 1;
11524
11525    /* Can turn into an EXACT node if we know the fold at compile time,
11526    * and it folds to itself and doesn't particpate in other folds */
11527    if (downgradable
11528     && ! LOC
11529     && PL_fold_latin1[code_point] == code_point
11530     && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11531      || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11532    {
11533     OP(node) = EXACT;
11534    }
11535   } /* else is Sharp s.  May need to fold it */
11536   else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11537    *character = 's';
11538    *(character + 1) = 's';
11539    len = 2;
11540   }
11541   else {
11542    *character = LATIN_SMALL_LETTER_SHARP_S;
11543    len = 1;
11544   }
11545  }
11546
11547  if (SIZE_ONLY) {
11548   RExC_size += STR_SZ(len);
11549  }
11550  else {
11551   RExC_emit += STR_SZ(len);
11552   STR_LEN(node) = len;
11553   if (! len_passed_in) {
11554    Copy((char *) character, STRING(node), len, char);
11555   }
11556  }
11557
11558  *flagp |= HASWIDTH;
11559
11560  /* A single character node is SIMPLE, except for the special-cased SHARP S
11561  * under /di. */
11562  if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11563   && (code_point != LATIN_SMALL_LETTER_SHARP_S
11564    || ! FOLD || ! DEPENDS_SEMANTICS))
11565  {
11566   *flagp |= SIMPLE;
11567  }
11568
11569  /* The OP may not be well defined in PASS1 */
11570  if (PASS2 && OP(node) == EXACTFL) {
11571   RExC_contains_locale = 1;
11572  }
11573 }
11574
11575
11576 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11577  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11578
11579 static I32
11580 S_backref_value(char *p)
11581 {
11582  const char* endptr;
11583  UV val;
11584  if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
11585   return (I32)val;
11586  return I32_MAX;
11587 }
11588
11589
11590 /*
11591  - regatom - the lowest level
11592
11593    Try to identify anything special at the start of the pattern. If there
11594    is, then handle it as required. This may involve generating a single regop,
11595    such as for an assertion; or it may involve recursing, such as to
11596    handle a () structure.
11597
11598    If the string doesn't start with something special then we gobble up
11599    as much literal text as we can.
11600
11601    Once we have been able to handle whatever type of thing started the
11602    sequence, we return.
11603
11604    Note: we have to be careful with escapes, as they can be both literal
11605    and special, and in the case of \10 and friends, context determines which.
11606
11607    A summary of the code structure is:
11608
11609    switch (first_byte) {
11610   cases for each special:
11611    handle this special;
11612    break;
11613   case '\\':
11614    switch (2nd byte) {
11615     cases for each unambiguous special:
11616      handle this special;
11617      break;
11618     cases for each ambigous special/literal:
11619      disambiguate;
11620      if (special)  handle here
11621      else goto defchar;
11622     default: // unambiguously literal:
11623      goto defchar;
11624    }
11625   default:  // is a literal char
11626    // FALL THROUGH
11627   defchar:
11628    create EXACTish node for literal;
11629    while (more input and node isn't full) {
11630     switch (input_byte) {
11631     cases for each special;
11632      make sure parse pointer is set so that the next call to
11633       regatom will see this special first
11634      goto loopdone; // EXACTish node terminated by prev. char
11635     default:
11636      append char to EXACTISH node;
11637     }
11638     get next input byte;
11639    }
11640   loopdone:
11641    }
11642    return the generated node;
11643
11644    Specifically there are two separate switches for handling
11645    escape sequences, with the one for handling literal escapes requiring
11646    a dummy entry for all of the special escapes that are actually handled
11647    by the other.
11648
11649    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11650    TRYAGAIN.
11651    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11652    restarted.
11653    Otherwise does not return NULL.
11654 */
11655
11656 STATIC regnode *
11657 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11658 {
11659  regnode *ret = NULL;
11660  I32 flags = 0;
11661  char *parse_start = RExC_parse;
11662  U8 op;
11663  int invert = 0;
11664  U8 arg;
11665
11666  GET_RE_DEBUG_FLAGS_DECL;
11667
11668  *flagp = WORST;  /* Tentatively. */
11669
11670  DEBUG_PARSE("atom");
11671
11672  PERL_ARGS_ASSERT_REGATOM;
11673
11674   tryagain:
11675  switch ((U8)*RExC_parse) {
11676  case '^':
11677   RExC_seen_zerolen++;
11678   nextchar(pRExC_state);
11679   if (RExC_flags & RXf_PMf_MULTILINE)
11680    ret = reg_node(pRExC_state, MBOL);
11681   else
11682    ret = reg_node(pRExC_state, SBOL);
11683   Set_Node_Length(ret, 1); /* MJD */
11684   break;
11685  case '$':
11686   nextchar(pRExC_state);
11687   if (*RExC_parse)
11688    RExC_seen_zerolen++;
11689   if (RExC_flags & RXf_PMf_MULTILINE)
11690    ret = reg_node(pRExC_state, MEOL);
11691   else
11692    ret = reg_node(pRExC_state, SEOL);
11693   Set_Node_Length(ret, 1); /* MJD */
11694   break;
11695  case '.':
11696   nextchar(pRExC_state);
11697   if (RExC_flags & RXf_PMf_SINGLELINE)
11698    ret = reg_node(pRExC_state, SANY);
11699   else
11700    ret = reg_node(pRExC_state, REG_ANY);
11701   *flagp |= HASWIDTH|SIMPLE;
11702   MARK_NAUGHTY(1);
11703   Set_Node_Length(ret, 1); /* MJD */
11704   break;
11705  case '[':
11706  {
11707   char * const oregcomp_parse = ++RExC_parse;
11708   ret = regclass(pRExC_state, flagp,depth+1,
11709      FALSE, /* means parse the whole char class */
11710      TRUE, /* allow multi-char folds */
11711      FALSE, /* don't silence non-portable warnings. */
11712      (bool) RExC_strict,
11713      NULL);
11714   if (*RExC_parse != ']') {
11715    RExC_parse = oregcomp_parse;
11716    vFAIL("Unmatched [");
11717   }
11718   if (ret == NULL) {
11719    if (*flagp & RESTART_UTF8)
11720     return NULL;
11721    FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11722     (UV) *flagp);
11723   }
11724   nextchar(pRExC_state);
11725   Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11726   break;
11727  }
11728  case '(':
11729   nextchar(pRExC_state);
11730   ret = reg(pRExC_state, 2, &flags,depth+1);
11731   if (ret == NULL) {
11732     if (flags & TRYAGAIN) {
11733      if (RExC_parse == RExC_end) {
11734       /* Make parent create an empty node if needed. */
11735       *flagp |= TRYAGAIN;
11736       return(NULL);
11737      }
11738      goto tryagain;
11739     }
11740     if (flags & RESTART_UTF8) {
11741      *flagp = RESTART_UTF8;
11742      return NULL;
11743     }
11744     FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11745                 (UV) flags);
11746   }
11747   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11748   break;
11749  case '|':
11750  case ')':
11751   if (flags & TRYAGAIN) {
11752    *flagp |= TRYAGAIN;
11753    return NULL;
11754   }
11755   vFAIL("Internal urp");
11756         /* Supposed to be caught earlier. */
11757   break;
11758  case '?':
11759  case '+':
11760  case '*':
11761   RExC_parse++;
11762   vFAIL("Quantifier follows nothing");
11763   break;
11764  case '\\':
11765   /* Special Escapes
11766
11767   This switch handles escape sequences that resolve to some kind
11768   of special regop and not to literal text. Escape sequnces that
11769   resolve to literal text are handled below in the switch marked
11770   "Literal Escapes".
11771
11772   Every entry in this switch *must* have a corresponding entry
11773   in the literal escape switch. However, the opposite is not
11774   required, as the default for this switch is to jump to the
11775   literal text handling code.
11776   */
11777   switch ((U8)*++RExC_parse) {
11778   /* Special Escapes */
11779   case 'A':
11780    RExC_seen_zerolen++;
11781    ret = reg_node(pRExC_state, SBOL);
11782    /* SBOL is shared with /^/ so we set the flags so we can tell
11783    * /\A/ from /^/ in split. We check ret because first pass we
11784    * have no regop struct to set the flags on. */
11785    if (PASS2)
11786     ret->flags = 1;
11787    *flagp |= SIMPLE;
11788    goto finish_meta_pat;
11789   case 'G':
11790    ret = reg_node(pRExC_state, GPOS);
11791    RExC_seen |= REG_GPOS_SEEN;
11792    *flagp |= SIMPLE;
11793    goto finish_meta_pat;
11794   case 'K':
11795    RExC_seen_zerolen++;
11796    ret = reg_node(pRExC_state, KEEPS);
11797    *flagp |= SIMPLE;
11798    /* XXX:dmq : disabling in-place substitution seems to
11799    * be necessary here to avoid cases of memory corruption, as
11800    * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11801    */
11802    RExC_seen |= REG_LOOKBEHIND_SEEN;
11803    goto finish_meta_pat;
11804   case 'Z':
11805    ret = reg_node(pRExC_state, SEOL);
11806    *flagp |= SIMPLE;
11807    RExC_seen_zerolen++;  /* Do not optimize RE away */
11808    goto finish_meta_pat;
11809   case 'z':
11810    ret = reg_node(pRExC_state, EOS);
11811    *flagp |= SIMPLE;
11812    RExC_seen_zerolen++;  /* Do not optimize RE away */
11813    goto finish_meta_pat;
11814   case 'C':
11815    ret = reg_node(pRExC_state, CANY);
11816    RExC_seen |= REG_CANY_SEEN;
11817    *flagp |= HASWIDTH|SIMPLE;
11818    if (PASS2) {
11819     ckWARNdep(RExC_parse+1, "\\C is deprecated");
11820    }
11821    goto finish_meta_pat;
11822   case 'X':
11823    ret = reg_node(pRExC_state, CLUMP);
11824    *flagp |= HASWIDTH;
11825    goto finish_meta_pat;
11826
11827   case 'W':
11828    invert = 1;
11829    /* FALLTHROUGH */
11830   case 'w':
11831    arg = ANYOF_WORDCHAR;
11832    goto join_posix;
11833
11834   case 'B':
11835    invert = 1;
11836    /* FALLTHROUGH */
11837   case 'b':
11838   {
11839    regex_charset charset = get_regex_charset(RExC_flags);
11840
11841    RExC_seen_zerolen++;
11842    RExC_seen |= REG_LOOKBEHIND_SEEN;
11843    op = BOUND + charset;
11844
11845    if (op == BOUNDL) {
11846     RExC_contains_locale = 1;
11847    }
11848
11849    ret = reg_node(pRExC_state, op);
11850    *flagp |= SIMPLE;
11851    if (*(RExC_parse + 1) != '{') {
11852     FLAGS(ret) = TRADITIONAL_BOUND;
11853     if (PASS2 && op > BOUNDA) {  /* /aa is same as /a */
11854      OP(ret) = BOUNDA;
11855     }
11856    }
11857    else {
11858     STRLEN length;
11859     char name = *RExC_parse;
11860     char * endbrace;
11861     RExC_parse += 2;
11862     endbrace = strchr(RExC_parse, '}');
11863
11864     if (! endbrace) {
11865      vFAIL2("Missing right brace on \\%c{}", name);
11866     }
11867     /* XXX Need to decide whether to take spaces or not.  Should be
11868     * consistent with \p{}, but that currently is SPACE, which
11869     * means vertical too, which seems wrong
11870     * while (isBLANK(*RExC_parse)) {
11871      RExC_parse++;
11872     }*/
11873     if (endbrace == RExC_parse) {
11874      RExC_parse++;  /* After the '}' */
11875      vFAIL2("Empty \\%c{}", name);
11876     }
11877     length = endbrace - RExC_parse;
11878     /*while (isBLANK(*(RExC_parse + length - 1))) {
11879      length--;
11880     }*/
11881     switch (*RExC_parse) {
11882      case 'g':
11883       if (length != 1
11884        && (length != 3 || strnNE(RExC_parse + 1, "cb", 2)))
11885       {
11886        goto bad_bound_type;
11887       }
11888       FLAGS(ret) = GCB_BOUND;
11889       break;
11890      case 's':
11891       if (length != 2 || *(RExC_parse + 1) != 'b') {
11892        goto bad_bound_type;
11893       }
11894       FLAGS(ret) = SB_BOUND;
11895       break;
11896      case 'w':
11897       if (length != 2 || *(RExC_parse + 1) != 'b') {
11898        goto bad_bound_type;
11899       }
11900       FLAGS(ret) = WB_BOUND;
11901       break;
11902      default:
11903      bad_bound_type:
11904       RExC_parse = endbrace;
11905       vFAIL2utf8f(
11906        "'%"UTF8f"' is an unknown bound type",
11907        UTF8fARG(UTF, length, endbrace - length));
11908       NOT_REACHED; /*NOTREACHED*/
11909     }
11910     RExC_parse = endbrace;
11911     RExC_uni_semantics = 1;
11912
11913     if (PASS2 && op >= BOUNDA) {  /* /aa is same as /a */
11914      OP(ret) = BOUNDU;
11915      length += 4;
11916
11917      /* Don't have to worry about UTF-8, in this message because
11918      * to get here the contents of the \b must be ASCII */
11919      ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
11920        "Using /u for '%.*s' instead of /%s",
11921        (unsigned) length,
11922        endbrace - length + 1,
11923        (charset == REGEX_ASCII_RESTRICTED_CHARSET)
11924        ? ASCII_RESTRICT_PAT_MODS
11925        : ASCII_MORE_RESTRICT_PAT_MODS);
11926     }
11927    }
11928
11929    if (PASS2 && invert) {
11930     OP(ret) += NBOUND - BOUND;
11931    }
11932    goto finish_meta_pat;
11933   }
11934
11935   case 'D':
11936    invert = 1;
11937    /* FALLTHROUGH */
11938   case 'd':
11939    arg = ANYOF_DIGIT;
11940    if (! DEPENDS_SEMANTICS) {
11941     goto join_posix;
11942    }
11943
11944    /* \d doesn't have any matches in the upper Latin1 range, hence /d
11945    * is equivalent to /u.  Changing to /u saves some branches at
11946    * runtime */
11947    op = POSIXU;
11948    goto join_posix_op_known;
11949
11950   case 'R':
11951    ret = reg_node(pRExC_state, LNBREAK);
11952    *flagp |= HASWIDTH|SIMPLE;
11953    goto finish_meta_pat;
11954
11955   case 'H':
11956    invert = 1;
11957    /* FALLTHROUGH */
11958   case 'h':
11959    arg = ANYOF_BLANK;
11960    op = POSIXU;
11961    goto join_posix_op_known;
11962
11963   case 'V':
11964    invert = 1;
11965    /* FALLTHROUGH */
11966   case 'v':
11967    arg = ANYOF_VERTWS;
11968    op = POSIXU;
11969    goto join_posix_op_known;
11970
11971   case 'S':
11972    invert = 1;
11973    /* FALLTHROUGH */
11974   case 's':
11975    arg = ANYOF_SPACE;
11976
11977   join_posix:
11978
11979    op = POSIXD + get_regex_charset(RExC_flags);
11980    if (op > POSIXA) {  /* /aa is same as /a */
11981     op = POSIXA;
11982    }
11983    else if (op == POSIXL) {
11984     RExC_contains_locale = 1;
11985    }
11986
11987   join_posix_op_known:
11988
11989    if (invert) {
11990     op += NPOSIXD - POSIXD;
11991    }
11992
11993    ret = reg_node(pRExC_state, op);
11994    if (! SIZE_ONLY) {
11995     FLAGS(ret) = namedclass_to_classnum(arg);
11996    }
11997
11998    *flagp |= HASWIDTH|SIMPLE;
11999    /* FALLTHROUGH */
12000
12001   finish_meta_pat:
12002    nextchar(pRExC_state);
12003    Set_Node_Length(ret, 2); /* MJD */
12004    break;
12005   case 'p':
12006   case 'P':
12007    {
12008 #ifdef DEBUGGING
12009     char* parse_start = RExC_parse - 2;
12010 #endif
12011
12012     RExC_parse--;
12013
12014     ret = regclass(pRExC_state, flagp,depth+1,
12015        TRUE, /* means just parse this element */
12016        FALSE, /* don't allow multi-char folds */
12017        FALSE, /* don't silence non-portable warnings.
12018           It would be a bug if these returned
12019           non-portables */
12020        (bool) RExC_strict,
12021        NULL);
12022     /* regclass() can only return RESTART_UTF8 if multi-char folds
12023     are allowed.  */
12024     if (!ret)
12025      FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
12026       (UV) *flagp);
12027
12028     RExC_parse--;
12029
12030     Set_Node_Offset(ret, parse_start + 2);
12031     Set_Node_Cur_Length(ret, parse_start);
12032     nextchar(pRExC_state);
12033    }
12034    break;
12035   case 'N':
12036    /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
12037    * \N{...} evaluates to a sequence of more than one code points).
12038    * The function call below returns a regnode, which is our result.
12039    * The parameters cause it to fail if the \N{} evaluates to a
12040    * single code point; we handle those like any other literal.  The
12041    * reason that the multicharacter case is handled here and not as
12042    * part of the EXACtish code is because of quantifiers.  In
12043    * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
12044    * this way makes that Just Happen. dmq.
12045    * join_exact() will join this up with adjacent EXACTish nodes
12046    * later on, if appropriate. */
12047    ++RExC_parse;
12048    if (grok_bslash_N(pRExC_state,
12049        &ret,     /* Want a regnode returned */
12050        NULL,     /* Fail if evaluates to a single code
12051           point */
12052        NULL,     /* Don't need a count of how many code
12053           points */
12054        flagp,
12055        depth)
12056    ) {
12057     break;
12058    }
12059
12060    if (*flagp & RESTART_UTF8)
12061     return NULL;
12062    RExC_parse--;
12063    goto defchar;
12064
12065   case 'k':    /* Handle \k<NAME> and \k'NAME' */
12066  parse_named_seq:
12067   {
12068    char ch= RExC_parse[1];
12069    if (ch != '<' && ch != '\'' && ch != '{') {
12070     RExC_parse++;
12071     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
12072     vFAIL2("Sequence %.2s... not terminated",parse_start);
12073    } else {
12074     /* this pretty much dupes the code for (?P=...) in reg(), if
12075     you change this make sure you change that */
12076     char* name_start = (RExC_parse += 2);
12077     U32 num = 0;
12078     SV *sv_dat = reg_scan_name(pRExC_state,
12079      SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
12080     ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
12081     if (RExC_parse == name_start || *RExC_parse != ch)
12082      /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
12083      vFAIL2("Sequence %.3s... not terminated",parse_start);
12084
12085     if (!SIZE_ONLY) {
12086      num = add_data( pRExC_state, STR_WITH_LEN("S"));
12087      RExC_rxi->data->data[num]=(void*)sv_dat;
12088      SvREFCNT_inc_simple_void(sv_dat);
12089     }
12090
12091     RExC_sawback = 1;
12092     ret = reganode(pRExC_state,
12093        ((! FOLD)
12094         ? NREF
12095         : (ASCII_FOLD_RESTRICTED)
12096         ? NREFFA
12097         : (AT_LEAST_UNI_SEMANTICS)
12098          ? NREFFU
12099          : (LOC)
12100          ? NREFFL
12101          : NREFF),
12102         num);
12103     *flagp |= HASWIDTH;
12104
12105     /* override incorrect value set in reganode MJD */
12106     Set_Node_Offset(ret, parse_start+1);
12107     Set_Node_Cur_Length(ret, parse_start);
12108     nextchar(pRExC_state);
12109
12110    }
12111    break;
12112   }
12113   case 'g':
12114   case '1': case '2': case '3': case '4':
12115   case '5': case '6': case '7': case '8': case '9':
12116    {
12117     I32 num;
12118     bool hasbrace = 0;
12119
12120     if (*RExC_parse == 'g') {
12121      bool isrel = 0;
12122
12123      RExC_parse++;
12124      if (*RExC_parse == '{') {
12125       RExC_parse++;
12126       hasbrace = 1;
12127      }
12128      if (*RExC_parse == '-') {
12129       RExC_parse++;
12130       isrel = 1;
12131      }
12132      if (hasbrace && !isDIGIT(*RExC_parse)) {
12133       if (isrel) RExC_parse--;
12134       RExC_parse -= 2;
12135       goto parse_named_seq;
12136      }
12137
12138      num = S_backref_value(RExC_parse);
12139      if (num == 0)
12140       vFAIL("Reference to invalid group 0");
12141      else if (num == I32_MAX) {
12142       if (isDIGIT(*RExC_parse))
12143        vFAIL("Reference to nonexistent group");
12144       else
12145        vFAIL("Unterminated \\g... pattern");
12146      }
12147
12148      if (isrel) {
12149       num = RExC_npar - num;
12150       if (num < 1)
12151        vFAIL("Reference to nonexistent or unclosed group");
12152      }
12153     }
12154     else {
12155      num = S_backref_value(RExC_parse);
12156      /* bare \NNN might be backref or octal - if it is larger
12157      * than or equal RExC_npar then it is assumed to be an
12158      * octal escape. Note RExC_npar is +1 from the actual
12159      * number of parens. */
12160      /* Note we do NOT check if num == I32_MAX here, as that is
12161      * handled by the RExC_npar check */
12162
12163      if (
12164       /* any numeric escape < 10 is always a backref */
12165       num > 9
12166       /* any numeric escape < RExC_npar is a backref */
12167       && num >= RExC_npar
12168       /* cannot be an octal escape if it starts with 8 */
12169       && *RExC_parse != '8'
12170       /* cannot be an octal escape it it starts with 9 */
12171       && *RExC_parse != '9'
12172      )
12173      {
12174       /* Probably not a backref, instead likely to be an
12175       * octal character escape, e.g. \35 or \777.
12176       * The above logic should make it obvious why using
12177       * octal escapes in patterns is problematic. - Yves */
12178       goto defchar;
12179      }
12180     }
12181
12182     /* At this point RExC_parse points at a numeric escape like
12183     * \12 or \88 or something similar, which we should NOT treat
12184     * as an octal escape. It may or may not be a valid backref
12185     * escape. For instance \88888888 is unlikely to be a valid
12186     * backref. */
12187     {
12188 #ifdef RE_TRACK_PATTERN_OFFSETS
12189      char * const parse_start = RExC_parse - 1; /* MJD */
12190 #endif
12191      while (isDIGIT(*RExC_parse))
12192       RExC_parse++;
12193      if (hasbrace) {
12194       if (*RExC_parse != '}')
12195        vFAIL("Unterminated \\g{...} pattern");
12196       RExC_parse++;
12197      }
12198      if (!SIZE_ONLY) {
12199       if (num > (I32)RExC_rx->nparens)
12200        vFAIL("Reference to nonexistent group");
12201      }
12202      RExC_sawback = 1;
12203      ret = reganode(pRExC_state,
12204         ((! FOLD)
12205          ? REF
12206          : (ASCII_FOLD_RESTRICTED)
12207          ? REFFA
12208          : (AT_LEAST_UNI_SEMANTICS)
12209           ? REFFU
12210           : (LOC)
12211           ? REFFL
12212           : REFF),
12213          num);
12214      *flagp |= HASWIDTH;
12215
12216      /* override incorrect value set in reganode MJD */
12217      Set_Node_Offset(ret, parse_start+1);
12218      Set_Node_Cur_Length(ret, parse_start);
12219      RExC_parse--;
12220      nextchar(pRExC_state);
12221     }
12222    }
12223    break;
12224   case '\0':
12225    if (RExC_parse >= RExC_end)
12226     FAIL("Trailing \\");
12227    /* FALLTHROUGH */
12228   default:
12229    /* Do not generate "unrecognized" warnings here, we fall
12230    back into the quick-grab loop below */
12231    parse_start--;
12232    goto defchar;
12233   }
12234   break;
12235
12236  case '#':
12237   if (RExC_flags & RXf_PMf_EXTENDED) {
12238    RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
12239    if (RExC_parse < RExC_end)
12240     goto tryagain;
12241   }
12242   /* FALLTHROUGH */
12243
12244  default:
12245
12246    parse_start = RExC_parse - 1;
12247
12248    RExC_parse++;
12249
12250   defchar: {
12251    STRLEN len = 0;
12252    UV ender = 0;
12253    char *p;
12254    char *s;
12255 #define MAX_NODE_STRING_SIZE 127
12256    char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
12257    char *s0;
12258    U8 upper_parse = MAX_NODE_STRING_SIZE;
12259    U8 node_type = compute_EXACTish(pRExC_state);
12260    bool next_is_quantifier;
12261    char * oldp = NULL;
12262
12263    /* We can convert EXACTF nodes to EXACTFU if they contain only
12264    * characters that match identically regardless of the target
12265    * string's UTF8ness.  The reason to do this is that EXACTF is not
12266    * trie-able, EXACTFU is.
12267    *
12268    * Similarly, we can convert EXACTFL nodes to EXACTFU if they
12269    * contain only above-Latin1 characters (hence must be in UTF8),
12270    * which don't participate in folds with Latin1-range characters,
12271    * as the latter's folds aren't known until runtime.  (We don't
12272    * need to figure this out until pass 2) */
12273    bool maybe_exactfu = PASS2
12274        && (node_type == EXACTF || node_type == EXACTFL);
12275
12276    /* If a folding node contains only code points that don't
12277    * participate in folds, it can be changed into an EXACT node,
12278    * which allows the optimizer more things to look for */
12279    bool maybe_exact;
12280
12281    ret = reg_node(pRExC_state, node_type);
12282
12283    /* In pass1, folded, we use a temporary buffer instead of the
12284    * actual node, as the node doesn't exist yet */
12285    s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
12286
12287    s0 = s;
12288
12289   reparse:
12290
12291    /* We do the EXACTFish to EXACT node only if folding.  (And we
12292    * don't need to figure this out until pass 2) */
12293    maybe_exact = FOLD && PASS2;
12294
12295    /* XXX The node can hold up to 255 bytes, yet this only goes to
12296    * 127.  I (khw) do not know why.  Keeping it somewhat less than
12297    * 255 allows us to not have to worry about overflow due to
12298    * converting to utf8 and fold expansion, but that value is
12299    * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
12300    * split up by this limit into a single one using the real max of
12301    * 255.  Even at 127, this breaks under rare circumstances.  If
12302    * folding, we do not want to split a node at a character that is a
12303    * non-final in a multi-char fold, as an input string could just
12304    * happen to want to match across the node boundary.  The join
12305    * would solve that problem if the join actually happens.  But a
12306    * series of more than two nodes in a row each of 127 would cause
12307    * the first join to succeed to get to 254, but then there wouldn't
12308    * be room for the next one, which could at be one of those split
12309    * multi-char folds.  I don't know of any fool-proof solution.  One
12310    * could back off to end with only a code point that isn't such a
12311    * non-final, but it is possible for there not to be any in the
12312    * entire node. */
12313    for (p = RExC_parse - 1;
12314     len < upper_parse && p < RExC_end;
12315     len++)
12316    {
12317     oldp = p;
12318
12319     if (RExC_flags & RXf_PMf_EXTENDED)
12320      p = regpatws(pRExC_state, p,
12321           TRUE); /* means recognize comments */
12322     switch ((U8)*p) {
12323     case '^':
12324     case '$':
12325     case '.':
12326     case '[':
12327     case '(':
12328     case ')':
12329     case '|':
12330      goto loopdone;
12331     case '\\':
12332      /* Literal Escapes Switch
12333
12334      This switch is meant to handle escape sequences that
12335      resolve to a literal character.
12336
12337      Every escape sequence that represents something
12338      else, like an assertion or a char class, is handled
12339      in the switch marked 'Special Escapes' above in this
12340      routine, but also has an entry here as anything that
12341      isn't explicitly mentioned here will be treated as
12342      an unescaped equivalent literal.
12343      */
12344
12345      switch ((U8)*++p) {
12346      /* These are all the special escapes. */
12347      case 'A':             /* Start assertion */
12348      case 'b': case 'B':   /* Word-boundary assertion*/
12349      case 'C':             /* Single char !DANGEROUS! */
12350      case 'd': case 'D':   /* digit class */
12351      case 'g': case 'G':   /* generic-backref, pos assertion */
12352      case 'h': case 'H':   /* HORIZWS */
12353      case 'k': case 'K':   /* named backref, keep marker */
12354      case 'p': case 'P':   /* Unicode property */
12355        case 'R':   /* LNBREAK */
12356      case 's': case 'S':   /* space class */
12357      case 'v': case 'V':   /* VERTWS */
12358      case 'w': case 'W':   /* word class */
12359      case 'X':             /* eXtended Unicode "combining
12360            character sequence" */
12361      case 'z': case 'Z':   /* End of line/string assertion */
12362       --p;
12363       goto loopdone;
12364
12365      /* Anything after here is an escape that resolves to a
12366      literal. (Except digits, which may or may not)
12367      */
12368      case 'n':
12369       ender = '\n';
12370       p++;
12371       break;
12372      case 'N': /* Handle a single-code point named character. */
12373       RExC_parse = p + 1;
12374       if (! grok_bslash_N(pRExC_state,
12375            NULL,   /* Fail if evaluates to
12376              anything other than a
12377              single code point */
12378            &ender, /* The returned single code
12379              point */
12380            NULL,   /* Don't need a count of
12381              how many code points */
12382            flagp,
12383            depth)
12384       ) {
12385        if (*flagp & RESTART_UTF8)
12386         FAIL("panic: grok_bslash_N set RESTART_UTF8");
12387
12388        /* Here, it wasn't a single code point.  Go close
12389        * up this EXACTish node.  The switch() prior to
12390        * this switch handles the other cases */
12391        RExC_parse = p = oldp;
12392        goto loopdone;
12393       }
12394       p = RExC_parse;
12395       if (ender > 0xff) {
12396        REQUIRE_UTF8;
12397       }
12398       break;
12399      case 'r':
12400       ender = '\r';
12401       p++;
12402       break;
12403      case 't':
12404       ender = '\t';
12405       p++;
12406       break;
12407      case 'f':
12408       ender = '\f';
12409       p++;
12410       break;
12411      case 'e':
12412       ender = ESC_NATIVE;
12413       p++;
12414       break;
12415      case 'a':
12416       ender = '\a';
12417       p++;
12418       break;
12419      case 'o':
12420       {
12421        UV result;
12422        const char* error_msg;
12423
12424        bool valid = grok_bslash_o(&p,
12425              &result,
12426              &error_msg,
12427              PASS2, /* out warnings */
12428              (bool) RExC_strict,
12429              TRUE, /* Output warnings
12430                 for non-
12431                 portables */
12432              UTF);
12433        if (! valid) {
12434         RExC_parse = p; /* going to die anyway; point
12435             to exact spot of failure */
12436         vFAIL(error_msg);
12437        }
12438        ender = result;
12439        if (IN_ENCODING && ender < 0x100) {
12440         goto recode_encoding;
12441        }
12442        if (ender > 0xff) {
12443         REQUIRE_UTF8;
12444        }
12445        break;
12446       }
12447      case 'x':
12448       {
12449        UV result = UV_MAX; /* initialize to erroneous
12450             value */
12451        const char* error_msg;
12452
12453        bool valid = grok_bslash_x(&p,
12454              &result,
12455              &error_msg,
12456              PASS2, /* out warnings */
12457              (bool) RExC_strict,
12458              TRUE, /* Silence warnings
12459                 for non-
12460                 portables */
12461              UTF);
12462        if (! valid) {
12463         RExC_parse = p; /* going to die anyway; point
12464             to exact spot of failure */
12465         vFAIL(error_msg);
12466        }
12467        ender = result;
12468
12469        if (ender < 0x100) {
12470 #ifdef EBCDIC
12471         if (RExC_recode_x_to_native) {
12472          ender = LATIN1_TO_NATIVE(ender);
12473         }
12474         else
12475 #endif
12476         if (IN_ENCODING) {
12477          goto recode_encoding;
12478         }
12479        }
12480        else {
12481         REQUIRE_UTF8;
12482        }
12483        break;
12484       }
12485      case 'c':
12486       p++;
12487       ender = grok_bslash_c(*p++, PASS2);
12488       break;
12489      case '8': case '9': /* must be a backreference */
12490       --p;
12491       /* we have an escape like \8 which cannot be an octal escape
12492       * so we exit the loop, and let the outer loop handle this
12493       * escape which may or may not be a legitimate backref. */
12494       goto loopdone;
12495      case '1': case '2': case '3':case '4':
12496      case '5': case '6': case '7':
12497       /* When we parse backslash escapes there is ambiguity
12498       * between backreferences and octal escapes. Any escape
12499       * from \1 - \9 is a backreference, any multi-digit
12500       * escape which does not start with 0 and which when
12501       * evaluated as decimal could refer to an already
12502       * parsed capture buffer is a back reference. Anything
12503       * else is octal.
12504       *
12505       * Note this implies that \118 could be interpreted as
12506       * 118 OR as "\11" . "8" depending on whether there
12507       * were 118 capture buffers defined already in the
12508       * pattern.  */
12509
12510       /* NOTE, RExC_npar is 1 more than the actual number of
12511       * parens we have seen so far, hence the < RExC_npar below. */
12512
12513       if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
12514       {  /* Not to be treated as an octal constant, go
12515         find backref */
12516        --p;
12517        goto loopdone;
12518       }
12519       /* FALLTHROUGH */
12520      case '0':
12521       {
12522        I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12523        STRLEN numlen = 3;
12524        ender = grok_oct(p, &numlen, &flags, NULL);
12525        if (ender > 0xff) {
12526         REQUIRE_UTF8;
12527        }
12528        p += numlen;
12529        if (PASS2   /* like \08, \178 */
12530         && numlen < 3
12531         && p < RExC_end
12532         && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12533        {
12534         reg_warn_non_literal_string(
12535           p + 1,
12536           form_short_octal_warning(p, numlen));
12537        }
12538       }
12539       if (IN_ENCODING && ender < 0x100)
12540        goto recode_encoding;
12541       break;
12542      recode_encoding:
12543       if (! RExC_override_recoding) {
12544        SV* enc = _get_encoding();
12545        ender = reg_recode((const char)(U8)ender, &enc);
12546        if (!enc && PASS2)
12547         ckWARNreg(p, "Invalid escape in the specified encoding");
12548        REQUIRE_UTF8;
12549       }
12550       break;
12551      case '\0':
12552       if (p >= RExC_end)
12553        FAIL("Trailing \\");
12554       /* FALLTHROUGH */
12555      default:
12556       if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12557        /* Include any { following the alpha to emphasize
12558        * that it could be part of an escape at some point
12559        * in the future */
12560        int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12561        ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12562       }
12563       goto normal_default;
12564      } /* End of switch on '\' */
12565      break;
12566     case '{':
12567      /* Currently we don't warn when the lbrace is at the start
12568      * of a construct.  This catches it in the middle of a
12569      * literal string, or when its the first thing after
12570      * something like "\b" */
12571      if (! SIZE_ONLY
12572       && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12573      {
12574       ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12575      }
12576      /*FALLTHROUGH*/
12577     default:    /* A literal character */
12578     normal_default:
12579      if (UTF8_IS_START(*p) && UTF) {
12580       STRLEN numlen;
12581       ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12582            &numlen, UTF8_ALLOW_DEFAULT);
12583       p += numlen;
12584      }
12585      else
12586       ender = (U8) *p++;
12587      break;
12588     } /* End of switch on the literal */
12589
12590     /* Here, have looked at the literal character and <ender>
12591     * contains its ordinal, <p> points to the character after it
12592     */
12593
12594     if ( RExC_flags & RXf_PMf_EXTENDED)
12595      p = regpatws(pRExC_state, p,
12596           TRUE); /* means recognize comments */
12597
12598     /* If the next thing is a quantifier, it applies to this
12599     * character only, which means that this character has to be in
12600     * its own node and can't just be appended to the string in an
12601     * existing node, so if there are already other characters in
12602     * the node, close the node with just them, and set up to do
12603     * this character again next time through, when it will be the
12604     * only thing in its new node */
12605     if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12606     {
12607      p = oldp;
12608      goto loopdone;
12609     }
12610
12611     if (! FOLD) {  /* The simple case, just append the literal */
12612
12613      /* In the sizing pass, we need only the size of the
12614      * character we are appending, hence we can delay getting
12615      * its representation until PASS2. */
12616      if (SIZE_ONLY) {
12617       if (UTF) {
12618        const STRLEN unilen = UNISKIP(ender);
12619        s += unilen;
12620
12621        /* We have to subtract 1 just below (and again in
12622        * the corresponding PASS2 code) because the loop
12623        * increments <len> each time, as all but this path
12624        * (and one other) through it add a single byte to
12625        * the EXACTish node.  But these paths would change
12626        * len to be the correct final value, so cancel out
12627        * the increment that follows */
12628        len += unilen - 1;
12629       }
12630       else {
12631        s++;
12632       }
12633      } else { /* PASS2 */
12634      not_fold_common:
12635       if (UTF) {
12636        U8 * new_s = uvchr_to_utf8((U8*)s, ender);
12637        len += (char *) new_s - s - 1;
12638        s = (char *) new_s;
12639       }
12640       else {
12641        *(s++) = (char) ender;
12642       }
12643      }
12644     }
12645     else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
12646
12647      /* Here are folding under /l, and the code point is
12648      * problematic.  First, we know we can't simplify things */
12649      maybe_exact = FALSE;
12650      maybe_exactfu = FALSE;
12651
12652      /* A problematic code point in this context means that its
12653      * fold isn't known until runtime, so we can't fold it now.
12654      * (The non-problematic code points are the above-Latin1
12655      * ones that fold to also all above-Latin1.  Their folds
12656      * don't vary no matter what the locale is.) But here we
12657      * have characters whose fold depends on the locale.
12658      * Unlike the non-folding case above, we have to keep track
12659      * of these in the sizing pass, so that we can make sure we
12660      * don't split too-long nodes in the middle of a potential
12661      * multi-char fold.  And unlike the regular fold case
12662      * handled in the else clauses below, we don't actually
12663      * fold and don't have special cases to consider.  What we
12664      * do for both passes is the PASS2 code for non-folding */
12665      goto not_fold_common;
12666     }
12667     else /* A regular FOLD code point */
12668      if (! ( UTF
12669       /* See comments for join_exact() as to why we fold this
12670       * non-UTF at compile time */
12671       || (node_type == EXACTFU
12672        && ender == LATIN_SMALL_LETTER_SHARP_S)))
12673     {
12674      /* Here, are folding and are not UTF-8 encoded; therefore
12675      * the character must be in the range 0-255, and is not /l
12676      * (Not /l because we already handled these under /l in
12677      * is_PROBLEMATIC_LOCALE_FOLD_cp) */
12678      if (IS_IN_SOME_FOLD_L1(ender)) {
12679       maybe_exact = FALSE;
12680
12681       /* See if the character's fold differs between /d and
12682       * /u.  This includes the multi-char fold SHARP S to
12683       * 'ss' */
12684       if (maybe_exactfu
12685        && (PL_fold[ender] != PL_fold_latin1[ender]
12686         || ender == LATIN_SMALL_LETTER_SHARP_S
12687         || (len > 0
12688         && isALPHA_FOLD_EQ(ender, 's')
12689         && isALPHA_FOLD_EQ(*(s-1), 's'))))
12690       {
12691        maybe_exactfu = FALSE;
12692       }
12693      }
12694
12695      /* Even when folding, we store just the input character, as
12696      * we have an array that finds its fold quickly */
12697      *(s++) = (char) ender;
12698     }
12699     else {  /* FOLD and UTF */
12700      /* Unlike the non-fold case, we do actually have to
12701      * calculate the results here in pass 1.  This is for two
12702      * reasons, the folded length may be longer than the
12703      * unfolded, and we have to calculate how many EXACTish
12704      * nodes it will take; and we may run out of room in a node
12705      * in the middle of a potential multi-char fold, and have
12706      * to back off accordingly.  */
12707
12708      UV folded;
12709      if (isASCII_uni(ender)) {
12710       folded = toFOLD(ender);
12711       *(s)++ = (U8) folded;
12712      }
12713      else {
12714       STRLEN foldlen;
12715
12716       folded = _to_uni_fold_flags(
12717          ender,
12718          (U8 *) s,
12719          &foldlen,
12720          FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12721               ? FOLD_FLAGS_NOMIX_ASCII
12722               : 0));
12723       s += foldlen;
12724
12725       /* The loop increments <len> each time, as all but this
12726       * path (and one other) through it add a single byte to
12727       * the EXACTish node.  But this one has changed len to
12728       * be the correct final value, so subtract one to
12729       * cancel out the increment that follows */
12730       len += foldlen - 1;
12731      }
12732      /* If this node only contains non-folding code points so
12733      * far, see if this new one is also non-folding */
12734      if (maybe_exact) {
12735       if (folded != ender) {
12736        maybe_exact = FALSE;
12737       }
12738       else {
12739        /* Here the fold is the original; we have to check
12740        * further to see if anything folds to it */
12741        if (_invlist_contains_cp(PL_utf8_foldable,
12742               ender))
12743        {
12744         maybe_exact = FALSE;
12745        }
12746       }
12747      }
12748      ender = folded;
12749     }
12750
12751     if (next_is_quantifier) {
12752
12753      /* Here, the next input is a quantifier, and to get here,
12754      * the current character is the only one in the node.
12755      * Also, here <len> doesn't include the final byte for this
12756      * character */
12757      len++;
12758      goto loopdone;
12759     }
12760
12761    } /* End of loop through literal characters */
12762
12763    /* Here we have either exhausted the input or ran out of room in
12764    * the node.  (If we encountered a character that can't be in the
12765    * node, transfer is made directly to <loopdone>, and so we
12766    * wouldn't have fallen off the end of the loop.)  In the latter
12767    * case, we artificially have to split the node into two, because
12768    * we just don't have enough space to hold everything.  This
12769    * creates a problem if the final character participates in a
12770    * multi-character fold in the non-final position, as a match that
12771    * should have occurred won't, due to the way nodes are matched,
12772    * and our artificial boundary.  So back off until we find a non-
12773    * problematic character -- one that isn't at the beginning or
12774    * middle of such a fold.  (Either it doesn't participate in any
12775    * folds, or appears only in the final position of all the folds it
12776    * does participate in.)  A better solution with far fewer false
12777    * positives, and that would fill the nodes more completely, would
12778    * be to actually have available all the multi-character folds to
12779    * test against, and to back-off only far enough to be sure that
12780    * this node isn't ending with a partial one.  <upper_parse> is set
12781    * further below (if we need to reparse the node) to include just
12782    * up through that final non-problematic character that this code
12783    * identifies, so when it is set to less than the full node, we can
12784    * skip the rest of this */
12785    if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12786
12787     const STRLEN full_len = len;
12788
12789     assert(len >= MAX_NODE_STRING_SIZE);
12790
12791     /* Here, <s> points to the final byte of the final character.
12792     * Look backwards through the string until find a non-
12793     * problematic character */
12794
12795     if (! UTF) {
12796
12797      /* This has no multi-char folds to non-UTF characters */
12798      if (ASCII_FOLD_RESTRICTED) {
12799       goto loopdone;
12800      }
12801
12802      while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12803      len = s - s0 + 1;
12804     }
12805     else {
12806      if (!  PL_NonL1NonFinalFold) {
12807       PL_NonL1NonFinalFold = _new_invlist_C_array(
12808           NonL1_Perl_Non_Final_Folds_invlist);
12809      }
12810
12811      /* Point to the first byte of the final character */
12812      s = (char *) utf8_hop((U8 *) s, -1);
12813
12814      while (s >= s0) {   /* Search backwards until find
12815           non-problematic char */
12816       if (UTF8_IS_INVARIANT(*s)) {
12817
12818        /* There are no ascii characters that participate
12819        * in multi-char folds under /aa.  In EBCDIC, the
12820        * non-ascii invariants are all control characters,
12821        * so don't ever participate in any folds. */
12822        if (ASCII_FOLD_RESTRICTED
12823         || ! IS_NON_FINAL_FOLD(*s))
12824        {
12825         break;
12826        }
12827       }
12828       else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12829        if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12830                 *s, *(s+1))))
12831        {
12832         break;
12833        }
12834       }
12835       else if (! _invlist_contains_cp(
12836           PL_NonL1NonFinalFold,
12837           valid_utf8_to_uvchr((U8 *) s, NULL)))
12838       {
12839        break;
12840       }
12841
12842       /* Here, the current character is problematic in that
12843       * it does occur in the non-final position of some
12844       * fold, so try the character before it, but have to
12845       * special case the very first byte in the string, so
12846       * we don't read outside the string */
12847       s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12848      } /* End of loop backwards through the string */
12849
12850      /* If there were only problematic characters in the string,
12851      * <s> will point to before s0, in which case the length
12852      * should be 0, otherwise include the length of the
12853      * non-problematic character just found */
12854      len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12855     }
12856
12857     /* Here, have found the final character, if any, that is
12858     * non-problematic as far as ending the node without splitting
12859     * it across a potential multi-char fold.  <len> contains the
12860     * number of bytes in the node up-to and including that
12861     * character, or is 0 if there is no such character, meaning
12862     * the whole node contains only problematic characters.  In
12863     * this case, give up and just take the node as-is.  We can't
12864     * do any better */
12865     if (len == 0) {
12866      len = full_len;
12867
12868      /* If the node ends in an 's' we make sure it stays EXACTF,
12869      * as if it turns into an EXACTFU, it could later get
12870      * joined with another 's' that would then wrongly match
12871      * the sharp s */
12872      if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
12873      {
12874       maybe_exactfu = FALSE;
12875      }
12876     } else {
12877
12878      /* Here, the node does contain some characters that aren't
12879      * problematic.  If one such is the final character in the
12880      * node, we are done */
12881      if (len == full_len) {
12882       goto loopdone;
12883      }
12884      else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12885
12886       /* If the final character is problematic, but the
12887       * penultimate is not, back-off that last character to
12888       * later start a new node with it */
12889       p = oldp;
12890       goto loopdone;
12891      }
12892
12893      /* Here, the final non-problematic character is earlier
12894      * in the input than the penultimate character.  What we do
12895      * is reparse from the beginning, going up only as far as
12896      * this final ok one, thus guaranteeing that the node ends
12897      * in an acceptable character.  The reason we reparse is
12898      * that we know how far in the character is, but we don't
12899      * know how to correlate its position with the input parse.
12900      * An alternate implementation would be to build that
12901      * correlation as we go along during the original parse,
12902      * but that would entail extra work for every node, whereas
12903      * this code gets executed only when the string is too
12904      * large for the node, and the final two characters are
12905      * problematic, an infrequent occurrence.  Yet another
12906      * possible strategy would be to save the tail of the
12907      * string, and the next time regatom is called, initialize
12908      * with that.  The problem with this is that unless you
12909      * back off one more character, you won't be guaranteed
12910      * regatom will get called again, unless regbranch,
12911      * regpiece ... are also changed.  If you do back off that
12912      * extra character, so that there is input guaranteed to
12913      * force calling regatom, you can't handle the case where
12914      * just the first character in the node is acceptable.  I
12915      * (khw) decided to try this method which doesn't have that
12916      * pitfall; if performance issues are found, we can do a
12917      * combination of the current approach plus that one */
12918      upper_parse = len;
12919      len = 0;
12920      s = s0;
12921      goto reparse;
12922     }
12923    }   /* End of verifying node ends with an appropriate char */
12924
12925   loopdone:   /* Jumped to when encounters something that shouldn't be
12926       in the node */
12927
12928    /* I (khw) don't know if you can get here with zero length, but the
12929    * old code handled this situation by creating a zero-length EXACT
12930    * node.  Might as well be NOTHING instead */
12931    if (len == 0) {
12932     OP(ret) = NOTHING;
12933    }
12934    else {
12935     if (FOLD) {
12936      /* If 'maybe_exact' is still set here, means there are no
12937      * code points in the node that participate in folds;
12938      * similarly for 'maybe_exactfu' and code points that match
12939      * differently depending on UTF8ness of the target string
12940      * (for /u), or depending on locale for /l */
12941      if (maybe_exact) {
12942       OP(ret) = (LOC)
12943         ? EXACTL
12944         : EXACT;
12945      }
12946      else if (maybe_exactfu) {
12947       OP(ret) = (LOC)
12948         ? EXACTFLU8
12949         : EXACTFU;
12950      }
12951     }
12952     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12953           FALSE /* Don't look to see if could
12954              be turned into an EXACT
12955              node, as we have already
12956              computed that */
12957           );
12958    }
12959
12960    RExC_parse = p - 1;
12961    Set_Node_Cur_Length(ret, parse_start);
12962    nextchar(pRExC_state);
12963    {
12964     /* len is STRLEN which is unsigned, need to copy to signed */
12965     IV iv = len;
12966     if (iv < 0)
12967      vFAIL("Internal disaster");
12968    }
12969
12970   } /* End of label 'defchar:' */
12971   break;
12972  } /* End of giant switch on input character */
12973
12974  return(ret);
12975 }
12976
12977 STATIC char *
12978 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12979 {
12980  /* Returns the next non-pattern-white space, non-comment character (the
12981  * latter only if 'recognize_comment is true) in the string p, which is
12982  * ended by RExC_end.  See also reg_skipcomment */
12983  const char *e = RExC_end;
12984
12985  PERL_ARGS_ASSERT_REGPATWS;
12986
12987  while (p < e) {
12988   STRLEN len;
12989   if ((len = is_PATWS_safe(p, e, UTF))) {
12990    p += len;
12991   }
12992   else if (recognize_comment && *p == '#') {
12993    p = reg_skipcomment(pRExC_state, p);
12994   }
12995   else
12996    break;
12997  }
12998  return p;
12999 }
13000
13001 STATIC void
13002 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
13003 {
13004  /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
13005  * sets up the bitmap and any flags, removing those code points from the
13006  * inversion list, setting it to NULL should it become completely empty */
13007
13008  PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
13009  assert(PL_regkind[OP(node)] == ANYOF);
13010
13011  ANYOF_BITMAP_ZERO(node);
13012  if (*invlist_ptr) {
13013
13014   /* This gets set if we actually need to modify things */
13015   bool change_invlist = FALSE;
13016
13017   UV start, end;
13018
13019   /* Start looking through *invlist_ptr */
13020   invlist_iterinit(*invlist_ptr);
13021   while (invlist_iternext(*invlist_ptr, &start, &end)) {
13022    UV high;
13023    int i;
13024
13025    if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
13026     ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
13027    }
13028    else if (end >= NUM_ANYOF_CODE_POINTS) {
13029     ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
13030    }
13031
13032    /* Quit if are above what we should change */
13033    if (start >= NUM_ANYOF_CODE_POINTS) {
13034     break;
13035    }
13036
13037    change_invlist = TRUE;
13038
13039    /* Set all the bits in the range, up to the max that we are doing */
13040    high = (end < NUM_ANYOF_CODE_POINTS - 1)
13041     ? end
13042     : NUM_ANYOF_CODE_POINTS - 1;
13043    for (i = start; i <= (int) high; i++) {
13044     if (! ANYOF_BITMAP_TEST(node, i)) {
13045      ANYOF_BITMAP_SET(node, i);
13046     }
13047    }
13048   }
13049   invlist_iterfinish(*invlist_ptr);
13050
13051   /* Done with loop; remove any code points that are in the bitmap from
13052   * *invlist_ptr; similarly for code points above the bitmap if we have
13053   * a flag to match all of them anyways */
13054   if (change_invlist) {
13055    _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
13056   }
13057   if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
13058    _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
13059   }
13060
13061   /* If have completely emptied it, remove it completely */
13062   if (_invlist_len(*invlist_ptr) == 0) {
13063    SvREFCNT_dec_NN(*invlist_ptr);
13064    *invlist_ptr = NULL;
13065   }
13066  }
13067 }
13068
13069 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
13070    Character classes ([:foo:]) can also be negated ([:^foo:]).
13071    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
13072    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
13073    but trigger failures because they are currently unimplemented. */
13074
13075 #define POSIXCC_DONE(c)   ((c) == ':')
13076 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
13077 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
13078
13079 PERL_STATIC_INLINE I32
13080 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
13081 {
13082  I32 namedclass = OOB_NAMEDCLASS;
13083
13084  PERL_ARGS_ASSERT_REGPPOSIXCC;
13085
13086  if (value == '[' && RExC_parse + 1 < RExC_end &&
13087   /* I smell either [: or [= or [. -- POSIX has been here, right? */
13088   POSIXCC(UCHARAT(RExC_parse)))
13089  {
13090   const char c = UCHARAT(RExC_parse);
13091   char* const s = RExC_parse++;
13092
13093   while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
13094    RExC_parse++;
13095   if (RExC_parse == RExC_end) {
13096    if (strict) {
13097
13098     /* Try to give a better location for the error (than the end of
13099     * the string) by looking for the matching ']' */
13100     RExC_parse = s;
13101     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
13102      RExC_parse++;
13103     }
13104     vFAIL2("Unmatched '%c' in POSIX class", c);
13105    }
13106    /* Grandfather lone [:, [=, [. */
13107    RExC_parse = s;
13108   }
13109   else {
13110    const char* const t = RExC_parse++; /* skip over the c */
13111    assert(*t == c);
13112
13113    if (UCHARAT(RExC_parse) == ']') {
13114     const char *posixcc = s + 1;
13115     RExC_parse++; /* skip over the ending ] */
13116
13117     if (*s == ':') {
13118      const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
13119      const I32 skip = t - posixcc;
13120
13121      /* Initially switch on the length of the name.  */
13122      switch (skip) {
13123      case 4:
13124       if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
13125               this is the Perl \w
13126               */
13127        namedclass = ANYOF_WORDCHAR;
13128       break;
13129      case 5:
13130       /* Names all of length 5.  */
13131       /* alnum alpha ascii blank cntrl digit graph lower
13132       print punct space upper  */
13133       /* Offset 4 gives the best switch position.  */
13134       switch (posixcc[4]) {
13135       case 'a':
13136        if (memEQ(posixcc, "alph", 4)) /* alpha */
13137         namedclass = ANYOF_ALPHA;
13138        break;
13139       case 'e':
13140        if (memEQ(posixcc, "spac", 4)) /* space */
13141         namedclass = ANYOF_SPACE;
13142        break;
13143       case 'h':
13144        if (memEQ(posixcc, "grap", 4)) /* graph */
13145         namedclass = ANYOF_GRAPH;
13146        break;
13147       case 'i':
13148        if (memEQ(posixcc, "asci", 4)) /* ascii */
13149         namedclass = ANYOF_ASCII;
13150        break;
13151       case 'k':
13152        if (memEQ(posixcc, "blan", 4)) /* blank */
13153         namedclass = ANYOF_BLANK;
13154        break;
13155       case 'l':
13156        if (memEQ(posixcc, "cntr", 4)) /* cntrl */
13157         namedclass = ANYOF_CNTRL;
13158        break;
13159       case 'm':
13160        if (memEQ(posixcc, "alnu", 4)) /* alnum */
13161         namedclass = ANYOF_ALPHANUMERIC;
13162        break;
13163       case 'r':
13164        if (memEQ(posixcc, "lowe", 4)) /* lower */
13165         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
13166        else if (memEQ(posixcc, "uppe", 4)) /* upper */
13167         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
13168        break;
13169       case 't':
13170        if (memEQ(posixcc, "digi", 4)) /* digit */
13171         namedclass = ANYOF_DIGIT;
13172        else if (memEQ(posixcc, "prin", 4)) /* print */
13173         namedclass = ANYOF_PRINT;
13174        else if (memEQ(posixcc, "punc", 4)) /* punct */
13175         namedclass = ANYOF_PUNCT;
13176        break;
13177       }
13178       break;
13179      case 6:
13180       if (memEQ(posixcc, "xdigit", 6))
13181        namedclass = ANYOF_XDIGIT;
13182       break;
13183      }
13184
13185      if (namedclass == OOB_NAMEDCLASS)
13186       vFAIL2utf8f(
13187        "POSIX class [:%"UTF8f":] unknown",
13188        UTF8fARG(UTF, t - s - 1, s + 1));
13189
13190      /* The #defines are structured so each complement is +1 to
13191      * the normal one */
13192      if (complement) {
13193       namedclass++;
13194      }
13195      assert (posixcc[skip] == ':');
13196      assert (posixcc[skip+1] == ']');
13197     } else if (!SIZE_ONLY) {
13198      /* [[=foo=]] and [[.foo.]] are still future. */
13199
13200      /* adjust RExC_parse so the warning shows after
13201      the class closes */
13202      while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
13203       RExC_parse++;
13204      vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
13205     }
13206    } else {
13207     /* Maternal grandfather:
13208     * "[:" ending in ":" but not in ":]" */
13209     if (strict) {
13210      vFAIL("Unmatched '[' in POSIX class");
13211     }
13212
13213     /* Grandfather lone [:, [=, [. */
13214     RExC_parse = s;
13215    }
13216   }
13217  }
13218
13219  return namedclass;
13220 }
13221
13222 STATIC bool
13223 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
13224 {
13225  /* This applies some heuristics at the current parse position (which should
13226  * be at a '[') to see if what follows might be intended to be a [:posix:]
13227  * class.  It returns true if it really is a posix class, of course, but it
13228  * also can return true if it thinks that what was intended was a posix
13229  * class that didn't quite make it.
13230  *
13231  * It will return true for
13232  *      [:alphanumerics:
13233  *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
13234  *                         ')' indicating the end of the (?[
13235  *      [:any garbage including %^&$ punctuation:]
13236  *
13237  * This is designed to be called only from S_handle_regex_sets; it could be
13238  * easily adapted to be called from the spot at the beginning of regclass()
13239  * that checks to see in a normal bracketed class if the surrounding []
13240  * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
13241  * change long-standing behavior, so I (khw) didn't do that */
13242  char* p = RExC_parse + 1;
13243  char first_char = *p;
13244
13245  PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
13246
13247  assert(*(p - 1) == '[');
13248
13249  if (! POSIXCC(first_char)) {
13250   return FALSE;
13251  }
13252
13253  p++;
13254  while (p < RExC_end && isWORDCHAR(*p)) p++;
13255
13256  if (p >= RExC_end) {
13257   return FALSE;
13258  }
13259
13260  if (p - RExC_parse > 2    /* Got at least 1 word character */
13261   && (*p == first_char
13262    || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
13263  {
13264   return TRUE;
13265  }
13266
13267  p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
13268
13269  return (p
13270    && p - RExC_parse > 2 /* [:] evaluates to colon;
13271          [::] is a bad posix class. */
13272    && first_char == *(p - 1));
13273 }
13274
13275 STATIC unsigned  int
13276 S_regex_set_precedence(const U8 my_operator) {
13277
13278  /* Returns the precedence in the (?[...]) construct of the input operator,
13279  * specified by its character representation.  The precedence follows
13280  * general Perl rules, but it extends this so that ')' and ']' have (low)
13281  * precedence even though they aren't really operators */
13282
13283  switch (my_operator) {
13284   case '!':
13285    return 5;
13286   case '&':
13287    return 4;
13288   case '^':
13289   case '|':
13290   case '+':
13291   case '-':
13292    return 3;
13293   case ')':
13294    return 2;
13295   case ']':
13296    return 1;
13297  }
13298
13299  NOT_REACHED; /* NOTREACHED */
13300  return 0;   /* Silence compiler warning */
13301 }
13302
13303 STATIC regnode *
13304 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
13305      I32 *flagp, U32 depth,
13306      char * const oregcomp_parse)
13307 {
13308  /* Handle the (?[...]) construct to do set operations */
13309
13310  U8 curchar;                     /* Current character being parsed */
13311  UV start, end;             /* End points of code point ranges */
13312  SV* final = NULL;               /* The end result inversion list */
13313  SV* result_string;              /* 'final' stringified */
13314  AV* stack;                      /* stack of operators and operands not yet
13315          resolved */
13316  AV* fence_stack = NULL;         /* A stack containing the positions in
13317          'stack' of where the undealt-with left
13318          parens would be if they were actually
13319          put there */
13320  IV fence = 0;                   /* Position of where most recent undealt-
13321          with left paren in stack is; -1 if none.
13322          */
13323  STRLEN len;                     /* Temporary */
13324  regnode* node;                  /* Temporary, and final regnode returned by
13325          this function */
13326  const bool save_fold = FOLD;    /* Temporary */
13327  char *save_end, *save_parse;    /* Temporaries */
13328
13329  GET_RE_DEBUG_FLAGS_DECL;
13330
13331  PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
13332
13333  if (LOC) {  /* XXX could make valid in UTF-8 locales */
13334   vFAIL("(?[...]) not valid in locale");
13335  }
13336  RExC_uni_semantics = 1;     /* The use of this operator implies /u.  This
13337         is required so that the compile time values
13338         are valid in all runtime cases */
13339
13340  /* This will return only an ANYOF regnode, or (unlikely) something smaller
13341  * (such as EXACT).  Thus we can skip most everything if just sizing.  We
13342  * call regclass to handle '[]' so as to not have to reinvent its parsing
13343  * rules here (throwing away the size it computes each time).  And, we exit
13344  * upon an unescaped ']' that isn't one ending a regclass.  To do both
13345  * these things, we need to realize that something preceded by a backslash
13346  * is escaped, so we have to keep track of backslashes */
13347  if (SIZE_ONLY) {
13348   UV depth = 0; /* how many nested (?[...]) constructs */
13349
13350   while (RExC_parse < RExC_end) {
13351    SV* current = NULL;
13352    RExC_parse = regpatws(pRExC_state, RExC_parse,
13353           TRUE); /* means recognize comments */
13354    switch (*RExC_parse) {
13355     case '?':
13356      if (RExC_parse[1] == '[') depth++, RExC_parse++;
13357      /* FALLTHROUGH */
13358     default:
13359      break;
13360     case '\\':
13361      /* Skip the next byte (which could cause us to end up in
13362      * the middle of a UTF-8 character, but since none of those
13363      * are confusable with anything we currently handle in this
13364      * switch (invariants all), it's safe.  We'll just hit the
13365      * default: case next time and keep on incrementing until
13366      * we find one of the invariants we do handle. */
13367      RExC_parse++;
13368      break;
13369     case '[':
13370     {
13371      /* If this looks like it is a [:posix:] class, leave the
13372      * parse pointer at the '[' to fool regclass() into
13373      * thinking it is part of a '[[:posix:]]'.  That function
13374      * will use strict checking to force a syntax error if it
13375      * doesn't work out to a legitimate class */
13376      bool is_posix_class
13377          = could_it_be_a_POSIX_class(pRExC_state);
13378      if (! is_posix_class) {
13379       RExC_parse++;
13380      }
13381
13382      /* regclass() can only return RESTART_UTF8 if multi-char
13383      folds are allowed.  */
13384      if (!regclass(pRExC_state, flagp,depth+1,
13385         is_posix_class, /* parse the whole char
13386              class only if not a
13387              posix class */
13388         FALSE, /* don't allow multi-char folds */
13389         TRUE, /* silence non-portable warnings. */
13390         TRUE, /* strict */
13391         &current
13392         ))
13393       FAIL2("panic: regclass returned NULL to handle_sets, "
13394        "flags=%#"UVxf"", (UV) *flagp);
13395
13396      /* function call leaves parse pointing to the ']', except
13397      * if we faked it */
13398      if (is_posix_class) {
13399       RExC_parse--;
13400      }
13401
13402      SvREFCNT_dec(current);   /* In case it returned something */
13403      break;
13404     }
13405
13406     case ']':
13407      if (depth--) break;
13408      RExC_parse++;
13409      if (RExC_parse < RExC_end
13410       && *RExC_parse == ')')
13411      {
13412       node = reganode(pRExC_state, ANYOF, 0);
13413       RExC_size += ANYOF_SKIP;
13414       nextchar(pRExC_state);
13415       Set_Node_Length(node,
13416         RExC_parse - oregcomp_parse + 1); /* MJD */
13417       return node;
13418      }
13419      goto no_close;
13420    }
13421    RExC_parse++;
13422   }
13423
13424  no_close:
13425   FAIL("Syntax error in (?[...])");
13426  }
13427
13428  /* Pass 2 only after this. */
13429  Perl_ck_warner_d(aTHX_
13430   packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
13431   "The regex_sets feature is experimental" REPORT_LOCATION,
13432    UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
13433    UTF8fARG(UTF,
13434      RExC_end - RExC_start - (RExC_parse - RExC_precomp),
13435      RExC_precomp + (RExC_parse - RExC_precomp)));
13436
13437  /* Everything in this construct is a metacharacter.  Operands begin with
13438  * either a '\' (for an escape sequence), or a '[' for a bracketed
13439  * character class.  Any other character should be an operator, or
13440  * parenthesis for grouping.  Both types of operands are handled by calling
13441  * regclass() to parse them.  It is called with a parameter to indicate to
13442  * return the computed inversion list.  The parsing here is implemented via
13443  * a stack.  Each entry on the stack is a single character representing one
13444  * of the operators; or else a pointer to an operand inversion list. */
13445
13446 #define IS_OPERAND(a)  (! SvIOK(a))
13447
13448  /* The stack is kept in Łukasiewicz order.  (That's pronounced similar
13449  * to luke-a-shave-itch (or -itz), but people who didn't want to bother
13450  * with prounouncing it called it Reverse Polish instead, but now that YOU
13451  * know how to prounounce it you can use the correct term, thus giving due
13452  * credit to the person who invented it, and impressing your geek friends.
13453  * Wikipedia says that the pronounciation of "Ł" has been changing so that
13454  * it is now more like an English initial W (as in wonk) than an L.)
13455  *
13456  * This means that, for example, 'a | b & c' is stored on the stack as
13457  *
13458  * c  [4]
13459  * b  [3]
13460  * &  [2]
13461  * a  [1]
13462  * |  [0]
13463  *
13464  * where the numbers in brackets give the stack [array] element number.
13465  * In this implementation, parentheses are not stored on the stack.
13466  * Instead a '(' creates a "fence" so that the part of the stack below the
13467  * fence is invisible except to the corresponding ')' (this allows us to
13468  * replace testing for parens, by using instead subtraction of the fence
13469  * position).  As new operands are processed they are pushed onto the stack
13470  * (except as noted in the next paragraph).  New operators of higher
13471  * precedence than the current final one are inserted on the stack before
13472  * the lhs operand (so that when the rhs is pushed next, everything will be
13473  * in the correct positions shown above.  When an operator of equal or
13474  * lower precedence is encountered in parsing, all the stacked operations
13475  * of equal or higher precedence are evaluated, leaving the result as the
13476  * top entry on the stack.  This makes higher precedence operations
13477  * evaluate before lower precedence ones, and causes operations of equal
13478  * precedence to left associate.
13479  *
13480  * The only unary operator '!' is immediately pushed onto the stack when
13481  * encountered.  When an operand is encountered, if the top of the stack is
13482  * a '!", the complement is immediately performed, and the '!' popped.  The
13483  * resulting value is treated as a new operand, and the logic in the
13484  * previous paragraph is executed.  Thus in the expression
13485  *      [a] + ! [b]
13486  * the stack looks like
13487  *
13488  * !
13489  * a
13490  * +
13491  *
13492  * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
13493  * becomes
13494  *
13495  * !b
13496  * a
13497  * +
13498  *
13499  * A ')' is treated as an operator with lower precedence than all the
13500  * aforementioned ones, which causes all operations on the stack above the
13501  * corresponding '(' to be evaluated down to a single resultant operand.
13502  * Then the fence for the '(' is removed, and the operand goes through the
13503  * algorithm above, without the fence.
13504  *
13505  * A separate stack is kept of the fence positions, so that the position of
13506  * the latest so-far unbalanced '(' is at the top of it.
13507  *
13508  * The ']' ending the construct is treated as the lowest operator of all,
13509  * so that everything gets evaluated down to a single operand, which is the
13510  * result */
13511
13512  sv_2mortal((SV *)(stack = newAV()));
13513  sv_2mortal((SV *)(fence_stack = newAV()));
13514
13515  while (RExC_parse < RExC_end) {
13516   I32 top_index;              /* Index of top-most element in 'stack' */
13517   SV** top_ptr;               /* Pointer to top 'stack' element */
13518   SV* current = NULL;         /* To contain the current inversion list
13519          operand */
13520   SV* only_to_avoid_leaks;
13521
13522   /* Skip white space */
13523   RExC_parse = regpatws(pRExC_state, RExC_parse,
13524     TRUE /* means recognize comments */ );
13525   if (RExC_parse >= RExC_end) {
13526    Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
13527   }
13528
13529   curchar = UCHARAT(RExC_parse);
13530
13531 redo_curchar:
13532
13533   top_index = av_tindex(stack);
13534
13535   switch (curchar) {
13536    SV** stacked_ptr;       /* Ptr to something already on 'stack' */
13537    char stacked_operator;  /* The topmost operator on the 'stack'. */
13538    SV* lhs;                /* Operand to the left of the operator */
13539    SV* rhs;                /* Operand to the right of the operator */
13540    SV* fence_ptr;          /* Pointer to top element of the fence
13541          stack */
13542
13543    case '(':
13544
13545     if (RExC_parse < RExC_end && (UCHARAT(RExC_parse + 1) == '?'))
13546     {
13547      /* If is a '(?', could be an embedded '(?flags:(?[...])'.
13548      * This happens when we have some thing like
13549      *
13550      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
13551      *   ...
13552      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
13553      *
13554      * Here we would be handling the interpolated
13555      * '$thai_or_lao'.  We handle this by a recursive call to
13556      * ourselves which returns the inversion list the
13557      * interpolated expression evaluates to.  We use the flags
13558      * from the interpolated pattern. */
13559      U32 save_flags = RExC_flags;
13560      const char * save_parse;
13561
13562      RExC_parse += 2;        /* Skip past the '(?' */
13563      save_parse = RExC_parse;
13564
13565      /* Parse any flags for the '(?' */
13566      parse_lparen_question_flags(pRExC_state);
13567
13568      if (RExC_parse == save_parse  /* Makes sure there was at
13569              least one flag (or else
13570              this embedding wasn't
13571              compiled) */
13572       || RExC_parse >= RExC_end - 4
13573       || UCHARAT(RExC_parse) != ':'
13574       || UCHARAT(++RExC_parse) != '('
13575       || UCHARAT(++RExC_parse) != '?'
13576       || UCHARAT(++RExC_parse) != '[')
13577      {
13578
13579       /* In combination with the above, this moves the
13580       * pointer to the point just after the first erroneous
13581       * character (or if there are no flags, to where they
13582       * should have been) */
13583       if (RExC_parse >= RExC_end - 4) {
13584        RExC_parse = RExC_end;
13585       }
13586       else if (RExC_parse != save_parse) {
13587        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13588       }
13589       vFAIL("Expecting '(?flags:(?[...'");
13590      }
13591
13592      /* Recurse, with the meat of the embedded expression */
13593      RExC_parse++;
13594      (void) handle_regex_sets(pRExC_state, &current, flagp,
13595              depth+1, oregcomp_parse);
13596
13597      /* Here, 'current' contains the embedded expression's
13598      * inversion list, and RExC_parse points to the trailing
13599      * ']'; the next character should be the ')' */
13600      RExC_parse++;
13601      assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
13602
13603      /* Then the ')' matching the original '(' handled by this
13604      * case: statement */
13605      RExC_parse++;
13606      assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
13607
13608      RExC_parse++;
13609      RExC_flags = save_flags;
13610      goto handle_operand;
13611     }
13612
13613     /* A regular '('.  Look behind for illegal syntax */
13614     if (top_index - fence >= 0) {
13615      /* If the top entry on the stack is an operator, it had
13616      * better be a '!', otherwise the entry below the top
13617      * operand should be an operator */
13618      if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
13619       || (! IS_OPERAND(*top_ptr) && SvUV(*top_ptr) != '!')
13620       || top_index - fence < 1
13621       || ! (stacked_ptr = av_fetch(stack,
13622              top_index - 1,
13623              FALSE))
13624       || IS_OPERAND(*stacked_ptr))
13625      {
13626       RExC_parse++;
13627       vFAIL("Unexpected '(' with no preceding operator");
13628      }
13629     }
13630
13631     /* Stack the position of this undealt-with left paren */
13632     fence = top_index + 1;
13633     av_push(fence_stack, newSViv(fence));
13634     break;
13635
13636    case '\\':
13637     /* regclass() can only return RESTART_UTF8 if multi-char
13638     folds are allowed.  */
13639     if (!regclass(pRExC_state, flagp,depth+1,
13640        TRUE, /* means parse just the next thing */
13641        FALSE, /* don't allow multi-char folds */
13642        FALSE, /* don't silence non-portable warnings.  */
13643        TRUE,  /* strict */
13644        &current))
13645     {
13646      FAIL2("panic: regclass returned NULL to handle_sets, "
13647       "flags=%#"UVxf"", (UV) *flagp);
13648     }
13649
13650     /* regclass() will return with parsing just the \ sequence,
13651     * leaving the parse pointer at the next thing to parse */
13652     RExC_parse--;
13653     goto handle_operand;
13654
13655    case '[':   /* Is a bracketed character class */
13656    {
13657     bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
13658
13659     if (! is_posix_class) {
13660      RExC_parse++;
13661     }
13662
13663     /* regclass() can only return RESTART_UTF8 if multi-char
13664     folds are allowed.  */
13665     if(!regclass(pRExC_state, flagp,depth+1,
13666        is_posix_class, /* parse the whole char class
13667             only if not a posix class */
13668        FALSE, /* don't allow multi-char folds */
13669        FALSE, /* don't silence non-portable warnings.  */
13670        TRUE,   /* strict */
13671        &current
13672        ))
13673     {
13674      FAIL2("panic: regclass returned NULL to handle_sets, "
13675       "flags=%#"UVxf"", (UV) *flagp);
13676     }
13677
13678     /* function call leaves parse pointing to the ']', except if we
13679     * faked it */
13680     if (is_posix_class) {
13681      RExC_parse--;
13682     }
13683
13684     goto handle_operand;
13685    }
13686
13687    case ']':
13688     if (top_index >= 1) {
13689      goto join_operators;
13690     }
13691
13692     /* Only a single operand on the stack: are done */
13693     goto done;
13694
13695    case ')':
13696     if (av_tindex(fence_stack) < 0) {
13697      RExC_parse++;
13698      vFAIL("Unexpected ')'");
13699     }
13700
13701     /* If at least two thing on the stack, treat this as an
13702     * operator */
13703     if (top_index - fence >= 1) {
13704      goto join_operators;
13705     }
13706
13707     /* Here only a single thing on the fenced stack, and there is a
13708     * fence.  Get rid of it */
13709     fence_ptr = av_pop(fence_stack);
13710     assert(fence_ptr);
13711     fence = SvIV(fence_ptr) - 1;
13712     SvREFCNT_dec_NN(fence_ptr);
13713     fence_ptr = NULL;
13714
13715     if (fence < 0) {
13716      fence = 0;
13717     }
13718
13719     /* Having gotten rid of the fence, we pop the operand at the
13720     * stack top and process it as a newly encountered operand */
13721     current = av_pop(stack);
13722     assert(IS_OPERAND(current));
13723     goto handle_operand;
13724
13725    case '&':
13726    case '|':
13727    case '+':
13728    case '-':
13729    case '^':
13730
13731     /* These binary operators should have a left operand already
13732     * parsed */
13733     if (   top_index - fence < 0
13734      || top_index - fence == 1
13735      || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13736      || ! IS_OPERAND(*top_ptr))
13737     {
13738      goto unexpected_binary;
13739     }
13740
13741     /* If only the one operand is on the part of the stack visible
13742     * to us, we just place this operator in the proper position */
13743     if (top_index - fence < 2) {
13744
13745      /* Place the operator before the operand */
13746
13747      SV* lhs = av_pop(stack);
13748      av_push(stack, newSVuv(curchar));
13749      av_push(stack, lhs);
13750      break;
13751     }
13752
13753     /* But if there is something else on the stack, we need to
13754     * process it before this new operator if and only if the
13755     * stacked operation has equal or higher precedence than the
13756     * new one */
13757
13758    join_operators:
13759
13760     /* The operator on the stack is supposed to be below both its
13761     * operands */
13762     if (   ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
13763      || IS_OPERAND(*stacked_ptr))
13764     {
13765      /* But if not, it's legal and indicates we are completely
13766      * done if and only if we're currently processing a ']',
13767      * which should be the final thing in the expression */
13768      if (curchar == ']') {
13769       goto done;
13770      }
13771
13772     unexpected_binary:
13773      RExC_parse++;
13774      vFAIL2("Unexpected binary operator '%c' with no "
13775       "preceding operand", curchar);
13776     }
13777     stacked_operator = (char) SvUV(*stacked_ptr);
13778
13779     if (regex_set_precedence(curchar)
13780      > regex_set_precedence(stacked_operator))
13781     {
13782      /* Here, the new operator has higher precedence than the
13783      * stacked one.  This means we need to add the new one to
13784      * the stack to await its rhs operand (and maybe more
13785      * stuff).  We put it before the lhs operand, leaving
13786      * untouched the stacked operator and everything below it
13787      * */
13788      lhs = av_pop(stack);
13789      assert(IS_OPERAND(lhs));
13790
13791      av_push(stack, newSVuv(curchar));
13792      av_push(stack, lhs);
13793      break;
13794     }
13795
13796     /* Here, the new operator has equal or lower precedence than
13797     * what's already there.  This means the operation already
13798     * there should be performed now, before the new one. */
13799     rhs = av_pop(stack);
13800     lhs = av_pop(stack);
13801
13802     assert(IS_OPERAND(rhs));
13803     assert(IS_OPERAND(lhs));
13804
13805     switch (stacked_operator) {
13806      case '&':
13807       _invlist_intersection(lhs, rhs, &rhs);
13808       break;
13809
13810      case '|':
13811      case '+':
13812       _invlist_union(lhs, rhs, &rhs);
13813       break;
13814
13815      case '-':
13816       _invlist_subtract(lhs, rhs, &rhs);
13817       break;
13818
13819      case '^':   /* The union minus the intersection */
13820      {
13821       SV* i = NULL;
13822       SV* u = NULL;
13823       SV* element;
13824
13825       _invlist_union(lhs, rhs, &u);
13826       _invlist_intersection(lhs, rhs, &i);
13827       /* _invlist_subtract will overwrite rhs
13828        without freeing what it already contains */
13829       element = rhs;
13830       _invlist_subtract(u, i, &rhs);
13831       SvREFCNT_dec_NN(i);
13832       SvREFCNT_dec_NN(u);
13833       SvREFCNT_dec_NN(element);
13834       break;
13835      }
13836     }
13837     SvREFCNT_dec(lhs);
13838
13839     /* Here, the higher precedence operation has been done, and the
13840     * result is in 'rhs'.  We overwrite the stacked operator with
13841     * the result.  Then we redo this code to either push the new
13842     * operator onto the stack or perform any higher precedence
13843     * stacked operation */
13844     only_to_avoid_leaks = av_pop(stack);
13845     SvREFCNT_dec(only_to_avoid_leaks);
13846     av_push(stack, rhs);
13847     goto redo_curchar;
13848
13849    case '!':   /* Highest priority, right associative, so just push
13850       onto stack */
13851     av_push(stack, newSVuv(curchar));
13852     break;
13853
13854    default:
13855     RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13856     vFAIL("Unexpected character");
13857
13858   handle_operand:
13859
13860    /* Here 'current' is the operand.  If something is already on the
13861    * stack, we have to check if it is a !. */
13862    top_index = av_tindex(stack);   /* Code above may have altered the
13863            * stack in the time since we
13864            * earlier set 'top_index'. */
13865    if (top_index - fence >= 0) {
13866     /* If the top entry on the stack is an operator, it had better
13867     * be a '!', otherwise the entry below the top operand should
13868     * be an operator */
13869     top_ptr = av_fetch(stack, top_index, FALSE);
13870     assert(top_ptr);
13871     if (! IS_OPERAND(*top_ptr)) {
13872
13873      /* The only permissible operator at the top of the stack is
13874      * '!', which is applied immediately to this operand. */
13875      curchar = (char) SvUV(*top_ptr);
13876      if (curchar != '!') {
13877       SvREFCNT_dec(current);
13878       vFAIL2("Unexpected binary operator '%c' with no "
13879         "preceding operand", curchar);
13880      }
13881
13882      _invlist_invert(current);
13883
13884      only_to_avoid_leaks = av_pop(stack);
13885      SvREFCNT_dec(only_to_avoid_leaks);
13886      top_index = av_tindex(stack);
13887
13888      /* And we redo with the inverted operand.  This allows
13889      * handling multiple ! in a row */
13890      goto handle_operand;
13891     }
13892       /* Single operand is ok only for the non-binary ')'
13893       * operator */
13894     else if ((top_index - fence == 0 && curchar != ')')
13895       || (top_index - fence > 0
13896        && (! (stacked_ptr = av_fetch(stack,
13897               top_index - 1,
13898               FALSE))
13899         || IS_OPERAND(*stacked_ptr))))
13900     {
13901      SvREFCNT_dec(current);
13902      vFAIL("Operand with no preceding operator");
13903     }
13904    }
13905
13906    /* Here there was nothing on the stack or the top element was
13907    * another operand.  Just add this new one */
13908    av_push(stack, current);
13909
13910   } /* End of switch on next parse token */
13911
13912   RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13913  } /* End of loop parsing through the construct */
13914
13915   done:
13916  if (av_tindex(fence_stack) >= 0) {
13917   vFAIL("Unmatched (");
13918  }
13919
13920  if (av_tindex(stack) < 0   /* Was empty */
13921   || ((final = av_pop(stack)) == NULL)
13922   || ! IS_OPERAND(final)
13923   || av_tindex(stack) >= 0)  /* More left on stack */
13924  {
13925   SvREFCNT_dec(final);
13926   vFAIL("Incomplete expression within '(?[ ])'");
13927  }
13928
13929  /* Here, 'final' is the resultant inversion list from evaluating the
13930  * expression.  Return it if so requested */
13931  if (return_invlist) {
13932   *return_invlist = final;
13933   return END;
13934  }
13935
13936  /* Otherwise generate a resultant node, based on 'final'.  regclass() is
13937  * expecting a string of ranges and individual code points */
13938  invlist_iterinit(final);
13939  result_string = newSVpvs("");
13940  while (invlist_iternext(final, &start, &end)) {
13941   if (start == end) {
13942    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13943   }
13944   else {
13945    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13946              start,          end);
13947   }
13948  }
13949
13950  /* About to generate an ANYOF (or similar) node from the inversion list we
13951  * have calculated */
13952  save_parse = RExC_parse;
13953  RExC_parse = SvPV(result_string, len);
13954  save_end = RExC_end;
13955  RExC_end = RExC_parse + len;
13956
13957  /* We turn off folding around the call, as the class we have constructed
13958  * already has all folding taken into consideration, and we don't want
13959  * regclass() to add to that */
13960  RExC_flags &= ~RXf_PMf_FOLD;
13961  /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13962  */
13963  node = regclass(pRExC_state, flagp,depth+1,
13964      FALSE, /* means parse the whole char class */
13965      FALSE, /* don't allow multi-char folds */
13966      TRUE, /* silence non-portable warnings.  The above may very
13967        well have generated non-portable code points, but
13968        they're valid on this machine */
13969      FALSE, /* similarly, no need for strict */
13970      NULL
13971     );
13972  if (!node)
13973   FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13974      PTR2UV(flagp));
13975  if (save_fold) {
13976   RExC_flags |= RXf_PMf_FOLD;
13977  }
13978  RExC_parse = save_parse + 1;
13979  RExC_end = save_end;
13980  SvREFCNT_dec_NN(final);
13981  SvREFCNT_dec_NN(result_string);
13982
13983  nextchar(pRExC_state);
13984  Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13985  return node;
13986 }
13987 #undef IS_OPERAND
13988
13989 STATIC void
13990 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13991 {
13992  /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13993  * innocent-looking character class, like /[ks]/i won't have to go out to
13994  * disk to find the possible matches.
13995  *
13996  * This should be called only for a Latin1-range code points, cp, which is
13997  * known to be involved in a simple fold with other code points above
13998  * Latin1.  It would give false results if /aa has been specified.
13999  * Multi-char folds are outside the scope of this, and must be handled
14000  * specially.
14001  *
14002  * XXX It would be better to generate these via regen, in case a new
14003  * version of the Unicode standard adds new mappings, though that is not
14004  * really likely, and may be caught by the default: case of the switch
14005  * below. */
14006
14007  PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
14008
14009  assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
14010
14011  switch (cp) {
14012   case 'k':
14013   case 'K':
14014   *invlist =
14015    add_cp_to_invlist(*invlist, KELVIN_SIGN);
14016    break;
14017   case 's':
14018   case 'S':
14019   *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
14020    break;
14021   case MICRO_SIGN:
14022   *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
14023   *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
14024    break;
14025   case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14026   case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14027   *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
14028    break;
14029   case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14030   *invlist = add_cp_to_invlist(*invlist,
14031           LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14032    break;
14033   case LATIN_SMALL_LETTER_SHARP_S:
14034   *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
14035    break;
14036   default:
14037    /* Use deprecated warning to increase the chances of this being
14038    * output */
14039    if (PASS2) {
14040     ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
14041    }
14042    break;
14043  }
14044 }
14045
14046 STATIC AV *
14047 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
14048 {
14049  /* This adds the string scalar <multi_string> to the array
14050  * <multi_char_matches>.  <multi_string> is known to have exactly
14051  * <cp_count> code points in it.  This is used when constructing a
14052  * bracketed character class and we find something that needs to match more
14053  * than a single character.
14054  *
14055  * <multi_char_matches> is actually an array of arrays.  Each top-level
14056  * element is an array that contains all the strings known so far that are
14057  * the same length.  And that length (in number of code points) is the same
14058  * as the index of the top-level array.  Hence, the [2] element is an
14059  * array, each element thereof is a string containing TWO code points;
14060  * while element [3] is for strings of THREE characters, and so on.  Since
14061  * this is for multi-char strings there can never be a [0] nor [1] element.
14062  *
14063  * When we rewrite the character class below, we will do so such that the
14064  * longest strings are written first, so that it prefers the longest
14065  * matching strings first.  This is done even if it turns out that any
14066  * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
14067  * Christiansen has agreed that this is ok.  This makes the test for the
14068  * ligature 'ffi' come before the test for 'ff', for example */
14069
14070  AV* this_array;
14071  AV** this_array_ptr;
14072
14073  PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
14074
14075  if (! multi_char_matches) {
14076   multi_char_matches = newAV();
14077  }
14078
14079  if (av_exists(multi_char_matches, cp_count)) {
14080   this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
14081   this_array = *this_array_ptr;
14082  }
14083  else {
14084   this_array = newAV();
14085   av_store(multi_char_matches, cp_count,
14086     (SV*) this_array);
14087  }
14088  av_push(this_array, multi_string);
14089
14090  return multi_char_matches;
14091 }
14092
14093 /* The names of properties whose definitions are not known at compile time are
14094  * stored in this SV, after a constant heading.  So if the length has been
14095  * changed since initialization, then there is a run-time definition. */
14096 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
14097           (SvCUR(listsv) != initial_listsv_len)
14098
14099 STATIC regnode *
14100 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
14101     const bool stop_at_1,  /* Just parse the next thing, don't
14102           look for a full character class */
14103     bool allow_multi_folds,
14104     const bool silence_non_portable,   /* Don't output warnings
14105              about too large
14106              characters */
14107     const bool strict,
14108     SV** ret_invlist  /* Return an inversion list, not a node */
14109   )
14110 {
14111  /* parse a bracketed class specification.  Most of these will produce an
14112  * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
14113  * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
14114  * under /i with multi-character folds: it will be rewritten following the
14115  * paradigm of this example, where the <multi-fold>s are characters which
14116  * fold to multiple character sequences:
14117  *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
14118  * gets effectively rewritten as:
14119  *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
14120  * reg() gets called (recursively) on the rewritten version, and this
14121  * function will return what it constructs.  (Actually the <multi-fold>s
14122  * aren't physically removed from the [abcdefghi], it's just that they are
14123  * ignored in the recursion by means of a flag:
14124  * <RExC_in_multi_char_class>.)
14125  *
14126  * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
14127  * characters, with the corresponding bit set if that character is in the
14128  * list.  For characters above this, a range list or swash is used.  There
14129  * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
14130  * determinable at compile time
14131  *
14132  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
14133  * to be restarted.  This can only happen if ret_invlist is non-NULL.
14134  */
14135
14136  UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
14137  IV range = 0;
14138  UV value = OOB_UNICODE, save_value = OOB_UNICODE;
14139  regnode *ret;
14140  STRLEN numlen;
14141  IV namedclass = OOB_NAMEDCLASS;
14142  char *rangebegin = NULL;
14143  bool need_class = 0;
14144  SV *listsv = NULL;
14145  STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
14146          than just initialized.  */
14147  SV* properties = NULL;    /* Code points that match \p{} \P{} */
14148  SV* posixes = NULL;     /* Code points that match classes like [:word:],
14149        extended beyond the Latin1 range.  These have to
14150        be kept separate from other code points for much
14151        of this function because their handling  is
14152        different under /i, and for most classes under
14153        /d as well */
14154  SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
14155        separate for a while from the non-complemented
14156        versions because of complications with /d
14157        matching */
14158  SV* simple_posixes = NULL; /* But under some conditions, the classes can be
14159         treated more simply than the general case,
14160         leading to less compilation and execution
14161         work */
14162  UV element_count = 0;   /* Number of distinct elements in the class.
14163        Optimizations may be possible if this is tiny */
14164  AV * multi_char_matches = NULL; /* Code points that fold to more than one
14165          character; used under /i */
14166  UV n;
14167  char * stop_ptr = RExC_end;    /* where to stop parsing */
14168  const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
14169             space? */
14170
14171  /* Unicode properties are stored in a swash; this holds the current one
14172  * being parsed.  If this swash is the only above-latin1 component of the
14173  * character class, an optimization is to pass it directly on to the
14174  * execution engine.  Otherwise, it is set to NULL to indicate that there
14175  * are other things in the class that have to be dealt with at execution
14176  * time */
14177  SV* swash = NULL;  /* Code points that match \p{} \P{} */
14178
14179  /* Set if a component of this character class is user-defined; just passed
14180  * on to the engine */
14181  bool has_user_defined_property = FALSE;
14182
14183  /* inversion list of code points this node matches only when the target
14184  * string is in UTF-8.  (Because is under /d) */
14185  SV* depends_list = NULL;
14186
14187  /* Inversion list of code points this node matches regardless of things
14188  * like locale, folding, utf8ness of the target string */
14189  SV* cp_list = NULL;
14190
14191  /* Like cp_list, but code points on this list need to be checked for things
14192  * that fold to/from them under /i */
14193  SV* cp_foldable_list = NULL;
14194
14195  /* Like cp_list, but code points on this list are valid only when the
14196  * runtime locale is UTF-8 */
14197  SV* only_utf8_locale_list = NULL;
14198
14199  /* In a range, if one of the endpoints is non-character-set portable,
14200  * meaning that it hard-codes a code point that may mean a different
14201  * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
14202  * mnemonic '\t' which each mean the same character no matter which
14203  * character set the platform is on. */
14204  unsigned int non_portable_endpoint = 0;
14205
14206  /* Is the range unicode? which means on a platform that isn't 1-1 native
14207  * to Unicode (i.e. non-ASCII), each code point in it should be considered
14208  * to be a Unicode value.  */
14209  bool unicode_range = FALSE;
14210  bool invert = FALSE;    /* Is this class to be complemented */
14211
14212  bool warn_super = ALWAYS_WARN_SUPER;
14213
14214  regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
14215   case we need to change the emitted regop to an EXACT. */
14216  const char * orig_parse = RExC_parse;
14217  const SSize_t orig_size = RExC_size;
14218  bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
14219  GET_RE_DEBUG_FLAGS_DECL;
14220
14221  PERL_ARGS_ASSERT_REGCLASS;
14222 #ifndef DEBUGGING
14223  PERL_UNUSED_ARG(depth);
14224 #endif
14225
14226  DEBUG_PARSE("clas");
14227
14228  /* Assume we are going to generate an ANYOF node. */
14229  ret = reganode(pRExC_state,
14230     (LOC)
14231      ? ANYOFL
14232      : ANYOF,
14233     0);
14234
14235  if (SIZE_ONLY) {
14236   RExC_size += ANYOF_SKIP;
14237   listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
14238  }
14239  else {
14240   ANYOF_FLAGS(ret) = 0;
14241
14242   RExC_emit += ANYOF_SKIP;
14243   listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
14244   initial_listsv_len = SvCUR(listsv);
14245   SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
14246  }
14247
14248  if (skip_white) {
14249   RExC_parse = regpatws(pRExC_state, RExC_parse,
14250        FALSE /* means don't recognize comments */ );
14251  }
14252
14253  if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
14254   RExC_parse++;
14255   invert = TRUE;
14256   allow_multi_folds = FALSE;
14257   MARK_NAUGHTY(1);
14258   if (skip_white) {
14259    RExC_parse = regpatws(pRExC_state, RExC_parse,
14260         FALSE /* means don't recognize comments */ );
14261   }
14262  }
14263
14264  /* Check that they didn't say [:posix:] instead of [[:posix:]] */
14265  if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
14266   const char *s = RExC_parse;
14267   const char  c = *s++;
14268
14269   if (*s == '^') {
14270    s++;
14271   }
14272   while (isWORDCHAR(*s))
14273    s++;
14274   if (*s && c == *s && s[1] == ']') {
14275    SAVEFREESV(RExC_rx_sv);
14276    ckWARN3reg(s+2,
14277      "POSIX syntax [%c %c] belongs inside character classes",
14278      c, c);
14279    (void)ReREFCNT_inc(RExC_rx_sv);
14280   }
14281  }
14282
14283  /* If the caller wants us to just parse a single element, accomplish this
14284  * by faking the loop ending condition */
14285  if (stop_at_1 && RExC_end > RExC_parse) {
14286   stop_ptr = RExC_parse + 1;
14287  }
14288
14289  /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
14290  if (UCHARAT(RExC_parse) == ']')
14291   goto charclassloop;
14292
14293  while (1) {
14294   if  (RExC_parse >= stop_ptr) {
14295    break;
14296   }
14297
14298   if (skip_white) {
14299    RExC_parse = regpatws(pRExC_state, RExC_parse,
14300         FALSE /* means don't recognize comments */ );
14301   }
14302
14303   if  (UCHARAT(RExC_parse) == ']') {
14304    break;
14305   }
14306
14307  charclassloop:
14308
14309   namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
14310   save_value = value;
14311   save_prevvalue = prevvalue;
14312
14313   if (!range) {
14314    rangebegin = RExC_parse;
14315    element_count++;
14316    non_portable_endpoint = 0;
14317   }
14318   if (UTF) {
14319    value = utf8n_to_uvchr((U8*)RExC_parse,
14320         RExC_end - RExC_parse,
14321         &numlen, UTF8_ALLOW_DEFAULT);
14322    RExC_parse += numlen;
14323   }
14324   else
14325    value = UCHARAT(RExC_parse++);
14326
14327   if (value == '['
14328    && RExC_parse < RExC_end
14329    && POSIXCC(UCHARAT(RExC_parse)))
14330   {
14331    namedclass = regpposixcc(pRExC_state, value, strict);
14332   }
14333   else if (value == '\\') {
14334    /* Is a backslash; get the code point of the char after it */
14335    if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
14336     value = utf8n_to_uvchr((U8*)RExC_parse,
14337         RExC_end - RExC_parse,
14338         &numlen, UTF8_ALLOW_DEFAULT);
14339     RExC_parse += numlen;
14340    }
14341    else
14342     value = UCHARAT(RExC_parse++);
14343
14344    /* Some compilers cannot handle switching on 64-bit integer
14345    * values, therefore value cannot be an UV.  Yes, this will
14346    * be a problem later if we want switch on Unicode.
14347    * A similar issue a little bit later when switching on
14348    * namedclass. --jhi */
14349
14350    /* If the \ is escaping white space when white space is being
14351    * skipped, it means that that white space is wanted literally, and
14352    * is already in 'value'.  Otherwise, need to translate the escape
14353    * into what it signifies. */
14354    if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
14355
14356    case 'w': namedclass = ANYOF_WORDCHAR; break;
14357    case 'W': namedclass = ANYOF_NWORDCHAR; break;
14358    case 's': namedclass = ANYOF_SPACE; break;
14359    case 'S': namedclass = ANYOF_NSPACE; break;
14360    case 'd': namedclass = ANYOF_DIGIT; break;
14361    case 'D': namedclass = ANYOF_NDIGIT; break;
14362    case 'v': namedclass = ANYOF_VERTWS; break;
14363    case 'V': namedclass = ANYOF_NVERTWS; break;
14364    case 'h': namedclass = ANYOF_HORIZWS; break;
14365    case 'H': namedclass = ANYOF_NHORIZWS; break;
14366    case 'N':  /* Handle \N{NAME} in class */
14367     {
14368      const char * const backslash_N_beg = RExC_parse - 2;
14369      int cp_count;
14370
14371      if (! grok_bslash_N(pRExC_state,
14372           NULL,      /* No regnode */
14373           &value,    /* Yes single value */
14374           &cp_count, /* Multiple code pt count */
14375           flagp,
14376           depth)
14377      ) {
14378
14379       if (*flagp & RESTART_UTF8)
14380        FAIL("panic: grok_bslash_N set RESTART_UTF8");
14381
14382       if (cp_count < 0) {
14383        vFAIL("\\N in a character class must be a named character: \\N{...}");
14384       }
14385       else if (cp_count == 0) {
14386        if (strict) {
14387         RExC_parse++;   /* Position after the "}" */
14388         vFAIL("Zero length \\N{}");
14389        }
14390        else if (PASS2) {
14391         ckWARNreg(RExC_parse,
14392           "Ignoring zero length \\N{} in character class");
14393        }
14394       }
14395       else { /* cp_count > 1 */
14396        if (! RExC_in_multi_char_class) {
14397         if (invert || range || *RExC_parse == '-') {
14398          if (strict) {
14399           RExC_parse--;
14400           vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
14401          }
14402          else if (PASS2) {
14403           ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
14404          }
14405          break; /* <value> contains the first code
14406            point. Drop out of the switch to
14407            process it */
14408         }
14409         else {
14410          SV * multi_char_N = newSVpvn(backslash_N_beg,
14411             RExC_parse - backslash_N_beg);
14412          multi_char_matches
14413           = add_multi_match(multi_char_matches,
14414               multi_char_N,
14415               cp_count);
14416         }
14417        }
14418       } /* End of cp_count != 1 */
14419
14420       /* This element should not be processed further in this
14421       * class */
14422       element_count--;
14423       value = save_value;
14424       prevvalue = save_prevvalue;
14425       continue;   /* Back to top of loop to get next char */
14426      }
14427
14428      /* Here, is a single code point, and <value> contains it */
14429      unicode_range = TRUE;   /* \N{} are Unicode */
14430     }
14431     break;
14432    case 'p':
14433    case 'P':
14434     {
14435     char *e;
14436
14437     /* We will handle any undefined properties ourselves */
14438     U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
14439          /* And we actually would prefer to get
14440           * the straight inversion list of the
14441           * swash, since we will be accessing it
14442           * anyway, to save a little time */
14443          |_CORE_SWASH_INIT_ACCEPT_INVLIST;
14444
14445     if (RExC_parse >= RExC_end)
14446      vFAIL2("Empty \\%c{}", (U8)value);
14447     if (*RExC_parse == '{') {
14448      const U8 c = (U8)value;
14449      e = strchr(RExC_parse++, '}');
14450      if (!e)
14451       vFAIL2("Missing right brace on \\%c{}", c);
14452      while (isSPACE(*RExC_parse))
14453       RExC_parse++;
14454      if (e == RExC_parse)
14455       vFAIL2("Empty \\%c{}", c);
14456      n = e - RExC_parse;
14457      while (isSPACE(*(RExC_parse + n - 1)))
14458       n--;
14459     }
14460     else {
14461      e = RExC_parse;
14462      n = 1;
14463     }
14464     if (!SIZE_ONLY) {
14465      SV* invlist;
14466      char* name;
14467
14468      if (UCHARAT(RExC_parse) == '^') {
14469       RExC_parse++;
14470       n--;
14471       /* toggle.  (The rhs xor gets the single bit that
14472       * differs between P and p; the other xor inverts just
14473       * that bit) */
14474       value ^= 'P' ^ 'p';
14475
14476       while (isSPACE(*RExC_parse)) {
14477        RExC_parse++;
14478        n--;
14479       }
14480      }
14481      /* Try to get the definition of the property into
14482      * <invlist>.  If /i is in effect, the effective property
14483      * will have its name be <__NAME_i>.  The design is
14484      * discussed in commit
14485      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
14486      name = savepv(Perl_form(aTHX_
14487           "%s%.*s%s\n",
14488           (FOLD) ? "__" : "",
14489           (int)n,
14490           RExC_parse,
14491           (FOLD) ? "_i" : ""
14492         ));
14493
14494      /* Look up the property name, and get its swash and
14495      * inversion list, if the property is found  */
14496      if (swash) {
14497       SvREFCNT_dec_NN(swash);
14498      }
14499      swash = _core_swash_init("utf8", name, &PL_sv_undef,
14500            1, /* binary */
14501            0, /* not tr/// */
14502            NULL, /* No inversion list */
14503            &swash_init_flags
14504            );
14505      if (! swash || ! (invlist = _get_swash_invlist(swash))) {
14506       HV* curpkg = (IN_PERL_COMPILETIME)
14507          ? PL_curstash
14508          : CopSTASH(PL_curcop);
14509       if (swash) {
14510        SvREFCNT_dec_NN(swash);
14511        swash = NULL;
14512       }
14513
14514       /* Here didn't find it.  It could be a user-defined
14515       * property that will be available at run-time.  If we
14516       * accept only compile-time properties, is an error;
14517       * otherwise add it to the list for run-time look up */
14518       if (ret_invlist) {
14519        RExC_parse = e + 1;
14520        vFAIL2utf8f(
14521         "Property '%"UTF8f"' is unknown",
14522         UTF8fARG(UTF, n, name));
14523       }
14524
14525       /* If the property name doesn't already have a package
14526       * name, add the current one to it so that it can be
14527       * referred to outside it. [perl #121777] */
14528       if (curpkg && ! instr(name, "::")) {
14529        char* pkgname = HvNAME(curpkg);
14530        if (strNE(pkgname, "main")) {
14531         char* full_name = Perl_form(aTHX_
14532                "%s::%s",
14533                pkgname,
14534                name);
14535         n = strlen(full_name);
14536         Safefree(name);
14537         name = savepvn(full_name, n);
14538        }
14539       }
14540       Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
14541           (value == 'p' ? '+' : '!'),
14542           UTF8fARG(UTF, n, name));
14543       has_user_defined_property = TRUE;
14544
14545       /* We don't know yet, so have to assume that the
14546       * property could match something in the Latin1 range,
14547       * hence something that isn't utf8.  Note that this
14548       * would cause things in <depends_list> to match
14549       * inappropriately, except that any \p{}, including
14550       * this one forces Unicode semantics, which means there
14551       * is no <depends_list> */
14552       ANYOF_FLAGS(ret)
14553          |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
14554      }
14555      else {
14556
14557       /* Here, did get the swash and its inversion list.  If
14558       * the swash is from a user-defined property, then this
14559       * whole character class should be regarded as such */
14560       if (swash_init_flags
14561        & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
14562       {
14563        has_user_defined_property = TRUE;
14564       }
14565       else if
14566        /* We warn on matching an above-Unicode code point
14567        * if the match would return true, except don't
14568        * warn for \p{All}, which has exactly one element
14569        * = 0 */
14570        (_invlist_contains_cp(invlist, 0x110000)
14571         && (! (_invlist_len(invlist) == 1
14572          && *invlist_array(invlist) == 0)))
14573       {
14574        warn_super = TRUE;
14575       }
14576
14577
14578       /* Invert if asking for the complement */
14579       if (value == 'P') {
14580        _invlist_union_complement_2nd(properties,
14581               invlist,
14582               &properties);
14583
14584        /* The swash can't be used as-is, because we've
14585        * inverted things; delay removing it to here after
14586        * have copied its invlist above */
14587        SvREFCNT_dec_NN(swash);
14588        swash = NULL;
14589       }
14590       else {
14591        _invlist_union(properties, invlist, &properties);
14592       }
14593      }
14594      Safefree(name);
14595     }
14596     RExC_parse = e + 1;
14597     namedclass = ANYOF_UNIPROP;  /* no official name, but it's
14598             named */
14599
14600     /* \p means they want Unicode semantics */
14601     RExC_uni_semantics = 1;
14602     }
14603     break;
14604    case 'n': value = '\n';   break;
14605    case 'r': value = '\r';   break;
14606    case 't': value = '\t';   break;
14607    case 'f': value = '\f';   break;
14608    case 'b': value = '\b';   break;
14609    case 'e': value = ESC_NATIVE;             break;
14610    case 'a': value = '\a';                   break;
14611    case 'o':
14612     RExC_parse--; /* function expects to be pointed at the 'o' */
14613     {
14614      const char* error_msg;
14615      bool valid = grok_bslash_o(&RExC_parse,
14616            &value,
14617            &error_msg,
14618            PASS2,   /* warnings only in
14619               pass 2 */
14620            strict,
14621            silence_non_portable,
14622            UTF);
14623      if (! valid) {
14624       vFAIL(error_msg);
14625      }
14626     }
14627     non_portable_endpoint++;
14628     if (IN_ENCODING && value < 0x100) {
14629      goto recode_encoding;
14630     }
14631     break;
14632    case 'x':
14633     RExC_parse--; /* function expects to be pointed at the 'x' */
14634     {
14635      const char* error_msg;
14636      bool valid = grok_bslash_x(&RExC_parse,
14637            &value,
14638            &error_msg,
14639            PASS2, /* Output warnings */
14640            strict,
14641            silence_non_portable,
14642            UTF);
14643      if (! valid) {
14644       vFAIL(error_msg);
14645      }
14646     }
14647     non_portable_endpoint++;
14648     if (IN_ENCODING && value < 0x100)
14649      goto recode_encoding;
14650     break;
14651    case 'c':
14652     value = grok_bslash_c(*RExC_parse++, PASS2);
14653     non_portable_endpoint++;
14654     break;
14655    case '0': case '1': case '2': case '3': case '4':
14656    case '5': case '6': case '7':
14657     {
14658      /* Take 1-3 octal digits */
14659      I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14660      numlen = (strict) ? 4 : 3;
14661      value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
14662      RExC_parse += numlen;
14663      if (numlen != 3) {
14664       if (strict) {
14665        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14666        vFAIL("Need exactly 3 octal digits");
14667       }
14668       else if (! SIZE_ONLY /* like \08, \178 */
14669         && numlen < 3
14670         && RExC_parse < RExC_end
14671         && isDIGIT(*RExC_parse)
14672         && ckWARN(WARN_REGEXP))
14673       {
14674        SAVEFREESV(RExC_rx_sv);
14675        reg_warn_non_literal_string(
14676         RExC_parse + 1,
14677         form_short_octal_warning(RExC_parse, numlen));
14678        (void)ReREFCNT_inc(RExC_rx_sv);
14679       }
14680      }
14681      non_portable_endpoint++;
14682      if (IN_ENCODING && value < 0x100)
14683       goto recode_encoding;
14684      break;
14685     }
14686    recode_encoding:
14687     if (! RExC_override_recoding) {
14688      SV* enc = _get_encoding();
14689      value = reg_recode((const char)(U8)value, &enc);
14690      if (!enc) {
14691       if (strict) {
14692        vFAIL("Invalid escape in the specified encoding");
14693       }
14694       else if (PASS2) {
14695        ckWARNreg(RExC_parse,
14696         "Invalid escape in the specified encoding");
14697       }
14698      }
14699      break;
14700     }
14701    default:
14702     /* Allow \_ to not give an error */
14703     if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
14704      if (strict) {
14705       vFAIL2("Unrecognized escape \\%c in character class",
14706        (int)value);
14707      }
14708      else {
14709       SAVEFREESV(RExC_rx_sv);
14710       ckWARN2reg(RExC_parse,
14711        "Unrecognized escape \\%c in character class passed through",
14712        (int)value);
14713       (void)ReREFCNT_inc(RExC_rx_sv);
14714      }
14715     }
14716     break;
14717    }   /* End of switch on char following backslash */
14718   } /* end of handling backslash escape sequences */
14719
14720   /* Here, we have the current token in 'value' */
14721
14722   if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
14723    U8 classnum;
14724
14725    /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
14726    * literal, as is the character that began the false range, i.e.
14727    * the 'a' in the examples */
14728    if (range) {
14729     if (!SIZE_ONLY) {
14730      const int w = (RExC_parse >= rangebegin)
14731         ? RExC_parse - rangebegin
14732         : 0;
14733      if (strict) {
14734       vFAIL2utf8f(
14735        "False [] range \"%"UTF8f"\"",
14736        UTF8fARG(UTF, w, rangebegin));
14737      }
14738      else {
14739       SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
14740       ckWARN2reg(RExC_parse,
14741        "False [] range \"%"UTF8f"\"",
14742        UTF8fARG(UTF, w, rangebegin));
14743       (void)ReREFCNT_inc(RExC_rx_sv);
14744       cp_list = add_cp_to_invlist(cp_list, '-');
14745       cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
14746                prevvalue);
14747      }
14748     }
14749
14750     range = 0; /* this was not a true range */
14751     element_count += 2; /* So counts for three values */
14752    }
14753
14754    classnum = namedclass_to_classnum(namedclass);
14755
14756    if (LOC && namedclass < ANYOF_POSIXL_MAX
14757 #ifndef HAS_ISASCII
14758     && classnum != _CC_ASCII
14759 #endif
14760    ) {
14761     /* What the Posix classes (like \w, [:space:]) match in locale
14762     * isn't knowable under locale until actual match time.  Room
14763     * must be reserved (one time per outer bracketed class) to
14764     * store such classes.  The space will contain a bit for each
14765     * named class that is to be matched against.  This isn't
14766     * needed for \p{} and pseudo-classes, as they are not affected
14767     * by locale, and hence are dealt with separately */
14768     if (! need_class) {
14769      need_class = 1;
14770      if (SIZE_ONLY) {
14771       RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14772      }
14773      else {
14774       RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14775      }
14776      ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
14777      ANYOF_POSIXL_ZERO(ret);
14778     }
14779
14780     /* Coverity thinks it is possible for this to be negative; both
14781     * jhi and khw think it's not, but be safer */
14782     assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14783      || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
14784
14785     /* See if it already matches the complement of this POSIX
14786     * class */
14787     if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14788      && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
14789                ? -1
14790                : 1)))
14791     {
14792      posixl_matches_all = TRUE;
14793      break;  /* No need to continue.  Since it matches both
14794        e.g., \w and \W, it matches everything, and the
14795        bracketed class can be optimized into qr/./s */
14796     }
14797
14798     /* Add this class to those that should be checked at runtime */
14799     ANYOF_POSIXL_SET(ret, namedclass);
14800
14801     /* The above-Latin1 characters are not subject to locale rules.
14802     * Just add them, in the second pass, to the
14803     * unconditionally-matched list */
14804     if (! SIZE_ONLY) {
14805      SV* scratch_list = NULL;
14806
14807      /* Get the list of the above-Latin1 code points this
14808      * matches */
14809      _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
14810           PL_XPosix_ptrs[classnum],
14811
14812           /* Odd numbers are complements, like
14813           * NDIGIT, NASCII, ... */
14814           namedclass % 2 != 0,
14815           &scratch_list);
14816      /* Checking if 'cp_list' is NULL first saves an extra
14817      * clone.  Its reference count will be decremented at the
14818      * next union, etc, or if this is the only instance, at the
14819      * end of the routine */
14820      if (! cp_list) {
14821       cp_list = scratch_list;
14822      }
14823      else {
14824       _invlist_union(cp_list, scratch_list, &cp_list);
14825       SvREFCNT_dec_NN(scratch_list);
14826      }
14827      continue;   /* Go get next character */
14828     }
14829    }
14830    else if (! SIZE_ONLY) {
14831
14832     /* Here, not in pass1 (in that pass we skip calculating the
14833     * contents of this class), and is /l, or is a POSIX class for
14834     * which /l doesn't matter (or is a Unicode property, which is
14835     * skipped here). */
14836     if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
14837      if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
14838
14839       /* Here, should be \h, \H, \v, or \V.  None of /d, /i
14840       * nor /l make a difference in what these match,
14841       * therefore we just add what they match to cp_list. */
14842       if (classnum != _CC_VERTSPACE) {
14843        assert(   namedclass == ANYOF_HORIZWS
14844         || namedclass == ANYOF_NHORIZWS);
14845
14846        /* It turns out that \h is just a synonym for
14847        * XPosixBlank */
14848        classnum = _CC_BLANK;
14849       }
14850
14851       _invlist_union_maybe_complement_2nd(
14852         cp_list,
14853         PL_XPosix_ptrs[classnum],
14854         namedclass % 2 != 0,    /* Complement if odd
14855               (NHORIZWS, NVERTWS)
14856               */
14857         &cp_list);
14858      }
14859     }
14860     else if (UNI_SEMANTICS
14861       || classnum == _CC_ASCII
14862       || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT
14863             || classnum == _CC_XDIGIT)))
14864     {
14865      /* We usually have to worry about /d and /a affecting what
14866      * POSIX classes match, with special code needed for /d
14867      * because we won't know until runtime what all matches.
14868      * But there is no extra work needed under /u, and
14869      * [:ascii:] is unaffected by /a and /d; and :digit: and
14870      * :xdigit: don't have runtime differences under /d.  So we
14871      * can special case these, and avoid some extra work below,
14872      * and at runtime. */
14873      _invlist_union_maybe_complement_2nd(
14874              simple_posixes,
14875              PL_XPosix_ptrs[classnum],
14876              namedclass % 2 != 0,
14877              &simple_posixes);
14878     }
14879     else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
14880       complement and use nposixes */
14881      SV** posixes_ptr = namedclass % 2 == 0
14882          ? &posixes
14883          : &nposixes;
14884      _invlist_union_maybe_complement_2nd(
14885              *posixes_ptr,
14886              PL_XPosix_ptrs[classnum],
14887              namedclass % 2 != 0,
14888              posixes_ptr);
14889     }
14890    }
14891   } /* end of namedclass \blah */
14892
14893   if (skip_white) {
14894    RExC_parse = regpatws(pRExC_state, RExC_parse,
14895         FALSE /* means don't recognize comments */ );
14896   }
14897
14898   /* If 'range' is set, 'value' is the ending of a range--check its
14899   * validity.  (If value isn't a single code point in the case of a
14900   * range, we should have figured that out above in the code that
14901   * catches false ranges).  Later, we will handle each individual code
14902   * point in the range.  If 'range' isn't set, this could be the
14903   * beginning of a range, so check for that by looking ahead to see if
14904   * the next real character to be processed is the range indicator--the
14905   * minus sign */
14906
14907   if (range) {
14908 #ifdef EBCDIC
14909    /* For unicode ranges, we have to test that the Unicode as opposed
14910    * to the native values are not decreasing.  (Above 255, there is
14911    * no difference between native and Unicode) */
14912    if (unicode_range && prevvalue < 255 && value < 255) {
14913     if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
14914      goto backwards_range;
14915     }
14916    }
14917    else
14918 #endif
14919    if (prevvalue > value) /* b-a */ {
14920     int w;
14921 #ifdef EBCDIC
14922    backwards_range:
14923 #endif
14924     w = RExC_parse - rangebegin;
14925     vFAIL2utf8f(
14926      "Invalid [] range \"%"UTF8f"\"",
14927      UTF8fARG(UTF, w, rangebegin));
14928     NOT_REACHED; /* NOTREACHED */
14929    }
14930   }
14931   else {
14932    prevvalue = value; /* save the beginning of the potential range */
14933    if (! stop_at_1     /* Can't be a range if parsing just one thing */
14934     && *RExC_parse == '-')
14935    {
14936     char* next_char_ptr = RExC_parse + 1;
14937     if (skip_white) {   /* Get the next real char after the '-' */
14938      next_char_ptr = regpatws(pRExC_state,
14939            RExC_parse + 1,
14940            FALSE); /* means don't recognize
14941               comments */
14942     }
14943
14944     /* If the '-' is at the end of the class (just before the ']',
14945     * it is a literal minus; otherwise it is a range */
14946     if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14947      RExC_parse = next_char_ptr;
14948
14949      /* a bad range like \w-, [:word:]- ? */
14950      if (namedclass > OOB_NAMEDCLASS) {
14951       if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
14952        const int w = RExC_parse >= rangebegin
14953           ?  RExC_parse - rangebegin
14954           : 0;
14955        if (strict) {
14956         vFAIL4("False [] range \"%*.*s\"",
14957          w, w, rangebegin);
14958        }
14959        else if (PASS2) {
14960         vWARN4(RExC_parse,
14961          "False [] range \"%*.*s\"",
14962          w, w, rangebegin);
14963        }
14964       }
14965       if (!SIZE_ONLY) {
14966        cp_list = add_cp_to_invlist(cp_list, '-');
14967       }
14968       element_count++;
14969      } else
14970       range = 1; /* yeah, it's a range! */
14971      continue; /* but do it the next time */
14972     }
14973    }
14974   }
14975
14976   if (namedclass > OOB_NAMEDCLASS) {
14977    continue;
14978   }
14979
14980   /* Here, we have a single value this time through the loop, and
14981   * <prevvalue> is the beginning of the range, if any; or <value> if
14982   * not. */
14983
14984   /* non-Latin1 code point implies unicode semantics.  Must be set in
14985   * pass1 so is there for the whole of pass 2 */
14986   if (value > 255) {
14987    RExC_uni_semantics = 1;
14988   }
14989
14990   /* Ready to process either the single value, or the completed range.
14991   * For single-valued non-inverted ranges, we consider the possibility
14992   * of multi-char folds.  (We made a conscious decision to not do this
14993   * for the other cases because it can often lead to non-intuitive
14994   * results.  For example, you have the peculiar case that:
14995   *  "s s" =~ /^[^\xDF]+$/i => Y
14996   *  "ss"  =~ /^[^\xDF]+$/i => N
14997   *
14998   * See [perl #89750] */
14999   if (FOLD && allow_multi_folds && value == prevvalue) {
15000    if (value == LATIN_SMALL_LETTER_SHARP_S
15001     || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
15002               value)))
15003    {
15004     /* Here <value> is indeed a multi-char fold.  Get what it is */
15005
15006     U8 foldbuf[UTF8_MAXBYTES_CASE];
15007     STRLEN foldlen;
15008
15009     UV folded = _to_uni_fold_flags(
15010         value,
15011         foldbuf,
15012         &foldlen,
15013         FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
15014             ? FOLD_FLAGS_NOMIX_ASCII
15015             : 0)
15016         );
15017
15018     /* Here, <folded> should be the first character of the
15019     * multi-char fold of <value>, with <foldbuf> containing the
15020     * whole thing.  But, if this fold is not allowed (because of
15021     * the flags), <fold> will be the same as <value>, and should
15022     * be processed like any other character, so skip the special
15023     * handling */
15024     if (folded != value) {
15025
15026      /* Skip if we are recursed, currently parsing the class
15027      * again.  Otherwise add this character to the list of
15028      * multi-char folds. */
15029      if (! RExC_in_multi_char_class) {
15030       STRLEN cp_count = utf8_length(foldbuf,
15031              foldbuf + foldlen);
15032       SV* multi_fold = sv_2mortal(newSVpvs(""));
15033
15034       Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
15035
15036       multi_char_matches
15037           = add_multi_match(multi_char_matches,
15038               multi_fold,
15039               cp_count);
15040
15041      }
15042
15043      /* This element should not be processed further in this
15044      * class */
15045      element_count--;
15046      value = save_value;
15047      prevvalue = save_prevvalue;
15048      continue;
15049     }
15050    }
15051   }
15052
15053   if (strict && PASS2 && ckWARN(WARN_REGEXP)) {
15054    if (range) {
15055
15056     /* If the range starts above 255, everything is portable and
15057     * likely to be so for any forseeable character set, so don't
15058     * warn. */
15059     if (unicode_range && non_portable_endpoint && prevvalue < 256) {
15060      vWARN(RExC_parse, "Both or neither range ends should be Unicode");
15061     }
15062     else if (prevvalue != value) {
15063
15064      /* Under strict, ranges that stop and/or end in an ASCII
15065      * printable should have each end point be a portable value
15066      * for it (preferably like 'A', but we don't warn if it is
15067      * a (portable) Unicode name or code point), and the range
15068      * must be be all digits or all letters of the same case.
15069      * Otherwise, the range is non-portable and unclear as to
15070      * what it contains */
15071      if ((isPRINT_A(prevvalue) || isPRINT_A(value))
15072       && (non_portable_endpoint
15073        || ! ((isDIGIT_A(prevvalue) && isDIGIT_A(value))
15074         || (isLOWER_A(prevvalue) && isLOWER_A(value))
15075         || (isUPPER_A(prevvalue) && isUPPER_A(value)))))
15076      {
15077       vWARN(RExC_parse, "Ranges of ASCII printables should be some subset of \"0-9\", \"A-Z\", or \"a-z\"");
15078      }
15079      else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */
15080
15081       /* But the nature of Unicode and languages mean we
15082       * can't do the same checks for above-ASCII ranges,
15083       * except in the case of digit ones.  These should
15084       * contain only digits from the same group of 10.  The
15085       * ASCII case is handled just above.  0x660 is the
15086       * first digit character beyond ASCII.  Hence here, the
15087       * range could be a range of digits.  Find out.  */
15088       IV index_start = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
15089               prevvalue);
15090       IV index_final = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
15091               value);
15092
15093       /* If the range start and final points are in the same
15094       * inversion list element, it means that either both
15095       * are not digits, or both are digits in a consecutive
15096       * sequence of digits.  (So far, Unicode has kept all
15097       * such sequences as distinct groups of 10, but assert
15098       * to make sure).  If the end points are not in the
15099       * same element, neither should be a digit. */
15100       if (index_start == index_final) {
15101        assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start)
15102        || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
15103        - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15104        == 10)
15105        /* But actually Unicode did have one group of 11
15106         * 'digits' in 5.2, so in case we are operating
15107         * on that version, let that pass */
15108        || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
15109        - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15110         == 11
15111        && invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15112         == 0x19D0)
15113        );
15114       }
15115       else if ((index_start >= 0
15116         && ELEMENT_RANGE_MATCHES_INVLIST(index_start))
15117         || (index_final >= 0
15118          && ELEMENT_RANGE_MATCHES_INVLIST(index_final)))
15119       {
15120        vWARN(RExC_parse, "Ranges of digits should be from the same group of 10");
15121       }
15122      }
15123     }
15124    }
15125    if ((! range || prevvalue == value) && non_portable_endpoint) {
15126     if (isPRINT_A(value)) {
15127      char literal[3];
15128      unsigned d = 0;
15129      if (isBACKSLASHED_PUNCT(value)) {
15130       literal[d++] = '\\';
15131      }
15132      literal[d++] = (char) value;
15133      literal[d++] = '\0';
15134
15135      vWARN4(RExC_parse,
15136       "\"%.*s\" is more clearly written simply as \"%s\"",
15137       (int) (RExC_parse - rangebegin),
15138       rangebegin,
15139       literal
15140       );
15141     }
15142     else if isMNEMONIC_CNTRL(value) {
15143      vWARN4(RExC_parse,
15144       "\"%.*s\" is more clearly written simply as \"%s\"",
15145       (int) (RExC_parse - rangebegin),
15146       rangebegin,
15147       cntrl_to_mnemonic((char) value)
15148       );
15149     }
15150    }
15151   }
15152
15153   /* Deal with this element of the class */
15154   if (! SIZE_ONLY) {
15155
15156 #ifndef EBCDIC
15157    cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15158              prevvalue, value);
15159 #else
15160    /* On non-ASCII platforms, for ranges that span all of 0..255, and
15161    * ones that don't require special handling, we can just add the
15162    * range like we do for ASCII platforms */
15163    if ((UNLIKELY(prevvalue == 0) && value >= 255)
15164     || ! (prevvalue < 256
15165      && (unicode_range
15166       || (! non_portable_endpoint
15167        && ((isLOWER_A(prevvalue) && isLOWER_A(value))
15168         || (isUPPER_A(prevvalue)
15169          && isUPPER_A(value)))))))
15170    {
15171     cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15172               prevvalue, value);
15173    }
15174    else {
15175     /* Here, requires special handling.  This can be because it is
15176     * a range whose code points are considered to be Unicode, and
15177     * so must be individually translated into native, or because
15178     * its a subrange of 'A-Z' or 'a-z' which each aren't
15179     * contiguous in EBCDIC, but we have defined them to include
15180     * only the "expected" upper or lower case ASCII alphabetics.
15181     * Subranges above 255 are the same in native and Unicode, so
15182     * can be added as a range */
15183     U8 start = NATIVE_TO_LATIN1(prevvalue);
15184     unsigned j;
15185     U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
15186     for (j = start; j <= end; j++) {
15187      cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
15188     }
15189     if (value > 255) {
15190      cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15191                256, value);
15192     }
15193    }
15194 #endif
15195   }
15196
15197   range = 0; /* this range (if it was one) is done now */
15198  } /* End of loop through all the text within the brackets */
15199
15200  /* If anything in the class expands to more than one character, we have to
15201  * deal with them by building up a substitute parse string, and recursively
15202  * calling reg() on it, instead of proceeding */
15203  if (multi_char_matches) {
15204   SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
15205   I32 cp_count;
15206   STRLEN len;
15207   char *save_end = RExC_end;
15208   char *save_parse = RExC_parse;
15209   bool first_time = TRUE;     /* First multi-char occurrence doesn't get
15210          a "|" */
15211   I32 reg_flags;
15212
15213   assert(! invert);
15214 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
15215   because too confusing */
15216   if (invert) {
15217    sv_catpv(substitute_parse, "(?:");
15218   }
15219 #endif
15220
15221   /* Look at the longest folds first */
15222   for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
15223
15224    if (av_exists(multi_char_matches, cp_count)) {
15225     AV** this_array_ptr;
15226     SV* this_sequence;
15227
15228     this_array_ptr = (AV**) av_fetch(multi_char_matches,
15229             cp_count, FALSE);
15230     while ((this_sequence = av_pop(*this_array_ptr)) !=
15231                 &PL_sv_undef)
15232     {
15233      if (! first_time) {
15234       sv_catpv(substitute_parse, "|");
15235      }
15236      first_time = FALSE;
15237
15238      sv_catpv(substitute_parse, SvPVX(this_sequence));
15239     }
15240    }
15241   }
15242
15243   /* If the character class contains anything else besides these
15244   * multi-character folds, have to include it in recursive parsing */
15245   if (element_count) {
15246    sv_catpv(substitute_parse, "|[");
15247    sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
15248    sv_catpv(substitute_parse, "]");
15249   }
15250
15251   sv_catpv(substitute_parse, ")");
15252 #if 0
15253   if (invert) {
15254    /* This is a way to get the parse to skip forward a whole named
15255    * sequence instead of matching the 2nd character when it fails the
15256    * first */
15257    sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
15258   }
15259 #endif
15260
15261   RExC_parse = SvPV(substitute_parse, len);
15262   RExC_end = RExC_parse + len;
15263   RExC_in_multi_char_class = 1;
15264   RExC_override_recoding = 1;
15265   RExC_emit = (regnode *)orig_emit;
15266
15267   ret = reg(pRExC_state, 1, &reg_flags, depth+1);
15268
15269   *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
15270
15271   RExC_parse = save_parse;
15272   RExC_end = save_end;
15273   RExC_in_multi_char_class = 0;
15274   RExC_override_recoding = 0;
15275   SvREFCNT_dec_NN(multi_char_matches);
15276   return ret;
15277  }
15278
15279  /* Here, we've gone through the entire class and dealt with multi-char
15280  * folds.  We are now in a position that we can do some checks to see if we
15281  * can optimize this ANYOF node into a simpler one, even in Pass 1.
15282  * Currently we only do two checks:
15283  * 1) is in the unlikely event that the user has specified both, eg. \w and
15284  *    \W under /l, then the class matches everything.  (This optimization
15285  *    is done only to make the optimizer code run later work.)
15286  * 2) if the character class contains only a single element (including a
15287  *    single range), we see if there is an equivalent node for it.
15288  * Other checks are possible */
15289  if (! ret_invlist   /* Can't optimize if returning the constructed
15290       inversion list */
15291   && (UNLIKELY(posixl_matches_all) || element_count == 1))
15292  {
15293   U8 op = END;
15294   U8 arg = 0;
15295
15296   if (UNLIKELY(posixl_matches_all)) {
15297    op = SANY;
15298   }
15299   else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
15300             \w or [:digit:] or \p{foo}
15301             */
15302
15303    /* All named classes are mapped into POSIXish nodes, with its FLAG
15304    * argument giving which class it is */
15305    switch ((I32)namedclass) {
15306     case ANYOF_UNIPROP:
15307      break;
15308
15309     /* These don't depend on the charset modifiers.  They always
15310     * match under /u rules */
15311     case ANYOF_NHORIZWS:
15312     case ANYOF_HORIZWS:
15313      namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
15314      /* FALLTHROUGH */
15315
15316     case ANYOF_NVERTWS:
15317     case ANYOF_VERTWS:
15318      op = POSIXU;
15319      goto join_posix;
15320
15321     /* The actual POSIXish node for all the rest depends on the
15322     * charset modifier.  The ones in the first set depend only on
15323     * ASCII or, if available on this platform, also locale */
15324     case ANYOF_ASCII:
15325     case ANYOF_NASCII:
15326 #ifdef HAS_ISASCII
15327      op = (LOC) ? POSIXL : POSIXA;
15328 #else
15329      op = POSIXA;
15330 #endif
15331      goto join_posix;
15332
15333     /* The following don't have any matches in the upper Latin1
15334     * range, hence /d is equivalent to /u for them.  Making it /u
15335     * saves some branches at runtime */
15336     case ANYOF_DIGIT:
15337     case ANYOF_NDIGIT:
15338     case ANYOF_XDIGIT:
15339     case ANYOF_NXDIGIT:
15340      if (! DEPENDS_SEMANTICS) {
15341       goto treat_as_default;
15342      }
15343
15344      op = POSIXU;
15345      goto join_posix;
15346
15347     /* The following change to CASED under /i */
15348     case ANYOF_LOWER:
15349     case ANYOF_NLOWER:
15350     case ANYOF_UPPER:
15351     case ANYOF_NUPPER:
15352      if (FOLD) {
15353       namedclass = ANYOF_CASED + (namedclass % 2);
15354      }
15355      /* FALLTHROUGH */
15356
15357     /* The rest have more possibilities depending on the charset.
15358     * We take advantage of the enum ordering of the charset
15359     * modifiers to get the exact node type, */
15360     default:
15361     treat_as_default:
15362      op = POSIXD + get_regex_charset(RExC_flags);
15363      if (op > POSIXA) { /* /aa is same as /a */
15364       op = POSIXA;
15365      }
15366
15367     join_posix:
15368      /* The odd numbered ones are the complements of the
15369      * next-lower even number one */
15370      if (namedclass % 2 == 1) {
15371       invert = ! invert;
15372       namedclass--;
15373      }
15374      arg = namedclass_to_classnum(namedclass);
15375      break;
15376    }
15377   }
15378   else if (value == prevvalue) {
15379
15380    /* Here, the class consists of just a single code point */
15381
15382    if (invert) {
15383     if (! LOC && value == '\n') {
15384      op = REG_ANY; /* Optimize [^\n] */
15385      *flagp |= HASWIDTH|SIMPLE;
15386      MARK_NAUGHTY(1);
15387     }
15388    }
15389    else if (value < 256 || UTF) {
15390
15391     /* Optimize a single value into an EXACTish node, but not if it
15392     * would require converting the pattern to UTF-8. */
15393     op = compute_EXACTish(pRExC_state);
15394    }
15395   } /* Otherwise is a range */
15396   else if (! LOC) {   /* locale could vary these */
15397    if (prevvalue == '0') {
15398     if (value == '9') {
15399      arg = _CC_DIGIT;
15400      op = POSIXA;
15401     }
15402    }
15403    else if (! FOLD || ASCII_FOLD_RESTRICTED) {
15404     /* We can optimize A-Z or a-z, but not if they could match
15405     * something like the KELVIN SIGN under /i. */
15406     if (prevvalue == 'A') {
15407      if (value == 'Z'
15408 #ifdef EBCDIC
15409       && ! non_portable_endpoint
15410 #endif
15411      ) {
15412       arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
15413       op = POSIXA;
15414      }
15415     }
15416     else if (prevvalue == 'a') {
15417      if (value == 'z'
15418 #ifdef EBCDIC
15419       && ! non_portable_endpoint
15420 #endif
15421      ) {
15422       arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
15423       op = POSIXA;
15424      }
15425     }
15426    }
15427   }
15428
15429   /* Here, we have changed <op> away from its initial value iff we found
15430   * an optimization */
15431   if (op != END) {
15432
15433    /* Throw away this ANYOF regnode, and emit the calculated one,
15434    * which should correspond to the beginning, not current, state of
15435    * the parse */
15436    const char * cur_parse = RExC_parse;
15437    RExC_parse = (char *)orig_parse;
15438    if ( SIZE_ONLY) {
15439     if (! LOC) {
15440
15441      /* To get locale nodes to not use the full ANYOF size would
15442      * require moving the code above that writes the portions
15443      * of it that aren't in other nodes to after this point.
15444      * e.g.  ANYOF_POSIXL_SET */
15445      RExC_size = orig_size;
15446     }
15447    }
15448    else {
15449     RExC_emit = (regnode *)orig_emit;
15450     if (PL_regkind[op] == POSIXD) {
15451      if (op == POSIXL) {
15452       RExC_contains_locale = 1;
15453      }
15454      if (invert) {
15455       op += NPOSIXD - POSIXD;
15456      }
15457     }
15458    }
15459
15460    ret = reg_node(pRExC_state, op);
15461
15462    if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
15463     if (! SIZE_ONLY) {
15464      FLAGS(ret) = arg;
15465     }
15466     *flagp |= HASWIDTH|SIMPLE;
15467    }
15468    else if (PL_regkind[op] == EXACT) {
15469     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15470           TRUE /* downgradable to EXACT */
15471           );
15472    }
15473
15474    RExC_parse = (char *) cur_parse;
15475
15476    SvREFCNT_dec(posixes);
15477    SvREFCNT_dec(nposixes);
15478    SvREFCNT_dec(simple_posixes);
15479    SvREFCNT_dec(cp_list);
15480    SvREFCNT_dec(cp_foldable_list);
15481    return ret;
15482   }
15483  }
15484
15485  if (SIZE_ONLY)
15486   return ret;
15487  /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
15488
15489  /* If folding, we calculate all characters that could fold to or from the
15490  * ones already on the list */
15491  if (cp_foldable_list) {
15492   if (FOLD) {
15493    UV start, end; /* End points of code point ranges */
15494
15495    SV* fold_intersection = NULL;
15496    SV** use_list;
15497
15498    /* Our calculated list will be for Unicode rules.  For locale
15499    * matching, we have to keep a separate list that is consulted at
15500    * runtime only when the locale indicates Unicode rules.  For
15501    * non-locale, we just use to the general list */
15502    if (LOC) {
15503     use_list = &only_utf8_locale_list;
15504    }
15505    else {
15506     use_list = &cp_list;
15507    }
15508
15509    /* Only the characters in this class that participate in folds need
15510    * be checked.  Get the intersection of this class and all the
15511    * possible characters that are foldable.  This can quickly narrow
15512    * down a large class */
15513    _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
15514         &fold_intersection);
15515
15516    /* The folds for all the Latin1 characters are hard-coded into this
15517    * program, but we have to go out to disk to get the others. */
15518    if (invlist_highest(cp_foldable_list) >= 256) {
15519
15520     /* This is a hash that for a particular fold gives all
15521     * characters that are involved in it */
15522     if (! PL_utf8_foldclosures) {
15523      _load_PL_utf8_foldclosures();
15524     }
15525    }
15526
15527    /* Now look at the foldable characters in this class individually */
15528    invlist_iterinit(fold_intersection);
15529    while (invlist_iternext(fold_intersection, &start, &end)) {
15530     UV j;
15531
15532     /* Look at every character in the range */
15533     for (j = start; j <= end; j++) {
15534      U8 foldbuf[UTF8_MAXBYTES_CASE+1];
15535      STRLEN foldlen;
15536      SV** listp;
15537
15538      if (j < 256) {
15539
15540       if (IS_IN_SOME_FOLD_L1(j)) {
15541
15542        /* ASCII is always matched; non-ASCII is matched
15543        * only under Unicode rules (which could happen
15544        * under /l if the locale is a UTF-8 one */
15545        if (isASCII(j) || ! DEPENDS_SEMANTICS) {
15546         *use_list = add_cp_to_invlist(*use_list,
15547                PL_fold_latin1[j]);
15548        }
15549        else {
15550         depends_list =
15551         add_cp_to_invlist(depends_list,
15552             PL_fold_latin1[j]);
15553        }
15554       }
15555
15556       if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
15557        && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
15558       {
15559        add_above_Latin1_folds(pRExC_state,
15560             (U8) j,
15561             use_list);
15562       }
15563       continue;
15564      }
15565
15566      /* Here is an above Latin1 character.  We don't have the
15567      * rules hard-coded for it.  First, get its fold.  This is
15568      * the simple fold, as the multi-character folds have been
15569      * handled earlier and separated out */
15570      _to_uni_fold_flags(j, foldbuf, &foldlen,
15571               (ASCII_FOLD_RESTRICTED)
15572               ? FOLD_FLAGS_NOMIX_ASCII
15573               : 0);
15574
15575      /* Single character fold of above Latin1.  Add everything in
15576      * its fold closure to the list that this node should match.
15577      * The fold closures data structure is a hash with the keys
15578      * being the UTF-8 of every character that is folded to, like
15579      * 'k', and the values each an array of all code points that
15580      * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
15581      * Multi-character folds are not included */
15582      if ((listp = hv_fetch(PL_utf8_foldclosures,
15583           (char *) foldbuf, foldlen, FALSE)))
15584      {
15585       AV* list = (AV*) *listp;
15586       IV k;
15587       for (k = 0; k <= av_tindex(list); k++) {
15588        SV** c_p = av_fetch(list, k, FALSE);
15589        UV c;
15590        assert(c_p);
15591
15592        c = SvUV(*c_p);
15593
15594        /* /aa doesn't allow folds between ASCII and non- */
15595        if ((ASCII_FOLD_RESTRICTED
15596         && (isASCII(c) != isASCII(j))))
15597        {
15598         continue;
15599        }
15600
15601        /* Folds under /l which cross the 255/256 boundary
15602        * are added to a separate list.  (These are valid
15603        * only when the locale is UTF-8.) */
15604        if (c < 256 && LOC) {
15605         *use_list = add_cp_to_invlist(*use_list, c);
15606         continue;
15607        }
15608
15609        if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
15610        {
15611         cp_list = add_cp_to_invlist(cp_list, c);
15612        }
15613        else {
15614         /* Similarly folds involving non-ascii Latin1
15615         * characters under /d are added to their list */
15616         depends_list = add_cp_to_invlist(depends_list,
15617                 c);
15618        }
15619       }
15620      }
15621     }
15622    }
15623    SvREFCNT_dec_NN(fold_intersection);
15624   }
15625
15626   /* Now that we have finished adding all the folds, there is no reason
15627   * to keep the foldable list separate */
15628   _invlist_union(cp_list, cp_foldable_list, &cp_list);
15629   SvREFCNT_dec_NN(cp_foldable_list);
15630  }
15631
15632  /* And combine the result (if any) with any inversion list from posix
15633  * classes.  The lists are kept separate up to now because we don't want to
15634  * fold the classes (folding of those is automatically handled by the swash
15635  * fetching code) */
15636  if (simple_posixes) {
15637   _invlist_union(cp_list, simple_posixes, &cp_list);
15638   SvREFCNT_dec_NN(simple_posixes);
15639  }
15640  if (posixes || nposixes) {
15641   if (posixes && AT_LEAST_ASCII_RESTRICTED) {
15642    /* Under /a and /aa, nothing above ASCII matches these */
15643    _invlist_intersection(posixes,
15644         PL_XPosix_ptrs[_CC_ASCII],
15645         &posixes);
15646   }
15647   if (nposixes) {
15648    if (DEPENDS_SEMANTICS) {
15649     /* Under /d, everything in the upper half of the Latin1 range
15650     * matches these complements */
15651     ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII;
15652    }
15653    else if (AT_LEAST_ASCII_RESTRICTED) {
15654     /* Under /a and /aa, everything above ASCII matches these
15655     * complements */
15656     _invlist_union_complement_2nd(nposixes,
15657            PL_XPosix_ptrs[_CC_ASCII],
15658            &nposixes);
15659    }
15660    if (posixes) {
15661     _invlist_union(posixes, nposixes, &posixes);
15662     SvREFCNT_dec_NN(nposixes);
15663    }
15664    else {
15665     posixes = nposixes;
15666    }
15667   }
15668   if (! DEPENDS_SEMANTICS) {
15669    if (cp_list) {
15670     _invlist_union(cp_list, posixes, &cp_list);
15671     SvREFCNT_dec_NN(posixes);
15672    }
15673    else {
15674     cp_list = posixes;
15675    }
15676   }
15677   else {
15678    /* Under /d, we put into a separate list the Latin1 things that
15679    * match only when the target string is utf8 */
15680    SV* nonascii_but_latin1_properties = NULL;
15681    _invlist_intersection(posixes, PL_UpperLatin1,
15682         &nonascii_but_latin1_properties);
15683    _invlist_subtract(posixes, nonascii_but_latin1_properties,
15684        &posixes);
15685    if (cp_list) {
15686     _invlist_union(cp_list, posixes, &cp_list);
15687     SvREFCNT_dec_NN(posixes);
15688    }
15689    else {
15690     cp_list = posixes;
15691    }
15692
15693    if (depends_list) {
15694     _invlist_union(depends_list, nonascii_but_latin1_properties,
15695        &depends_list);
15696     SvREFCNT_dec_NN(nonascii_but_latin1_properties);
15697    }
15698    else {
15699     depends_list = nonascii_but_latin1_properties;
15700    }
15701   }
15702  }
15703
15704  /* And combine the result (if any) with any inversion list from properties.
15705  * The lists are kept separate up to now so that we can distinguish the two
15706  * in regards to matching above-Unicode.  A run-time warning is generated
15707  * if a Unicode property is matched against a non-Unicode code point. But,
15708  * we allow user-defined properties to match anything, without any warning,
15709  * and we also suppress the warning if there is a portion of the character
15710  * class that isn't a Unicode property, and which matches above Unicode, \W
15711  * or [\x{110000}] for example.
15712  * (Note that in this case, unlike the Posix one above, there is no
15713  * <depends_list>, because having a Unicode property forces Unicode
15714  * semantics */
15715  if (properties) {
15716   if (cp_list) {
15717
15718    /* If it matters to the final outcome, see if a non-property
15719    * component of the class matches above Unicode.  If so, the
15720    * warning gets suppressed.  This is true even if just a single
15721    * such code point is specified, as though not strictly correct if
15722    * another such code point is matched against, the fact that they
15723    * are using above-Unicode code points indicates they should know
15724    * the issues involved */
15725    if (warn_super) {
15726     warn_super = ! (invert
15727        ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
15728    }
15729
15730    _invlist_union(properties, cp_list, &cp_list);
15731    SvREFCNT_dec_NN(properties);
15732   }
15733   else {
15734    cp_list = properties;
15735   }
15736
15737   if (warn_super) {
15738    ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
15739   }
15740  }
15741
15742  /* Here, we have calculated what code points should be in the character
15743  * class.
15744  *
15745  * Now we can see about various optimizations.  Fold calculation (which we
15746  * did above) needs to take place before inversion.  Otherwise /[^k]/i
15747  * would invert to include K, which under /i would match k, which it
15748  * shouldn't.  Therefore we can't invert folded locale now, as it won't be
15749  * folded until runtime */
15750
15751  /* If we didn't do folding, it's because some information isn't available
15752  * until runtime; set the run-time fold flag for these.  (We don't have to
15753  * worry about properties folding, as that is taken care of by the swash
15754  * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
15755  * locales, or the class matches at least one 0-255 range code point */
15756  if (LOC && FOLD) {
15757   if (only_utf8_locale_list) {
15758    ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15759   }
15760   else if (cp_list) { /* Look to see if there a 0-255 code point is in
15761        the list */
15762    UV start, end;
15763    invlist_iterinit(cp_list);
15764    if (invlist_iternext(cp_list, &start, &end) && start < 256) {
15765     ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15766    }
15767    invlist_iterfinish(cp_list);
15768   }
15769  }
15770
15771  /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
15772  * at compile time.  Besides not inverting folded locale now, we can't
15773  * invert if there are things such as \w, which aren't known until runtime
15774  * */
15775  if (cp_list
15776   && invert
15777   && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15778   && ! depends_list
15779   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15780  {
15781   _invlist_invert(cp_list);
15782
15783   /* Any swash can't be used as-is, because we've inverted things */
15784   if (swash) {
15785    SvREFCNT_dec_NN(swash);
15786    swash = NULL;
15787   }
15788
15789   /* Clear the invert flag since have just done it here */
15790   invert = FALSE;
15791  }
15792
15793  if (ret_invlist) {
15794   assert(cp_list);
15795
15796   *ret_invlist = cp_list;
15797   SvREFCNT_dec(swash);
15798
15799   /* Discard the generated node */
15800   if (SIZE_ONLY) {
15801    RExC_size = orig_size;
15802   }
15803   else {
15804    RExC_emit = orig_emit;
15805   }
15806   return orig_emit;
15807  }
15808
15809  /* Some character classes are equivalent to other nodes.  Such nodes take
15810  * up less room and generally fewer operations to execute than ANYOF nodes.
15811  * Above, we checked for and optimized into some such equivalents for
15812  * certain common classes that are easy to test.  Getting to this point in
15813  * the code means that the class didn't get optimized there.  Since this
15814  * code is only executed in Pass 2, it is too late to save space--it has
15815  * been allocated in Pass 1, and currently isn't given back.  But turning
15816  * things into an EXACTish node can allow the optimizer to join it to any
15817  * adjacent such nodes.  And if the class is equivalent to things like /./,
15818  * expensive run-time swashes can be avoided.  Now that we have more
15819  * complete information, we can find things necessarily missed by the
15820  * earlier code.  I (khw) am not sure how much to look for here.  It would
15821  * be easy, but perhaps too slow, to check any candidates against all the
15822  * node types they could possibly match using _invlistEQ(). */
15823
15824  if (cp_list
15825   && ! invert
15826   && ! depends_list
15827   && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15828   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15829
15830   /* We don't optimize if we are supposed to make sure all non-Unicode
15831    * code points raise a warning, as only ANYOF nodes have this check.
15832    * */
15833   && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
15834  {
15835   UV start, end;
15836   U8 op = END;  /* The optimzation node-type */
15837   const char * cur_parse= RExC_parse;
15838
15839   invlist_iterinit(cp_list);
15840   if (! invlist_iternext(cp_list, &start, &end)) {
15841
15842    /* Here, the list is empty.  This happens, for example, when a
15843    * Unicode property is the only thing in the character class, and
15844    * it doesn't match anything.  (perluniprops.pod notes such
15845    * properties) */
15846    op = OPFAIL;
15847    *flagp |= HASWIDTH|SIMPLE;
15848   }
15849   else if (start == end) {    /* The range is a single code point */
15850    if (! invlist_iternext(cp_list, &start, &end)
15851
15852      /* Don't do this optimization if it would require changing
15853      * the pattern to UTF-8 */
15854     && (start < 256 || UTF))
15855    {
15856     /* Here, the list contains a single code point.  Can optimize
15857     * into an EXACTish node */
15858
15859     value = start;
15860
15861     if (! FOLD) {
15862      op = (LOC)
15863       ? EXACTL
15864       : EXACT;
15865     }
15866     else if (LOC) {
15867
15868      /* A locale node under folding with one code point can be
15869      * an EXACTFL, as its fold won't be calculated until
15870      * runtime */
15871      op = EXACTFL;
15872     }
15873     else {
15874
15875      /* Here, we are generally folding, but there is only one
15876      * code point to match.  If we have to, we use an EXACT
15877      * node, but it would be better for joining with adjacent
15878      * nodes in the optimization pass if we used the same
15879      * EXACTFish node that any such are likely to be.  We can
15880      * do this iff the code point doesn't participate in any
15881      * folds.  For example, an EXACTF of a colon is the same as
15882      * an EXACT one, since nothing folds to or from a colon. */
15883      if (value < 256) {
15884       if (IS_IN_SOME_FOLD_L1(value)) {
15885        op = EXACT;
15886       }
15887      }
15888      else {
15889       if (_invlist_contains_cp(PL_utf8_foldable, value)) {
15890        op = EXACT;
15891       }
15892      }
15893
15894      /* If we haven't found the node type, above, it means we
15895      * can use the prevailing one */
15896      if (op == END) {
15897       op = compute_EXACTish(pRExC_state);
15898      }
15899     }
15900    }
15901   }
15902   else if (start == 0) {
15903    if (end == UV_MAX) {
15904     op = SANY;
15905     *flagp |= HASWIDTH|SIMPLE;
15906     MARK_NAUGHTY(1);
15907    }
15908    else if (end == '\n' - 1
15909      && invlist_iternext(cp_list, &start, &end)
15910      && start == '\n' + 1 && end == UV_MAX)
15911    {
15912     op = REG_ANY;
15913     *flagp |= HASWIDTH|SIMPLE;
15914     MARK_NAUGHTY(1);
15915    }
15916   }
15917   invlist_iterfinish(cp_list);
15918
15919   if (op != END) {
15920    RExC_parse = (char *)orig_parse;
15921    RExC_emit = (regnode *)orig_emit;
15922
15923    ret = reg_node(pRExC_state, op);
15924
15925    RExC_parse = (char *)cur_parse;
15926
15927    if (PL_regkind[op] == EXACT) {
15928     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15929           TRUE /* downgradable to EXACT */
15930           );
15931    }
15932
15933    SvREFCNT_dec_NN(cp_list);
15934    return ret;
15935   }
15936  }
15937
15938  /* Here, <cp_list> contains all the code points we can determine at
15939  * compile time that match under all conditions.  Go through it, and
15940  * for things that belong in the bitmap, put them there, and delete from
15941  * <cp_list>.  While we are at it, see if everything above 255 is in the
15942  * list, and if so, set a flag to speed up execution */
15943
15944  populate_ANYOF_from_invlist(ret, &cp_list);
15945
15946  if (invert) {
15947   ANYOF_FLAGS(ret) |= ANYOF_INVERT;
15948  }
15949
15950  /* Here, the bitmap has been populated with all the Latin1 code points that
15951  * always match.  Can now add to the overall list those that match only
15952  * when the target string is UTF-8 (<depends_list>). */
15953  if (depends_list) {
15954   if (cp_list) {
15955    _invlist_union(cp_list, depends_list, &cp_list);
15956    SvREFCNT_dec_NN(depends_list);
15957   }
15958   else {
15959    cp_list = depends_list;
15960   }
15961   ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
15962  }
15963
15964  /* If there is a swash and more than one element, we can't use the swash in
15965  * the optimization below. */
15966  if (swash && element_count > 1) {
15967   SvREFCNT_dec_NN(swash);
15968   swash = NULL;
15969  }
15970
15971  /* Note that the optimization of using 'swash' if it is the only thing in
15972  * the class doesn't have us change swash at all, so it can include things
15973  * that are also in the bitmap; otherwise we have purposely deleted that
15974  * duplicate information */
15975  set_ANYOF_arg(pRExC_state, ret, cp_list,
15976     (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15977     ? listsv : NULL,
15978     only_utf8_locale_list,
15979     swash, has_user_defined_property);
15980
15981  *flagp |= HASWIDTH|SIMPLE;
15982
15983  if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
15984   RExC_contains_locale = 1;
15985  }
15986
15987  return ret;
15988 }
15989
15990 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15991
15992 STATIC void
15993 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
15994     regnode* const node,
15995     SV* const cp_list,
15996     SV* const runtime_defns,
15997     SV* const only_utf8_locale_list,
15998     SV* const swash,
15999     const bool has_user_defined_property)
16000 {
16001  /* Sets the arg field of an ANYOF-type node 'node', using information about
16002  * the node passed-in.  If there is nothing outside the node's bitmap, the
16003  * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
16004  * the count returned by add_data(), having allocated and stored an array,
16005  * av, that that count references, as follows:
16006  *  av[0] stores the character class description in its textual form.
16007  *        This is used later (regexec.c:Perl_regclass_swash()) to
16008  *        initialize the appropriate swash, and is also useful for dumping
16009  *        the regnode.  This is set to &PL_sv_undef if the textual
16010  *        description is not needed at run-time (as happens if the other
16011  *        elements completely define the class)
16012  *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
16013  *        computed from av[0].  But if no further computation need be done,
16014  *        the swash is stored here now (and av[0] is &PL_sv_undef).
16015  *  av[2] stores the inversion list of code points that match only if the
16016  *        current locale is UTF-8
16017  *  av[3] stores the cp_list inversion list for use in addition or instead
16018  *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
16019  *        (Otherwise everything needed is already in av[0] and av[1])
16020  *  av[4] is set if any component of the class is from a user-defined
16021  *        property; used only if av[3] exists */
16022
16023  UV n;
16024
16025  PERL_ARGS_ASSERT_SET_ANYOF_ARG;
16026
16027  if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
16028   assert(! (ANYOF_FLAGS(node)
16029     & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16030      |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
16031   ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
16032  }
16033  else {
16034   AV * const av = newAV();
16035   SV *rv;
16036
16037   assert(ANYOF_FLAGS(node)
16038    & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16039     |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
16040
16041   av_store(av, 0, (runtime_defns)
16042       ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
16043   if (swash) {
16044    assert(cp_list);
16045    av_store(av, 1, swash);
16046    SvREFCNT_dec_NN(cp_list);
16047   }
16048   else {
16049    av_store(av, 1, &PL_sv_undef);
16050    if (cp_list) {
16051     av_store(av, 3, cp_list);
16052     av_store(av, 4, newSVuv(has_user_defined_property));
16053    }
16054   }
16055
16056   if (only_utf8_locale_list) {
16057    av_store(av, 2, only_utf8_locale_list);
16058   }
16059   else {
16060    av_store(av, 2, &PL_sv_undef);
16061   }
16062
16063   rv = newRV_noinc(MUTABLE_SV(av));
16064   n = add_data(pRExC_state, STR_WITH_LEN("s"));
16065   RExC_rxi->data->data[n] = (void*)rv;
16066   ARG_SET(node, n);
16067  }
16068 }
16069
16070 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
16071 SV *
16072 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
16073           const regnode* node,
16074           bool doinit,
16075           SV** listsvp,
16076           SV** only_utf8_locale_ptr,
16077           SV*  exclude_list)
16078
16079 {
16080  /* For internal core use only.
16081  * Returns the swash for the input 'node' in the regex 'prog'.
16082  * If <doinit> is 'true', will attempt to create the swash if not already
16083  *   done.
16084  * If <listsvp> is non-null, will return the printable contents of the
16085  *    swash.  This can be used to get debugging information even before the
16086  *    swash exists, by calling this function with 'doinit' set to false, in
16087  *    which case the components that will be used to eventually create the
16088  *    swash are returned  (in a printable form).
16089  * If <exclude_list> is not NULL, it is an inversion list of things to
16090  *    exclude from what's returned in <listsvp>.
16091  * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
16092  * that, in spite of this function's name, the swash it returns may include
16093  * the bitmap data as well */
16094
16095  SV *sw  = NULL;
16096  SV *si  = NULL;         /* Input swash initialization string */
16097  SV*  invlist = NULL;
16098
16099  RXi_GET_DECL(prog,progi);
16100  const struct reg_data * const data = prog ? progi->data : NULL;
16101
16102  PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
16103
16104  assert(ANYOF_FLAGS(node)
16105   & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16106   |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
16107
16108  if (data && data->count) {
16109   const U32 n = ARG(node);
16110
16111   if (data->what[n] == 's') {
16112    SV * const rv = MUTABLE_SV(data->data[n]);
16113    AV * const av = MUTABLE_AV(SvRV(rv));
16114    SV **const ary = AvARRAY(av);
16115    U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
16116
16117    si = *ary; /* ary[0] = the string to initialize the swash with */
16118
16119    /* Elements 3 and 4 are either both present or both absent. [3] is
16120    * any inversion list generated at compile time; [4] indicates if
16121    * that inversion list has any user-defined properties in it. */
16122    if (av_tindex(av) >= 2) {
16123     if (only_utf8_locale_ptr
16124      && ary[2]
16125      && ary[2] != &PL_sv_undef)
16126     {
16127      *only_utf8_locale_ptr = ary[2];
16128     }
16129     else {
16130      assert(only_utf8_locale_ptr);
16131      *only_utf8_locale_ptr = NULL;
16132     }
16133
16134     if (av_tindex(av) >= 3) {
16135      invlist = ary[3];
16136      if (SvUV(ary[4])) {
16137       swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
16138      }
16139     }
16140     else {
16141      invlist = NULL;
16142     }
16143    }
16144
16145    /* Element [1] is reserved for the set-up swash.  If already there,
16146    * return it; if not, create it and store it there */
16147    if (ary[1] && SvROK(ary[1])) {
16148     sw = ary[1];
16149    }
16150    else if (doinit && ((si && si != &PL_sv_undef)
16151         || (invlist && invlist != &PL_sv_undef))) {
16152     assert(si);
16153     sw = _core_swash_init("utf8", /* the utf8 package */
16154          "", /* nameless */
16155          si,
16156          1, /* binary */
16157          0, /* not from tr/// */
16158          invlist,
16159          &swash_init_flags);
16160     (void)av_store(av, 1, sw);
16161    }
16162   }
16163  }
16164
16165  /* If requested, return a printable version of what this swash matches */
16166  if (listsvp) {
16167   SV* matches_string = newSVpvs("");
16168
16169   /* The swash should be used, if possible, to get the data, as it
16170   * contains the resolved data.  But this function can be called at
16171   * compile-time, before everything gets resolved, in which case we
16172   * return the currently best available information, which is the string
16173   * that will eventually be used to do that resolving, 'si' */
16174   if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
16175    && (si && si != &PL_sv_undef))
16176   {
16177    sv_catsv(matches_string, si);
16178   }
16179
16180   /* Add the inversion list to whatever we have.  This may have come from
16181   * the swash, or from an input parameter */
16182   if (invlist) {
16183    if (exclude_list) {
16184     SV* clone = invlist_clone(invlist);
16185     _invlist_subtract(clone, exclude_list, &clone);
16186     sv_catsv(matches_string, _invlist_contents(clone));
16187     SvREFCNT_dec_NN(clone);
16188    }
16189    else {
16190     sv_catsv(matches_string, _invlist_contents(invlist));
16191    }
16192   }
16193   *listsvp = matches_string;
16194  }
16195
16196  return sw;
16197 }
16198 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
16199
16200 /* reg_skipcomment()
16201
16202    Absorbs an /x style # comment from the input stream,
16203    returning a pointer to the first character beyond the comment, or if the
16204    comment terminates the pattern without anything following it, this returns
16205    one past the final character of the pattern (in other words, RExC_end) and
16206    sets the REG_RUN_ON_COMMENT_SEEN flag.
16207
16208    Note it's the callers responsibility to ensure that we are
16209    actually in /x mode
16210
16211 */
16212
16213 PERL_STATIC_INLINE char*
16214 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
16215 {
16216  PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
16217
16218  assert(*p == '#');
16219
16220  while (p < RExC_end) {
16221   if (*(++p) == '\n') {
16222    return p+1;
16223   }
16224  }
16225
16226  /* we ran off the end of the pattern without ending the comment, so we have
16227  * to add an \n when wrapping */
16228  RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
16229  return p;
16230 }
16231
16232 /* nextchar()
16233
16234    Advances the parse position, and optionally absorbs
16235    "whitespace" from the inputstream.
16236
16237    Without /x "whitespace" means (?#...) style comments only,
16238    with /x this means (?#...) and # comments and whitespace proper.
16239
16240    Returns the RExC_parse point from BEFORE the scan occurs.
16241
16242    This is the /x friendly way of saying RExC_parse++.
16243 */
16244
16245 STATIC char*
16246 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
16247 {
16248  char* const retval = RExC_parse++;
16249
16250  PERL_ARGS_ASSERT_NEXTCHAR;
16251
16252  for (;;) {
16253   if (RExC_end - RExC_parse >= 3
16254    && *RExC_parse == '('
16255    && RExC_parse[1] == '?'
16256    && RExC_parse[2] == '#')
16257   {
16258    while (*RExC_parse != ')') {
16259     if (RExC_parse == RExC_end)
16260      FAIL("Sequence (?#... not terminated");
16261     RExC_parse++;
16262    }
16263    RExC_parse++;
16264    continue;
16265   }
16266   if (RExC_flags & RXf_PMf_EXTENDED) {
16267    char * p = regpatws(pRExC_state, RExC_parse,
16268           TRUE); /* means recognize comments */
16269    if (p != RExC_parse) {
16270     RExC_parse = p;
16271     continue;
16272    }
16273   }
16274   return retval;
16275  }
16276 }
16277
16278 STATIC regnode *
16279 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
16280 {
16281  /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
16282  * space.  In pass1, it aligns and increments RExC_size; in pass2,
16283  * RExC_emit */
16284
16285  regnode * const ret = RExC_emit;
16286  GET_RE_DEBUG_FLAGS_DECL;
16287
16288  PERL_ARGS_ASSERT_REGNODE_GUTS;
16289
16290  assert(extra_size >= regarglen[op]);
16291
16292  if (SIZE_ONLY) {
16293   SIZE_ALIGN(RExC_size);
16294   RExC_size += 1 + extra_size;
16295   return(ret);
16296  }
16297  if (RExC_emit >= RExC_emit_bound)
16298   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
16299     op, (void*)RExC_emit, (void*)RExC_emit_bound);
16300
16301  NODE_ALIGN_FILL(ret);
16302 #ifndef RE_TRACK_PATTERN_OFFSETS
16303  PERL_UNUSED_ARG(name);
16304 #else
16305  if (RExC_offsets) {         /* MJD */
16306   MJD_OFFSET_DEBUG(
16307    ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
16308    name, __LINE__,
16309    PL_reg_name[op],
16310    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
16311     ? "Overwriting end of array!\n" : "OK",
16312    (UV)(RExC_emit - RExC_emit_start),
16313    (UV)(RExC_parse - RExC_start),
16314    (UV)RExC_offsets[0]));
16315   Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
16316  }
16317 #endif
16318  return(ret);
16319 }
16320
16321 /*
16322 - reg_node - emit a node
16323 */
16324 STATIC regnode *   /* Location. */
16325 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
16326 {
16327  regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
16328
16329  PERL_ARGS_ASSERT_REG_NODE;
16330
16331  assert(regarglen[op] == 0);
16332
16333  if (PASS2) {
16334   regnode *ptr = ret;
16335   FILL_ADVANCE_NODE(ptr, op);
16336  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
16337   RExC_emit = ptr;
16338  }
16339  return(ret);
16340 }
16341
16342 /*
16343 - reganode - emit a node with an argument
16344 */
16345 STATIC regnode *   /* Location. */
16346 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
16347 {
16348  regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
16349
16350  PERL_ARGS_ASSERT_REGANODE;
16351
16352  assert(regarglen[op] == 1);
16353
16354  if (PASS2) {
16355   regnode *ptr = ret;
16356   FILL_ADVANCE_NODE_ARG(ptr, op, arg);
16357  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
16358   RExC_emit = ptr;
16359  }
16360  return(ret);
16361 }
16362
16363 STATIC regnode *
16364 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
16365 {
16366  /* emit a node with U32 and I32 arguments */
16367
16368  regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
16369
16370  PERL_ARGS_ASSERT_REG2LANODE;
16371
16372  assert(regarglen[op] == 2);
16373
16374  if (PASS2) {
16375   regnode *ptr = ret;
16376   FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
16377   RExC_emit = ptr;
16378  }
16379  return(ret);
16380 }
16381
16382 /*
16383 - reginsert - insert an operator in front of already-emitted operand
16384 *
16385 * Means relocating the operand.
16386 */
16387 STATIC void
16388 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
16389 {
16390  regnode *src;
16391  regnode *dst;
16392  regnode *place;
16393  const int offset = regarglen[(U8)op];
16394  const int size = NODE_STEP_REGNODE + offset;
16395  GET_RE_DEBUG_FLAGS_DECL;
16396
16397  PERL_ARGS_ASSERT_REGINSERT;
16398  PERL_UNUSED_CONTEXT;
16399  PERL_UNUSED_ARG(depth);
16400 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
16401  DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
16402  if (SIZE_ONLY) {
16403   RExC_size += size;
16404   return;
16405  }
16406
16407  src = RExC_emit;
16408  RExC_emit += size;
16409  dst = RExC_emit;
16410  if (RExC_open_parens) {
16411   int paren;
16412   /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
16413   for ( paren=0 ; paren < RExC_npar ; paren++ ) {
16414    if ( RExC_open_parens[paren] >= opnd ) {
16415     /*DEBUG_PARSE_FMT("open"," - %d",size);*/
16416     RExC_open_parens[paren] += size;
16417    } else {
16418     /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
16419    }
16420    if ( RExC_close_parens[paren] >= opnd ) {
16421     /*DEBUG_PARSE_FMT("close"," - %d",size);*/
16422     RExC_close_parens[paren] += size;
16423    } else {
16424     /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
16425    }
16426   }
16427  }
16428
16429  while (src > opnd) {
16430   StructCopy(--src, --dst, regnode);
16431 #ifdef RE_TRACK_PATTERN_OFFSETS
16432   if (RExC_offsets) {     /* MJD 20010112 */
16433    MJD_OFFSET_DEBUG(
16434     ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
16435     "reg_insert",
16436     __LINE__,
16437     PL_reg_name[op],
16438     (UV)(dst - RExC_emit_start) > RExC_offsets[0]
16439      ? "Overwriting end of array!\n" : "OK",
16440     (UV)(src - RExC_emit_start),
16441     (UV)(dst - RExC_emit_start),
16442     (UV)RExC_offsets[0]));
16443    Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
16444    Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
16445   }
16446 #endif
16447  }
16448
16449
16450  place = opnd;  /* Op node, where operand used to be. */
16451 #ifdef RE_TRACK_PATTERN_OFFSETS
16452  if (RExC_offsets) {         /* MJD */
16453   MJD_OFFSET_DEBUG(
16454    ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
16455    "reginsert",
16456    __LINE__,
16457    PL_reg_name[op],
16458    (UV)(place - RExC_emit_start) > RExC_offsets[0]
16459    ? "Overwriting end of array!\n" : "OK",
16460    (UV)(place - RExC_emit_start),
16461    (UV)(RExC_parse - RExC_start),
16462    (UV)RExC_offsets[0]));
16463   Set_Node_Offset(place, RExC_parse);
16464   Set_Node_Length(place, 1);
16465  }
16466 #endif
16467  src = NEXTOPER(place);
16468  FILL_ADVANCE_NODE(place, op);
16469  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
16470  Zero(src, offset, regnode);
16471 }
16472
16473 /*
16474 - regtail - set the next-pointer at the end of a node chain of p to val.
16475 - SEE ALSO: regtail_study
16476 */
16477 /* TODO: All three parms should be const */
16478 STATIC void
16479 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
16480     const regnode *val,U32 depth)
16481 {
16482  regnode *scan;
16483  GET_RE_DEBUG_FLAGS_DECL;
16484
16485  PERL_ARGS_ASSERT_REGTAIL;
16486 #ifndef DEBUGGING
16487  PERL_UNUSED_ARG(depth);
16488 #endif
16489
16490  if (SIZE_ONLY)
16491   return;
16492
16493  /* Find last node. */
16494  scan = p;
16495  for (;;) {
16496   regnode * const temp = regnext(scan);
16497   DEBUG_PARSE_r({
16498    DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
16499    regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
16500    PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
16501     SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
16502      (temp == NULL ? "->" : ""),
16503      (temp == NULL ? PL_reg_name[OP(val)] : "")
16504    );
16505   });
16506   if (temp == NULL)
16507    break;
16508   scan = temp;
16509  }
16510
16511  if (reg_off_by_arg[OP(scan)]) {
16512   ARG_SET(scan, val - scan);
16513  }
16514  else {
16515   NEXT_OFF(scan) = val - scan;
16516  }
16517 }
16518
16519 #ifdef DEBUGGING
16520 /*
16521 - regtail_study - set the next-pointer at the end of a node chain of p to val.
16522 - Look for optimizable sequences at the same time.
16523 - currently only looks for EXACT chains.
16524
16525 This is experimental code. The idea is to use this routine to perform
16526 in place optimizations on branches and groups as they are constructed,
16527 with the long term intention of removing optimization from study_chunk so
16528 that it is purely analytical.
16529
16530 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
16531 to control which is which.
16532
16533 */
16534 /* TODO: All four parms should be const */
16535
16536 STATIC U8
16537 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
16538      const regnode *val,U32 depth)
16539 {
16540  regnode *scan;
16541  U8 exact = PSEUDO;
16542 #ifdef EXPERIMENTAL_INPLACESCAN
16543  I32 min = 0;
16544 #endif
16545  GET_RE_DEBUG_FLAGS_DECL;
16546
16547  PERL_ARGS_ASSERT_REGTAIL_STUDY;
16548
16549
16550  if (SIZE_ONLY)
16551   return exact;
16552
16553  /* Find last node. */
16554
16555  scan = p;
16556  for (;;) {
16557   regnode * const temp = regnext(scan);
16558 #ifdef EXPERIMENTAL_INPLACESCAN
16559   if (PL_regkind[OP(scan)] == EXACT) {
16560    bool unfolded_multi_char; /* Unexamined in this routine */
16561    if (join_exact(pRExC_state, scan, &min,
16562       &unfolded_multi_char, 1, val, depth+1))
16563     return EXACT;
16564   }
16565 #endif
16566   if ( exact ) {
16567    switch (OP(scan)) {
16568     case EXACT:
16569     case EXACTL:
16570     case EXACTF:
16571     case EXACTFA_NO_TRIE:
16572     case EXACTFA:
16573     case EXACTFU:
16574     case EXACTFLU8:
16575     case EXACTFU_SS:
16576     case EXACTFL:
16577       if( exact == PSEUDO )
16578        exact= OP(scan);
16579       else if ( exact != OP(scan) )
16580        exact= 0;
16581     case NOTHING:
16582      break;
16583     default:
16584      exact= 0;
16585    }
16586   }
16587   DEBUG_PARSE_r({
16588    DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
16589    regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
16590    PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
16591     SvPV_nolen_const(RExC_mysv),
16592     REG_NODE_NUM(scan),
16593     PL_reg_name[exact]);
16594   });
16595   if (temp == NULL)
16596    break;
16597   scan = temp;
16598  }
16599  DEBUG_PARSE_r({
16600   DEBUG_PARSE_MSG("");
16601   regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
16602   PerlIO_printf(Perl_debug_log,
16603      "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
16604      SvPV_nolen_const(RExC_mysv),
16605      (IV)REG_NODE_NUM(val),
16606      (IV)(val - scan)
16607   );
16608  });
16609  if (reg_off_by_arg[OP(scan)]) {
16610   ARG_SET(scan, val - scan);
16611  }
16612  else {
16613   NEXT_OFF(scan) = val - scan;
16614  }
16615
16616  return exact;
16617 }
16618 #endif
16619
16620 /*
16621  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
16622  */
16623 #ifdef DEBUGGING
16624
16625 static void
16626 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
16627 {
16628  int bit;
16629  int set=0;
16630
16631  ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16632
16633  for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
16634   if (flags & (1<<bit)) {
16635    if (!set++ && lead)
16636     PerlIO_printf(Perl_debug_log, "%s",lead);
16637    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
16638   }
16639  }
16640  if (lead)  {
16641   if (set)
16642    PerlIO_printf(Perl_debug_log, "\n");
16643   else
16644    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16645  }
16646 }
16647
16648 static void
16649 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
16650 {
16651  int bit;
16652  int set=0;
16653  regex_charset cs;
16654
16655  ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16656
16657  for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
16658   if (flags & (1<<bit)) {
16659    if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
16660     continue;
16661    }
16662    if (!set++ && lead)
16663     PerlIO_printf(Perl_debug_log, "%s",lead);
16664    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
16665   }
16666  }
16667  if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
16668    if (!set++ && lead) {
16669     PerlIO_printf(Perl_debug_log, "%s",lead);
16670    }
16671    switch (cs) {
16672     case REGEX_UNICODE_CHARSET:
16673      PerlIO_printf(Perl_debug_log, "UNICODE");
16674      break;
16675     case REGEX_LOCALE_CHARSET:
16676      PerlIO_printf(Perl_debug_log, "LOCALE");
16677      break;
16678     case REGEX_ASCII_RESTRICTED_CHARSET:
16679      PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
16680      break;
16681     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
16682      PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
16683      break;
16684     default:
16685      PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
16686      break;
16687    }
16688  }
16689  if (lead)  {
16690   if (set)
16691    PerlIO_printf(Perl_debug_log, "\n");
16692   else
16693    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16694  }
16695 }
16696 #endif
16697
16698 void
16699 Perl_regdump(pTHX_ const regexp *r)
16700 {
16701 #ifdef DEBUGGING
16702  SV * const sv = sv_newmortal();
16703  SV *dsv= sv_newmortal();
16704  RXi_GET_DECL(r,ri);
16705  GET_RE_DEBUG_FLAGS_DECL;
16706
16707  PERL_ARGS_ASSERT_REGDUMP;
16708
16709  (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
16710
16711  /* Header fields of interest. */
16712  if (r->anchored_substr) {
16713   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
16714    RE_SV_DUMPLEN(r->anchored_substr), 30);
16715   PerlIO_printf(Perl_debug_log,
16716      "anchored %s%s at %"IVdf" ",
16717      s, RE_SV_TAIL(r->anchored_substr),
16718      (IV)r->anchored_offset);
16719  } else if (r->anchored_utf8) {
16720   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
16721    RE_SV_DUMPLEN(r->anchored_utf8), 30);
16722   PerlIO_printf(Perl_debug_log,
16723      "anchored utf8 %s%s at %"IVdf" ",
16724      s, RE_SV_TAIL(r->anchored_utf8),
16725      (IV)r->anchored_offset);
16726  }
16727  if (r->float_substr) {
16728   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
16729    RE_SV_DUMPLEN(r->float_substr), 30);
16730   PerlIO_printf(Perl_debug_log,
16731      "floating %s%s at %"IVdf"..%"UVuf" ",
16732      s, RE_SV_TAIL(r->float_substr),
16733      (IV)r->float_min_offset, (UV)r->float_max_offset);
16734  } else if (r->float_utf8) {
16735   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
16736    RE_SV_DUMPLEN(r->float_utf8), 30);
16737   PerlIO_printf(Perl_debug_log,
16738      "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
16739      s, RE_SV_TAIL(r->float_utf8),
16740      (IV)r->float_min_offset, (UV)r->float_max_offset);
16741  }
16742  if (r->check_substr || r->check_utf8)
16743   PerlIO_printf(Perl_debug_log,
16744      (const char *)
16745      (r->check_substr == r->float_substr
16746      && r->check_utf8 == r->float_utf8
16747      ? "(checking floating" : "(checking anchored"));
16748  if (r->intflags & PREGf_NOSCAN)
16749   PerlIO_printf(Perl_debug_log, " noscan");
16750  if (r->extflags & RXf_CHECK_ALL)
16751   PerlIO_printf(Perl_debug_log, " isall");
16752  if (r->check_substr || r->check_utf8)
16753   PerlIO_printf(Perl_debug_log, ") ");
16754
16755  if (ri->regstclass) {
16756   regprop(r, sv, ri->regstclass, NULL, NULL);
16757   PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
16758  }
16759  if (r->intflags & PREGf_ANCH) {
16760   PerlIO_printf(Perl_debug_log, "anchored");
16761   if (r->intflags & PREGf_ANCH_MBOL)
16762    PerlIO_printf(Perl_debug_log, "(MBOL)");
16763   if (r->intflags & PREGf_ANCH_SBOL)
16764    PerlIO_printf(Perl_debug_log, "(SBOL)");
16765   if (r->intflags & PREGf_ANCH_GPOS)
16766    PerlIO_printf(Perl_debug_log, "(GPOS)");
16767   PerlIO_putc(Perl_debug_log, ' ');
16768  }
16769  if (r->intflags & PREGf_GPOS_SEEN)
16770   PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
16771  if (r->intflags & PREGf_SKIP)
16772   PerlIO_printf(Perl_debug_log, "plus ");
16773  if (r->intflags & PREGf_IMPLICIT)
16774   PerlIO_printf(Perl_debug_log, "implicit ");
16775  PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
16776  if (r->extflags & RXf_EVAL_SEEN)
16777   PerlIO_printf(Perl_debug_log, "with eval ");
16778  PerlIO_printf(Perl_debug_log, "\n");
16779  DEBUG_FLAGS_r({
16780   regdump_extflags("r->extflags: ",r->extflags);
16781   regdump_intflags("r->intflags: ",r->intflags);
16782  });
16783 #else
16784  PERL_ARGS_ASSERT_REGDUMP;
16785  PERL_UNUSED_CONTEXT;
16786  PERL_UNUSED_ARG(r);
16787 #endif /* DEBUGGING */
16788 }
16789
16790 /*
16791 - regprop - printable representation of opcode, with run time support
16792 */
16793
16794 void
16795 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
16796 {
16797 #ifdef DEBUGGING
16798  int k;
16799
16800  /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
16801  static const char * const anyofs[] = {
16802 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
16803  || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
16804  || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
16805  || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
16806  || _CC_CNTRL != 13 || _CC_ASCII != 14 || _CC_VERTSPACE != 15
16807   #error Need to adjust order of anyofs[]
16808 #endif
16809   "\\w",
16810   "\\W",
16811   "\\d",
16812   "\\D",
16813   "[:alpha:]",
16814   "[:^alpha:]",
16815   "[:lower:]",
16816   "[:^lower:]",
16817   "[:upper:]",
16818   "[:^upper:]",
16819   "[:punct:]",
16820   "[:^punct:]",
16821   "[:print:]",
16822   "[:^print:]",
16823   "[:alnum:]",
16824   "[:^alnum:]",
16825   "[:graph:]",
16826   "[:^graph:]",
16827   "[:cased:]",
16828   "[:^cased:]",
16829   "\\s",
16830   "\\S",
16831   "[:blank:]",
16832   "[:^blank:]",
16833   "[:xdigit:]",
16834   "[:^xdigit:]",
16835   "[:cntrl:]",
16836   "[:^cntrl:]",
16837   "[:ascii:]",
16838   "[:^ascii:]",
16839   "\\v",
16840   "\\V"
16841  };
16842  RXi_GET_DECL(prog,progi);
16843  GET_RE_DEBUG_FLAGS_DECL;
16844
16845  PERL_ARGS_ASSERT_REGPROP;
16846
16847  sv_setpvn(sv, "", 0);
16848
16849  if (OP(o) > REGNODE_MAX)  /* regnode.type is unsigned */
16850   /* It would be nice to FAIL() here, but this may be called from
16851   regexec.c, and it would be hard to supply pRExC_state. */
16852   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16853            (int)OP(o), (int)REGNODE_MAX);
16854  sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
16855
16856  k = PL_regkind[OP(o)];
16857
16858  if (k == EXACT) {
16859   sv_catpvs(sv, " ");
16860   /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
16861   * is a crude hack but it may be the best for now since
16862   * we have no flag "this EXACTish node was UTF-8"
16863   * --jhi */
16864   pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
16865     PERL_PV_ESCAPE_UNI_DETECT |
16866     PERL_PV_ESCAPE_NONASCII   |
16867     PERL_PV_PRETTY_ELLIPSES   |
16868     PERL_PV_PRETTY_LTGT       |
16869     PERL_PV_PRETTY_NOCLEAR
16870     );
16871  } else if (k == TRIE) {
16872   /* print the details of the trie in dumpuntil instead, as
16873   * progi->data isn't available here */
16874   const char op = OP(o);
16875   const U32 n = ARG(o);
16876   const reg_ac_data * const ac = IS_TRIE_AC(op) ?
16877    (reg_ac_data *)progi->data->data[n] :
16878    NULL;
16879   const reg_trie_data * const trie
16880    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
16881
16882   Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
16883   DEBUG_TRIE_COMPILE_r(
16884   Perl_sv_catpvf(aTHX_ sv,
16885    "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
16886    (UV)trie->startstate,
16887    (IV)trie->statecount-1, /* -1 because of the unused 0 element */
16888    (UV)trie->wordcount,
16889    (UV)trie->minlen,
16890    (UV)trie->maxlen,
16891    (UV)TRIE_CHARCOUNT(trie),
16892    (UV)trie->uniquecharcount
16893   );
16894   );
16895   if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
16896    sv_catpvs(sv, "[");
16897    (void) put_charclass_bitmap_innards(sv,
16898             (IS_ANYOF_TRIE(op))
16899             ? ANYOF_BITMAP(o)
16900             : TRIE_BITMAP(trie),
16901             NULL);
16902    sv_catpvs(sv, "]");
16903   }
16904
16905  } else if (k == CURLY) {
16906   if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
16907    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
16908   Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
16909  }
16910  else if (k == WHILEM && o->flags)   /* Ordinal/of */
16911   Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
16912  else if (k == REF || k == OPEN || k == CLOSE
16913    || k == GROUPP || OP(o)==ACCEPT)
16914  {
16915   AV *name_list= NULL;
16916   Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
16917   if ( RXp_PAREN_NAMES(prog) ) {
16918    name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16919   } else if ( pRExC_state ) {
16920    name_list= RExC_paren_name_list;
16921   }
16922   if (name_list) {
16923    if ( k != REF || (OP(o) < NREF)) {
16924     SV **name= av_fetch(name_list, ARG(o), 0 );
16925     if (name)
16926      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16927    }
16928    else {
16929     SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
16930     I32 *nums=(I32*)SvPVX(sv_dat);
16931     SV **name= av_fetch(name_list, nums[0], 0 );
16932     I32 n;
16933     if (name) {
16934      for ( n=0; n<SvIVX(sv_dat); n++ ) {
16935       Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
16936          (n ? "," : ""), (IV)nums[n]);
16937      }
16938      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16939     }
16940    }
16941   }
16942   if ( k == REF && reginfo) {
16943    U32 n = ARG(o);  /* which paren pair */
16944    I32 ln = prog->offs[n].start;
16945    if (prog->lastparen < n || ln == -1)
16946     Perl_sv_catpvf(aTHX_ sv, ": FAIL");
16947    else if (ln == prog->offs[n].end)
16948     Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
16949    else {
16950     const char *s = reginfo->strbeg + ln;
16951     Perl_sv_catpvf(aTHX_ sv, ": ");
16952     Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
16953      PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
16954    }
16955   }
16956  } else if (k == GOSUB) {
16957   AV *name_list= NULL;
16958   if ( RXp_PAREN_NAMES(prog) ) {
16959    name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16960   } else if ( pRExC_state ) {
16961    name_list= RExC_paren_name_list;
16962   }
16963
16964   /* Paren and offset */
16965   Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
16966   if (name_list) {
16967    SV **name= av_fetch(name_list, ARG(o), 0 );
16968    if (name)
16969     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16970   }
16971  }
16972  else if (k == VERB) {
16973   if (!o->flags)
16974    Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
16975       SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
16976  } else if (k == LOGICAL)
16977   /* 2: embedded, otherwise 1 */
16978   Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
16979  else if (k == ANYOF) {
16980   const U8 flags = ANYOF_FLAGS(o);
16981   int do_sep = 0;
16982   SV* bitmap_invlist;  /* Will hold what the bit map contains */
16983
16984
16985   if (OP(o) == ANYOFL)
16986    sv_catpvs(sv, "{loc}");
16987   if (flags & ANYOF_LOC_FOLD)
16988    sv_catpvs(sv, "{i}");
16989   Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
16990   if (flags & ANYOF_INVERT)
16991    sv_catpvs(sv, "^");
16992
16993   /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
16994   * */
16995   do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
16996                &bitmap_invlist);
16997
16998   /* output any special charclass tests (used entirely under use
16999   * locale) * */
17000   if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
17001    int i;
17002    for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
17003     if (ANYOF_POSIXL_TEST(o,i)) {
17004      sv_catpv(sv, anyofs[i]);
17005      do_sep = 1;
17006     }
17007    }
17008   }
17009
17010   if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
17011      |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
17012      |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
17013      |ANYOF_LOC_FOLD)))
17014   {
17015    if (do_sep) {
17016     Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
17017     if (flags & ANYOF_INVERT)
17018      /*make sure the invert info is in each */
17019      sv_catpvs(sv, "^");
17020    }
17021
17022    if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
17023     sv_catpvs(sv, "{non-utf8-latin1-all}");
17024    }
17025
17026    if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
17027     sv_catpvs(sv, "{above_bitmap_all}");
17028
17029    if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
17030     SV *lv; /* Set if there is something outside the bit map. */
17031     bool byte_output = FALSE;   /* If something has been output */
17032     SV *only_utf8_locale;
17033
17034     /* Get the stuff that wasn't in the bitmap.  'bitmap_invlist'
17035     * is used to guarantee that nothing in the bitmap gets
17036     * returned */
17037     (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
17038              &lv, &only_utf8_locale,
17039              bitmap_invlist);
17040     if (lv && lv != &PL_sv_undef) {
17041      char *s = savesvpv(lv);
17042      char * const origs = s;
17043
17044      while (*s && *s != '\n')
17045       s++;
17046
17047      if (*s == '\n') {
17048       const char * const t = ++s;
17049
17050       if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
17051        sv_catpvs(sv, "{outside bitmap}");
17052       }
17053       else {
17054        sv_catpvs(sv, "{utf8}");
17055       }
17056
17057       if (byte_output) {
17058        sv_catpvs(sv, " ");
17059       }
17060
17061       while (*s) {
17062        if (*s == '\n') {
17063
17064         /* Truncate very long output */
17065         if (s - origs > 256) {
17066          Perl_sv_catpvf(aTHX_ sv,
17067             "%.*s...",
17068             (int) (s - origs - 1),
17069             t);
17070          goto out_dump;
17071         }
17072         *s = ' ';
17073        }
17074        else if (*s == '\t') {
17075         *s = '-';
17076        }
17077        s++;
17078       }
17079       if (s[-1] == ' ')
17080        s[-1] = 0;
17081
17082       sv_catpv(sv, t);
17083      }
17084
17085     out_dump:
17086
17087      Safefree(origs);
17088      SvREFCNT_dec_NN(lv);
17089     }
17090
17091     if ((flags & ANYOF_LOC_FOLD)
17092      && only_utf8_locale
17093      && only_utf8_locale != &PL_sv_undef)
17094     {
17095      UV start, end;
17096      int max_entries = 256;
17097
17098      sv_catpvs(sv, "{utf8 locale}");
17099      invlist_iterinit(only_utf8_locale);
17100      while (invlist_iternext(only_utf8_locale,
17101            &start, &end)) {
17102       put_range(sv, start, end, FALSE);
17103       max_entries --;
17104       if (max_entries < 0) {
17105        sv_catpvs(sv, "...");
17106        break;
17107       }
17108      }
17109      invlist_iterfinish(only_utf8_locale);
17110     }
17111    }
17112   }
17113   SvREFCNT_dec(bitmap_invlist);
17114
17115
17116   Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
17117  }
17118  else if (k == POSIXD || k == NPOSIXD) {
17119   U8 index = FLAGS(o) * 2;
17120   if (index < C_ARRAY_LENGTH(anyofs)) {
17121    if (*anyofs[index] != '[')  {
17122     sv_catpv(sv, "[");
17123    }
17124    sv_catpv(sv, anyofs[index]);
17125    if (*anyofs[index] != '[')  {
17126     sv_catpv(sv, "]");
17127    }
17128   }
17129   else {
17130    Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
17131   }
17132  }
17133  else if (k == BOUND || k == NBOUND) {
17134   /* Must be synced with order of 'bound_type' in regcomp.h */
17135   const char * const bounds[] = {
17136    "",      /* Traditional */
17137    "{gcb}",
17138    "{sb}",
17139    "{wb}"
17140   };
17141   sv_catpv(sv, bounds[FLAGS(o)]);
17142  }
17143  else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
17144   Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
17145  else if (OP(o) == SBOL)
17146   Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
17147 #else
17148  PERL_UNUSED_CONTEXT;
17149  PERL_UNUSED_ARG(sv);
17150  PERL_UNUSED_ARG(o);
17151  PERL_UNUSED_ARG(prog);
17152  PERL_UNUSED_ARG(reginfo);
17153  PERL_UNUSED_ARG(pRExC_state);
17154 #endif /* DEBUGGING */
17155 }
17156
17157
17158
17159 SV *
17160 Perl_re_intuit_string(pTHX_ REGEXP * const r)
17161 {    /* Assume that RE_INTUIT is set */
17162  struct regexp *const prog = ReANY(r);
17163  GET_RE_DEBUG_FLAGS_DECL;
17164
17165  PERL_ARGS_ASSERT_RE_INTUIT_STRING;
17166  PERL_UNUSED_CONTEXT;
17167
17168  DEBUG_COMPILE_r(
17169   {
17170    const char * const s = SvPV_nolen_const(RX_UTF8(r)
17171      ? prog->check_utf8 : prog->check_substr);
17172
17173    if (!PL_colorset) reginitcolors();
17174    PerlIO_printf(Perl_debug_log,
17175      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
17176      PL_colors[4],
17177      RX_UTF8(r) ? "utf8 " : "",
17178      PL_colors[5],PL_colors[0],
17179      s,
17180      PL_colors[1],
17181      (strlen(s) > 60 ? "..." : ""));
17182   } );
17183
17184  /* use UTF8 check substring if regexp pattern itself is in UTF8 */
17185  return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
17186 }
17187
17188 /*
17189    pregfree()
17190
17191    handles refcounting and freeing the perl core regexp structure. When
17192    it is necessary to actually free the structure the first thing it
17193    does is call the 'free' method of the regexp_engine associated to
17194    the regexp, allowing the handling of the void *pprivate; member
17195    first. (This routine is not overridable by extensions, which is why
17196    the extensions free is called first.)
17197
17198    See regdupe and regdupe_internal if you change anything here.
17199 */
17200 #ifndef PERL_IN_XSUB_RE
17201 void
17202 Perl_pregfree(pTHX_ REGEXP *r)
17203 {
17204  SvREFCNT_dec(r);
17205 }
17206
17207 void
17208 Perl_pregfree2(pTHX_ REGEXP *rx)
17209 {
17210  struct regexp *const r = ReANY(rx);
17211  GET_RE_DEBUG_FLAGS_DECL;
17212
17213  PERL_ARGS_ASSERT_PREGFREE2;
17214
17215  if (r->mother_re) {
17216   ReREFCNT_dec(r->mother_re);
17217  } else {
17218   CALLREGFREE_PVT(rx); /* free the private data */
17219   SvREFCNT_dec(RXp_PAREN_NAMES(r));
17220   Safefree(r->xpv_len_u.xpvlenu_pv);
17221  }
17222  if (r->substrs) {
17223   SvREFCNT_dec(r->anchored_substr);
17224   SvREFCNT_dec(r->anchored_utf8);
17225   SvREFCNT_dec(r->float_substr);
17226   SvREFCNT_dec(r->float_utf8);
17227   Safefree(r->substrs);
17228  }
17229  RX_MATCH_COPY_FREE(rx);
17230 #ifdef PERL_ANY_COW
17231  SvREFCNT_dec(r->saved_copy);
17232 #endif
17233  Safefree(r->offs);
17234  SvREFCNT_dec(r->qr_anoncv);
17235  rx->sv_u.svu_rx = 0;
17236 }
17237
17238 /*  reg_temp_copy()
17239
17240  This is a hacky workaround to the structural issue of match results
17241  being stored in the regexp structure which is in turn stored in
17242  PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
17243  could be PL_curpm in multiple contexts, and could require multiple
17244  result sets being associated with the pattern simultaneously, such
17245  as when doing a recursive match with (??{$qr})
17246
17247  The solution is to make a lightweight copy of the regexp structure
17248  when a qr// is returned from the code executed by (??{$qr}) this
17249  lightweight copy doesn't actually own any of its data except for
17250  the starp/end and the actual regexp structure itself.
17251
17252 */
17253
17254
17255 REGEXP *
17256 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
17257 {
17258  struct regexp *ret;
17259  struct regexp *const r = ReANY(rx);
17260  const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
17261
17262  PERL_ARGS_ASSERT_REG_TEMP_COPY;
17263
17264  if (!ret_x)
17265   ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
17266  else {
17267   SvOK_off((SV *)ret_x);
17268   if (islv) {
17269    /* For PVLVs, SvANY points to the xpvlv body while sv_u points
17270    to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
17271    made both spots point to the same regexp body.) */
17272    REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
17273    assert(!SvPVX(ret_x));
17274    ret_x->sv_u.svu_rx = temp->sv_any;
17275    temp->sv_any = NULL;
17276    SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
17277    SvREFCNT_dec_NN(temp);
17278    /* SvCUR still resides in the xpvlv struct, so the regexp copy-
17279    ing below will not set it. */
17280    SvCUR_set(ret_x, SvCUR(rx));
17281   }
17282  }
17283  /* This ensures that SvTHINKFIRST(sv) is true, and hence that
17284  sv_force_normal(sv) is called.  */
17285  SvFAKE_on(ret_x);
17286  ret = ReANY(ret_x);
17287
17288  SvFLAGS(ret_x) |= SvUTF8(rx);
17289  /* We share the same string buffer as the original regexp, on which we
17290  hold a reference count, incremented when mother_re is set below.
17291  The string pointer is copied here, being part of the regexp struct.
17292  */
17293  memcpy(&(ret->xpv_cur), &(r->xpv_cur),
17294   sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
17295  if (r->offs) {
17296   const I32 npar = r->nparens+1;
17297   Newx(ret->offs, npar, regexp_paren_pair);
17298   Copy(r->offs, ret->offs, npar, regexp_paren_pair);
17299  }
17300  if (r->substrs) {
17301   Newx(ret->substrs, 1, struct reg_substr_data);
17302   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
17303
17304   SvREFCNT_inc_void(ret->anchored_substr);
17305   SvREFCNT_inc_void(ret->anchored_utf8);
17306   SvREFCNT_inc_void(ret->float_substr);
17307   SvREFCNT_inc_void(ret->float_utf8);
17308
17309   /* check_substr and check_utf8, if non-NULL, point to either their
17310   anchored or float namesakes, and don't hold a second reference.  */
17311  }
17312  RX_MATCH_COPIED_off(ret_x);
17313 #ifdef PERL_ANY_COW
17314  ret->saved_copy = NULL;
17315 #endif
17316  ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
17317  SvREFCNT_inc_void(ret->qr_anoncv);
17318
17319  return ret_x;
17320 }
17321 #endif
17322
17323 /* regfree_internal()
17324
17325    Free the private data in a regexp. This is overloadable by
17326    extensions. Perl takes care of the regexp structure in pregfree(),
17327    this covers the *pprivate pointer which technically perl doesn't
17328    know about, however of course we have to handle the
17329    regexp_internal structure when no extension is in use.
17330
17331    Note this is called before freeing anything in the regexp
17332    structure.
17333  */
17334
17335 void
17336 Perl_regfree_internal(pTHX_ REGEXP * const rx)
17337 {
17338  struct regexp *const r = ReANY(rx);
17339  RXi_GET_DECL(r,ri);
17340  GET_RE_DEBUG_FLAGS_DECL;
17341
17342  PERL_ARGS_ASSERT_REGFREE_INTERNAL;
17343
17344  DEBUG_COMPILE_r({
17345   if (!PL_colorset)
17346    reginitcolors();
17347   {
17348    SV *dsv= sv_newmortal();
17349    RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
17350     dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
17351    PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
17352     PL_colors[4],PL_colors[5],s);
17353   }
17354  });
17355 #ifdef RE_TRACK_PATTERN_OFFSETS
17356  if (ri->u.offsets)
17357   Safefree(ri->u.offsets);             /* 20010421 MJD */
17358 #endif
17359  if (ri->code_blocks) {
17360   int n;
17361   for (n = 0; n < ri->num_code_blocks; n++)
17362    SvREFCNT_dec(ri->code_blocks[n].src_regex);
17363   Safefree(ri->code_blocks);
17364  }
17365
17366  if (ri->data) {
17367   int n = ri->data->count;
17368
17369   while (--n >= 0) {
17370   /* If you add a ->what type here, update the comment in regcomp.h */
17371    switch (ri->data->what[n]) {
17372    case 'a':
17373    case 'r':
17374    case 's':
17375    case 'S':
17376    case 'u':
17377     SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
17378     break;
17379    case 'f':
17380     Safefree(ri->data->data[n]);
17381     break;
17382    case 'l':
17383    case 'L':
17384     break;
17385    case 'T':
17386     { /* Aho Corasick add-on structure for a trie node.
17387      Used in stclass optimization only */
17388      U32 refcount;
17389      reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
17390 #ifdef USE_ITHREADS
17391      dVAR;
17392 #endif
17393      OP_REFCNT_LOCK;
17394      refcount = --aho->refcount;
17395      OP_REFCNT_UNLOCK;
17396      if ( !refcount ) {
17397       PerlMemShared_free(aho->states);
17398       PerlMemShared_free(aho->fail);
17399       /* do this last!!!! */
17400       PerlMemShared_free(ri->data->data[n]);
17401       /* we should only ever get called once, so
17402       * assert as much, and also guard the free
17403       * which /might/ happen twice. At the least
17404       * it will make code anlyzers happy and it
17405       * doesn't cost much. - Yves */
17406       assert(ri->regstclass);
17407       if (ri->regstclass) {
17408        PerlMemShared_free(ri->regstclass);
17409        ri->regstclass = 0;
17410       }
17411      }
17412     }
17413     break;
17414    case 't':
17415     {
17416      /* trie structure. */
17417      U32 refcount;
17418      reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
17419 #ifdef USE_ITHREADS
17420      dVAR;
17421 #endif
17422      OP_REFCNT_LOCK;
17423      refcount = --trie->refcount;
17424      OP_REFCNT_UNLOCK;
17425      if ( !refcount ) {
17426       PerlMemShared_free(trie->charmap);
17427       PerlMemShared_free(trie->states);
17428       PerlMemShared_free(trie->trans);
17429       if (trie->bitmap)
17430        PerlMemShared_free(trie->bitmap);
17431       if (trie->jump)
17432        PerlMemShared_free(trie->jump);
17433       PerlMemShared_free(trie->wordinfo);
17434       /* do this last!!!! */
17435       PerlMemShared_free(ri->data->data[n]);
17436      }
17437     }
17438     break;
17439    default:
17440     Perl_croak(aTHX_ "panic: regfree data code '%c'",
17441              ri->data->what[n]);
17442    }
17443   }
17444   Safefree(ri->data->what);
17445   Safefree(ri->data);
17446  }
17447
17448  Safefree(ri);
17449 }
17450
17451 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
17452 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
17453 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
17454
17455 /*
17456    re_dup - duplicate a regexp.
17457
17458    This routine is expected to clone a given regexp structure. It is only
17459    compiled under USE_ITHREADS.
17460
17461    After all of the core data stored in struct regexp is duplicated
17462    the regexp_engine.dupe method is used to copy any private data
17463    stored in the *pprivate pointer. This allows extensions to handle
17464    any duplication it needs to do.
17465
17466    See pregfree() and regfree_internal() if you change anything here.
17467 */
17468 #if defined(USE_ITHREADS)
17469 #ifndef PERL_IN_XSUB_RE
17470 void
17471 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
17472 {
17473  dVAR;
17474  I32 npar;
17475  const struct regexp *r = ReANY(sstr);
17476  struct regexp *ret = ReANY(dstr);
17477
17478  PERL_ARGS_ASSERT_RE_DUP_GUTS;
17479
17480  npar = r->nparens+1;
17481  Newx(ret->offs, npar, regexp_paren_pair);
17482  Copy(r->offs, ret->offs, npar, regexp_paren_pair);
17483
17484  if (ret->substrs) {
17485   /* Do it this way to avoid reading from *r after the StructCopy().
17486   That way, if any of the sv_dup_inc()s dislodge *r from the L1
17487   cache, it doesn't matter.  */
17488   const bool anchored = r->check_substr
17489    ? r->check_substr == r->anchored_substr
17490    : r->check_utf8 == r->anchored_utf8;
17491   Newx(ret->substrs, 1, struct reg_substr_data);
17492   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
17493
17494   ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
17495   ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
17496   ret->float_substr = sv_dup_inc(ret->float_substr, param);
17497   ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
17498
17499   /* check_substr and check_utf8, if non-NULL, point to either their
17500   anchored or float namesakes, and don't hold a second reference.  */
17501
17502   if (ret->check_substr) {
17503    if (anchored) {
17504     assert(r->check_utf8 == r->anchored_utf8);
17505     ret->check_substr = ret->anchored_substr;
17506     ret->check_utf8 = ret->anchored_utf8;
17507    } else {
17508     assert(r->check_substr == r->float_substr);
17509     assert(r->check_utf8 == r->float_utf8);
17510     ret->check_substr = ret->float_substr;
17511     ret->check_utf8 = ret->float_utf8;
17512    }
17513   } else if (ret->check_utf8) {
17514    if (anchored) {
17515     ret->check_utf8 = ret->anchored_utf8;
17516    } else {
17517     ret->check_utf8 = ret->float_utf8;
17518    }
17519   }
17520  }
17521
17522  RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
17523  ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
17524
17525  if (ret->pprivate)
17526   RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
17527
17528  if (RX_MATCH_COPIED(dstr))
17529   ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
17530  else
17531   ret->subbeg = NULL;
17532 #ifdef PERL_ANY_COW
17533  ret->saved_copy = NULL;
17534 #endif
17535
17536  /* Whether mother_re be set or no, we need to copy the string.  We
17537  cannot refrain from copying it when the storage points directly to
17538  our mother regexp, because that's
17539    1: a buffer in a different thread
17540    2: something we no longer hold a reference on
17541    so we need to copy it locally.  */
17542  RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
17543  ret->mother_re   = NULL;
17544 }
17545 #endif /* PERL_IN_XSUB_RE */
17546
17547 /*
17548    regdupe_internal()
17549
17550    This is the internal complement to regdupe() which is used to copy
17551    the structure pointed to by the *pprivate pointer in the regexp.
17552    This is the core version of the extension overridable cloning hook.
17553    The regexp structure being duplicated will be copied by perl prior
17554    to this and will be provided as the regexp *r argument, however
17555    with the /old/ structures pprivate pointer value. Thus this routine
17556    may override any copying normally done by perl.
17557
17558    It returns a pointer to the new regexp_internal structure.
17559 */
17560
17561 void *
17562 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
17563 {
17564  dVAR;
17565  struct regexp *const r = ReANY(rx);
17566  regexp_internal *reti;
17567  int len;
17568  RXi_GET_DECL(r,ri);
17569
17570  PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
17571
17572  len = ProgLen(ri);
17573
17574  Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
17575   char, regexp_internal);
17576  Copy(ri->program, reti->program, len+1, regnode);
17577
17578  reti->num_code_blocks = ri->num_code_blocks;
17579  if (ri->code_blocks) {
17580   int n;
17581   Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
17582     struct reg_code_block);
17583   Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
17584     struct reg_code_block);
17585   for (n = 0; n < ri->num_code_blocks; n++)
17586    reti->code_blocks[n].src_regex = (REGEXP*)
17587      sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
17588  }
17589  else
17590   reti->code_blocks = NULL;
17591
17592  reti->regstclass = NULL;
17593
17594  if (ri->data) {
17595   struct reg_data *d;
17596   const int count = ri->data->count;
17597   int i;
17598
17599   Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
17600     char, struct reg_data);
17601   Newx(d->what, count, U8);
17602
17603   d->count = count;
17604   for (i = 0; i < count; i++) {
17605    d->what[i] = ri->data->what[i];
17606    switch (d->what[i]) {
17607     /* see also regcomp.h and regfree_internal() */
17608    case 'a': /* actually an AV, but the dup function is identical.  */
17609    case 'r':
17610    case 's':
17611    case 'S':
17612    case 'u': /* actually an HV, but the dup function is identical.  */
17613     d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
17614     break;
17615    case 'f':
17616     /* This is cheating. */
17617     Newx(d->data[i], 1, regnode_ssc);
17618     StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
17619     reti->regstclass = (regnode*)d->data[i];
17620     break;
17621    case 'T':
17622     /* Trie stclasses are readonly and can thus be shared
17623     * without duplication. We free the stclass in pregfree
17624     * when the corresponding reg_ac_data struct is freed.
17625     */
17626     reti->regstclass= ri->regstclass;
17627     /* FALLTHROUGH */
17628    case 't':
17629     OP_REFCNT_LOCK;
17630     ((reg_trie_data*)ri->data->data[i])->refcount++;
17631     OP_REFCNT_UNLOCK;
17632     /* FALLTHROUGH */
17633    case 'l':
17634    case 'L':
17635     d->data[i] = ri->data->data[i];
17636     break;
17637    default:
17638     Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
17639               ri->data->what[i]);
17640    }
17641   }
17642
17643   reti->data = d;
17644  }
17645  else
17646   reti->data = NULL;
17647
17648  reti->name_list_idx = ri->name_list_idx;
17649
17650 #ifdef RE_TRACK_PATTERN_OFFSETS
17651  if (ri->u.offsets) {
17652   Newx(reti->u.offsets, 2*len+1, U32);
17653   Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
17654  }
17655 #else
17656  SetProgLen(reti,len);
17657 #endif
17658
17659  return (void*)reti;
17660 }
17661
17662 #endif    /* USE_ITHREADS */
17663
17664 #ifndef PERL_IN_XSUB_RE
17665
17666 /*
17667  - regnext - dig the "next" pointer out of a node
17668  */
17669 regnode *
17670 Perl_regnext(pTHX_ regnode *p)
17671 {
17672  I32 offset;
17673
17674  if (!p)
17675   return(NULL);
17676
17677  if (OP(p) > REGNODE_MAX) {  /* regnode.type is unsigned */
17678   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
17679             (int)OP(p), (int)REGNODE_MAX);
17680  }
17681
17682  offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
17683  if (offset == 0)
17684   return(NULL);
17685
17686  return(p+offset);
17687 }
17688 #endif
17689
17690 STATIC void
17691 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
17692 {
17693  va_list args;
17694  STRLEN l1 = strlen(pat1);
17695  STRLEN l2 = strlen(pat2);
17696  char buf[512];
17697  SV *msv;
17698  const char *message;
17699
17700  PERL_ARGS_ASSERT_RE_CROAK2;
17701
17702  if (l1 > 510)
17703   l1 = 510;
17704  if (l1 + l2 > 510)
17705   l2 = 510 - l1;
17706  Copy(pat1, buf, l1 , char);
17707  Copy(pat2, buf + l1, l2 , char);
17708  buf[l1 + l2] = '\n';
17709  buf[l1 + l2 + 1] = '\0';
17710  va_start(args, pat2);
17711  msv = vmess(buf, &args);
17712  va_end(args);
17713  message = SvPV_const(msv,l1);
17714  if (l1 > 512)
17715   l1 = 512;
17716  Copy(message, buf, l1 , char);
17717  /* l1-1 to avoid \n */
17718  Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
17719 }
17720
17721 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
17722
17723 #ifndef PERL_IN_XSUB_RE
17724 void
17725 Perl_save_re_context(pTHX)
17726 {
17727  I32 nparens = -1;
17728  I32 i;
17729
17730  /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
17731
17732  if (PL_curpm) {
17733   const REGEXP * const rx = PM_GETRE(PL_curpm);
17734   if (rx)
17735    nparens = RX_NPARENS(rx);
17736  }
17737
17738  /* RT #124109. This is a complete hack; in the SWASHNEW case we know
17739  * that PL_curpm will be null, but that utf8.pm and the modules it
17740  * loads will only use $1..$3.
17741  * The t/porting/re_context.t test file checks this assumption.
17742  */
17743  if (nparens == -1)
17744   nparens = 3;
17745
17746  for (i = 1; i <= nparens; i++) {
17747   char digits[TYPE_CHARS(long)];
17748   const STRLEN len = my_snprintf(digits, sizeof(digits),
17749          "%lu", (long)i);
17750   GV *const *const gvp
17751    = (GV**)hv_fetch(PL_defstash, digits, len, 0);
17752
17753   if (gvp) {
17754    GV * const gv = *gvp;
17755    if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
17756     save_scalar(gv);
17757   }
17758  }
17759 }
17760 #endif
17761
17762 #ifdef DEBUGGING
17763
17764 STATIC void
17765 S_put_code_point(pTHX_ SV *sv, UV c)
17766 {
17767  PERL_ARGS_ASSERT_PUT_CODE_POINT;
17768
17769  if (c > 255) {
17770   Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
17771  }
17772  else if (isPRINT(c)) {
17773   const char string = (char) c;
17774   if (isBACKSLASHED_PUNCT(c))
17775    sv_catpvs(sv, "\\");
17776   sv_catpvn(sv, &string, 1);
17777  }
17778  else {
17779   const char * const mnemonic = cntrl_to_mnemonic((char) c);
17780   if (mnemonic) {
17781    Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
17782   }
17783   else {
17784    Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
17785   }
17786  }
17787 }
17788
17789 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
17790
17791 STATIC void
17792 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
17793 {
17794  /* Appends to 'sv' a displayable version of the range of code points from
17795  * 'start' to 'end'.  It assumes that only ASCII printables are displayable
17796  * as-is (though some of these will be escaped by put_code_point()). */
17797
17798  const unsigned int min_range_count = 3;
17799
17800  assert(start <= end);
17801
17802  PERL_ARGS_ASSERT_PUT_RANGE;
17803
17804  while (start <= end) {
17805   UV this_end;
17806   const char * format;
17807
17808   if (end - start < min_range_count) {
17809
17810    /* Individual chars in short ranges */
17811    for (; start <= end; start++) {
17812     put_code_point(sv, start);
17813    }
17814    break;
17815   }
17816
17817   /* If permitted by the input options, and there is a possibility that
17818   * this range contains a printable literal, look to see if there is
17819   * one.  */
17820   if (allow_literals && start <= MAX_PRINT_A) {
17821
17822    /* If the range begin isn't an ASCII printable, effectively split
17823    * the range into two parts:
17824    *  1) the portion before the first such printable,
17825    *  2) the rest
17826    * and output them separately. */
17827    if (! isPRINT_A(start)) {
17828     UV temp_end = start + 1;
17829
17830     /* There is no point looking beyond the final possible
17831     * printable, in MAX_PRINT_A */
17832     UV max = MIN(end, MAX_PRINT_A);
17833
17834     while (temp_end <= max && ! isPRINT_A(temp_end)) {
17835      temp_end++;
17836     }
17837
17838     /* Here, temp_end points to one beyond the first printable if
17839     * found, or to one beyond 'max' if not.  If none found, make
17840     * sure that we use the entire range */
17841     if (temp_end > MAX_PRINT_A) {
17842      temp_end = end + 1;
17843     }
17844
17845     /* Output the first part of the split range, the part that
17846     * doesn't have printables, with no looking for literals
17847     * (otherwise we would infinitely recurse) */
17848     put_range(sv, start, temp_end - 1, FALSE);
17849
17850     /* The 2nd part of the range (if any) starts here. */
17851     start = temp_end;
17852
17853     /* We continue instead of dropping down because even if the 2nd
17854     * part is non-empty, it could be so short that we want to
17855     * output it specially, as tested for at the top of this loop.
17856     * */
17857     continue;
17858    }
17859
17860    /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
17861    * output a sub-range of just the digits or letters, then process
17862    * the remaining portion as usual. */
17863    if (isALPHANUMERIC_A(start)) {
17864     UV mask = (isDIGIT_A(start))
17865       ? _CC_DIGIT
17866        : isUPPER_A(start)
17867        ? _CC_UPPER
17868        : _CC_LOWER;
17869     UV temp_end = start + 1;
17870
17871     /* Find the end of the sub-range that includes just the
17872     * characters in the same class as the first character in it */
17873     while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
17874      temp_end++;
17875     }
17876     temp_end--;
17877
17878     /* For short ranges, don't duplicate the code above to output
17879     * them; just call recursively */
17880     if (temp_end - start < min_range_count) {
17881      put_range(sv, start, temp_end, FALSE);
17882     }
17883     else {  /* Output as a range */
17884      put_code_point(sv, start);
17885      sv_catpvs(sv, "-");
17886      put_code_point(sv, temp_end);
17887     }
17888     start = temp_end + 1;
17889     continue;
17890    }
17891
17892    /* We output any other printables as individual characters */
17893    if (isPUNCT_A(start) || isSPACE_A(start)) {
17894     while (start <= end && (isPUNCT_A(start)
17895           || isSPACE_A(start)))
17896     {
17897      put_code_point(sv, start);
17898      start++;
17899     }
17900     continue;
17901    }
17902   } /* End of looking for literals */
17903
17904   /* Here is not to output as a literal.  Some control characters have
17905   * mnemonic names.  Split off any of those at the beginning and end of
17906   * the range to print mnemonically.  It isn't possible for many of
17907   * these to be in a row, so this won't overwhelm with output */
17908   while (isMNEMONIC_CNTRL(start) && start <= end) {
17909    put_code_point(sv, start);
17910    start++;
17911   }
17912   if (start < end && isMNEMONIC_CNTRL(end)) {
17913
17914    /* Here, the final character in the range has a mnemonic name.
17915    * Work backwards from the end to find the final non-mnemonic */
17916    UV temp_end = end - 1;
17917    while (isMNEMONIC_CNTRL(temp_end)) {
17918     temp_end--;
17919    }
17920
17921    /* And separately output the range that doesn't have mnemonics */
17922    put_range(sv, start, temp_end, FALSE);
17923
17924    /* Then output the mnemonic trailing controls */
17925    start = temp_end + 1;
17926    while (start <= end) {
17927     put_code_point(sv, start);
17928     start++;
17929    }
17930    break;
17931   }
17932
17933   /* As a final resort, output the range or subrange as hex. */
17934
17935   this_end = (end < NUM_ANYOF_CODE_POINTS)
17936      ? end
17937      : NUM_ANYOF_CODE_POINTS - 1;
17938   format = (this_end < 256)
17939     ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
17940     : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
17941   GCC_DIAG_IGNORE(-Wformat-nonliteral);
17942   Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
17943   GCC_DIAG_RESTORE;
17944   break;
17945  }
17946 }
17947
17948 STATIC bool
17949 S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
17950 {
17951  /* Appends to 'sv' a displayable version of the innards of the bracketed
17952  * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
17953  * output anything, and bitmap_invlist, if not NULL, will point to an
17954  * inversion list of what is in the bit map */
17955
17956  int i;
17957  UV start, end;
17958  unsigned int punct_count = 0;
17959  SV* invlist = NULL;
17960  SV** invlist_ptr;   /* Temporary, in case bitmap_invlist is NULL */
17961  bool allow_literals = TRUE;
17962
17963  PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
17964
17965  invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
17966
17967  /* Worst case is exactly every-other code point is in the list */
17968  *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
17969
17970  /* Convert the bit map to an inversion list, keeping track of how many
17971  * ASCII puncts are set, including an extra amount for the backslashed
17972  * ones.  */
17973  for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
17974   if (BITMAP_TEST(bitmap, i)) {
17975    *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
17976    if (isPUNCT_A(i)) {
17977     punct_count++;
17978     if isBACKSLASHED_PUNCT(i) {
17979      punct_count++;
17980     }
17981    }
17982   }
17983  }
17984
17985  /* Nothing to output */
17986  if (_invlist_len(*invlist_ptr) == 0) {
17987   SvREFCNT_dec(invlist);
17988   return FALSE;
17989  }
17990
17991  /* Generally, it is more readable if printable characters are output as
17992  * literals, but if a range (nearly) spans all of them, it's best to output
17993  * it as a single range.  This code will use a single range if all but 2
17994  * printables are in it */
17995  invlist_iterinit(*invlist_ptr);
17996  while (invlist_iternext(*invlist_ptr, &start, &end)) {
17997
17998   /* If range starts beyond final printable, it doesn't have any in it */
17999   if (start > MAX_PRINT_A) {
18000    break;
18001   }
18002
18003   /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
18004   * all but two, the range must start and end no later than 2 from
18005   * either end */
18006   if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
18007    if (end > MAX_PRINT_A) {
18008     end = MAX_PRINT_A;
18009    }
18010    if (start < ' ') {
18011     start = ' ';
18012    }
18013    if (end - start >= MAX_PRINT_A - ' ' - 2) {
18014     allow_literals = FALSE;
18015    }
18016    break;
18017   }
18018  }
18019  invlist_iterfinish(*invlist_ptr);
18020
18021  /* The legibility of the output depends mostly on how many punctuation
18022  * characters are output.  There are 32 possible ASCII ones, and some have
18023  * an additional backslash, bringing it to currently 36, so if any more
18024  * than 18 are to be output, we can instead output it as its complement,
18025  * yielding fewer puncts, and making it more legible.  But give some weight
18026  * to the fact that outputting it as a complement is less legible than a
18027  * straight output, so don't complement unless we are somewhat over the 18
18028  * mark */
18029  if (allow_literals && punct_count > 22) {
18030   sv_catpvs(sv, "^");
18031
18032   /* Add everything remaining to the list, so when we invert it just
18033   * below, it will be excluded */
18034   _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
18035   _invlist_invert(*invlist_ptr);
18036  }
18037
18038  /* Here we have figured things out.  Output each range */
18039  invlist_iterinit(*invlist_ptr);
18040  while (invlist_iternext(*invlist_ptr, &start, &end)) {
18041   if (start >= NUM_ANYOF_CODE_POINTS) {
18042    break;
18043   }
18044   put_range(sv, start, end, allow_literals);
18045  }
18046  invlist_iterfinish(*invlist_ptr);
18047
18048  return TRUE;
18049 }
18050
18051 #define CLEAR_OPTSTART \
18052  if (optstart) STMT_START {                                               \
18053   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
18054        " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
18055   optstart=NULL;                                                       \
18056  } STMT_END
18057
18058 #define DUMPUNTIL(b,e)                                                       \
18059      CLEAR_OPTSTART;                                          \
18060      node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
18061
18062 STATIC const regnode *
18063 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
18064    const regnode *last, const regnode *plast,
18065    SV* sv, I32 indent, U32 depth)
18066 {
18067  U8 op = PSEUDO; /* Arbitrary non-END op. */
18068  const regnode *next;
18069  const regnode *optstart= NULL;
18070
18071  RXi_GET_DECL(r,ri);
18072  GET_RE_DEBUG_FLAGS_DECL;
18073
18074  PERL_ARGS_ASSERT_DUMPUNTIL;
18075
18076 #ifdef DEBUG_DUMPUNTIL
18077  PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
18078   last ? last-start : 0,plast ? plast-start : 0);
18079 #endif
18080
18081  if (plast && plast < last)
18082   last= plast;
18083
18084  while (PL_regkind[op] != END && (!last || node < last)) {
18085   assert(node);
18086   /* While that wasn't END last time... */
18087   NODE_ALIGN(node);
18088   op = OP(node);
18089   if (op == CLOSE || op == WHILEM)
18090    indent--;
18091   next = regnext((regnode *)node);
18092
18093   /* Where, what. */
18094   if (OP(node) == OPTIMIZED) {
18095    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
18096     optstart = node;
18097    else
18098     goto after_print;
18099   } else
18100    CLEAR_OPTSTART;
18101
18102   regprop(r, sv, node, NULL, NULL);
18103   PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
18104      (int)(2*indent + 1), "", SvPVX_const(sv));
18105
18106   if (OP(node) != OPTIMIZED) {
18107    if (next == NULL)  /* Next ptr. */
18108     PerlIO_printf(Perl_debug_log, " (0)");
18109    else if (PL_regkind[(U8)op] == BRANCH
18110      && PL_regkind[OP(next)] != BRANCH )
18111     PerlIO_printf(Perl_debug_log, " (FAIL)");
18112    else
18113     PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
18114    (void)PerlIO_putc(Perl_debug_log, '\n');
18115   }
18116
18117  after_print:
18118   if (PL_regkind[(U8)op] == BRANCHJ) {
18119    assert(next);
18120    {
18121     const regnode *nnode = (OP(next) == LONGJMP
18122          ? regnext((regnode *)next)
18123          : next);
18124     if (last && nnode > last)
18125      nnode = last;
18126     DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
18127    }
18128   }
18129   else if (PL_regkind[(U8)op] == BRANCH) {
18130    assert(next);
18131    DUMPUNTIL(NEXTOPER(node), next);
18132   }
18133   else if ( PL_regkind[(U8)op]  == TRIE ) {
18134    const regnode *this_trie = node;
18135    const char op = OP(node);
18136    const U32 n = ARG(node);
18137    const reg_ac_data * const ac = op>=AHOCORASICK ?
18138    (reg_ac_data *)ri->data->data[n] :
18139    NULL;
18140    const reg_trie_data * const trie =
18141     (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
18142 #ifdef DEBUGGING
18143    AV *const trie_words
18144       = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
18145 #endif
18146    const regnode *nextbranch= NULL;
18147    I32 word_idx;
18148    sv_setpvs(sv, "");
18149    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
18150     SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
18151
18152     PerlIO_printf(Perl_debug_log, "%*s%s ",
18153     (int)(2*(indent+3)), "",
18154      elem_ptr
18155      ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
18156         SvCUR(*elem_ptr), 60,
18157         PL_colors[0], PL_colors[1],
18158         (SvUTF8(*elem_ptr)
18159         ? PERL_PV_ESCAPE_UNI
18160         : 0)
18161         | PERL_PV_PRETTY_ELLIPSES
18162         | PERL_PV_PRETTY_LTGT
18163        )
18164      : "???"
18165     );
18166     if (trie->jump) {
18167      U16 dist= trie->jump[word_idx+1];
18168      PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
18169        (UV)((dist ? this_trie + dist : next) - start));
18170      if (dist) {
18171       if (!nextbranch)
18172        nextbranch= this_trie + trie->jump[0];
18173       DUMPUNTIL(this_trie + dist, nextbranch);
18174      }
18175      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
18176       nextbranch= regnext((regnode *)nextbranch);
18177     } else {
18178      PerlIO_printf(Perl_debug_log, "\n");
18179     }
18180    }
18181    if (last && next > last)
18182     node= last;
18183    else
18184     node= next;
18185   }
18186   else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
18187    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
18188      NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
18189   }
18190   else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
18191    assert(next);
18192    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
18193   }
18194   else if ( op == PLUS || op == STAR) {
18195    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
18196   }
18197   else if (PL_regkind[(U8)op] == ANYOF) {
18198    /* arglen 1 + class block */
18199    node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
18200       ? ANYOF_POSIXL_SKIP
18201       : ANYOF_SKIP);
18202    node = NEXTOPER(node);
18203   }
18204   else if (PL_regkind[(U8)op] == EXACT) {
18205    /* Literal string, where present. */
18206    node += NODE_SZ_STR(node) - 1;
18207    node = NEXTOPER(node);
18208   }
18209   else {
18210    node = NEXTOPER(node);
18211    node += regarglen[(U8)op];
18212   }
18213   if (op == CURLYX || op == OPEN)
18214    indent++;
18215  }
18216  CLEAR_OPTSTART;
18217 #ifdef DEBUG_DUMPUNTIL
18218  PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
18219 #endif
18220  return node;
18221 }
18222
18223 #endif /* DEBUGGING */
18224
18225 /*
18226  * ex: set ts=8 sts=4 sw=4 et:
18227  */