src/5014001/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  * Copyright (c) 1986 by University of Toronto.
  42  * Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  * Permission is granted to anyone to use this software for any
  45  * purpose on any computer system, and to redistribute it freely,
  46  * subject to the following restrictions:
  47  *
  48  * 1. The author is not responsible for the consequences of use of
  49  *  this software, no matter how awful, even if they arise
  50  *  from defects in it.
  51  *
  52  * 2. The origin of this software must not be misrepresented, either
  53  *  by explicit claim or by omission.
  54  *
  55  * 3. Altered versions must be plainly marked as such, and must not
  56  *  be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89
  90 #ifdef op
  91 #undef op
  92 #endif /* op */
  93
  94 #ifdef MSDOS
  95 #  if defined(BUGGY_MSC6)
  96  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  97 #    pragma optimize("a",off)
  98  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  99 #    pragma optimize("w",on )
 100 #  endif /* BUGGY_MSC6 */
 101 #endif /* MSDOS */
 102
 103 #ifndef STATIC
 104 #define STATIC static
 105 #endif
 106
 107 typedef struct RExC_state_t {
 108  U32  flags;   /* are we folding, multilining? */
 109  char *precomp;  /* uncompiled string. */
 110  REGEXP *rx_sv;   /* The SV that is the regexp. */
 111  regexp *rx;                    /* perl core regexp structure */
 112  regexp_internal *rxi;           /* internal data for regexp object pprivate field */
 113  char *start;   /* Start of input for compile */
 114  char *end;   /* End of input for compile */
 115  char *parse;   /* Input-scan pointer. */
 116  I32  whilem_seen;  /* number of WHILEM in this expr */
 117  regnode *emit_start;  /* Start of emitted-code area */
 118  regnode *emit_bound;  /* First regnode outside of the allocated space */
 119  regnode *emit;   /* Code-emit pointer; &regdummy = don't = compiling */
 120  I32  naughty;  /* How bad is this pattern? */
 121  I32  sawback;  /* Did we see \1, ...? */
 122  U32  seen;
 123  I32  size;   /* Code size. */
 124  I32  npar;   /* Capture buffer count, (OPEN). */
 125  I32  cpar;   /* Capture buffer count, (CLOSE). */
 126  I32  nestroot;  /* root parens we are in - used by accept */
 127  I32  extralen;
 128  I32  seen_zerolen;
 129  I32  seen_evals;
 130  regnode **open_parens;  /* pointers to open parens */
 131  regnode **close_parens;  /* pointers to close parens */
 132  regnode *opend;   /* END node in program */
 133  I32  utf8;  /* whether the pattern is utf8 or not */
 134  I32  orig_utf8; /* whether the pattern was originally in utf8 */
 135         /* XXX use this for future optimisation of case
 136         * where pattern must be upgraded to utf8. */
 137  I32  uni_semantics; /* If a d charset modifier should use unicode
 138         rules, even if the pattern is not in
 139         utf8 */
 140  HV  *paren_names;  /* Paren names */
 141
 142  regnode **recurse;  /* Recurse regops */
 143  I32  recurse_count;  /* Number of recurse regops */
 144  I32  in_lookbehind;
 145  I32  contains_locale;
 146  I32  override_recoding;
 147 #if ADD_TO_REGEXEC
 148  char  *starttry;  /* -Dr: where regtry was called. */
 149 #define RExC_starttry (pRExC_state->starttry)
 150 #endif
 151 #ifdef DEBUGGING
 152  const char  *lastparse;
 153  I32         lastnum;
 154  AV          *paren_name_list;       /* idx -> name */
 155 #define RExC_lastparse (pRExC_state->lastparse)
 156 #define RExC_lastnum (pRExC_state->lastnum)
 157 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 158 #endif
 159 } RExC_state_t;
 160
 161 #define RExC_flags (pRExC_state->flags)
 162 #define RExC_precomp (pRExC_state->precomp)
 163 #define RExC_rx_sv (pRExC_state->rx_sv)
 164 #define RExC_rx  (pRExC_state->rx)
 165 #define RExC_rxi (pRExC_state->rxi)
 166 #define RExC_start (pRExC_state->start)
 167 #define RExC_end (pRExC_state->end)
 168 #define RExC_parse (pRExC_state->parse)
 169 #define RExC_whilem_seen (pRExC_state->whilem_seen)
 170 #ifdef RE_TRACK_PATTERN_OFFSETS
 171 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
 172 #endif
 173 #define RExC_emit (pRExC_state->emit)
 174 #define RExC_emit_start (pRExC_state->emit_start)
 175 #define RExC_emit_bound (pRExC_state->emit_bound)
 176 #define RExC_naughty (pRExC_state->naughty)
 177 #define RExC_sawback (pRExC_state->sawback)
 178 #define RExC_seen (pRExC_state->seen)
 179 #define RExC_size (pRExC_state->size)
 180 #define RExC_npar (pRExC_state->npar)
 181 #define RExC_nestroot   (pRExC_state->nestroot)
 182 #define RExC_extralen (pRExC_state->extralen)
 183 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
 184 #define RExC_seen_evals (pRExC_state->seen_evals)
 185 #define RExC_utf8 (pRExC_state->utf8)
 186 #define RExC_uni_semantics (pRExC_state->uni_semantics)
 187 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
 188 #define RExC_open_parens (pRExC_state->open_parens)
 189 #define RExC_close_parens (pRExC_state->close_parens)
 190 #define RExC_opend (pRExC_state->opend)
 191 #define RExC_paren_names (pRExC_state->paren_names)
 192 #define RExC_recurse (pRExC_state->recurse)
 193 #define RExC_recurse_count (pRExC_state->recurse_count)
 194 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
 195 #define RExC_contains_locale (pRExC_state->contains_locale)
 196 #define RExC_override_recoding (pRExC_state->override_recoding)
 197
 198
 199 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
 200 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 201   ((*s) == '{' && regcurly(s)))
 202
 203 #ifdef SPSTART
 204 #undef SPSTART  /* dratted cpp namespace... */
 205 #endif
 206 /*
 207  * Flags to be passed up and down.
 208  */
 209 #define WORST  0 /* Worst case. */
 210 #define HASWIDTH 0x01 /* Known to match non-null strings. */
 211
 212 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 213  * character, and if utf8, must be invariant.  Note that this is not the same thing as REGNODE_SIMPLE */
 214 #define SIMPLE  0x02
 215 #define SPSTART  0x04 /* Starts with * or +. */
 216 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
 217 #define POSTPONED 0x10    /* (?1),(?&name), (??{...}) or similar */
 218
 219 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 220
 221 /* whether trie related optimizations are enabled */
 222 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 223 #define TRIE_STUDY_OPT
 224 #define FULL_TRIE_STUDY
 225 #define TRIE_STCLASS
 226 #endif
 227
 228
 229
 230 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 231 #define PBITVAL(paren) (1 << ((paren) & 7))
 232 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 233 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 234 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 235
 236 /* If not already in utf8, do a longjmp back to the beginning */
 237 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 238 #define REQUIRE_UTF8 STMT_START {                                       \
 239          if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 240       } STMT_END
 241
 242 /* About scan_data_t.
 243
 244   During optimisation we recurse through the regexp program performing
 245   various inplace (keyhole style) optimisations. In addition study_chunk
 246   and scan_commit populate this data structure with information about
 247   what strings MUST appear in the pattern. We look for the longest
 248   string that must appear at a fixed location, and we look for the
 249   longest string that may appear at a floating location. So for instance
 250   in the pattern:
 251
 252  /FOO[xX]A.*B[xX]BAR/
 253
 254   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 255   strings (because they follow a .* construct). study_chunk will identify
 256   both FOO and BAR as being the longest fixed and floating strings respectively.
 257
 258   The strings can be composites, for instance
 259
 260  /(f)(o)(o)/
 261
 262   will result in a composite fixed substring 'foo'.
 263
 264   For each string some basic information is maintained:
 265
 266   - offset or min_offset
 267  This is the position the string must appear at, or not before.
 268  It also implicitly (when combined with minlenp) tells us how many
 269  characters must match before the string we are searching for.
 270  Likewise when combined with minlenp and the length of the string it
 271  tells us how many characters must appear after the string we have
 272  found.
 273
 274   - max_offset
 275  Only used for floating strings. This is the rightmost point that
 276  the string can appear at. If set to I32 max it indicates that the
 277  string can occur infinitely far to the right.
 278
 279   - minlenp
 280  A pointer to the minimum length of the pattern that the string
 281  was found inside. This is important as in the case of positive
 282  lookahead or positive lookbehind we can have multiple patterns
 283  involved. Consider
 284
 285  /(?=FOO).*F/
 286
 287  The minimum length of the pattern overall is 3, the minimum length
 288  of the lookahead part is 3, but the minimum length of the part that
 289  will actually match is 1. So 'FOO's minimum length is 3, but the
 290  minimum length for the F is 1. This is important as the minimum length
 291  is used to determine offsets in front of and behind the string being
 292  looked for.  Since strings can be composites this is the length of the
 293  pattern at the time it was committed with a scan_commit. Note that
 294  the length is calculated by study_chunk, so that the minimum lengths
 295  are not known until the full pattern has been compiled, thus the
 296  pointer to the value.
 297
 298   - lookbehind
 299
 300  In the case of lookbehind the string being searched for can be
 301  offset past the start point of the final matching string.
 302  If this value was just blithely removed from the min_offset it would
 303  invalidate some of the calculations for how many chars must match
 304  before or after (as they are derived from min_offset and minlen and
 305  the length of the string being searched for).
 306  When the final pattern is compiled and the data is moved from the
 307  scan_data_t structure into the regexp structure the information
 308  about lookbehind is factored in, with the information that would
 309  have been lost precalculated in the end_shift field for the
 310  associated string.
 311
 312   The fields pos_min and pos_delta are used to store the minimum offset
 313   and the delta to the maximum offset at the current point in the pattern.
 314
 315 */
 316
 317 typedef struct scan_data_t {
 318  /*I32 len_min;      unused */
 319  /*I32 len_delta;    unused */
 320  I32 pos_min;
 321  I32 pos_delta;
 322  SV *last_found;
 323  I32 last_end;     /* min value, <0 unless valid. */
 324  I32 last_start_min;
 325  I32 last_start_max;
 326  SV **longest;     /* Either &l_fixed, or &l_float. */
 327  SV *longest_fixed;      /* longest fixed string found in pattern */
 328  I32 offset_fixed;       /* offset where it starts */
 329  I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 330  I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 331  SV *longest_float;      /* longest floating string found in pattern */
 332  I32 offset_float_min;   /* earliest point in string it can appear */
 333  I32 offset_float_max;   /* latest point in string it can appear */
 334  I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 335  I32 lookbehind_float;   /* is the position of the string modified by LB */
 336  I32 flags;
 337  I32 whilem_c;
 338  I32 *last_closep;
 339  struct regnode_charclass_class *start_class;
 340 } scan_data_t;
 341
 342 /*
 343  * Forward declarations for pregcomp()'s friends.
 344  */
 345
 346 static const scan_data_t zero_scan_data =
 347   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 348
 349 #define SF_BEFORE_EOL  (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 350 #define SF_BEFORE_SEOL  0x0001
 351 #define SF_BEFORE_MEOL  0x0002
 352 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 353 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 354
 355 #ifdef NO_UNARY_PLUS
 356 #  define SF_FIX_SHIFT_EOL (0+2)
 357 #  define SF_FL_SHIFT_EOL  (0+4)
 358 #else
 359 #  define SF_FIX_SHIFT_EOL (+2)
 360 #  define SF_FL_SHIFT_EOL  (+4)
 361 #endif
 362
 363 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 364 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 365
 366 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 367 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 368 #define SF_IS_INF  0x0040
 369 #define SF_HAS_PAR  0x0080
 370 #define SF_IN_PAR  0x0100
 371 #define SF_HAS_EVAL  0x0200
 372 #define SCF_DO_SUBSTR  0x0400
 373 #define SCF_DO_STCLASS_AND 0x0800
 374 #define SCF_DO_STCLASS_OR 0x1000
 375 #define SCF_DO_STCLASS  (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 376 #define SCF_WHILEM_VISITED_POS 0x2000
 377
 378 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 379 #define SCF_SEEN_ACCEPT         0x8000
 380
 381 #define UTF cBOOL(RExC_utf8)
 382 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 383 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 384 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 385 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 386 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 387 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 388 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 389
 390 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 391
 392 #define OOB_UNICODE  12345678
 393 #define OOB_NAMEDCLASS  -1
 394
 395 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 396 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 397
 398
 399 /* length of regex to show in messages that don't mark a position within */
 400 #define RegexLengthToShowInErrorMessages 127
 401
 402 /*
 403  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 404  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 405  * op/pragma/warn/regcomp.
 406  */
 407 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 408 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 409
 410 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 411
 412 /*
 413  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 414  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 415  * "...".
 416  */
 417 #define _FAIL(code) STMT_START {     \
 418  const char *ellipses = "";      \
 419  IV len = RExC_end - RExC_precomp;     \
 420                   \
 421  if (!SIZE_ONLY)       \
 422   SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 423  if (len > RegexLengthToShowInErrorMessages) {   \
 424   /* chop 10 shorter than the max, to ensure meaning of "..." */ \
 425   len = RegexLengthToShowInErrorMessages - 10;   \
 426   ellipses = "...";      \
 427  }         \
 428  code;                                                               \
 429 } STMT_END
 430
 431 #define FAIL(msg) _FAIL(       \
 432  Perl_croak(aTHX_ "%s in regex m/%.*s%s/",     \
 433    msg, (int)len, RExC_precomp, ellipses))
 434
 435 #define FAIL2(msg,arg) _FAIL(       \
 436  Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 437    arg, (int)len, RExC_precomp, ellipses))
 438
 439 /*
 440  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 441  */
 442 #define Simple_vFAIL(m) STMT_START {     \
 443  const IV offset = RExC_parse - RExC_precomp;   \
 444  Perl_croak(aTHX_ "%s" REPORT_LOCATION,    \
 445    m, (int)offset, RExC_precomp, RExC_precomp + offset); \
 446 } STMT_END
 447
 448 /*
 449  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 450  */
 451 #define vFAIL(m) STMT_START {    \
 452  if (!SIZE_ONLY)     \
 453   SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
 454  Simple_vFAIL(m);     \
 455 } STMT_END
 456
 457 /*
 458  * Like Simple_vFAIL(), but accepts two arguments.
 459  */
 460 #define Simple_vFAIL2(m,a1) STMT_START {   \
 461  const IV offset = RExC_parse - RExC_precomp;   \
 462  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,   \
 463    (int)offset, RExC_precomp, RExC_precomp + offset); \
 464 } STMT_END
 465
 466 /*
 467  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 468  */
 469 #define vFAIL2(m,a1) STMT_START {   \
 470  if (!SIZE_ONLY)     \
 471   SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
 472  Simple_vFAIL2(m, a1);    \
 473 } STMT_END
 474
 475
 476 /*
 477  * Like Simple_vFAIL(), but accepts three arguments.
 478  */
 479 #define Simple_vFAIL3(m, a1, a2) STMT_START {   \
 480  const IV offset = RExC_parse - RExC_precomp;  \
 481  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,  \
 482    (int)offset, RExC_precomp, RExC_precomp + offset); \
 483 } STMT_END
 484
 485 /*
 486  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 487  */
 488 #define vFAIL3(m,a1,a2) STMT_START {   \
 489  if (!SIZE_ONLY)     \
 490   SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
 491  Simple_vFAIL3(m, a1, a2);    \
 492 } STMT_END
 493
 494 /*
 495  * Like Simple_vFAIL(), but accepts four arguments.
 496  */
 497 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {  \
 498  const IV offset = RExC_parse - RExC_precomp;  \
 499  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,  \
 500    (int)offset, RExC_precomp, RExC_precomp + offset); \
 501 } STMT_END
 502
 503 #define ckWARNreg(loc,m) STMT_START {     \
 504  const IV offset = loc - RExC_precomp;    \
 505  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 506    (int)offset, RExC_precomp, RExC_precomp + offset);  \
 507 } STMT_END
 508
 509 #define ckWARNregdep(loc,m) STMT_START {    \
 510  const IV offset = loc - RExC_precomp;    \
 511  Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
 512    m REPORT_LOCATION,      \
 513    (int)offset, RExC_precomp, RExC_precomp + offset);  \
 514 } STMT_END
 515
 516 #define ckWARN2regdep(loc,m, a1) STMT_START {    \
 517  const IV offset = loc - RExC_precomp;    \
 518  Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
 519    m REPORT_LOCATION,      \
 520    a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
 521 } STMT_END
 522
 523 #define ckWARN2reg(loc, m, a1) STMT_START {    \
 524  const IV offset = loc - RExC_precomp;    \
 525  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 526    a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
 527 } STMT_END
 528
 529 #define vWARN3(loc, m, a1, a2) STMT_START {    \
 530  const IV offset = loc - RExC_precomp;    \
 531  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 532    a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
 533 } STMT_END
 534
 535 #define ckWARN3reg(loc, m, a1, a2) STMT_START {    \
 536  const IV offset = loc - RExC_precomp;    \
 537  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 538    a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
 539 } STMT_END
 540
 541 #define vWARN4(loc, m, a1, a2, a3) STMT_START {    \
 542  const IV offset = loc - RExC_precomp;    \
 543  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 544    a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 545 } STMT_END
 546
 547 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {   \
 548  const IV offset = loc - RExC_precomp;    \
 549  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 550    a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 551 } STMT_END
 552
 553 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {   \
 554  const IV offset = loc - RExC_precomp;    \
 555  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 556    a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 557 } STMT_END
 558
 559
 560 /* Allow for side effects in s */
 561 #define REGC(c,s) STMT_START {   \
 562  if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 563 } STMT_END
 564
 565 /* Macros for recording node offsets.   20001227 mjd@plover.com
 566  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 567  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 568  * Element 0 holds the number n.
 569  * Position is 1 indexed.
 570  */
 571 #ifndef RE_TRACK_PATTERN_OFFSETS
 572 #define Set_Node_Offset_To_R(node,byte)
 573 #define Set_Node_Offset(node,byte)
 574 #define Set_Cur_Node_Offset
 575 #define Set_Node_Length_To_R(node,len)
 576 #define Set_Node_Length(node,len)
 577 #define Set_Node_Cur_Length(node)
 578 #define Node_Offset(n)
 579 #define Node_Length(n)
 580 #define Set_Node_Offset_Length(node,offset,len)
 581 #define ProgLen(ri) ri->u.proglen
 582 #define SetProgLen(ri,x) ri->u.proglen = x
 583 #else
 584 #define ProgLen(ri) ri->u.offsets[0]
 585 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 586 #define Set_Node_Offset_To_R(node,byte) STMT_START {   \
 587  if (! SIZE_ONLY) {       \
 588   MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",  \
 589      __LINE__, (int)(node), (int)(byte)));  \
 590   if((node) < 0) {      \
 591    Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 592   } else {       \
 593    RExC_offsets[2*(node)-1] = (byte);    \
 594   }        \
 595  }         \
 596 } STMT_END
 597
 598 #define Set_Node_Offset(node,byte) \
 599  Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 600 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 601
 602 #define Set_Node_Length_To_R(node,len) STMT_START {   \
 603  if (! SIZE_ONLY) {       \
 604   MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",  \
 605     __LINE__, (int)(node), (int)(len)));   \
 606   if((node) < 0) {      \
 607    Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 608   } else {       \
 609    RExC_offsets[2*(node)] = (len);    \
 610   }        \
 611  }         \
 612 } STMT_END
 613
 614 #define Set_Node_Length(node,len) \
 615  Set_Node_Length_To_R((node)-RExC_emit_start, len)
 616 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 617 #define Set_Node_Cur_Length(node) \
 618  Set_Node_Length(node, RExC_parse - parse_start)
 619
 620 /* Get offsets and lengths */
 621 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 622 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 623
 624 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
 625  Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
 626  Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
 627 } STMT_END
 628 #endif
 629
 630 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 631 #define EXPERIMENTAL_INPLACESCAN
 632 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 633
 634 #define DEBUG_STUDYDATA(str,data,depth)                              \
 635 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 636  PerlIO_printf(Perl_debug_log,                                    \
 637   "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 638   " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 639   (int)(depth)*2, "",                                          \
 640   (IV)((data)->pos_min),                                       \
 641   (IV)((data)->pos_delta),                                     \
 642   (UV)((data)->flags),                                         \
 643   (IV)((data)->whilem_c),                                      \
 644   (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 645   is_inf ? "INF " : ""                                         \
 646  );                                                               \
 647  if ((data)->last_found)                                          \
 648   PerlIO_printf(Perl_debug_log,                                \
 649    "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 650    " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 651    SvPVX_const((data)->last_found),                         \
 652    (IV)((data)->last_end),                                  \
 653    (IV)((data)->last_start_min),                            \
 654    (IV)((data)->last_start_max),                            \
 655    ((data)->longest &&                                      \
 656    (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 657    SvPVX_const((data)->longest_fixed),                      \
 658    (IV)((data)->offset_fixed),                              \
 659    ((data)->longest &&                                      \
 660    (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 661    SvPVX_const((data)->longest_float),                      \
 662    (IV)((data)->offset_float_min),                          \
 663    (IV)((data)->offset_float_max)                           \
 664   );                                                           \
 665  PerlIO_printf(Perl_debug_log,"\n");                              \
 666 });
 667
 668 static void clear_re(pTHX_ void *r);
 669
 670 /* Mark that we cannot extend a found fixed substring at this point.
 671    Update the longest found anchored substring and the longest found
 672    floating substrings if needed. */
 673
 674 STATIC void
 675 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 676 {
 677  const STRLEN l = CHR_SVLEN(data->last_found);
 678  const STRLEN old_l = CHR_SVLEN(*data->longest);
 679  GET_RE_DEBUG_FLAGS_DECL;
 680
 681  PERL_ARGS_ASSERT_SCAN_COMMIT;
 682
 683  if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 684   SvSetMagicSV(*data->longest, data->last_found);
 685   if (*data->longest == data->longest_fixed) {
 686    data->offset_fixed = l ? data->last_start_min : data->pos_min;
 687    if (data->flags & SF_BEFORE_EOL)
 688     data->flags
 689      |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 690    else
 691     data->flags &= ~SF_FIX_BEFORE_EOL;
 692    data->minlen_fixed=minlenp;
 693    data->lookbehind_fixed=0;
 694   }
 695   else { /* *data->longest == data->longest_float */
 696    data->offset_float_min = l ? data->last_start_min : data->pos_min;
 697    data->offset_float_max = (l
 698          ? data->last_start_max
 699          : data->pos_min + data->pos_delta);
 700    if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 701     data->offset_float_max = I32_MAX;
 702    if (data->flags & SF_BEFORE_EOL)
 703     data->flags
 704      |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 705    else
 706     data->flags &= ~SF_FL_BEFORE_EOL;
 707    data->minlen_float=minlenp;
 708    data->lookbehind_float=0;
 709   }
 710  }
 711  SvCUR_set(data->last_found, 0);
 712  {
 713   SV * const sv = data->last_found;
 714   if (SvUTF8(sv) && SvMAGICAL(sv)) {
 715    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 716    if (mg)
 717     mg->mg_len = 0;
 718   }
 719  }
 720  data->last_end = -1;
 721  data->flags &= ~SF_BEFORE_EOL;
 722  DEBUG_STUDYDATA("commit: ",data,0);
 723 }
 724
 725 /* Can match anything (initialization) */
 726 STATIC void
 727 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 728 {
 729  PERL_ARGS_ASSERT_CL_ANYTHING;
 730
 731  ANYOF_BITMAP_SETALL(cl);
 732  cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
 733     |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
 734
 735  /* If any portion of the regex is to operate under locale rules,
 736  * initialization includes it.  The reason this isn't done for all regexes
 737  * is that the optimizer was written under the assumption that locale was
 738  * all-or-nothing.  Given the complexity and lack of documentation in the
 739  * optimizer, and that there are inadequate test cases for locale, so many
 740  * parts of it may not work properly, it is safest to avoid locale unless
 741  * necessary. */
 742  if (RExC_contains_locale) {
 743   ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 744   cl->flags |= ANYOF_LOCALE;
 745  }
 746  else {
 747   ANYOF_CLASS_ZERO(cl);     /* Only /l uses class now */
 748  }
 749 }
 750
 751 /* Can match anything (initialization) */
 752 STATIC int
 753 S_cl_is_anything(const struct regnode_charclass_class *cl)
 754 {
 755  int value;
 756
 757  PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 758
 759  for (value = 0; value <= ANYOF_MAX; value += 2)
 760   if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 761    return 1;
 762  if (!(cl->flags & ANYOF_UNICODE_ALL))
 763   return 0;
 764  if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 765   return 0;
 766  return 1;
 767 }
 768
 769 /* Can match anything (initialization) */
 770 STATIC void
 771 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 772 {
 773  PERL_ARGS_ASSERT_CL_INIT;
 774
 775  Zero(cl, 1, struct regnode_charclass_class);
 776  cl->type = ANYOF;
 777  cl_anything(pRExC_state, cl);
 778  ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 779 }
 780
 781 /* These two functions currently do the exact same thing */
 782 #define cl_init_zero  S_cl_init
 783
 784 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 785  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 786  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 787 STATIC void
 788 S_cl_and(struct regnode_charclass_class *cl,
 789   const struct regnode_charclass_class *and_with)
 790 {
 791  PERL_ARGS_ASSERT_CL_AND;
 792
 793  assert(and_with->type == ANYOF);
 794
 795  /* I (khw) am not sure all these restrictions are necessary XXX */
 796  if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 797   && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 798   && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 799   && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 800   && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
 801   int i;
 802
 803   if (and_with->flags & ANYOF_INVERT)
 804    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 805     cl->bitmap[i] &= ~and_with->bitmap[i];
 806   else
 807    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 808     cl->bitmap[i] &= and_with->bitmap[i];
 809  } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 810
 811  if (and_with->flags & ANYOF_INVERT) {
 812
 813   /* Here, the and'ed node is inverted.  Get the AND of the flags that
 814   * aren't affected by the inversion.  Those that are affected are
 815   * handled individually below */
 816   U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 817   cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 818   cl->flags |= affected_flags;
 819
 820   /* We currently don't know how to deal with things that aren't in the
 821   * bitmap, but we know that the intersection is no greater than what
 822   * is already in cl, so let there be false positives that get sorted
 823   * out after the synthetic start class succeeds, and the node is
 824   * matched for real. */
 825
 826   /* The inversion of these two flags indicate that the resulting
 827   * intersection doesn't have them */
 828   if (and_with->flags & ANYOF_UNICODE_ALL) {
 829    cl->flags &= ~ANYOF_UNICODE_ALL;
 830   }
 831   if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 832    cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 833   }
 834  }
 835  else {   /* and'd node is not inverted */
 836   U8 outside_bitmap_but_not_utf8; /* Temp variable */
 837
 838   if (! ANYOF_NONBITMAP(and_with)) {
 839
 840    /* Here 'and_with' doesn't match anything outside the bitmap
 841    * (except possibly ANYOF_UNICODE_ALL), which means the
 842    * intersection can't either, except for ANYOF_UNICODE_ALL, in
 843    * which case we don't know what the intersection is, but it's no
 844    * greater than what cl already has, so can just leave it alone,
 845    * with possible false positives */
 846    if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 847     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 848     cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 849    }
 850   }
 851   else if (! ANYOF_NONBITMAP(cl)) {
 852
 853    /* Here, 'and_with' does match something outside the bitmap, and cl
 854    * doesn't have a list of things to match outside the bitmap.  If
 855    * cl can match all code points above 255, the intersection will
 856    * be those above-255 code points that 'and_with' matches.  If cl
 857    * can't match all Unicode code points, it means that it can't
 858    * match anything outside the bitmap (since the 'if' that got us
 859    * into this block tested for that), so we leave the bitmap empty.
 860    */
 861    if (cl->flags & ANYOF_UNICODE_ALL) {
 862     ARG_SET(cl, ARG(and_with));
 863
 864     /* and_with's ARG may match things that don't require UTF8.
 865     * And now cl's will too, in spite of this being an 'and'.  See
 866     * the comments below about the kludge */
 867     cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 868    }
 869   }
 870   else {
 871    /* Here, both 'and_with' and cl match something outside the
 872    * bitmap.  Currently we do not do the intersection, so just match
 873    * whatever cl had at the beginning.  */
 874   }
 875
 876
 877   /* Take the intersection of the two sets of flags.  However, the
 878   * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 879   * kludge around the fact that this flag is not treated like the others
 880   * which are initialized in cl_anything().  The way the optimizer works
 881   * is that the synthetic start class (SSC) is initialized to match
 882   * anything, and then the first time a real node is encountered, its
 883   * values are AND'd with the SSC's with the result being the values of
 884   * the real node.  However, there are paths through the optimizer where
 885   * the AND never gets called, so those initialized bits are set
 886   * inappropriately, which is not usually a big deal, as they just cause
 887   * false positives in the SSC, which will just mean a probably
 888   * imperceptible slow down in execution.  However this bit has a
 889   * higher false positive consequence in that it can cause utf8.pm,
 890   * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 891   * bigger slowdown and also causes significant extra memory to be used.
 892   * In order to prevent this, the code now takes a different tack.  The
 893   * bit isn't set unless some part of the regular expression needs it,
 894   * but once set it won't get cleared.  This means that these extra
 895   * modules won't get loaded unless there was some path through the
 896   * pattern that would have required them anyway, and  so any false
 897   * positives that occur by not ANDing them out when they could be
 898   * aren't as severe as they would be if we treated this bit like all
 899   * the others */
 900   outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 901          & ANYOF_NONBITMAP_NON_UTF8;
 902   cl->flags &= and_with->flags;
 903   cl->flags |= outside_bitmap_but_not_utf8;
 904  }
 905 }
 906
 907 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 908  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 909  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 910 STATIC void
 911 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 912 {
 913  PERL_ARGS_ASSERT_CL_OR;
 914
 915  if (or_with->flags & ANYOF_INVERT) {
 916
 917   /* Here, the or'd node is to be inverted.  This means we take the
 918   * complement of everything not in the bitmap, but currently we don't
 919   * know what that is, so give up and match anything */
 920   if (ANYOF_NONBITMAP(or_with)) {
 921    cl_anything(pRExC_state, cl);
 922   }
 923   /* We do not use
 924   * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 925   *   <= (B1 | !B2) | (CL1 | !CL2)
 926   * which is wasteful if CL2 is small, but we ignore CL2:
 927   *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 928   * XXXX Can we handle case-fold?  Unclear:
 929   *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 930   *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 931   */
 932   else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 933    && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 934    && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 935    int i;
 936
 937    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 938     cl->bitmap[i] |= ~or_with->bitmap[i];
 939   } /* XXXX: logic is complicated otherwise */
 940   else {
 941    cl_anything(pRExC_state, cl);
 942   }
 943
 944   /* And, we can just take the union of the flags that aren't affected
 945   * by the inversion */
 946   cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 947
 948   /* For the remaining flags:
 949    ANYOF_UNICODE_ALL and inverted means to not match anything above
 950      255, which means that the union with cl should just be
 951      what cl has in it, so can ignore this flag
 952    ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
 953      is 127-255 to match them, but then invert that, so the
 954      union with cl should just be what cl has in it, so can
 955      ignore this flag
 956   */
 957  } else {    /* 'or_with' is not inverted */
 958   /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 959   if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 960    && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 961     || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
 962    int i;
 963
 964    /* OR char bitmap and class bitmap separately */
 965    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 966     cl->bitmap[i] |= or_with->bitmap[i];
 967    if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 968     for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 969      cl->classflags[i] |= or_with->classflags[i];
 970     cl->flags |= ANYOF_CLASS;
 971    }
 972   }
 973   else { /* XXXX: logic is complicated, leave it along for a moment. */
 974    cl_anything(pRExC_state, cl);
 975   }
 976
 977   if (ANYOF_NONBITMAP(or_with)) {
 978
 979    /* Use the added node's outside-the-bit-map match if there isn't a
 980    * conflict.  If there is a conflict (both nodes match something
 981    * outside the bitmap, but what they match outside is not the same
 982    * pointer, and hence not easily compared until XXX we extend
 983    * inversion lists this far), give up and allow the start class to
 984    * match everything outside the bitmap.  If that stuff is all above
 985    * 255, can just set UNICODE_ALL, otherwise caould be anything. */
 986    if (! ANYOF_NONBITMAP(cl)) {
 987     ARG_SET(cl, ARG(or_with));
 988    }
 989    else if (ARG(cl) != ARG(or_with)) {
 990
 991     if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
 992      cl_anything(pRExC_state, cl);
 993     }
 994     else {
 995      cl->flags |= ANYOF_UNICODE_ALL;
 996     }
 997    }
 998   }
 999
1000   /* Take the union */
1001   cl->flags |= or_with->flags;
1002  }
1003 }
1004
1005 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1006 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1007 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1008 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1009
1010
1011 #ifdef DEBUGGING
1012 /*
1013    dump_trie(trie,widecharmap,revcharmap)
1014    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1015    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1016
1017    These routines dump out a trie in a somewhat readable format.
1018    The _interim_ variants are used for debugging the interim
1019    tables that are used to generate the final compressed
1020    representation which is what dump_trie expects.
1021
1022    Part of the reason for their existence is to provide a form
1023    of documentation as to how the different representations function.
1024
1025 */
1026
1027 /*
1028   Dumps the final compressed table form of the trie to Perl_debug_log.
1029   Used for debugging make_trie().
1030 */
1031
1032 STATIC void
1033 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1034    AV *revcharmap, U32 depth)
1035 {
1036  U32 state;
1037  SV *sv=sv_newmortal();
1038  int colwidth= widecharmap ? 6 : 4;
1039  U16 word;
1040  GET_RE_DEBUG_FLAGS_DECL;
1041
1042  PERL_ARGS_ASSERT_DUMP_TRIE;
1043
1044  PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1045   (int)depth * 2 + 2,"",
1046   "Match","Base","Ofs" );
1047
1048  for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1049   SV ** const tmp = av_fetch( revcharmap, state, 0);
1050   if ( tmp ) {
1051    PerlIO_printf( Perl_debug_log, "%*s",
1052     colwidth,
1053     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1054        PL_colors[0], PL_colors[1],
1055        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1056        PERL_PV_ESCAPE_FIRSTCHAR
1057     )
1058    );
1059   }
1060  }
1061  PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1062   (int)depth * 2 + 2,"");
1063
1064  for( state = 0 ; state < trie->uniquecharcount ; state++ )
1065   PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1066  PerlIO_printf( Perl_debug_log, "\n");
1067
1068  for( state = 1 ; state < trie->statecount ; state++ ) {
1069   const U32 base = trie->states[ state ].trans.base;
1070
1071   PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1072
1073   if ( trie->states[ state ].wordnum ) {
1074    PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1075   } else {
1076    PerlIO_printf( Perl_debug_log, "%6s", "" );
1077   }
1078
1079   PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1080
1081   if ( base ) {
1082    U32 ofs = 0;
1083
1084    while( ( base + ofs  < trie->uniquecharcount ) ||
1085     ( base + ofs - trie->uniquecharcount < trie->lasttrans
1086      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1087      ofs++;
1088
1089    PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1090
1091    for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1092     if ( ( base + ofs >= trie->uniquecharcount ) &&
1093      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1094      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1095     {
1096     PerlIO_printf( Perl_debug_log, "%*"UVXf,
1097      colwidth,
1098      (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1099     } else {
1100      PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1101     }
1102    }
1103
1104    PerlIO_printf( Perl_debug_log, "]");
1105
1106   }
1107   PerlIO_printf( Perl_debug_log, "\n" );
1108  }
1109  PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1110  for (word=1; word <= trie->wordcount; word++) {
1111   PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1112    (int)word, (int)(trie->wordinfo[word].prev),
1113    (int)(trie->wordinfo[word].len));
1114  }
1115  PerlIO_printf(Perl_debug_log, "\n" );
1116 }
1117 /*
1118   Dumps a fully constructed but uncompressed trie in list form.
1119   List tries normally only are used for construction when the number of
1120   possible chars (trie->uniquecharcount) is very high.
1121   Used for debugging make_trie().
1122 */
1123 STATIC void
1124 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1125       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1126       U32 depth)
1127 {
1128  U32 state;
1129  SV *sv=sv_newmortal();
1130  int colwidth= widecharmap ? 6 : 4;
1131  GET_RE_DEBUG_FLAGS_DECL;
1132
1133  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1134
1135  /* print out the table precompression.  */
1136  PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1137   (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1138   "------:-----+-----------------\n" );
1139
1140  for( state=1 ; state < next_alloc ; state ++ ) {
1141   U16 charid;
1142
1143   PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1144    (int)depth * 2 + 2,"", (UV)state  );
1145   if ( ! trie->states[ state ].wordnum ) {
1146    PerlIO_printf( Perl_debug_log, "%5s| ","");
1147   } else {
1148    PerlIO_printf( Perl_debug_log, "W%4x| ",
1149     trie->states[ state ].wordnum
1150    );
1151   }
1152   for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1153    SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1154    if ( tmp ) {
1155     PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1156      colwidth,
1157      pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1158        PL_colors[0], PL_colors[1],
1159        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1160        PERL_PV_ESCAPE_FIRSTCHAR
1161      ) ,
1162      TRIE_LIST_ITEM(state,charid).forid,
1163      (UV)TRIE_LIST_ITEM(state,charid).newstate
1164     );
1165     if (!(charid % 10))
1166      PerlIO_printf(Perl_debug_log, "\n%*s| ",
1167       (int)((depth * 2) + 14), "");
1168    }
1169   }
1170   PerlIO_printf( Perl_debug_log, "\n");
1171  }
1172 }
1173
1174 /*
1175   Dumps a fully constructed but uncompressed trie in table form.
1176   This is the normal DFA style state transition table, with a few
1177   twists to facilitate compression later.
1178   Used for debugging make_trie().
1179 */
1180 STATIC void
1181 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1182       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1183       U32 depth)
1184 {
1185  U32 state;
1186  U16 charid;
1187  SV *sv=sv_newmortal();
1188  int colwidth= widecharmap ? 6 : 4;
1189  GET_RE_DEBUG_FLAGS_DECL;
1190
1191  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1192
1193  /*
1194  print out the table precompression so that we can do a visual check
1195  that they are identical.
1196  */
1197
1198  PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1199
1200  for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1201   SV ** const tmp = av_fetch( revcharmap, charid, 0);
1202   if ( tmp ) {
1203    PerlIO_printf( Perl_debug_log, "%*s",
1204     colwidth,
1205     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1206        PL_colors[0], PL_colors[1],
1207        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1208        PERL_PV_ESCAPE_FIRSTCHAR
1209     )
1210    );
1211   }
1212  }
1213
1214  PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1215
1216  for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1217   PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1218  }
1219
1220  PerlIO_printf( Perl_debug_log, "\n" );
1221
1222  for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1223
1224   PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1225    (int)depth * 2 + 2,"",
1226    (UV)TRIE_NODENUM( state ) );
1227
1228   for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1229    UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1230    if (v)
1231     PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1232    else
1233     PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1234   }
1235   if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1236    PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1237   } else {
1238    PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1239    trie->states[ TRIE_NODENUM( state ) ].wordnum );
1240   }
1241  }
1242 }
1243
1244 #endif
1245
1246
1247 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1248   startbranch: the first branch in the whole branch sequence
1249   first      : start branch of sequence of branch-exact nodes.
1250    May be the same as startbranch
1251   last       : Thing following the last branch.
1252    May be the same as tail.
1253   tail       : item following the branch sequence
1254   count      : words in the sequence
1255   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1256   depth      : indent depth
1257
1258 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1259
1260 A trie is an N'ary tree where the branches are determined by digital
1261 decomposition of the key. IE, at the root node you look up the 1st character and
1262 follow that branch repeat until you find the end of the branches. Nodes can be
1263 marked as "accepting" meaning they represent a complete word. Eg:
1264
1265   /he|she|his|hers/
1266
1267 would convert into the following structure. Numbers represent states, letters
1268 following numbers represent valid transitions on the letter from that state, if
1269 the number is in square brackets it represents an accepting state, otherwise it
1270 will be in parenthesis.
1271
1272  +-h->+-e->[3]-+-r->(8)-+-s->[9]
1273  |    |
1274  |   (2)
1275  |    |
1276  (1)   +-i->(6)-+-s->[7]
1277  |
1278  +-s->(3)-+-h->(4)-+-e->[5]
1279
1280  Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1281
1282 This shows that when matching against the string 'hers' we will begin at state 1
1283 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1284 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1285 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1286 single traverse. We store a mapping from accepting to state to which word was
1287 matched, and then when we have multiple possibilities we try to complete the
1288 rest of the regex in the order in which they occured in the alternation.
1289
1290 The only prior NFA like behaviour that would be changed by the TRIE support is
1291 the silent ignoring of duplicate alternations which are of the form:
1292
1293  / (DUPE|DUPE) X? (?{ ... }) Y /x
1294
1295 Thus EVAL blocks following a trie may be called a different number of times with
1296 and without the optimisation. With the optimisations dupes will be silently
1297 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1298 the following demonstrates:
1299
1300  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1301
1302 which prints out 'word' three times, but
1303
1304  'words'=~/(word|word|word)(?{ print $1 })S/
1305
1306 which doesnt print it out at all. This is due to other optimisations kicking in.
1307
1308 Example of what happens on a structural level:
1309
1310 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1311
1312    1: CURLYM[1] {1,32767}(18)
1313    5:   BRANCH(8)
1314    6:     EXACT <ac>(16)
1315    8:   BRANCH(11)
1316    9:     EXACT <ad>(16)
1317   11:   BRANCH(14)
1318   12:     EXACT <ab>(16)
1319   16:   SUCCEED(0)
1320   17:   NOTHING(18)
1321   18: END(0)
1322
1323 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1324 and should turn into:
1325
1326    1: CURLYM[1] {1,32767}(18)
1327    5:   TRIE(16)
1328   [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1329   <ac>
1330   <ad>
1331   <ab>
1332   16:   SUCCEED(0)
1333   17:   NOTHING(18)
1334   18: END(0)
1335
1336 Cases where tail != last would be like /(?foo|bar)baz/:
1337
1338    1: BRANCH(4)
1339    2:   EXACT <foo>(8)
1340    4: BRANCH(7)
1341    5:   EXACT <bar>(8)
1342    7: TAIL(8)
1343    8: EXACT <baz>(10)
1344   10: END(0)
1345
1346 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1347 and would end up looking like:
1348
1349  1: TRIE(8)
1350  [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1351   <foo>
1352   <bar>
1353    7: TAIL(8)
1354    8: EXACT <baz>(10)
1355   10: END(0)
1356
1357  d = uvuni_to_utf8_flags(d, uv, 0);
1358
1359 is the recommended Unicode-aware way of saying
1360
1361  *(d++) = uv;
1362 */
1363
1364 #define TRIE_STORE_REVCHAR                                                 \
1365  STMT_START {                                                           \
1366   if (UTF) {          \
1367    SV *zlopp = newSV(2);        \
1368    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);    \
1369    unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1370    SvCUR_set(zlopp, kapow - flrbbbbb);       \
1371    SvPOK_on(zlopp);         \
1372    SvUTF8_on(zlopp);         \
1373    av_push(revcharmap, zlopp);        \
1374   } else {          \
1375    char ooooff = (char)uvc;            \
1376    av_push(revcharmap, newSVpvn(&ooooff, 1));      \
1377   }           \
1378   } STMT_END
1379
1380 #define TRIE_READ_CHAR STMT_START {                                           \
1381  wordlen++;                                                                \
1382  if ( UTF ) {                                                              \
1383   if ( folder ) {                                                       \
1384    if ( foldlen > 0 ) {                                              \
1385    uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1386    foldlen -= len;                                                \
1387    scan += len;                                                   \
1388    len = 0;                                                       \
1389    } else {                                                          \
1390     uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1391     uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1392     foldlen -= UNISKIP( uvc );                                    \
1393     scan = foldbuf + UNISKIP( uvc );                              \
1394    }                                                                 \
1395   } else {                                                              \
1396    uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1397   }                                                                     \
1398  } else {                                                                  \
1399   uvc = (U32)*uc;                                                       \
1400   len = 1;                                                              \
1401  }                                                                         \
1402 } STMT_END
1403
1404
1405
1406 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1407  if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1408   U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1409   Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1410  }                                                           \
1411  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1412  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1413  TRIE_LIST_CUR( state )++;                                   \
1414 } STMT_END
1415
1416 #define TRIE_LIST_NEW(state) STMT_START {                       \
1417  Newxz( trie->states[ state ].trans.list,               \
1418   4, reg_trie_trans_le );                                 \
1419  TRIE_LIST_CUR( state ) = 1;                                \
1420  TRIE_LIST_LEN( state ) = 4;                                \
1421 } STMT_END
1422
1423 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1424  U16 dupe= trie->states[ state ].wordnum;                    \
1425  regnode * const noper_next = regnext( noper );              \
1426                 \
1427  DEBUG_r({                                                   \
1428   /* store the word for dumping */                        \
1429   SV* tmp;                                                \
1430   if (OP(noper) != NOTHING)                               \
1431    tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1432   else                                                    \
1433    tmp = newSVpvn_utf8( "", 0, UTF );   \
1434   av_push( trie_words, tmp );                             \
1435  });                                                         \
1436                 \
1437  curword++;                                                  \
1438  trie->wordinfo[curword].prev   = 0;                         \
1439  trie->wordinfo[curword].len    = wordlen;                   \
1440  trie->wordinfo[curword].accept = state;                     \
1441                 \
1442  if ( noper_next < tail ) {                                  \
1443   if (!trie->jump)                                        \
1444    trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1445   trie->jump[curword] = (U16)(noper_next - convert);      \
1446   if (!jumper)                                            \
1447    jumper = noper_next;                                \
1448   if (!nextbranch)                                        \
1449    nextbranch= regnext(cur);                           \
1450  }                                                           \
1451                 \
1452  if ( dupe ) {                                               \
1453   /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1454   /* chain, so that when the bits of chain are later    */\
1455   /* linked together, the dups appear in the chain      */\
1456   trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1457   trie->wordinfo[dupe].prev = curword;                    \
1458  } else {                                                    \
1459   /* we haven't inserted this word yet.                */ \
1460   trie->states[ state ].wordnum = curword;                \
1461  }                                                           \
1462 } STMT_END
1463
1464
1465 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)  \
1466  ( ( base + charid >=  ucharcount     \
1467   && base + charid < ubound     \
1468   && state == trie->trans[ base - ucharcount + charid ].check \
1469   && trie->trans[ base - ucharcount + charid ].next )  \
1470   ? trie->trans[ base - ucharcount + charid ].next  \
1471   : ( state==1 ? special : 0 )     \
1472  )
1473
1474 #define MADE_TRIE       1
1475 #define MADE_JUMP_TRIE  2
1476 #define MADE_EXACT_TRIE 4
1477
1478 STATIC I32
1479 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1480 {
1481  dVAR;
1482  /* first pass, loop through and scan words */
1483  reg_trie_data *trie;
1484  HV *widecharmap = NULL;
1485  AV *revcharmap = newAV();
1486  regnode *cur;
1487  const U32 uniflags = UTF8_ALLOW_DEFAULT;
1488  STRLEN len = 0;
1489  UV uvc = 0;
1490  U16 curword = 0;
1491  U32 next_alloc = 0;
1492  regnode *jumper = NULL;
1493  regnode *nextbranch = NULL;
1494  regnode *convert = NULL;
1495  U32 *prev_states; /* temp array mapping each state to previous one */
1496  /* we just use folder as a flag in utf8 */
1497  const U8 * folder = NULL;
1498
1499 #ifdef DEBUGGING
1500  const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1501  AV *trie_words = NULL;
1502  /* along with revcharmap, this only used during construction but both are
1503  * useful during debugging so we store them in the struct when debugging.
1504  */
1505 #else
1506  const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1507  STRLEN trie_charcount=0;
1508 #endif
1509  SV *re_trie_maxbuff;
1510  GET_RE_DEBUG_FLAGS_DECL;
1511
1512  PERL_ARGS_ASSERT_MAKE_TRIE;
1513 #ifndef DEBUGGING
1514  PERL_UNUSED_ARG(depth);
1515 #endif
1516
1517  switch (flags) {
1518   case EXACTFA:
1519   case EXACTFU: folder = PL_fold_latin1; break;
1520   case EXACTF:  folder = PL_fold; break;
1521   case EXACTFL: folder = PL_fold_locale; break;
1522  }
1523
1524  trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1525  trie->refcount = 1;
1526  trie->startstate = 1;
1527  trie->wordcount = word_count;
1528  RExC_rxi->data->data[ data_slot ] = (void*)trie;
1529  trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1530  if (!(UTF && folder))
1531   trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1532  trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1533      trie->wordcount+1, sizeof(reg_trie_wordinfo));
1534
1535  DEBUG_r({
1536   trie_words = newAV();
1537  });
1538
1539  re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1540  if (!SvIOK(re_trie_maxbuff)) {
1541   sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1542  }
1543  DEBUG_OPTIMISE_r({
1544     PerlIO_printf( Perl_debug_log,
1545     "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1546     (int)depth * 2 + 2, "",
1547     REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1548     REG_NODE_NUM(last), REG_NODE_NUM(tail),
1549     (int)depth);
1550  });
1551
1552    /* Find the node we are going to overwrite */
1553  if ( first == startbranch && OP( last ) != BRANCH ) {
1554   /* whole branch chain */
1555   convert = first;
1556  } else {
1557   /* branch sub-chain */
1558   convert = NEXTOPER( first );
1559  }
1560
1561  /*  -- First loop and Setup --
1562
1563  We first traverse the branches and scan each word to determine if it
1564  contains widechars, and how many unique chars there are, this is
1565  important as we have to build a table with at least as many columns as we
1566  have unique chars.
1567
1568  We use an array of integers to represent the character codes 0..255
1569  (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1570  native representation of the character value as the key and IV's for the
1571  coded index.
1572
1573  *TODO* If we keep track of how many times each character is used we can
1574  remap the columns so that the table compression later on is more
1575  efficient in terms of memory by ensuring the most common value is in the
1576  middle and the least common are on the outside.  IMO this would be better
1577  than a most to least common mapping as theres a decent chance the most
1578  common letter will share a node with the least common, meaning the node
1579  will not be compressible. With a middle is most common approach the worst
1580  case is when we have the least common nodes twice.
1581
1582  */
1583
1584  for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1585   regnode * const noper = NEXTOPER( cur );
1586   const U8 *uc = (U8*)STRING( noper );
1587   const U8 * const e  = uc + STR_LEN( noper );
1588   STRLEN foldlen = 0;
1589   U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1590   const U8 *scan = (U8*)NULL;
1591   U32 wordlen      = 0;         /* required init */
1592   STRLEN chars = 0;
1593   bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1594
1595   if (OP(noper) == NOTHING) {
1596    trie->minlen= 0;
1597    continue;
1598   }
1599   if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1600    TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1601           regardless of encoding */
1602
1603   for ( ; uc < e ; uc += len ) {
1604    TRIE_CHARCOUNT(trie)++;
1605    TRIE_READ_CHAR;
1606    chars++;
1607    if ( uvc < 256 ) {
1608     if ( !trie->charmap[ uvc ] ) {
1609      trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1610      if ( folder )
1611       trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1612      TRIE_STORE_REVCHAR;
1613     }
1614     if ( set_bit ) {
1615      /* store the codepoint in the bitmap, and its folded
1616      * equivalent. */
1617      TRIE_BITMAP_SET(trie,uvc);
1618
1619      /* store the folded codepoint */
1620      if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1621
1622      if ( !UTF ) {
1623       /* store first byte of utf8 representation of
1624       variant codepoints */
1625       if (! UNI_IS_INVARIANT(uvc)) {
1626        TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1627       }
1628      }
1629      set_bit = 0; /* We've done our bit :-) */
1630     }
1631    } else {
1632     SV** svpp;
1633     if ( !widecharmap )
1634      widecharmap = newHV();
1635
1636     svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1637
1638     if ( !svpp )
1639      Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1640
1641     if ( !SvTRUE( *svpp ) ) {
1642      sv_setiv( *svpp, ++trie->uniquecharcount );
1643      TRIE_STORE_REVCHAR;
1644     }
1645    }
1646   }
1647   if( cur == first ) {
1648    trie->minlen=chars;
1649    trie->maxlen=chars;
1650   } else if (chars < trie->minlen) {
1651    trie->minlen=chars;
1652   } else if (chars > trie->maxlen) {
1653    trie->maxlen=chars;
1654   }
1655
1656  } /* end first pass */
1657  DEBUG_TRIE_COMPILE_r(
1658   PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1659     (int)depth * 2 + 2,"",
1660     ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1661     (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1662     (int)trie->minlen, (int)trie->maxlen )
1663  );
1664
1665  /*
1666   We now know what we are dealing with in terms of unique chars and
1667   string sizes so we can calculate how much memory a naive
1668   representation using a flat table  will take. If it's over a reasonable
1669   limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1670   conservative but potentially much slower representation using an array
1671   of lists.
1672
1673   At the end we convert both representations into the same compressed
1674   form that will be used in regexec.c for matching with. The latter
1675   is a form that cannot be used to construct with but has memory
1676   properties similar to the list form and access properties similar
1677   to the table form making it both suitable for fast searches and
1678   small enough that its feasable to store for the duration of a program.
1679
1680   See the comment in the code where the compressed table is produced
1681   inplace from the flat tabe representation for an explanation of how
1682   the compression works.
1683
1684  */
1685
1686
1687  Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1688  prev_states[1] = 0;
1689
1690  if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1691   /*
1692    Second Pass -- Array Of Lists Representation
1693
1694    Each state will be represented by a list of charid:state records
1695    (reg_trie_trans_le) the first such element holds the CUR and LEN
1696    points of the allocated array. (See defines above).
1697
1698    We build the initial structure using the lists, and then convert
1699    it into the compressed table form which allows faster lookups
1700    (but cant be modified once converted).
1701   */
1702
1703   STRLEN transcount = 1;
1704
1705   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1706    "%*sCompiling trie using list compiler\n",
1707    (int)depth * 2 + 2, ""));
1708
1709   trie->states = (reg_trie_state *)
1710    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1711         sizeof(reg_trie_state) );
1712   TRIE_LIST_NEW(1);
1713   next_alloc = 2;
1714
1715   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1716
1717    regnode * const noper = NEXTOPER( cur );
1718    U8 *uc           = (U8*)STRING( noper );
1719    const U8 * const e = uc + STR_LEN( noper );
1720    U32 state        = 1;         /* required init */
1721    U16 charid       = 0;         /* sanity init */
1722    U8 *scan         = (U8*)NULL; /* sanity init */
1723    STRLEN foldlen   = 0;         /* required init */
1724    U32 wordlen      = 0;         /* required init */
1725    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1726
1727    if (OP(noper) != NOTHING) {
1728     for ( ; uc < e ; uc += len ) {
1729
1730      TRIE_READ_CHAR;
1731
1732      if ( uvc < 256 ) {
1733       charid = trie->charmap[ uvc ];
1734      } else {
1735       SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1736       if ( !svpp ) {
1737        charid = 0;
1738       } else {
1739        charid=(U16)SvIV( *svpp );
1740       }
1741      }
1742      /* charid is now 0 if we dont know the char read, or nonzero if we do */
1743      if ( charid ) {
1744
1745       U16 check;
1746       U32 newstate = 0;
1747
1748       charid--;
1749       if ( !trie->states[ state ].trans.list ) {
1750        TRIE_LIST_NEW( state );
1751       }
1752       for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1753        if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1754         newstate = TRIE_LIST_ITEM( state, check ).newstate;
1755         break;
1756        }
1757       }
1758       if ( ! newstate ) {
1759        newstate = next_alloc++;
1760        prev_states[newstate] = state;
1761        TRIE_LIST_PUSH( state, charid, newstate );
1762        transcount++;
1763       }
1764       state = newstate;
1765      } else {
1766       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1767      }
1768     }
1769    }
1770    TRIE_HANDLE_WORD(state);
1771
1772   } /* end second pass */
1773
1774   /* next alloc is the NEXT state to be allocated */
1775   trie->statecount = next_alloc;
1776   trie->states = (reg_trie_state *)
1777    PerlMemShared_realloc( trie->states,
1778         next_alloc
1779         * sizeof(reg_trie_state) );
1780
1781   /* and now dump it out before we compress it */
1782   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1783               revcharmap, next_alloc,
1784               depth+1)
1785   );
1786
1787   trie->trans = (reg_trie_trans *)
1788    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1789   {
1790    U32 state;
1791    U32 tp = 0;
1792    U32 zp = 0;
1793
1794
1795    for( state=1 ; state < next_alloc ; state ++ ) {
1796     U32 base=0;
1797
1798     /*
1799     DEBUG_TRIE_COMPILE_MORE_r(
1800      PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1801     );
1802     */
1803
1804     if (trie->states[state].trans.list) {
1805      U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1806      U16 maxid=minid;
1807      U16 idx;
1808
1809      for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1810       const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1811       if ( forid < minid ) {
1812        minid=forid;
1813       } else if ( forid > maxid ) {
1814        maxid=forid;
1815       }
1816      }
1817      if ( transcount < tp + maxid - minid + 1) {
1818       transcount *= 2;
1819       trie->trans = (reg_trie_trans *)
1820        PerlMemShared_realloc( trie->trans,
1821              transcount
1822              * sizeof(reg_trie_trans) );
1823       Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1824      }
1825      base = trie->uniquecharcount + tp - minid;
1826      if ( maxid == minid ) {
1827       U32 set = 0;
1828       for ( ; zp < tp ; zp++ ) {
1829        if ( ! trie->trans[ zp ].next ) {
1830         base = trie->uniquecharcount + zp - minid;
1831         trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1832         trie->trans[ zp ].check = state;
1833         set = 1;
1834         break;
1835        }
1836       }
1837       if ( !set ) {
1838        trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1839        trie->trans[ tp ].check = state;
1840        tp++;
1841        zp = tp;
1842       }
1843      } else {
1844       for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1845        const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1846        trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1847        trie->trans[ tid ].check = state;
1848       }
1849       tp += ( maxid - minid + 1 );
1850      }
1851      Safefree(trie->states[ state ].trans.list);
1852     }
1853     /*
1854     DEBUG_TRIE_COMPILE_MORE_r(
1855      PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1856     );
1857     */
1858     trie->states[ state ].trans.base=base;
1859    }
1860    trie->lasttrans = tp + 1;
1861   }
1862  } else {
1863   /*
1864   Second Pass -- Flat Table Representation.
1865
1866   we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1867   We know that we will need Charcount+1 trans at most to store the data
1868   (one row per char at worst case) So we preallocate both structures
1869   assuming worst case.
1870
1871   We then construct the trie using only the .next slots of the entry
1872   structs.
1873
1874   We use the .check field of the first entry of the node temporarily to
1875   make compression both faster and easier by keeping track of how many non
1876   zero fields are in the node.
1877
1878   Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1879   transition.
1880
1881   There are two terms at use here: state as a TRIE_NODEIDX() which is a
1882   number representing the first entry of the node, and state as a
1883   TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1884   TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1885   are 2 entrys per node. eg:
1886
1887    A B       A B
1888   1. 2 4    1. 3 7
1889   2. 0 3    3. 0 5
1890   3. 0 0    5. 0 0
1891   4. 0 0    7. 0 0
1892
1893   The table is internally in the right hand, idx form. However as we also
1894   have to deal with the states array which is indexed by nodenum we have to
1895   use TRIE_NODENUM() to convert.
1896
1897   */
1898   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1899    "%*sCompiling trie using table compiler\n",
1900    (int)depth * 2 + 2, ""));
1901
1902   trie->trans = (reg_trie_trans *)
1903    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1904         * trie->uniquecharcount + 1,
1905         sizeof(reg_trie_trans) );
1906   trie->states = (reg_trie_state *)
1907    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1908         sizeof(reg_trie_state) );
1909   next_alloc = trie->uniquecharcount + 1;
1910
1911
1912   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1913
1914    regnode * const noper   = NEXTOPER( cur );
1915    const U8 *uc     = (U8*)STRING( noper );
1916    const U8 * const e = uc + STR_LEN( noper );
1917
1918    U32 state        = 1;         /* required init */
1919
1920    U16 charid       = 0;         /* sanity init */
1921    U32 accept_state = 0;         /* sanity init */
1922    U8 *scan         = (U8*)NULL; /* sanity init */
1923
1924    STRLEN foldlen   = 0;         /* required init */
1925    U32 wordlen      = 0;         /* required init */
1926    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1927
1928    if ( OP(noper) != NOTHING ) {
1929     for ( ; uc < e ; uc += len ) {
1930
1931      TRIE_READ_CHAR;
1932
1933      if ( uvc < 256 ) {
1934       charid = trie->charmap[ uvc ];
1935      } else {
1936       SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1937       charid = svpp ? (U16)SvIV(*svpp) : 0;
1938      }
1939      if ( charid ) {
1940       charid--;
1941       if ( !trie->trans[ state + charid ].next ) {
1942        trie->trans[ state + charid ].next = next_alloc;
1943        trie->trans[ state ].check++;
1944        prev_states[TRIE_NODENUM(next_alloc)]
1945          = TRIE_NODENUM(state);
1946        next_alloc += trie->uniquecharcount;
1947       }
1948       state = trie->trans[ state + charid ].next;
1949      } else {
1950       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1951      }
1952      /* charid is now 0 if we dont know the char read, or nonzero if we do */
1953     }
1954    }
1955    accept_state = TRIE_NODENUM( state );
1956    TRIE_HANDLE_WORD(accept_state);
1957
1958   } /* end second pass */
1959
1960   /* and now dump it out before we compress it */
1961   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1962               revcharmap,
1963               next_alloc, depth+1));
1964
1965   {
1966   /*
1967   * Inplace compress the table.*
1968
1969   For sparse data sets the table constructed by the trie algorithm will
1970   be mostly 0/FAIL transitions or to put it another way mostly empty.
1971   (Note that leaf nodes will not contain any transitions.)
1972
1973   This algorithm compresses the tables by eliminating most such
1974   transitions, at the cost of a modest bit of extra work during lookup:
1975
1976   - Each states[] entry contains a .base field which indicates the
1977   index in the state[] array wheres its transition data is stored.
1978
1979   - If .base is 0 there are no valid transitions from that node.
1980
1981   - If .base is nonzero then charid is added to it to find an entry in
1982   the trans array.
1983
1984   -If trans[states[state].base+charid].check!=state then the
1985   transition is taken to be a 0/Fail transition. Thus if there are fail
1986   transitions at the front of the node then the .base offset will point
1987   somewhere inside the previous nodes data (or maybe even into a node
1988   even earlier), but the .check field determines if the transition is
1989   valid.
1990
1991   XXX - wrong maybe?
1992   The following process inplace converts the table to the compressed
1993   table: We first do not compress the root node 1,and mark all its
1994   .check pointers as 1 and set its .base pointer as 1 as well. This
1995   allows us to do a DFA construction from the compressed table later,
1996   and ensures that any .base pointers we calculate later are greater
1997   than 0.
1998
1999   - We set 'pos' to indicate the first entry of the second node.
2000
2001   - We then iterate over the columns of the node, finding the first and
2002   last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2003   and set the .check pointers accordingly, and advance pos
2004   appropriately and repreat for the next node. Note that when we copy
2005   the next pointers we have to convert them from the original
2006   NODEIDX form to NODENUM form as the former is not valid post
2007   compression.
2008
2009   - If a node has no transitions used we mark its base as 0 and do not
2010   advance the pos pointer.
2011
2012   - If a node only has one transition we use a second pointer into the
2013   structure to fill in allocated fail transitions from other states.
2014   This pointer is independent of the main pointer and scans forward
2015   looking for null transitions that are allocated to a state. When it
2016   finds one it writes the single transition into the "hole".  If the
2017   pointer doesnt find one the single transition is appended as normal.
2018
2019   - Once compressed we can Renew/realloc the structures to release the
2020   excess space.
2021
2022   See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2023   specifically Fig 3.47 and the associated pseudocode.
2024
2025   demq
2026   */
2027   const U32 laststate = TRIE_NODENUM( next_alloc );
2028   U32 state, charid;
2029   U32 pos = 0, zp=0;
2030   trie->statecount = laststate;
2031
2032   for ( state = 1 ; state < laststate ; state++ ) {
2033    U8 flag = 0;
2034    const U32 stateidx = TRIE_NODEIDX( state );
2035    const U32 o_used = trie->trans[ stateidx ].check;
2036    U32 used = trie->trans[ stateidx ].check;
2037    trie->trans[ stateidx ].check = 0;
2038
2039    for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2040     if ( flag || trie->trans[ stateidx + charid ].next ) {
2041      if ( trie->trans[ stateidx + charid ].next ) {
2042       if (o_used == 1) {
2043        for ( ; zp < pos ; zp++ ) {
2044         if ( ! trie->trans[ zp ].next ) {
2045          break;
2046         }
2047        }
2048        trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2049        trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2050        trie->trans[ zp ].check = state;
2051        if ( ++zp > pos ) pos = zp;
2052        break;
2053       }
2054       used--;
2055      }
2056      if ( !flag ) {
2057       flag = 1;
2058       trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2059      }
2060      trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2061      trie->trans[ pos ].check = state;
2062      pos++;
2063     }
2064    }
2065   }
2066   trie->lasttrans = pos + 1;
2067   trie->states = (reg_trie_state *)
2068    PerlMemShared_realloc( trie->states, laststate
2069         * sizeof(reg_trie_state) );
2070   DEBUG_TRIE_COMPILE_MORE_r(
2071     PerlIO_printf( Perl_debug_log,
2072      "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2073      (int)depth * 2 + 2,"",
2074      (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2075      (IV)next_alloc,
2076      (IV)pos,
2077      ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2078    );
2079
2080   } /* end table compress */
2081  }
2082  DEBUG_TRIE_COMPILE_MORE_r(
2083    PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2084     (int)depth * 2 + 2, "",
2085     (UV)trie->statecount,
2086     (UV)trie->lasttrans)
2087  );
2088  /* resize the trans array to remove unused space */
2089  trie->trans = (reg_trie_trans *)
2090   PerlMemShared_realloc( trie->trans, trie->lasttrans
2091        * sizeof(reg_trie_trans) );
2092
2093  {   /* Modify the program and insert the new TRIE node */
2094   U8 nodetype =(U8)(flags & 0xFF);
2095   char *str=NULL;
2096
2097 #ifdef DEBUGGING
2098   regnode *optimize = NULL;
2099 #ifdef RE_TRACK_PATTERN_OFFSETS
2100
2101   U32 mjd_offset = 0;
2102   U32 mjd_nodelen = 0;
2103 #endif /* RE_TRACK_PATTERN_OFFSETS */
2104 #endif /* DEBUGGING */
2105   /*
2106   This means we convert either the first branch or the first Exact,
2107   depending on whether the thing following (in 'last') is a branch
2108   or not and whther first is the startbranch (ie is it a sub part of
2109   the alternation or is it the whole thing.)
2110   Assuming its a sub part we convert the EXACT otherwise we convert
2111   the whole branch sequence, including the first.
2112   */
2113   /* Find the node we are going to overwrite */
2114   if ( first != startbranch || OP( last ) == BRANCH ) {
2115    /* branch sub-chain */
2116    NEXT_OFF( first ) = (U16)(last - first);
2117 #ifdef RE_TRACK_PATTERN_OFFSETS
2118    DEBUG_r({
2119     mjd_offset= Node_Offset((convert));
2120     mjd_nodelen= Node_Length((convert));
2121    });
2122 #endif
2123    /* whole branch chain */
2124   }
2125 #ifdef RE_TRACK_PATTERN_OFFSETS
2126   else {
2127    DEBUG_r({
2128     const  regnode *nop = NEXTOPER( convert );
2129     mjd_offset= Node_Offset((nop));
2130     mjd_nodelen= Node_Length((nop));
2131    });
2132   }
2133   DEBUG_OPTIMISE_r(
2134    PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2135     (int)depth * 2 + 2, "",
2136     (UV)mjd_offset, (UV)mjd_nodelen)
2137   );
2138 #endif
2139   /* But first we check to see if there is a common prefix we can
2140   split out as an EXACT and put in front of the TRIE node.  */
2141   trie->startstate= 1;
2142   if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2143    U32 state;
2144    for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2145     U32 ofs = 0;
2146     I32 idx = -1;
2147     U32 count = 0;
2148     const U32 base = trie->states[ state ].trans.base;
2149
2150     if ( trie->states[state].wordnum )
2151       count = 1;
2152
2153     for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2154      if ( ( base + ofs >= trie->uniquecharcount ) &&
2155       ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2156       trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2157      {
2158       if ( ++count > 1 ) {
2159        SV **tmp = av_fetch( revcharmap, ofs, 0);
2160        const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2161        if ( state == 1 ) break;
2162        if ( count == 2 ) {
2163         Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2164         DEBUG_OPTIMISE_r(
2165          PerlIO_printf(Perl_debug_log,
2166           "%*sNew Start State=%"UVuf" Class: [",
2167           (int)depth * 2 + 2, "",
2168           (UV)state));
2169         if (idx >= 0) {
2170          SV ** const tmp = av_fetch( revcharmap, idx, 0);
2171          const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2172
2173          TRIE_BITMAP_SET(trie,*ch);
2174          if ( folder )
2175           TRIE_BITMAP_SET(trie, folder[ *ch ]);
2176          DEBUG_OPTIMISE_r(
2177           PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2178          );
2179         }
2180        }
2181        TRIE_BITMAP_SET(trie,*ch);
2182        if ( folder )
2183         TRIE_BITMAP_SET(trie,folder[ *ch ]);
2184        DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2185       }
2186       idx = ofs;
2187      }
2188     }
2189     if ( count == 1 ) {
2190      SV **tmp = av_fetch( revcharmap, idx, 0);
2191      STRLEN len;
2192      char *ch = SvPV( *tmp, len );
2193      DEBUG_OPTIMISE_r({
2194       SV *sv=sv_newmortal();
2195       PerlIO_printf( Perl_debug_log,
2196        "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2197        (int)depth * 2 + 2, "",
2198        (UV)state, (UV)idx,
2199        pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2200         PL_colors[0], PL_colors[1],
2201         (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2202         PERL_PV_ESCAPE_FIRSTCHAR
2203        )
2204       );
2205      });
2206      if ( state==1 ) {
2207       OP( convert ) = nodetype;
2208       str=STRING(convert);
2209       STR_LEN(convert)=0;
2210      }
2211      STR_LEN(convert) += len;
2212      while (len--)
2213       *str++ = *ch++;
2214     } else {
2215 #ifdef DEBUGGING
2216      if (state>1)
2217       DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2218 #endif
2219      break;
2220     }
2221    }
2222    trie->prefixlen = (state-1);
2223    if (str) {
2224     regnode *n = convert+NODE_SZ_STR(convert);
2225     NEXT_OFF(convert) = NODE_SZ_STR(convert);
2226     trie->startstate = state;
2227     trie->minlen -= (state - 1);
2228     trie->maxlen -= (state - 1);
2229 #ifdef DEBUGGING
2230    /* At least the UNICOS C compiler choked on this
2231     * being argument to DEBUG_r(), so let's just have
2232     * it right here. */
2233    if (
2234 #ifdef PERL_EXT_RE_BUILD
2235     1
2236 #else
2237     DEBUG_r_TEST
2238 #endif
2239     ) {
2240     regnode *fix = convert;
2241     U32 word = trie->wordcount;
2242     mjd_nodelen++;
2243     Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2244     while( ++fix < n ) {
2245      Set_Node_Offset_Length(fix, 0, 0);
2246     }
2247     while (word--) {
2248      SV ** const tmp = av_fetch( trie_words, word, 0 );
2249      if (tmp) {
2250       if ( STR_LEN(convert) <= SvCUR(*tmp) )
2251        sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2252       else
2253        sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2254      }
2255     }
2256    }
2257 #endif
2258     if (trie->maxlen) {
2259      convert = n;
2260     } else {
2261      NEXT_OFF(convert) = (U16)(tail - convert);
2262      DEBUG_r(optimize= n);
2263     }
2264    }
2265   }
2266   if (!jumper)
2267    jumper = last;
2268   if ( trie->maxlen ) {
2269    NEXT_OFF( convert ) = (U16)(tail - convert);
2270    ARG_SET( convert, data_slot );
2271    /* Store the offset to the first unabsorbed branch in
2272    jump[0], which is otherwise unused by the jump logic.
2273    We use this when dumping a trie and during optimisation. */
2274    if (trie->jump)
2275     trie->jump[0] = (U16)(nextbranch - convert);
2276
2277    /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2278    *   and there is a bitmap
2279    *   and the first "jump target" node we found leaves enough room
2280    * then convert the TRIE node into a TRIEC node, with the bitmap
2281    * embedded inline in the opcode - this is hypothetically faster.
2282    */
2283    if ( !trie->states[trie->startstate].wordnum
2284     && trie->bitmap
2285     && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2286    {
2287     OP( convert ) = TRIEC;
2288     Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2289     PerlMemShared_free(trie->bitmap);
2290     trie->bitmap= NULL;
2291    } else
2292     OP( convert ) = TRIE;
2293
2294    /* store the type in the flags */
2295    convert->flags = nodetype;
2296    DEBUG_r({
2297    optimize = convert
2298      + NODE_STEP_REGNODE
2299      + regarglen[ OP( convert ) ];
2300    });
2301    /* XXX We really should free up the resource in trie now,
2302     as we won't use them - (which resources?) dmq */
2303   }
2304   /* needed for dumping*/
2305   DEBUG_r(if (optimize) {
2306    regnode *opt = convert;
2307
2308    while ( ++opt < optimize) {
2309     Set_Node_Offset_Length(opt,0,0);
2310    }
2311    /*
2312     Try to clean up some of the debris left after the
2313     optimisation.
2314    */
2315    while( optimize < jumper ) {
2316     mjd_nodelen += Node_Length((optimize));
2317     OP( optimize ) = OPTIMIZED;
2318     Set_Node_Offset_Length(optimize,0,0);
2319     optimize++;
2320    }
2321    Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2322   });
2323  } /* end node insert */
2324  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2325
2326  /*  Finish populating the prev field of the wordinfo array.  Walk back
2327  *  from each accept state until we find another accept state, and if
2328  *  so, point the first word's .prev field at the second word. If the
2329  *  second already has a .prev field set, stop now. This will be the
2330  *  case either if we've already processed that word's accept state,
2331  *  or that state had multiple words, and the overspill words were
2332  *  already linked up earlier.
2333  */
2334  {
2335   U16 word;
2336   U32 state;
2337   U16 prev;
2338
2339   for (word=1; word <= trie->wordcount; word++) {
2340    prev = 0;
2341    if (trie->wordinfo[word].prev)
2342     continue;
2343    state = trie->wordinfo[word].accept;
2344    while (state) {
2345     state = prev_states[state];
2346     if (!state)
2347      break;
2348     prev = trie->states[state].wordnum;
2349     if (prev)
2350      break;
2351    }
2352    trie->wordinfo[word].prev = prev;
2353   }
2354   Safefree(prev_states);
2355  }
2356
2357
2358  /* and now dump out the compressed format */
2359  DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2360
2361  RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2362 #ifdef DEBUGGING
2363  RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2364  RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2365 #else
2366  SvREFCNT_dec(revcharmap);
2367 #endif
2368  return trie->jump
2369   ? MADE_JUMP_TRIE
2370   : trie->startstate>1
2371    ? MADE_EXACT_TRIE
2372    : MADE_TRIE;
2373 }
2374
2375 STATIC void
2376 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2377 {
2378 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2379
2380    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2381    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2382    ISBN 0-201-10088-6
2383
2384    We find the fail state for each state in the trie, this state is the longest proper
2385    suffix of the current state's 'word' that is also a proper prefix of another word in our
2386    trie. State 1 represents the word '' and is thus the default fail state. This allows
2387    the DFA not to have to restart after its tried and failed a word at a given point, it
2388    simply continues as though it had been matching the other word in the first place.
2389    Consider
2390  'abcdgu'=~/abcdefg|cdgu/
2391    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2392    fail, which would bring us to the state representing 'd' in the second word where we would
2393    try 'g' and succeed, proceeding to match 'cdgu'.
2394  */
2395  /* add a fail transition */
2396  const U32 trie_offset = ARG(source);
2397  reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2398  U32 *q;
2399  const U32 ucharcount = trie->uniquecharcount;
2400  const U32 numstates = trie->statecount;
2401  const U32 ubound = trie->lasttrans + ucharcount;
2402  U32 q_read = 0;
2403  U32 q_write = 0;
2404  U32 charid;
2405  U32 base = trie->states[ 1 ].trans.base;
2406  U32 *fail;
2407  reg_ac_data *aho;
2408  const U32 data_slot = add_data( pRExC_state, 1, "T" );
2409  GET_RE_DEBUG_FLAGS_DECL;
2410
2411  PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2412 #ifndef DEBUGGING
2413  PERL_UNUSED_ARG(depth);
2414 #endif
2415
2416
2417  ARG_SET( stclass, data_slot );
2418  aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2419  RExC_rxi->data->data[ data_slot ] = (void*)aho;
2420  aho->trie=trie_offset;
2421  aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2422  Copy( trie->states, aho->states, numstates, reg_trie_state );
2423  Newxz( q, numstates, U32);
2424  aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2425  aho->refcount = 1;
2426  fail = aho->fail;
2427  /* initialize fail[0..1] to be 1 so that we always have
2428  a valid final fail state */
2429  fail[ 0 ] = fail[ 1 ] = 1;
2430
2431  for ( charid = 0; charid < ucharcount ; charid++ ) {
2432   const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2433   if ( newstate ) {
2434    q[ q_write ] = newstate;
2435    /* set to point at the root */
2436    fail[ q[ q_write++ ] ]=1;
2437   }
2438  }
2439  while ( q_read < q_write) {
2440   const U32 cur = q[ q_read++ % numstates ];
2441   base = trie->states[ cur ].trans.base;
2442
2443   for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2444    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2445    if (ch_state) {
2446     U32 fail_state = cur;
2447     U32 fail_base;
2448     do {
2449      fail_state = fail[ fail_state ];
2450      fail_base = aho->states[ fail_state ].trans.base;
2451     } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2452
2453     fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2454     fail[ ch_state ] = fail_state;
2455     if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2456     {
2457       aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2458     }
2459     q[ q_write++ % numstates] = ch_state;
2460    }
2461   }
2462  }
2463  /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2464  when we fail in state 1, this allows us to use the
2465  charclass scan to find a valid start char. This is based on the principle
2466  that theres a good chance the string being searched contains lots of stuff
2467  that cant be a start char.
2468  */
2469  fail[ 0 ] = fail[ 1 ] = 0;
2470  DEBUG_TRIE_COMPILE_r({
2471   PerlIO_printf(Perl_debug_log,
2472      "%*sStclass Failtable (%"UVuf" states): 0",
2473      (int)(depth * 2), "", (UV)numstates
2474   );
2475   for( q_read=1; q_read<numstates; q_read++ ) {
2476    PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2477   }
2478   PerlIO_printf(Perl_debug_log, "\n");
2479  });
2480  Safefree(q);
2481  /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2482 }
2483
2484
2485 /*
2486  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2487  * These need to be revisited when a newer toolchain becomes available.
2488  */
2489 #if defined(__sparc64__) && defined(__GNUC__)
2490 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2491 #       undef  SPARC64_GCC_WORKAROUND
2492 #       define SPARC64_GCC_WORKAROUND 1
2493 #   endif
2494 #endif
2495
2496 #define DEBUG_PEEP(str,scan,depth) \
2497  DEBUG_OPTIMISE_r({if (scan){ \
2498  SV * const mysv=sv_newmortal(); \
2499  regnode *Next = regnext(scan); \
2500  regprop(RExC_rx, mysv, scan); \
2501  PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2502  (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2503  Next ? (REG_NODE_NUM(Next)) : 0 ); \
2504    }});
2505
2506
2507
2508
2509
2510 #define JOIN_EXACT(scan,min,flags) \
2511  if (PL_regkind[OP(scan)] == EXACT) \
2512   join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2513
2514 STATIC U32
2515 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2516  /* Merge several consecutive EXACTish nodes into one. */
2517  regnode *n = regnext(scan);
2518  U32 stringok = 1;
2519  regnode *next = scan + NODE_SZ_STR(scan);
2520  U32 merged = 0;
2521  U32 stopnow = 0;
2522 #ifdef DEBUGGING
2523  regnode *stop = scan;
2524  GET_RE_DEBUG_FLAGS_DECL;
2525 #else
2526  PERL_UNUSED_ARG(depth);
2527 #endif
2528
2529  PERL_ARGS_ASSERT_JOIN_EXACT;
2530 #ifndef EXPERIMENTAL_INPLACESCAN
2531  PERL_UNUSED_ARG(flags);
2532  PERL_UNUSED_ARG(val);
2533 #endif
2534  DEBUG_PEEP("join",scan,depth);
2535
2536  /* Skip NOTHING, merge EXACT*. */
2537  while (n &&
2538   ( PL_regkind[OP(n)] == NOTHING ||
2539    (stringok && (OP(n) == OP(scan))))
2540   && NEXT_OFF(n)
2541   && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2542
2543   if (OP(n) == TAIL || n > next)
2544    stringok = 0;
2545   if (PL_regkind[OP(n)] == NOTHING) {
2546    DEBUG_PEEP("skip:",n,depth);
2547    NEXT_OFF(scan) += NEXT_OFF(n);
2548    next = n + NODE_STEP_REGNODE;
2549 #ifdef DEBUGGING
2550    if (stringok)
2551     stop = n;
2552 #endif
2553    n = regnext(n);
2554   }
2555   else if (stringok) {
2556    const unsigned int oldl = STR_LEN(scan);
2557    regnode * const nnext = regnext(n);
2558
2559    DEBUG_PEEP("merg",n,depth);
2560
2561    merged++;
2562    if (oldl + STR_LEN(n) > U8_MAX)
2563     break;
2564    NEXT_OFF(scan) += NEXT_OFF(n);
2565    STR_LEN(scan) += STR_LEN(n);
2566    next = n + NODE_SZ_STR(n);
2567    /* Now we can overwrite *n : */
2568    Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2569 #ifdef DEBUGGING
2570    stop = next - 1;
2571 #endif
2572    n = nnext;
2573    if (stopnow) break;
2574   }
2575
2576 #ifdef EXPERIMENTAL_INPLACESCAN
2577   if (flags && !NEXT_OFF(n)) {
2578    DEBUG_PEEP("atch", val, depth);
2579    if (reg_off_by_arg[OP(n)]) {
2580     ARG_SET(n, val - n);
2581    }
2582    else {
2583     NEXT_OFF(n) = val - n;
2584    }
2585    stopnow = 1;
2586   }
2587 #endif
2588  }
2589 #define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS   0x0390
2590 #define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2591 #define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
2592 #define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2593
2594  if (UTF
2595   && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
2596   && ( STR_LEN(scan) >= 6 ) )
2597  {
2598  /*
2599  Two problematic code points in Unicode casefolding of EXACT nodes:
2600
2601  U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2602  U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2603
2604  which casefold to
2605
2606  Unicode                      UTF-8
2607
2608  U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2609  U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2610
2611  This means that in case-insensitive matching (or "loose matching",
2612  as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2613  length of the above casefolded versions) can match a target string
2614  of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2615  This would rather mess up the minimum length computation.
2616
2617  What we'll do is to look for the tail four bytes, and then peek
2618  at the preceding two bytes to see whether we need to decrease
2619  the minimum length by four (six minus two).
2620
2621  Thanks to the design of UTF-8, there cannot be false matches:
2622  A sequence of valid UTF-8 bytes cannot be a subsequence of
2623  another valid sequence of UTF-8 bytes.
2624
2625  */
2626   char * const s0 = STRING(scan), *s, *t;
2627   char * const s1 = s0 + STR_LEN(scan) - 1;
2628   char * const s2 = s1 - 4;
2629 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2630   const char t0[] = "\xaf\x49\xaf\x42";
2631 #else
2632   const char t0[] = "\xcc\x88\xcc\x81";
2633 #endif
2634   const char * const t1 = t0 + 3;
2635
2636   for (s = s0 + 2;
2637    s < s2 && (t = ninstr(s, s1, t0, t1));
2638    s = t + 4) {
2639 #ifdef EBCDIC
2640    if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2641     ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2642 #else
2643    if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2644     ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2645 #endif
2646     *min -= 4;
2647   }
2648  }
2649
2650 #ifdef DEBUGGING
2651  /* Allow dumping */
2652  n = scan + NODE_SZ_STR(scan);
2653  while (n <= stop) {
2654   if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2655    OP(n) = OPTIMIZED;
2656    NEXT_OFF(n) = 0;
2657   }
2658   n++;
2659  }
2660 #endif
2661  DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2662  return stopnow;
2663 }
2664
2665 /* REx optimizer.  Converts nodes into quicker variants "in place".
2666    Finds fixed substrings.  */
2667
2668 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2669    to the position after last scanned or to NULL. */
2670
2671 #define INIT_AND_WITHP \
2672  assert(!and_withp); \
2673  Newx(and_withp,1,struct regnode_charclass_class); \
2674  SAVEFREEPV(and_withp)
2675
2676 /* this is a chain of data about sub patterns we are processing that
2677    need to be handled separately/specially in study_chunk. Its so
2678    we can simulate recursion without losing state.  */
2679 struct scan_frame;
2680 typedef struct scan_frame {
2681  regnode *last;  /* last node to process in this frame */
2682  regnode *next;  /* next node to process when last is reached */
2683  struct scan_frame *prev; /*previous frame*/
2684  I32 stop; /* what stopparen do we use */
2685 } scan_frame;
2686
2687
2688 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2689
2690 #define CASE_SYNST_FNC(nAmE)                                       \
2691 case nAmE:                                                         \
2692  if (flags & SCF_DO_STCLASS_AND) {                              \
2693    for (value = 0; value < 256; value++)                  \
2694     if (!is_ ## nAmE ## _cp(value))                       \
2695      ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2696  }                                                              \
2697  else {                                                         \
2698    for (value = 0; value < 256; value++)                  \
2699     if (is_ ## nAmE ## _cp(value))                        \
2700      ANYOF_BITMAP_SET(data->start_class, value);    \
2701  }                                                              \
2702  break;                                                         \
2703 case N ## nAmE:                                                    \
2704  if (flags & SCF_DO_STCLASS_AND) {                              \
2705    for (value = 0; value < 256; value++)                   \
2706     if (is_ ## nAmE ## _cp(value))                         \
2707      ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2708  }                                                               \
2709  else {                                                          \
2710    for (value = 0; value < 256; value++)                   \
2711     if (!is_ ## nAmE ## _cp(value))                        \
2712      ANYOF_BITMAP_SET(data->start_class, value);     \
2713  }                                                               \
2714  break
2715
2716
2717
2718 STATIC I32
2719 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2720       I32 *minlenp, I32 *deltap,
2721       regnode *last,
2722       scan_data_t *data,
2723       I32 stopparen,
2724       U8* recursed,
2725       struct regnode_charclass_class *and_withp,
2726       U32 flags, U32 depth)
2727       /* scanp: Start here (read-write). */
2728       /* deltap: Write maxlen-minlen here. */
2729       /* last: Stop before this one. */
2730       /* data: string data about the pattern */
2731       /* stopparen: treat close N as END */
2732       /* recursed: which subroutines have we recursed into */
2733       /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2734 {
2735  dVAR;
2736  I32 min = 0, pars = 0, code;
2737  regnode *scan = *scanp, *next;
2738  I32 delta = 0;
2739  int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2740  int is_inf_internal = 0;  /* The studied chunk is infinite */
2741  I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2742  scan_data_t data_fake;
2743  SV *re_trie_maxbuff = NULL;
2744  regnode *first_non_open = scan;
2745  I32 stopmin = I32_MAX;
2746  scan_frame *frame = NULL;
2747  GET_RE_DEBUG_FLAGS_DECL;
2748
2749  PERL_ARGS_ASSERT_STUDY_CHUNK;
2750
2751 #ifdef DEBUGGING
2752  StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2753 #endif
2754
2755  if ( depth == 0 ) {
2756   while (first_non_open && OP(first_non_open) == OPEN)
2757    first_non_open=regnext(first_non_open);
2758  }
2759
2760
2761   fake_study_recurse:
2762  while ( scan && OP(scan) != END && scan < last ){
2763   /* Peephole optimizer: */
2764   DEBUG_STUDYDATA("Peep:", data,depth);
2765   DEBUG_PEEP("Peep",scan,depth);
2766   JOIN_EXACT(scan,&min,0);
2767
2768   /* Follow the next-chain of the current node and optimize
2769   away all the NOTHINGs from it.  */
2770   if (OP(scan) != CURLYX) {
2771    const int max = (reg_off_by_arg[OP(scan)]
2772      ? I32_MAX
2773      /* I32 may be smaller than U16 on CRAYs! */
2774      : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2775    int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2776    int noff;
2777    regnode *n = scan;
2778
2779    /* Skip NOTHING and LONGJMP. */
2780    while ((n = regnext(n))
2781     && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2782      || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2783     && off + noff < max)
2784     off += noff;
2785    if (reg_off_by_arg[OP(scan)])
2786     ARG(scan) = off;
2787    else
2788     NEXT_OFF(scan) = off;
2789   }
2790
2791
2792
2793   /* The principal pseudo-switch.  Cannot be a switch, since we
2794   look into several different things.  */
2795   if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2796     || OP(scan) == IFTHEN) {
2797    next = regnext(scan);
2798    code = OP(scan);
2799    /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2800
2801    if (OP(next) == code || code == IFTHEN) {
2802     /* NOTE - There is similar code to this block below for handling
2803     TRIE nodes on a re-study.  If you change stuff here check there
2804     too. */
2805     I32 max1 = 0, min1 = I32_MAX, num = 0;
2806     struct regnode_charclass_class accum;
2807     regnode * const startbranch=scan;
2808
2809     if (flags & SCF_DO_SUBSTR)
2810      SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2811     if (flags & SCF_DO_STCLASS)
2812      cl_init_zero(pRExC_state, &accum);
2813
2814     while (OP(scan) == code) {
2815      I32 deltanext, minnext, f = 0, fake;
2816      struct regnode_charclass_class this_class;
2817
2818      num++;
2819      data_fake.flags = 0;
2820      if (data) {
2821       data_fake.whilem_c = data->whilem_c;
2822       data_fake.last_closep = data->last_closep;
2823      }
2824      else
2825       data_fake.last_closep = &fake;
2826
2827      data_fake.pos_delta = delta;
2828      next = regnext(scan);
2829      scan = NEXTOPER(scan);
2830      if (code != BRANCH)
2831       scan = NEXTOPER(scan);
2832      if (flags & SCF_DO_STCLASS) {
2833       cl_init(pRExC_state, &this_class);
2834       data_fake.start_class = &this_class;
2835       f = SCF_DO_STCLASS_AND;
2836      }
2837      if (flags & SCF_WHILEM_VISITED_POS)
2838       f |= SCF_WHILEM_VISITED_POS;
2839
2840      /* we suppose the run is continuous, last=next...*/
2841      minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2842           next, &data_fake,
2843           stopparen, recursed, NULL, f,depth+1);
2844      if (min1 > minnext)
2845       min1 = minnext;
2846      if (max1 < minnext + deltanext)
2847       max1 = minnext + deltanext;
2848      if (deltanext == I32_MAX)
2849       is_inf = is_inf_internal = 1;
2850      scan = next;
2851      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2852       pars++;
2853      if (data_fake.flags & SCF_SEEN_ACCEPT) {
2854       if ( stopmin > minnext)
2855        stopmin = min + min1;
2856       flags &= ~SCF_DO_SUBSTR;
2857       if (data)
2858        data->flags |= SCF_SEEN_ACCEPT;
2859      }
2860      if (data) {
2861       if (data_fake.flags & SF_HAS_EVAL)
2862        data->flags |= SF_HAS_EVAL;
2863       data->whilem_c = data_fake.whilem_c;
2864      }
2865      if (flags & SCF_DO_STCLASS)
2866       cl_or(pRExC_state, &accum, &this_class);
2867     }
2868     if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2869      min1 = 0;
2870     if (flags & SCF_DO_SUBSTR) {
2871      data->pos_min += min1;
2872      data->pos_delta += max1 - min1;
2873      if (max1 != min1 || is_inf)
2874       data->longest = &(data->longest_float);
2875     }
2876     min += min1;
2877     delta += max1 - min1;
2878     if (flags & SCF_DO_STCLASS_OR) {
2879      cl_or(pRExC_state, data->start_class, &accum);
2880      if (min1) {
2881       cl_and(data->start_class, and_withp);
2882       flags &= ~SCF_DO_STCLASS;
2883      }
2884     }
2885     else if (flags & SCF_DO_STCLASS_AND) {
2886      if (min1) {
2887       cl_and(data->start_class, &accum);
2888       flags &= ~SCF_DO_STCLASS;
2889      }
2890      else {
2891       /* Switch to OR mode: cache the old value of
2892       * data->start_class */
2893       INIT_AND_WITHP;
2894       StructCopy(data->start_class, and_withp,
2895         struct regnode_charclass_class);
2896       flags &= ~SCF_DO_STCLASS_AND;
2897       StructCopy(&accum, data->start_class,
2898         struct regnode_charclass_class);
2899       flags |= SCF_DO_STCLASS_OR;
2900       data->start_class->flags |= ANYOF_EOS;
2901      }
2902     }
2903
2904     if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2905     /* demq.
2906
2907     Assuming this was/is a branch we are dealing with: 'scan' now
2908     points at the item that follows the branch sequence, whatever
2909     it is. We now start at the beginning of the sequence and look
2910     for subsequences of
2911
2912     BRANCH->EXACT=>x1
2913     BRANCH->EXACT=>x2
2914     tail
2915
2916     which would be constructed from a pattern like /A|LIST|OF|WORDS/
2917
2918     If we can find such a subsequence we need to turn the first
2919     element into a trie and then add the subsequent branch exact
2920     strings to the trie.
2921
2922     We have two cases
2923
2924      1. patterns where the whole set of branches can be converted.
2925
2926      2. patterns where only a subset can be converted.
2927
2928     In case 1 we can replace the whole set with a single regop
2929     for the trie. In case 2 we need to keep the start and end
2930     branches so
2931
2932      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2933      becomes BRANCH TRIE; BRANCH X;
2934
2935     There is an additional case, that being where there is a
2936     common prefix, which gets split out into an EXACT like node
2937     preceding the TRIE node.
2938
2939     If x(1..n)==tail then we can do a simple trie, if not we make
2940     a "jump" trie, such that when we match the appropriate word
2941     we "jump" to the appropriate tail node. Essentially we turn
2942     a nested if into a case structure of sorts.
2943
2944     */
2945
2946      int made=0;
2947      if (!re_trie_maxbuff) {
2948       re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2949       if (!SvIOK(re_trie_maxbuff))
2950        sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2951      }
2952      if ( SvIV(re_trie_maxbuff)>=0  ) {
2953       regnode *cur;
2954       regnode *first = (regnode *)NULL;
2955       regnode *last = (regnode *)NULL;
2956       regnode *tail = scan;
2957       U8 optype = 0;
2958       U32 count=0;
2959
2960 #ifdef DEBUGGING
2961       SV * const mysv = sv_newmortal();       /* for dumping */
2962 #endif
2963       /* var tail is used because there may be a TAIL
2964       regop in the way. Ie, the exacts will point to the
2965       thing following the TAIL, but the last branch will
2966       point at the TAIL. So we advance tail. If we
2967       have nested (?:) we may have to move through several
2968       tails.
2969       */
2970
2971       while ( OP( tail ) == TAIL ) {
2972        /* this is the TAIL generated by (?:) */
2973        tail = regnext( tail );
2974       }
2975
2976
2977       DEBUG_OPTIMISE_r({
2978        regprop(RExC_rx, mysv, tail );
2979        PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2980         (int)depth * 2 + 2, "",
2981         "Looking for TRIE'able sequences. Tail node is: ",
2982         SvPV_nolen_const( mysv )
2983        );
2984       });
2985
2986       /*
2987
2988       step through the branches, cur represents each
2989       branch, noper is the first thing to be matched
2990       as part of that branch and noper_next is the
2991       regnext() of that node. if noper is an EXACT
2992       and noper_next is the same as scan (our current
2993       position in the regex) then the EXACT branch is
2994       a possible optimization target. Once we have
2995       two or more consecutive such branches we can
2996       create a trie of the EXACT's contents and stich
2997       it in place. If the sequence represents all of
2998       the branches we eliminate the whole thing and
2999       replace it with a single TRIE. If it is a
3000       subsequence then we need to stitch it in. This
3001       means the first branch has to remain, and needs
3002       to be repointed at the item on the branch chain
3003       following the last branch optimized. This could
3004       be either a BRANCH, in which case the
3005       subsequence is internal, or it could be the
3006       item following the branch sequence in which
3007       case the subsequence is at the end.
3008
3009       */
3010
3011       /* dont use tail as the end marker for this traverse */
3012       for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3013        regnode * const noper = NEXTOPER( cur );
3014 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3015        regnode * const noper_next = regnext( noper );
3016 #endif
3017
3018        DEBUG_OPTIMISE_r({
3019         regprop(RExC_rx, mysv, cur);
3020         PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3021         (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3022
3023         regprop(RExC_rx, mysv, noper);
3024         PerlIO_printf( Perl_debug_log, " -> %s",
3025          SvPV_nolen_const(mysv));
3026
3027         if ( noper_next ) {
3028         regprop(RExC_rx, mysv, noper_next );
3029         PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3030          SvPV_nolen_const(mysv));
3031         }
3032         PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
3033         REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
3034        });
3035        if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
3036           : PL_regkind[ OP( noper ) ] == EXACT )
3037         || OP(noper) == NOTHING )
3038 #ifdef NOJUMPTRIE
3039         && noper_next == tail
3040 #endif
3041         && count < U16_MAX)
3042        {
3043         count++;
3044         if ( !first || optype == NOTHING ) {
3045          if (!first) first = cur;
3046          optype = OP( noper );
3047         } else {
3048          last = cur;
3049         }
3050        } else {
3051 /*
3052  Currently the trie logic handles case insensitive matching properly only
3053  when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
3054  semantics).
3055
3056  If/when this is fixed the following define can be swapped
3057  in below to fully enable trie logic.
3058
3059 #define TRIE_TYPE_IS_SAFE 1
3060
3061 */
3062 #define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
3063
3064         if ( last && TRIE_TYPE_IS_SAFE ) {
3065          make_trie( pRExC_state,
3066            startbranch, first, cur, tail, count,
3067            optype, depth+1 );
3068         }
3069         if ( PL_regkind[ OP( noper ) ] == EXACT
3070 #ifdef NOJUMPTRIE
3071          && noper_next == tail
3072 #endif
3073         ){
3074          count = 1;
3075          first = cur;
3076          optype = OP( noper );
3077         } else {
3078          count = 0;
3079          first = NULL;
3080          optype = 0;
3081         }
3082         last = NULL;
3083        }
3084       }
3085       DEBUG_OPTIMISE_r({
3086        regprop(RExC_rx, mysv, cur);
3087        PerlIO_printf( Perl_debug_log,
3088        "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3089        "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3090
3091       });
3092
3093       if ( last && TRIE_TYPE_IS_SAFE ) {
3094        made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3095 #ifdef TRIE_STUDY_OPT
3096        if ( ((made == MADE_EXACT_TRIE &&
3097         startbranch == first)
3098         || ( first_non_open == first )) &&
3099         depth==0 ) {
3100         flags |= SCF_TRIE_RESTUDY;
3101         if ( startbranch == first
3102          && scan == tail )
3103         {
3104          RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3105         }
3106        }
3107 #endif
3108       }
3109      }
3110
3111     } /* do trie */
3112
3113    }
3114    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3115     scan = NEXTOPER(NEXTOPER(scan));
3116    } else   /* single branch is optimized. */
3117     scan = NEXTOPER(scan);
3118    continue;
3119   } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3120    scan_frame *newframe = NULL;
3121    I32 paren;
3122    regnode *start;
3123    regnode *end;
3124
3125    if (OP(scan) != SUSPEND) {
3126    /* set the pointer */
3127     if (OP(scan) == GOSUB) {
3128      paren = ARG(scan);
3129      RExC_recurse[ARG2L(scan)] = scan;
3130      start = RExC_open_parens[paren-1];
3131      end   = RExC_close_parens[paren-1];
3132     } else {
3133      paren = 0;
3134      start = RExC_rxi->program + 1;
3135      end   = RExC_opend;
3136     }
3137     if (!recursed) {
3138      Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3139      SAVEFREEPV(recursed);
3140     }
3141     if (!PAREN_TEST(recursed,paren+1)) {
3142      PAREN_SET(recursed,paren+1);
3143      Newx(newframe,1,scan_frame);
3144     } else {
3145      if (flags & SCF_DO_SUBSTR) {
3146       SCAN_COMMIT(pRExC_state,data,minlenp);
3147       data->longest = &(data->longest_float);
3148      }
3149      is_inf = is_inf_internal = 1;
3150      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3151       cl_anything(pRExC_state, data->start_class);
3152      flags &= ~SCF_DO_STCLASS;
3153     }
3154    } else {
3155     Newx(newframe,1,scan_frame);
3156     paren = stopparen;
3157     start = scan+2;
3158     end = regnext(scan);
3159    }
3160    if (newframe) {
3161     assert(start);
3162     assert(end);
3163     SAVEFREEPV(newframe);
3164     newframe->next = regnext(scan);
3165     newframe->last = last;
3166     newframe->stop = stopparen;
3167     newframe->prev = frame;
3168
3169     frame = newframe;
3170     scan =  start;
3171     stopparen = paren;
3172     last = end;
3173
3174     continue;
3175    }
3176   }
3177   else if (OP(scan) == EXACT) {
3178    I32 l = STR_LEN(scan);
3179    UV uc;
3180    if (UTF) {
3181     const U8 * const s = (U8*)STRING(scan);
3182     l = utf8_length(s, s + l);
3183     uc = utf8_to_uvchr(s, NULL);
3184    } else {
3185     uc = *((U8*)STRING(scan));
3186    }
3187    min += l;
3188    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3189     /* The code below prefers earlier match for fixed
3190     offset, later match for variable offset.  */
3191     if (data->last_end == -1) { /* Update the start info. */
3192      data->last_start_min = data->pos_min;
3193      data->last_start_max = is_inf
3194       ? I32_MAX : data->pos_min + data->pos_delta;
3195     }
3196     sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3197     if (UTF)
3198      SvUTF8_on(data->last_found);
3199     {
3200      SV * const sv = data->last_found;
3201      MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3202       mg_find(sv, PERL_MAGIC_utf8) : NULL;
3203      if (mg && mg->mg_len >= 0)
3204       mg->mg_len += utf8_length((U8*)STRING(scan),
3205             (U8*)STRING(scan)+STR_LEN(scan));
3206     }
3207     data->last_end = data->pos_min + l;
3208     data->pos_min += l; /* As in the first entry. */
3209     data->flags &= ~SF_BEFORE_EOL;
3210    }
3211    if (flags & SCF_DO_STCLASS_AND) {
3212     /* Check whether it is compatible with what we know already! */
3213     int compat = 1;
3214
3215
3216     /* If compatible, we or it in below.  It is compatible if is
3217     * in the bitmp and either 1) its bit or its fold is set, or 2)
3218     * it's for a locale.  Even if there isn't unicode semantics
3219     * here, at runtime there may be because of matching against a
3220     * utf8 string, so accept a possible false positive for
3221     * latin1-range folds */
3222     if (uc >= 0x100 ||
3223      (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3224      && !ANYOF_BITMAP_TEST(data->start_class, uc)
3225      && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3226       || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3227      )
3228     {
3229      compat = 0;
3230     }
3231     ANYOF_CLASS_ZERO(data->start_class);
3232     ANYOF_BITMAP_ZERO(data->start_class);
3233     if (compat)
3234      ANYOF_BITMAP_SET(data->start_class, uc);
3235     else if (uc >= 0x100) {
3236      int i;
3237
3238      /* Some Unicode code points fold to the Latin1 range; as
3239      * XXX temporary code, instead of figuring out if this is
3240      * one, just assume it is and set all the start class bits
3241      * that could be some such above 255 code point's fold
3242      * which will generate fals positives.  As the code
3243      * elsewhere that does compute the fold settles down, it
3244      * can be extracted out and re-used here */
3245      for (i = 0; i < 256; i++){
3246       if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3247        ANYOF_BITMAP_SET(data->start_class, i);
3248       }
3249      }
3250     }
3251     data->start_class->flags &= ~ANYOF_EOS;
3252     if (uc < 0x100)
3253     data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3254    }
3255    else if (flags & SCF_DO_STCLASS_OR) {
3256     /* false positive possible if the class is case-folded */
3257     if (uc < 0x100)
3258      ANYOF_BITMAP_SET(data->start_class, uc);
3259     else
3260      data->start_class->flags |= ANYOF_UNICODE_ALL;
3261     data->start_class->flags &= ~ANYOF_EOS;
3262     cl_and(data->start_class, and_withp);
3263    }
3264    flags &= ~SCF_DO_STCLASS;
3265   }
3266   else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3267    I32 l = STR_LEN(scan);
3268    UV uc = *((U8*)STRING(scan));
3269
3270    /* Search for fixed substrings supports EXACT only. */
3271    if (flags & SCF_DO_SUBSTR) {
3272     assert(data);
3273     SCAN_COMMIT(pRExC_state, data, minlenp);
3274    }
3275    if (UTF) {
3276     const U8 * const s = (U8 *)STRING(scan);
3277     l = utf8_length(s, s + l);
3278     uc = utf8_to_uvchr(s, NULL);
3279    }
3280    min += l;
3281    if (flags & SCF_DO_SUBSTR)
3282     data->pos_min += l;
3283    if (flags & SCF_DO_STCLASS_AND) {
3284     /* Check whether it is compatible with what we know already! */
3285     int compat = 1;
3286     if (uc >= 0x100 ||
3287     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3288     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3289     && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3290     {
3291      compat = 0;
3292     }
3293     ANYOF_CLASS_ZERO(data->start_class);
3294     ANYOF_BITMAP_ZERO(data->start_class);
3295     if (compat) {
3296      ANYOF_BITMAP_SET(data->start_class, uc);
3297      data->start_class->flags &= ~ANYOF_EOS;
3298      data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3299      if (OP(scan) == EXACTFL) {
3300       /* XXX This set is probably no longer necessary, and
3301       * probably wrong as LOCALE now is on in the initial
3302       * state */
3303       data->start_class->flags |= ANYOF_LOCALE;
3304      }
3305      else {
3306
3307       /* Also set the other member of the fold pair.  In case
3308       * that unicode semantics is called for at runtime, use
3309       * the full latin1 fold.  (Can't do this for locale,
3310       * because not known until runtime */
3311       ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3312      }
3313     }
3314     else if (uc >= 0x100) {
3315      int i;
3316      for (i = 0; i < 256; i++){
3317       if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3318        ANYOF_BITMAP_SET(data->start_class, i);
3319       }
3320      }
3321     }
3322    }
3323    else if (flags & SCF_DO_STCLASS_OR) {
3324     if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3325      /* false positive possible if the class is case-folded.
3326      Assume that the locale settings are the same... */
3327      if (uc < 0x100) {
3328       ANYOF_BITMAP_SET(data->start_class, uc);
3329       if (OP(scan) != EXACTFL) {
3330
3331        /* And set the other member of the fold pair, but
3332        * can't do that in locale because not known until
3333        * run-time */
3334        ANYOF_BITMAP_SET(data->start_class,
3335            PL_fold_latin1[uc]);
3336       }
3337      }
3338      data->start_class->flags &= ~ANYOF_EOS;
3339     }
3340     cl_and(data->start_class, and_withp);
3341    }
3342    flags &= ~SCF_DO_STCLASS;
3343   }
3344   else if (REGNODE_VARIES(OP(scan))) {
3345    I32 mincount, maxcount, minnext, deltanext, fl = 0;
3346    I32 f = flags, pos_before = 0;
3347    regnode * const oscan = scan;
3348    struct regnode_charclass_class this_class;
3349    struct regnode_charclass_class *oclass = NULL;
3350    I32 next_is_eval = 0;
3351
3352    switch (PL_regkind[OP(scan)]) {
3353    case WHILEM:  /* End of (?:...)* . */
3354     scan = NEXTOPER(scan);
3355     goto finish;
3356    case PLUS:
3357     if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3358      next = NEXTOPER(scan);
3359      if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3360       mincount = 1;
3361       maxcount = REG_INFTY;
3362       next = regnext(scan);
3363       scan = NEXTOPER(scan);
3364       goto do_curly;
3365      }
3366     }
3367     if (flags & SCF_DO_SUBSTR)
3368      data->pos_min++;
3369     min++;
3370     /* Fall through. */
3371    case STAR:
3372     if (flags & SCF_DO_STCLASS) {
3373      mincount = 0;
3374      maxcount = REG_INFTY;
3375      next = regnext(scan);
3376      scan = NEXTOPER(scan);
3377      goto do_curly;
3378     }
3379     is_inf = is_inf_internal = 1;
3380     scan = regnext(scan);
3381     if (flags & SCF_DO_SUBSTR) {
3382      SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3383      data->longest = &(data->longest_float);
3384     }
3385     goto optimize_curly_tail;
3386    case CURLY:
3387     if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3388      && (scan->flags == stopparen))
3389     {
3390      mincount = 1;
3391      maxcount = 1;
3392     } else {
3393      mincount = ARG1(scan);
3394      maxcount = ARG2(scan);
3395     }
3396     next = regnext(scan);
3397     if (OP(scan) == CURLYX) {
3398      I32 lp = (data ? *(data->last_closep) : 0);
3399      scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3400     }
3401     scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3402     next_is_eval = (OP(scan) == EVAL);
3403    do_curly:
3404     if (flags & SCF_DO_SUBSTR) {
3405      if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3406      pos_before = data->pos_min;
3407     }
3408     if (data) {
3409      fl = data->flags;
3410      data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3411      if (is_inf)
3412       data->flags |= SF_IS_INF;
3413     }
3414     if (flags & SCF_DO_STCLASS) {
3415      cl_init(pRExC_state, &this_class);
3416      oclass = data->start_class;
3417      data->start_class = &this_class;
3418      f |= SCF_DO_STCLASS_AND;
3419      f &= ~SCF_DO_STCLASS_OR;
3420     }
3421     /* Exclude from super-linear cache processing any {n,m}
3422     regops for which the combination of input pos and regex
3423     pos is not enough information to determine if a match
3424     will be possible.
3425
3426     For example, in the regex /foo(bar\s*){4,8}baz/ with the
3427     regex pos at the \s*, the prospects for a match depend not
3428     only on the input position but also on how many (bar\s*)
3429     repeats into the {4,8} we are. */
3430    if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3431      f &= ~SCF_WHILEM_VISITED_POS;
3432
3433     /* This will finish on WHILEM, setting scan, or on NULL: */
3434     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3435          last, data, stopparen, recursed, NULL,
3436          (mincount == 0
3437           ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3438
3439     if (flags & SCF_DO_STCLASS)
3440      data->start_class = oclass;
3441     if (mincount == 0 || minnext == 0) {
3442      if (flags & SCF_DO_STCLASS_OR) {
3443       cl_or(pRExC_state, data->start_class, &this_class);
3444      }
3445      else if (flags & SCF_DO_STCLASS_AND) {
3446       /* Switch to OR mode: cache the old value of
3447       * data->start_class */
3448       INIT_AND_WITHP;
3449       StructCopy(data->start_class, and_withp,
3450         struct regnode_charclass_class);
3451       flags &= ~SCF_DO_STCLASS_AND;
3452       StructCopy(&this_class, data->start_class,
3453         struct regnode_charclass_class);
3454       flags |= SCF_DO_STCLASS_OR;
3455       data->start_class->flags |= ANYOF_EOS;
3456      }
3457     } else {  /* Non-zero len */
3458      if (flags & SCF_DO_STCLASS_OR) {
3459       cl_or(pRExC_state, data->start_class, &this_class);
3460       cl_and(data->start_class, and_withp);
3461      }
3462      else if (flags & SCF_DO_STCLASS_AND)
3463       cl_and(data->start_class, &this_class);
3464      flags &= ~SCF_DO_STCLASS;
3465     }
3466     if (!scan)   /* It was not CURLYX, but CURLY. */
3467      scan = next;
3468     if ( /* ? quantifier ok, except for (?{ ... }) */
3469      (next_is_eval || !(mincount == 0 && maxcount == 1))
3470      && (minnext == 0) && (deltanext == 0)
3471      && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3472      && maxcount <= REG_INFTY/3) /* Complement check for big count */
3473     {
3474      ckWARNreg(RExC_parse,
3475        "Quantifier unexpected on zero-length expression");
3476     }
3477
3478     min += minnext * mincount;
3479     is_inf_internal |= ((maxcount == REG_INFTY
3480          && (minnext + deltanext) > 0)
3481          || deltanext == I32_MAX);
3482     is_inf |= is_inf_internal;
3483     delta += (minnext + deltanext) * maxcount - minnext * mincount;
3484
3485     /* Try powerful optimization CURLYX => CURLYN. */
3486     if (  OP(oscan) == CURLYX && data
3487      && data->flags & SF_IN_PAR
3488      && !(data->flags & SF_HAS_EVAL)
3489      && !deltanext && minnext == 1 ) {
3490      /* Try to optimize to CURLYN.  */
3491      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3492      regnode * const nxt1 = nxt;
3493 #ifdef DEBUGGING
3494      regnode *nxt2;
3495 #endif
3496
3497      /* Skip open. */
3498      nxt = regnext(nxt);
3499      if (!REGNODE_SIMPLE(OP(nxt))
3500       && !(PL_regkind[OP(nxt)] == EXACT
3501        && STR_LEN(nxt) == 1))
3502       goto nogo;
3503 #ifdef DEBUGGING
3504      nxt2 = nxt;
3505 #endif
3506      nxt = regnext(nxt);
3507      if (OP(nxt) != CLOSE)
3508       goto nogo;
3509      if (RExC_open_parens) {
3510       RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3511       RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3512      }
3513      /* Now we know that nxt2 is the only contents: */
3514      oscan->flags = (U8)ARG(nxt);
3515      OP(oscan) = CURLYN;
3516      OP(nxt1) = NOTHING; /* was OPEN. */
3517
3518 #ifdef DEBUGGING
3519      OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3520      NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3521      NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3522      OP(nxt) = OPTIMIZED; /* was CLOSE. */
3523      OP(nxt + 1) = OPTIMIZED; /* was count. */
3524      NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3525 #endif
3526     }
3527    nogo:
3528
3529     /* Try optimization CURLYX => CURLYM. */
3530     if (  OP(oscan) == CURLYX && data
3531      && !(data->flags & SF_HAS_PAR)
3532      && !(data->flags & SF_HAS_EVAL)
3533      && !deltanext /* atom is fixed width */
3534      && minnext != 0 /* CURLYM can't handle zero width */
3535     ) {
3536      /* XXXX How to optimize if data == 0? */
3537      /* Optimize to a simpler form.  */
3538      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3539      regnode *nxt2;
3540
3541      OP(oscan) = CURLYM;
3542      while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3543        && (OP(nxt2) != WHILEM))
3544       nxt = nxt2;
3545      OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3546      /* Need to optimize away parenths. */
3547      if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3548       /* Set the parenth number.  */
3549       regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3550
3551       oscan->flags = (U8)ARG(nxt);
3552       if (RExC_open_parens) {
3553        RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3554        RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3555       }
3556       OP(nxt1) = OPTIMIZED; /* was OPEN. */
3557       OP(nxt) = OPTIMIZED; /* was CLOSE. */
3558
3559 #ifdef DEBUGGING
3560       OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3561       OP(nxt + 1) = OPTIMIZED; /* was count. */
3562       NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3563       NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3564 #endif
3565 #if 0
3566       while ( nxt1 && (OP(nxt1) != WHILEM)) {
3567        regnode *nnxt = regnext(nxt1);
3568        if (nnxt == nxt) {
3569         if (reg_off_by_arg[OP(nxt1)])
3570          ARG_SET(nxt1, nxt2 - nxt1);
3571         else if (nxt2 - nxt1 < U16_MAX)
3572          NEXT_OFF(nxt1) = nxt2 - nxt1;
3573         else
3574          OP(nxt) = NOTHING; /* Cannot beautify */
3575        }
3576        nxt1 = nnxt;
3577       }
3578 #endif
3579       /* Optimize again: */
3580       study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3581          NULL, stopparen, recursed, NULL, 0,depth+1);
3582      }
3583      else
3584       oscan->flags = 0;
3585     }
3586     else if ((OP(oscan) == CURLYX)
3587       && (flags & SCF_WHILEM_VISITED_POS)
3588       /* See the comment on a similar expression above.
3589        However, this time it's not a subexpression
3590        we care about, but the expression itself. */
3591       && (maxcount == REG_INFTY)
3592       && data && ++data->whilem_c < 16) {
3593      /* This stays as CURLYX, we can put the count/of pair. */
3594      /* Find WHILEM (as in regexec.c) */
3595      regnode *nxt = oscan + NEXT_OFF(oscan);
3596
3597      if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3598       nxt += ARG(nxt);
3599      PREVOPER(nxt)->flags = (U8)(data->whilem_c
3600       | (RExC_whilem_seen << 4)); /* On WHILEM */
3601     }
3602     if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3603      pars++;
3604     if (flags & SCF_DO_SUBSTR) {
3605      SV *last_str = NULL;
3606      int counted = mincount != 0;
3607
3608      if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3609 #if defined(SPARC64_GCC_WORKAROUND)
3610       I32 b = 0;
3611       STRLEN l = 0;
3612       const char *s = NULL;
3613       I32 old = 0;
3614
3615       if (pos_before >= data->last_start_min)
3616        b = pos_before;
3617       else
3618        b = data->last_start_min;
3619
3620       l = 0;
3621       s = SvPV_const(data->last_found, l);
3622       old = b - data->last_start_min;
3623
3624 #else
3625       I32 b = pos_before >= data->last_start_min
3626        ? pos_before : data->last_start_min;
3627       STRLEN l;
3628       const char * const s = SvPV_const(data->last_found, l);
3629       I32 old = b - data->last_start_min;
3630 #endif
3631
3632       if (UTF)
3633        old = utf8_hop((U8*)s, old) - (U8*)s;
3634       l -= old;
3635       /* Get the added string: */
3636       last_str = newSVpvn_utf8(s  + old, l, UTF);
3637       if (deltanext == 0 && pos_before == b) {
3638        /* What was added is a constant string */
3639        if (mincount > 1) {
3640         SvGROW(last_str, (mincount * l) + 1);
3641         repeatcpy(SvPVX(last_str) + l,
3642           SvPVX_const(last_str), l, mincount - 1);
3643         SvCUR_set(last_str, SvCUR(last_str) * mincount);
3644         /* Add additional parts. */
3645         SvCUR_set(data->last_found,
3646           SvCUR(data->last_found) - l);
3647         sv_catsv(data->last_found, last_str);
3648         {
3649          SV * sv = data->last_found;
3650          MAGIC *mg =
3651           SvUTF8(sv) && SvMAGICAL(sv) ?
3652           mg_find(sv, PERL_MAGIC_utf8) : NULL;
3653          if (mg && mg->mg_len >= 0)
3654           mg->mg_len += CHR_SVLEN(last_str) - l;
3655         }
3656         data->last_end += l * (mincount - 1);
3657        }
3658       } else {
3659        /* start offset must point into the last copy */
3660        data->last_start_min += minnext * (mincount - 1);
3661        data->last_start_max += is_inf ? I32_MAX
3662         : (maxcount - 1) * (minnext + data->pos_delta);
3663       }
3664      }
3665      /* It is counted once already... */
3666      data->pos_min += minnext * (mincount - counted);
3667      data->pos_delta += - counted * deltanext +
3668       (minnext + deltanext) * maxcount - minnext * mincount;
3669      if (mincount != maxcount) {
3670       /* Cannot extend fixed substrings found inside
3671        the group.  */
3672       SCAN_COMMIT(pRExC_state,data,minlenp);
3673       if (mincount && last_str) {
3674        SV * const sv = data->last_found;
3675        MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3676         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3677
3678        if (mg)
3679         mg->mg_len = -1;
3680        sv_setsv(sv, last_str);
3681        data->last_end = data->pos_min;
3682        data->last_start_min =
3683         data->pos_min - CHR_SVLEN(last_str);
3684        data->last_start_max = is_inf
3685         ? I32_MAX
3686         : data->pos_min + data->pos_delta
3687         - CHR_SVLEN(last_str);
3688       }
3689       data->longest = &(data->longest_float);
3690      }
3691      SvREFCNT_dec(last_str);
3692     }
3693     if (data && (fl & SF_HAS_EVAL))
3694      data->flags |= SF_HAS_EVAL;
3695    optimize_curly_tail:
3696     if (OP(oscan) != CURLYX) {
3697      while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3698       && NEXT_OFF(next))
3699       NEXT_OFF(oscan) += NEXT_OFF(next);
3700     }
3701     continue;
3702    default:   /* REF, ANYOFV, and CLUMP only? */
3703     if (flags & SCF_DO_SUBSTR) {
3704      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3705      data->longest = &(data->longest_float);
3706     }
3707     is_inf = is_inf_internal = 1;
3708     if (flags & SCF_DO_STCLASS_OR)
3709      cl_anything(pRExC_state, data->start_class);
3710     flags &= ~SCF_DO_STCLASS;
3711     break;
3712    }
3713   }
3714   else if (OP(scan) == LNBREAK) {
3715    if (flags & SCF_DO_STCLASS) {
3716     int value = 0;
3717     data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3718      if (flags & SCF_DO_STCLASS_AND) {
3719      for (value = 0; value < 256; value++)
3720       if (!is_VERTWS_cp(value))
3721        ANYOF_BITMAP_CLEAR(data->start_class, value);
3722     }
3723     else {
3724      for (value = 0; value < 256; value++)
3725       if (is_VERTWS_cp(value))
3726        ANYOF_BITMAP_SET(data->start_class, value);
3727     }
3728     if (flags & SCF_DO_STCLASS_OR)
3729      cl_and(data->start_class, and_withp);
3730     flags &= ~SCF_DO_STCLASS;
3731    }
3732    min += 1;
3733    delta += 1;
3734    if (flags & SCF_DO_SUBSTR) {
3735      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3736      data->pos_min += 1;
3737     data->pos_delta += 1;
3738     data->longest = &(data->longest_float);
3739     }
3740   }
3741   else if (OP(scan) == FOLDCHAR) {
3742    int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
3743    flags &= ~SCF_DO_STCLASS;
3744    min += 1;
3745    delta += d;
3746    if (flags & SCF_DO_SUBSTR) {
3747     SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3748     data->pos_min += 1;
3749     data->pos_delta += d;
3750     data->longest = &(data->longest_float);
3751    }
3752   }
3753   else if (REGNODE_SIMPLE(OP(scan))) {
3754    int value = 0;
3755
3756    if (flags & SCF_DO_SUBSTR) {
3757     SCAN_COMMIT(pRExC_state,data,minlenp);
3758     data->pos_min++;
3759    }
3760    min++;
3761    if (flags & SCF_DO_STCLASS) {
3762     data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3763
3764     /* Some of the logic below assumes that switching
3765     locale on will only add false positives. */
3766     switch (PL_regkind[OP(scan)]) {
3767     case SANY:
3768     default:
3769     do_default:
3770      /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3771      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3772       cl_anything(pRExC_state, data->start_class);
3773      break;
3774     case REG_ANY:
3775      if (OP(scan) == SANY)
3776       goto do_default;
3777      if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3778       value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3779         || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3780       cl_anything(pRExC_state, data->start_class);
3781      }
3782      if (flags & SCF_DO_STCLASS_AND || !value)
3783       ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3784      break;
3785     case ANYOF:
3786      if (flags & SCF_DO_STCLASS_AND)
3787       cl_and(data->start_class,
3788        (struct regnode_charclass_class*)scan);
3789      else
3790       cl_or(pRExC_state, data->start_class,
3791        (struct regnode_charclass_class*)scan);
3792      break;
3793     case ALNUM:
3794      if (flags & SCF_DO_STCLASS_AND) {
3795       if (!(data->start_class->flags & ANYOF_LOCALE)) {
3796        ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3797        if (OP(scan) == ALNUMU) {
3798         for (value = 0; value < 256; value++) {
3799          if (!isWORDCHAR_L1(value)) {
3800           ANYOF_BITMAP_CLEAR(data->start_class, value);
3801          }
3802         }
3803        } else {
3804         for (value = 0; value < 256; value++) {
3805          if (!isALNUM(value)) {
3806           ANYOF_BITMAP_CLEAR(data->start_class, value);
3807          }
3808         }
3809        }
3810       }
3811      }
3812      else {
3813       if (data->start_class->flags & ANYOF_LOCALE)
3814        ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3815
3816       /* Even if under locale, set the bits for non-locale
3817       * in case it isn't a true locale-node.  This will
3818       * create false positives if it truly is locale */
3819       if (OP(scan) == ALNUMU) {
3820        for (value = 0; value < 256; value++) {
3821         if (isWORDCHAR_L1(value)) {
3822          ANYOF_BITMAP_SET(data->start_class, value);
3823         }
3824        }
3825       } else {
3826        for (value = 0; value < 256; value++) {
3827         if (isALNUM(value)) {
3828          ANYOF_BITMAP_SET(data->start_class, value);
3829         }
3830        }
3831       }
3832      }
3833      break;
3834     case NALNUM:
3835      if (flags & SCF_DO_STCLASS_AND) {
3836       if (!(data->start_class->flags & ANYOF_LOCALE)) {
3837        ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3838        if (OP(scan) == NALNUMU) {
3839         for (value = 0; value < 256; value++) {
3840          if (isWORDCHAR_L1(value)) {
3841           ANYOF_BITMAP_CLEAR(data->start_class, value);
3842          }
3843         }
3844        } else {
3845         for (value = 0; value < 256; value++) {
3846          if (isALNUM(value)) {
3847           ANYOF_BITMAP_CLEAR(data->start_class, value);
3848          }
3849         }
3850        }
3851       }
3852      }
3853      else {
3854       if (data->start_class->flags & ANYOF_LOCALE)
3855        ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3856
3857       /* Even if under locale, set the bits for non-locale in
3858       * case it isn't a true locale-node.  This will create
3859       * false positives if it truly is locale */
3860       if (OP(scan) == NALNUMU) {
3861        for (value = 0; value < 256; value++) {
3862         if (! isWORDCHAR_L1(value)) {
3863          ANYOF_BITMAP_SET(data->start_class, value);
3864         }
3865        }
3866       } else {
3867        for (value = 0; value < 256; value++) {
3868         if (! isALNUM(value)) {
3869          ANYOF_BITMAP_SET(data->start_class, value);
3870         }
3871        }
3872       }
3873      }
3874      break;
3875     case SPACE:
3876      if (flags & SCF_DO_STCLASS_AND) {
3877       if (!(data->start_class->flags & ANYOF_LOCALE)) {
3878        ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3879        if (OP(scan) == SPACEU) {
3880         for (value = 0; value < 256; value++) {
3881          if (!isSPACE_L1(value)) {
3882           ANYOF_BITMAP_CLEAR(data->start_class, value);
3883          }
3884         }
3885        } else {
3886         for (value = 0; value < 256; value++) {
3887          if (!isSPACE(value)) {
3888           ANYOF_BITMAP_CLEAR(data->start_class, value);
3889          }
3890         }
3891        }
3892       }
3893      }
3894      else {
3895       if (data->start_class->flags & ANYOF_LOCALE) {
3896        ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3897       }
3898       if (OP(scan) == SPACEU) {
3899        for (value = 0; value < 256; value++) {
3900         if (isSPACE_L1(value)) {
3901          ANYOF_BITMAP_SET(data->start_class, value);
3902         }
3903        }
3904       } else {
3905        for (value = 0; value < 256; value++) {
3906         if (isSPACE(value)) {
3907          ANYOF_BITMAP_SET(data->start_class, value);
3908         }
3909        }
3910       }
3911      }
3912      break;
3913     case NSPACE:
3914      if (flags & SCF_DO_STCLASS_AND) {
3915       if (!(data->start_class->flags & ANYOF_LOCALE)) {
3916        ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3917        if (OP(scan) == NSPACEU) {
3918         for (value = 0; value < 256; value++) {
3919          if (isSPACE_L1(value)) {
3920           ANYOF_BITMAP_CLEAR(data->start_class, value);
3921          }
3922         }
3923        } else {
3924         for (value = 0; value < 256; value++) {
3925          if (isSPACE(value)) {
3926           ANYOF_BITMAP_CLEAR(data->start_class, value);
3927          }
3928         }
3929        }
3930       }
3931      }
3932      else {
3933       if (data->start_class->flags & ANYOF_LOCALE)
3934        ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3935       if (OP(scan) == NSPACEU) {
3936        for (value = 0; value < 256; value++) {
3937         if (!isSPACE_L1(value)) {
3938          ANYOF_BITMAP_SET(data->start_class, value);
3939         }
3940        }
3941       }
3942       else {
3943        for (value = 0; value < 256; value++) {
3944         if (!isSPACE(value)) {
3945          ANYOF_BITMAP_SET(data->start_class, value);
3946         }
3947        }
3948       }
3949      }
3950      break;
3951     case DIGIT:
3952      if (flags & SCF_DO_STCLASS_AND) {
3953       if (!(data->start_class->flags & ANYOF_LOCALE)) {
3954        ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3955        for (value = 0; value < 256; value++)
3956         if (!isDIGIT(value))
3957          ANYOF_BITMAP_CLEAR(data->start_class, value);
3958       }
3959      }
3960      else {
3961       if (data->start_class->flags & ANYOF_LOCALE)
3962        ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3963       for (value = 0; value < 256; value++)
3964        if (isDIGIT(value))
3965         ANYOF_BITMAP_SET(data->start_class, value);
3966      }
3967      break;
3968     case NDIGIT:
3969      if (flags & SCF_DO_STCLASS_AND) {
3970       if (!(data->start_class->flags & ANYOF_LOCALE))
3971        ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3972       for (value = 0; value < 256; value++)
3973        if (isDIGIT(value))
3974         ANYOF_BITMAP_CLEAR(data->start_class, value);
3975      }
3976      else {
3977       if (data->start_class->flags & ANYOF_LOCALE)
3978        ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3979       for (value = 0; value < 256; value++)
3980        if (!isDIGIT(value))
3981         ANYOF_BITMAP_SET(data->start_class, value);
3982      }
3983      break;
3984     CASE_SYNST_FNC(VERTWS);
3985     CASE_SYNST_FNC(HORIZWS);
3986
3987     }
3988     if (flags & SCF_DO_STCLASS_OR)
3989      cl_and(data->start_class, and_withp);
3990     flags &= ~SCF_DO_STCLASS;
3991    }
3992   }
3993   else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3994    data->flags |= (OP(scan) == MEOL
3995        ? SF_BEFORE_MEOL
3996        : SF_BEFORE_SEOL);
3997   }
3998   else if (  PL_regkind[OP(scan)] == BRANCHJ
3999     /* Lookbehind, or need to calculate parens/evals/stclass: */
4000     && (scan->flags || data || (flags & SCF_DO_STCLASS))
4001     && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4002    if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4003     || OP(scan) == UNLESSM )
4004    {
4005     /* Negative Lookahead/lookbehind
4006     In this case we can't do fixed string optimisation.
4007     */
4008
4009     I32 deltanext, minnext, fake = 0;
4010     regnode *nscan;
4011     struct regnode_charclass_class intrnl;
4012     int f = 0;
4013
4014     data_fake.flags = 0;
4015     if (data) {
4016      data_fake.whilem_c = data->whilem_c;
4017      data_fake.last_closep = data->last_closep;
4018     }
4019     else
4020      data_fake.last_closep = &fake;
4021     data_fake.pos_delta = delta;
4022     if ( flags & SCF_DO_STCLASS && !scan->flags
4023      && OP(scan) == IFMATCH ) { /* Lookahead */
4024      cl_init(pRExC_state, &intrnl);
4025      data_fake.start_class = &intrnl;
4026      f |= SCF_DO_STCLASS_AND;
4027     }
4028     if (flags & SCF_WHILEM_VISITED_POS)
4029      f |= SCF_WHILEM_VISITED_POS;
4030     next = regnext(scan);
4031     nscan = NEXTOPER(NEXTOPER(scan));
4032     minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4033      last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4034     if (scan->flags) {
4035      if (deltanext) {
4036       FAIL("Variable length lookbehind not implemented");
4037      }
4038      else if (minnext > (I32)U8_MAX) {
4039       FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4040      }
4041      scan->flags = (U8)minnext;
4042     }
4043     if (data) {
4044      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4045       pars++;
4046      if (data_fake.flags & SF_HAS_EVAL)
4047       data->flags |= SF_HAS_EVAL;
4048      data->whilem_c = data_fake.whilem_c;
4049     }
4050     if (f & SCF_DO_STCLASS_AND) {
4051      if (flags & SCF_DO_STCLASS_OR) {
4052       /* OR before, AND after: ideally we would recurse with
4053       * data_fake to get the AND applied by study of the
4054       * remainder of the pattern, and then derecurse;
4055       * *** HACK *** for now just treat as "no information".
4056       * See [perl #56690].
4057       */
4058       cl_init(pRExC_state, data->start_class);
4059      }  else {
4060       /* AND before and after: combine and continue */
4061       const int was = (data->start_class->flags & ANYOF_EOS);
4062
4063       cl_and(data->start_class, &intrnl);
4064       if (was)
4065        data->start_class->flags |= ANYOF_EOS;
4066      }
4067     }
4068    }
4069 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4070    else {
4071     /* Positive Lookahead/lookbehind
4072     In this case we can do fixed string optimisation,
4073     but we must be careful about it. Note in the case of
4074     lookbehind the positions will be offset by the minimum
4075     length of the pattern, something we won't know about
4076     until after the recurse.
4077     */
4078     I32 deltanext, fake = 0;
4079     regnode *nscan;
4080     struct regnode_charclass_class intrnl;
4081     int f = 0;
4082     /* We use SAVEFREEPV so that when the full compile
4083      is finished perl will clean up the allocated
4084      minlens when it's all done. This way we don't
4085      have to worry about freeing them when we know
4086      they wont be used, which would be a pain.
4087     */
4088     I32 *minnextp;
4089     Newx( minnextp, 1, I32 );
4090     SAVEFREEPV(minnextp);
4091
4092     if (data) {
4093      StructCopy(data, &data_fake, scan_data_t);
4094      if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4095       f |= SCF_DO_SUBSTR;
4096       if (scan->flags)
4097        SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4098       data_fake.last_found=newSVsv(data->last_found);
4099      }
4100     }
4101     else
4102      data_fake.last_closep = &fake;
4103     data_fake.flags = 0;
4104     data_fake.pos_delta = delta;
4105     if (is_inf)
4106      data_fake.flags |= SF_IS_INF;
4107     if ( flags & SCF_DO_STCLASS && !scan->flags
4108      && OP(scan) == IFMATCH ) { /* Lookahead */
4109      cl_init(pRExC_state, &intrnl);
4110      data_fake.start_class = &intrnl;
4111      f |= SCF_DO_STCLASS_AND;
4112     }
4113     if (flags & SCF_WHILEM_VISITED_POS)
4114      f |= SCF_WHILEM_VISITED_POS;
4115     next = regnext(scan);
4116     nscan = NEXTOPER(NEXTOPER(scan));
4117
4118     *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4119      last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4120     if (scan->flags) {
4121      if (deltanext) {
4122       FAIL("Variable length lookbehind not implemented");
4123      }
4124      else if (*minnextp > (I32)U8_MAX) {
4125       FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4126      }
4127      scan->flags = (U8)*minnextp;
4128     }
4129
4130     *minnextp += min;
4131
4132     if (f & SCF_DO_STCLASS_AND) {
4133      const int was = (data->start_class->flags & ANYOF_EOS);
4134
4135      cl_and(data->start_class, &intrnl);
4136      if (was)
4137       data->start_class->flags |= ANYOF_EOS;
4138     }
4139     if (data) {
4140      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4141       pars++;
4142      if (data_fake.flags & SF_HAS_EVAL)
4143       data->flags |= SF_HAS_EVAL;
4144      data->whilem_c = data_fake.whilem_c;
4145      if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4146       if (RExC_rx->minlen<*minnextp)
4147        RExC_rx->minlen=*minnextp;
4148       SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4149       SvREFCNT_dec(data_fake.last_found);
4150
4151       if ( data_fake.minlen_fixed != minlenp )
4152       {
4153        data->offset_fixed= data_fake.offset_fixed;
4154        data->minlen_fixed= data_fake.minlen_fixed;
4155        data->lookbehind_fixed+= scan->flags;
4156       }
4157       if ( data_fake.minlen_float != minlenp )
4158       {
4159        data->minlen_float= data_fake.minlen_float;
4160        data->offset_float_min=data_fake.offset_float_min;
4161        data->offset_float_max=data_fake.offset_float_max;
4162        data->lookbehind_float+= scan->flags;
4163       }
4164      }
4165     }
4166
4167
4168    }
4169 #endif
4170   }
4171   else if (OP(scan) == OPEN) {
4172    if (stopparen != (I32)ARG(scan))
4173     pars++;
4174   }
4175   else if (OP(scan) == CLOSE) {
4176    if (stopparen == (I32)ARG(scan)) {
4177     break;
4178    }
4179    if ((I32)ARG(scan) == is_par) {
4180     next = regnext(scan);
4181
4182     if ( next && (OP(next) != WHILEM) && next < last)
4183      is_par = 0;  /* Disable optimization */
4184    }
4185    if (data)
4186     *(data->last_closep) = ARG(scan);
4187   }
4188   else if (OP(scan) == EVAL) {
4189     if (data)
4190      data->flags |= SF_HAS_EVAL;
4191   }
4192   else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4193    if (flags & SCF_DO_SUBSTR) {
4194     SCAN_COMMIT(pRExC_state,data,minlenp);
4195     flags &= ~SCF_DO_SUBSTR;
4196    }
4197    if (data && OP(scan)==ACCEPT) {
4198     data->flags |= SCF_SEEN_ACCEPT;
4199     if (stopmin > min)
4200      stopmin = min;
4201    }
4202   }
4203   else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4204   {
4205     if (flags & SCF_DO_SUBSTR) {
4206      SCAN_COMMIT(pRExC_state,data,minlenp);
4207      data->longest = &(data->longest_float);
4208     }
4209     is_inf = is_inf_internal = 1;
4210     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4211      cl_anything(pRExC_state, data->start_class);
4212     flags &= ~SCF_DO_STCLASS;
4213   }
4214   else if (OP(scan) == GPOS) {
4215    if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4216     !(delta || is_inf || (data && data->pos_delta)))
4217    {
4218     if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4219      RExC_rx->extflags |= RXf_ANCH_GPOS;
4220     if (RExC_rx->gofs < (U32)min)
4221      RExC_rx->gofs = min;
4222    } else {
4223     RExC_rx->extflags |= RXf_GPOS_FLOAT;
4224     RExC_rx->gofs = 0;
4225    }
4226   }
4227 #ifdef TRIE_STUDY_OPT
4228 #ifdef FULL_TRIE_STUDY
4229   else if (PL_regkind[OP(scan)] == TRIE) {
4230    /* NOTE - There is similar code to this block above for handling
4231    BRANCH nodes on the initial study.  If you change stuff here
4232    check there too. */
4233    regnode *trie_node= scan;
4234    regnode *tail= regnext(scan);
4235    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4236    I32 max1 = 0, min1 = I32_MAX;
4237    struct regnode_charclass_class accum;
4238
4239    if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4240     SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4241    if (flags & SCF_DO_STCLASS)
4242     cl_init_zero(pRExC_state, &accum);
4243
4244    if (!trie->jump) {
4245     min1= trie->minlen;
4246     max1= trie->maxlen;
4247    } else {
4248     const regnode *nextbranch= NULL;
4249     U32 word;
4250
4251     for ( word=1 ; word <= trie->wordcount ; word++)
4252     {
4253      I32 deltanext=0, minnext=0, f = 0, fake;
4254      struct regnode_charclass_class this_class;
4255
4256      data_fake.flags = 0;
4257      if (data) {
4258       data_fake.whilem_c = data->whilem_c;
4259       data_fake.last_closep = data->last_closep;
4260      }
4261      else
4262       data_fake.last_closep = &fake;
4263      data_fake.pos_delta = delta;
4264      if (flags & SCF_DO_STCLASS) {
4265       cl_init(pRExC_state, &this_class);
4266       data_fake.start_class = &this_class;
4267       f = SCF_DO_STCLASS_AND;
4268      }
4269      if (flags & SCF_WHILEM_VISITED_POS)
4270       f |= SCF_WHILEM_VISITED_POS;
4271
4272      if (trie->jump[word]) {
4273       if (!nextbranch)
4274        nextbranch = trie_node + trie->jump[0];
4275       scan= trie_node + trie->jump[word];
4276       /* We go from the jump point to the branch that follows
4277       it. Note this means we need the vestigal unused branches
4278       even though they arent otherwise used.
4279       */
4280       minnext = study_chunk(pRExC_state, &scan, minlenp,
4281        &deltanext, (regnode *)nextbranch, &data_fake,
4282        stopparen, recursed, NULL, f,depth+1);
4283      }
4284      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4285       nextbranch= regnext((regnode*)nextbranch);
4286
4287      if (min1 > (I32)(minnext + trie->minlen))
4288       min1 = minnext + trie->minlen;
4289      if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4290       max1 = minnext + deltanext + trie->maxlen;
4291      if (deltanext == I32_MAX)
4292       is_inf = is_inf_internal = 1;
4293
4294      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4295       pars++;
4296      if (data_fake.flags & SCF_SEEN_ACCEPT) {
4297       if ( stopmin > min + min1)
4298        stopmin = min + min1;
4299       flags &= ~SCF_DO_SUBSTR;
4300       if (data)
4301        data->flags |= SCF_SEEN_ACCEPT;
4302      }
4303      if (data) {
4304       if (data_fake.flags & SF_HAS_EVAL)
4305        data->flags |= SF_HAS_EVAL;
4306       data->whilem_c = data_fake.whilem_c;
4307      }
4308      if (flags & SCF_DO_STCLASS)
4309       cl_or(pRExC_state, &accum, &this_class);
4310     }
4311    }
4312    if (flags & SCF_DO_SUBSTR) {
4313     data->pos_min += min1;
4314     data->pos_delta += max1 - min1;
4315     if (max1 != min1 || is_inf)
4316      data->longest = &(data->longest_float);
4317    }
4318    min += min1;
4319    delta += max1 - min1;
4320    if (flags & SCF_DO_STCLASS_OR) {
4321     cl_or(pRExC_state, data->start_class, &accum);
4322     if (min1) {
4323      cl_and(data->start_class, and_withp);
4324      flags &= ~SCF_DO_STCLASS;
4325     }
4326    }
4327    else if (flags & SCF_DO_STCLASS_AND) {
4328     if (min1) {
4329      cl_and(data->start_class, &accum);
4330      flags &= ~SCF_DO_STCLASS;
4331     }
4332     else {
4333      /* Switch to OR mode: cache the old value of
4334      * data->start_class */
4335      INIT_AND_WITHP;
4336      StructCopy(data->start_class, and_withp,
4337        struct regnode_charclass_class);
4338      flags &= ~SCF_DO_STCLASS_AND;
4339      StructCopy(&accum, data->start_class,
4340        struct regnode_charclass_class);
4341      flags |= SCF_DO_STCLASS_OR;
4342      data->start_class->flags |= ANYOF_EOS;
4343     }
4344    }
4345    scan= tail;
4346    continue;
4347   }
4348 #else
4349   else if (PL_regkind[OP(scan)] == TRIE) {
4350    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4351    U8*bang=NULL;
4352
4353    min += trie->minlen;
4354    delta += (trie->maxlen - trie->minlen);
4355    flags &= ~SCF_DO_STCLASS; /* xxx */
4356    if (flags & SCF_DO_SUBSTR) {
4357      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4358      data->pos_min += trie->minlen;
4359      data->pos_delta += (trie->maxlen - trie->minlen);
4360     if (trie->maxlen != trie->minlen)
4361      data->longest = &(data->longest_float);
4362     }
4363     if (trie->jump) /* no more substrings -- for now /grr*/
4364      flags &= ~SCF_DO_SUBSTR;
4365   }
4366 #endif /* old or new */
4367 #endif /* TRIE_STUDY_OPT */
4368
4369   /* Else: zero-length, ignore. */
4370   scan = regnext(scan);
4371  }
4372  if (frame) {
4373   last = frame->last;
4374   scan = frame->next;
4375   stopparen = frame->stop;
4376   frame = frame->prev;
4377   goto fake_study_recurse;
4378  }
4379
4380   finish:
4381  assert(!frame);
4382  DEBUG_STUDYDATA("pre-fin:",data,depth);
4383
4384  *scanp = scan;
4385  *deltap = is_inf_internal ? I32_MAX : delta;
4386  if (flags & SCF_DO_SUBSTR && is_inf)
4387   data->pos_delta = I32_MAX - data->pos_min;
4388  if (is_par > (I32)U8_MAX)
4389   is_par = 0;
4390  if (is_par && pars==1 && data) {
4391   data->flags |= SF_IN_PAR;
4392   data->flags &= ~SF_HAS_PAR;
4393  }
4394  else if (pars && data) {
4395   data->flags |= SF_HAS_PAR;
4396   data->flags &= ~SF_IN_PAR;
4397  }
4398  if (flags & SCF_DO_STCLASS_OR)
4399   cl_and(data->start_class, and_withp);
4400  if (flags & SCF_TRIE_RESTUDY)
4401   data->flags |=  SCF_TRIE_RESTUDY;
4402
4403  DEBUG_STUDYDATA("post-fin:",data,depth);
4404
4405  return min < stopmin ? min : stopmin;
4406 }
4407
4408 STATIC U32
4409 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4410 {
4411  U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4412
4413  PERL_ARGS_ASSERT_ADD_DATA;
4414
4415  Renewc(RExC_rxi->data,
4416   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4417   char, struct reg_data);
4418  if(count)
4419   Renew(RExC_rxi->data->what, count + n, U8);
4420  else
4421   Newx(RExC_rxi->data->what, n, U8);
4422  RExC_rxi->data->count = count + n;
4423  Copy(s, RExC_rxi->data->what + count, n, U8);
4424  return count;
4425 }
4426
4427 /*XXX: todo make this not included in a non debugging perl */
4428 #ifndef PERL_IN_XSUB_RE
4429 void
4430 Perl_reginitcolors(pTHX)
4431 {
4432  dVAR;
4433  const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4434  if (s) {
4435   char *t = savepv(s);
4436   int i = 0;
4437   PL_colors[0] = t;
4438   while (++i < 6) {
4439    t = strchr(t, '\t');
4440    if (t) {
4441     *t = '\0';
4442     PL_colors[i] = ++t;
4443    }
4444    else
4445     PL_colors[i] = t = (char *)"";
4446   }
4447  } else {
4448   int i = 0;
4449   while (i < 6)
4450    PL_colors[i++] = (char *)"";
4451  }
4452  PL_colorset = 1;
4453 }
4454 #endif
4455
4456
4457 #ifdef TRIE_STUDY_OPT
4458 #define CHECK_RESTUDY_GOTO                                  \
4459   if (                                                \
4460    (data.flags & SCF_TRIE_RESTUDY)               \
4461    && ! restudied++                              \
4462   )     goto reStudy
4463 #else
4464 #define CHECK_RESTUDY_GOTO
4465 #endif
4466
4467 /*
4468  - pregcomp - compile a regular expression into internal code
4469  *
4470  * We can't allocate space until we know how big the compiled form will be,
4471  * but we can't compile it (and thus know how big it is) until we've got a
4472  * place to put the code.  So we cheat:  we compile it twice, once with code
4473  * generation turned off and size counting turned on, and once "for real".
4474  * This also means that we don't allocate space until we are sure that the
4475  * thing really will compile successfully, and we never have to move the
4476  * code and thus invalidate pointers into it.  (Note that it has to be in
4477  * one piece because free() must be able to free it all.) [NB: not true in perl]
4478  *
4479  * Beware that the optimization-preparation code in here knows about some
4480  * of the structure of the compiled regexp.  [I'll say.]
4481  */
4482
4483
4484
4485 #ifndef PERL_IN_XSUB_RE
4486 #define RE_ENGINE_PTR &reh_regexp_engine
4487 #else
4488 extern const struct regexp_engine my_reg_engine;
4489 #define RE_ENGINE_PTR &my_reg_engine
4490 #endif
4491
4492 #ifndef PERL_IN_XSUB_RE
4493 REGEXP *
4494 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4495 {
4496  dVAR;
4497  HV * const table = GvHV(PL_hintgv);
4498
4499  PERL_ARGS_ASSERT_PREGCOMP;
4500
4501  /* Dispatch a request to compile a regexp to correct
4502  regexp engine. */
4503  if (table) {
4504   SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4505   GET_RE_DEBUG_FLAGS_DECL;
4506   if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4507    const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4508    DEBUG_COMPILE_r({
4509     PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4510      SvIV(*ptr));
4511    });
4512    return CALLREGCOMP_ENG(eng, pattern, flags);
4513   }
4514  }
4515  return Perl_re_compile(aTHX_ pattern, flags);
4516 }
4517 #endif
4518
4519 REGEXP *
4520 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4521 {
4522  dVAR;
4523  REGEXP *rx;
4524  struct regexp *r;
4525  register regexp_internal *ri;
4526  STRLEN plen;
4527  char  *exp;
4528  char* xend;
4529  regnode *scan;
4530  I32 flags;
4531  I32 minlen = 0;
4532  U32 pm_flags;
4533
4534  /* these are all flags - maybe they should be turned
4535  * into a single int with different bit masks */
4536  I32 sawlookahead = 0;
4537  I32 sawplus = 0;
4538  I32 sawopen = 0;
4539  bool used_setjump = FALSE;
4540  regex_charset initial_charset = get_regex_charset(orig_pm_flags);
4541
4542  U8 jump_ret = 0;
4543  dJMPENV;
4544  scan_data_t data;
4545  RExC_state_t RExC_state;
4546  RExC_state_t * const pRExC_state = &RExC_state;
4547 #ifdef TRIE_STUDY_OPT
4548  int restudied;
4549  RExC_state_t copyRExC_state;
4550 #endif
4551  GET_RE_DEBUG_FLAGS_DECL;
4552
4553  PERL_ARGS_ASSERT_RE_COMPILE;
4554
4555  DEBUG_r(if (!PL_colorset) reginitcolors());
4556
4557  RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4558  RExC_uni_semantics = 0;
4559  RExC_contains_locale = 0;
4560
4561  /****************** LONG JUMP TARGET HERE***********************/
4562  /* Longjmp back to here if have to switch in midstream to utf8 */
4563  if (! RExC_orig_utf8) {
4564   JMPENV_PUSH(jump_ret);
4565   used_setjump = TRUE;
4566  }
4567
4568  if (jump_ret == 0) {    /* First time through */
4569   exp = SvPV(pattern, plen);
4570   xend = exp + plen;
4571   /* ignore the utf8ness if the pattern is 0 length */
4572   if (plen == 0) {
4573    RExC_utf8 = RExC_orig_utf8 = 0;
4574   }
4575
4576   DEBUG_COMPILE_r({
4577    SV *dsv= sv_newmortal();
4578    RE_PV_QUOTED_DECL(s, RExC_utf8,
4579     dsv, exp, plen, 60);
4580    PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4581       PL_colors[4],PL_colors[5],s);
4582   });
4583  }
4584  else {  /* longjumped back */
4585   STRLEN len = plen;
4586
4587   /* If the cause for the longjmp was other than changing to utf8, pop
4588   * our own setjmp, and longjmp to the correct handler */
4589   if (jump_ret != UTF8_LONGJMP) {
4590    JMPENV_POP;
4591    JMPENV_JUMP(jump_ret);
4592   }
4593
4594   GET_RE_DEBUG_FLAGS;
4595
4596   /* It's possible to write a regexp in ascii that represents Unicode
4597   codepoints outside of the byte range, such as via \x{100}. If we
4598   detect such a sequence we have to convert the entire pattern to utf8
4599   and then recompile, as our sizing calculation will have been based
4600   on 1 byte == 1 character, but we will need to use utf8 to encode
4601   at least some part of the pattern, and therefore must convert the whole
4602   thing.
4603   -- dmq */
4604   DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4605    "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4606   exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4607   xend = exp + len;
4608   RExC_orig_utf8 = RExC_utf8 = 1;
4609   SAVEFREEPV(exp);
4610  }
4611
4612 #ifdef TRIE_STUDY_OPT
4613  restudied = 0;
4614 #endif
4615
4616  pm_flags = orig_pm_flags;
4617
4618  if (initial_charset == REGEX_LOCALE_CHARSET) {
4619   RExC_contains_locale = 1;
4620  }
4621  else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
4622
4623   /* Set to use unicode semantics if the pattern is in utf8 and has the
4624   * 'depends' charset specified, as it means unicode when utf8  */
4625   set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4626  }
4627
4628  RExC_precomp = exp;
4629  RExC_flags = pm_flags;
4630  RExC_sawback = 0;
4631
4632  RExC_seen = 0;
4633  RExC_in_lookbehind = 0;
4634  RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4635  RExC_seen_evals = 0;
4636  RExC_extralen = 0;
4637  RExC_override_recoding = 0;
4638
4639  /* First pass: determine size, legality. */
4640  RExC_parse = exp;
4641  RExC_start = exp;
4642  RExC_end = xend;
4643  RExC_naughty = 0;
4644  RExC_npar = 1;
4645  RExC_nestroot = 0;
4646  RExC_size = 0L;
4647  RExC_emit = &PL_regdummy;
4648  RExC_whilem_seen = 0;
4649  RExC_open_parens = NULL;
4650  RExC_close_parens = NULL;
4651  RExC_opend = NULL;
4652  RExC_paren_names = NULL;
4653 #ifdef DEBUGGING
4654  RExC_paren_name_list = NULL;
4655 #endif
4656  RExC_recurse = NULL;
4657  RExC_recurse_count = 0;
4658
4659 #if 0 /* REGC() is (currently) a NOP at the first pass.
4660  * Clever compilers notice this and complain. --jhi */
4661  REGC((U8)REG_MAGIC, (char*)RExC_emit);
4662 #endif
4663  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4664  if (reg(pRExC_state, 0, &flags,1) == NULL) {
4665   RExC_precomp = NULL;
4666   return(NULL);
4667  }
4668
4669  /* Here, finished first pass.  Get rid of any added setjmp */
4670  if (used_setjump) {
4671   JMPENV_POP;
4672  }
4673
4674  DEBUG_PARSE_r({
4675   PerlIO_printf(Perl_debug_log,
4676    "Required size %"IVdf" nodes\n"
4677    "Starting second pass (creation)\n",
4678    (IV)RExC_size);
4679   RExC_lastnum=0;
4680   RExC_lastparse=NULL;
4681  });
4682
4683  /* The first pass could have found things that force Unicode semantics */
4684  if ((RExC_utf8 || RExC_uni_semantics)
4685   && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
4686  {
4687   set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4688  }
4689
4690  /* Small enough for pointer-storage convention?
4691  If extralen==0, this means that we will not need long jumps. */
4692  if (RExC_size >= 0x10000L && RExC_extralen)
4693   RExC_size += RExC_extralen;
4694  else
4695   RExC_extralen = 0;
4696  if (RExC_whilem_seen > 15)
4697   RExC_whilem_seen = 15;
4698
4699  /* Allocate space and zero-initialize. Note, the two step process
4700  of zeroing when in debug mode, thus anything assigned has to
4701  happen after that */
4702  rx = (REGEXP*) newSV_type(SVt_REGEXP);
4703  r = (struct regexp*)SvANY(rx);
4704  Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4705   char, regexp_internal);
4706  if ( r == NULL || ri == NULL )
4707   FAIL("Regexp out of space");
4708 #ifdef DEBUGGING
4709  /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4710  Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4711 #else
4712  /* bulk initialize base fields with 0. */
4713  Zero(ri, sizeof(regexp_internal), char);
4714 #endif
4715
4716  /* non-zero initialization begins here */
4717  RXi_SET( r, ri );
4718  r->engine= RE_ENGINE_PTR;
4719  r->extflags = pm_flags;
4720  {
4721   bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4722   bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
4723
4724   /* The caret is output if there are any defaults: if not all the STD
4725   * flags are set, or if no character set specifier is needed */
4726   bool has_default =
4727      (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4728      || ! has_charset);
4729   bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4730   U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4731        >> RXf_PMf_STD_PMMOD_SHIFT);
4732   const char *fptr = STD_PAT_MODS;        /*"msix"*/
4733   char *p;
4734   /* Allocate for the worst case, which is all the std flags are turned
4735   * on.  If more precision is desired, we could do a population count of
4736   * the flags set.  This could be done with a small lookup table, or by
4737   * shifting, masking and adding, or even, when available, assembly
4738   * language for a machine-language population count.
4739   * We never output a minus, as all those are defaults, so are
4740   * covered by the caret */
4741   const STRLEN wraplen = plen + has_p + has_runon
4742    + has_default       /* If needs a caret */
4743
4744     /* If needs a character set specifier */
4745    + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
4746    + (sizeof(STD_PAT_MODS) - 1)
4747    + (sizeof("(?:)") - 1);
4748
4749   p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4750   SvPOK_on(rx);
4751   SvFLAGS(rx) |= SvUTF8(pattern);
4752   *p++='('; *p++='?';
4753
4754   /* If a default, cover it using the caret */
4755   if (has_default) {
4756    *p++= DEFAULT_PAT_MOD;
4757   }
4758   if (has_charset) {
4759    STRLEN len;
4760    const char* const name = get_regex_charset_name(r->extflags, &len);
4761    Copy(name, p, len, char);
4762    p += len;
4763   }
4764   if (has_p)
4765    *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4766   {
4767    char ch;
4768    while((ch = *fptr++)) {
4769     if(reganch & 1)
4770      *p++ = ch;
4771     reganch >>= 1;
4772    }
4773   }
4774
4775   *p++ = ':';
4776   Copy(RExC_precomp, p, plen, char);
4777   assert ((RX_WRAPPED(rx) - p) < 16);
4778   r->pre_prefix = p - RX_WRAPPED(rx);
4779   p += plen;
4780   if (has_runon)
4781    *p++ = '\n';
4782   *p++ = ')';
4783   *p = 0;
4784   SvCUR_set(rx, p - SvPVX_const(rx));
4785  }
4786
4787  r->intflags = 0;
4788  r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4789
4790  if (RExC_seen & REG_SEEN_RECURSE) {
4791   Newxz(RExC_open_parens, RExC_npar,regnode *);
4792   SAVEFREEPV(RExC_open_parens);
4793   Newxz(RExC_close_parens,RExC_npar,regnode *);
4794   SAVEFREEPV(RExC_close_parens);
4795  }
4796
4797  /* Useful during FAIL. */
4798 #ifdef RE_TRACK_PATTERN_OFFSETS
4799  Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4800  DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4801       "%s %"UVuf" bytes for offset annotations.\n",
4802       ri->u.offsets ? "Got" : "Couldn't get",
4803       (UV)((2*RExC_size+1) * sizeof(U32))));
4804 #endif
4805  SetProgLen(ri,RExC_size);
4806  RExC_rx_sv = rx;
4807  RExC_rx = r;
4808  RExC_rxi = ri;
4809  REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4810
4811  /* Second pass: emit code. */
4812  RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4813  RExC_parse = exp;
4814  RExC_end = xend;
4815  RExC_naughty = 0;
4816  RExC_npar = 1;
4817  RExC_emit_start = ri->program;
4818  RExC_emit = ri->program;
4819  RExC_emit_bound = ri->program + RExC_size + 1;
4820
4821  /* Store the count of eval-groups for security checks: */
4822  RExC_rx->seen_evals = RExC_seen_evals;
4823  REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4824  if (reg(pRExC_state, 0, &flags,1) == NULL) {
4825   ReREFCNT_dec(rx);
4826   return(NULL);
4827  }
4828  /* XXXX To minimize changes to RE engine we always allocate
4829  3-units-long substrs field. */
4830  Newx(r->substrs, 1, struct reg_substr_data);
4831  if (RExC_recurse_count) {
4832   Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4833   SAVEFREEPV(RExC_recurse);
4834  }
4835
4836 reStudy:
4837  r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4838  Zero(r->substrs, 1, struct reg_substr_data);
4839
4840 #ifdef TRIE_STUDY_OPT
4841  if (!restudied) {
4842   StructCopy(&zero_scan_data, &data, scan_data_t);
4843   copyRExC_state = RExC_state;
4844  } else {
4845   U32 seen=RExC_seen;
4846   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4847
4848   RExC_state = copyRExC_state;
4849   if (seen & REG_TOP_LEVEL_BRANCHES)
4850    RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4851   else
4852    RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4853   if (data.last_found) {
4854    SvREFCNT_dec(data.longest_fixed);
4855    SvREFCNT_dec(data.longest_float);
4856    SvREFCNT_dec(data.last_found);
4857   }
4858   StructCopy(&zero_scan_data, &data, scan_data_t);
4859  }
4860 #else
4861  StructCopy(&zero_scan_data, &data, scan_data_t);
4862 #endif
4863
4864  /* Dig out information for optimizations. */
4865  r->extflags = RExC_flags; /* was pm_op */
4866  /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4867
4868  if (UTF)
4869   SvUTF8_on(rx); /* Unicode in it? */
4870  ri->regstclass = NULL;
4871  if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4872   r->intflags |= PREGf_NAUGHTY;
4873  scan = ri->program + 1;  /* First BRANCH. */
4874
4875  /* testing for BRANCH here tells us whether there is "must appear"
4876  data in the pattern. If there is then we can use it for optimisations */
4877  if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4878   I32 fake;
4879   STRLEN longest_float_length, longest_fixed_length;
4880   struct regnode_charclass_class ch_class; /* pointed to by data */
4881   int stclass_flag;
4882   I32 last_close = 0; /* pointed to by data */
4883   regnode *first= scan;
4884   regnode *first_next= regnext(first);
4885   /*
4886   * Skip introductions and multiplicators >= 1
4887   * so that we can extract the 'meat' of the pattern that must
4888   * match in the large if() sequence following.
4889   * NOTE that EXACT is NOT covered here, as it is normally
4890   * picked up by the optimiser separately.
4891   *
4892   * This is unfortunate as the optimiser isnt handling lookahead
4893   * properly currently.
4894   *
4895   */
4896   while ((OP(first) == OPEN && (sawopen = 1)) ||
4897    /* An OR of *one* alternative - should not happen now. */
4898    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4899    /* for now we can't handle lookbehind IFMATCH*/
4900    (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4901    (OP(first) == PLUS) ||
4902    (OP(first) == MINMOD) ||
4903    /* An {n,m} with n>0 */
4904    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4905    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4906   {
4907     /*
4908     * the only op that could be a regnode is PLUS, all the rest
4909     * will be regnode_1 or regnode_2.
4910     *
4911     */
4912     if (OP(first) == PLUS)
4913      sawplus = 1;
4914     else
4915      first += regarglen[OP(first)];
4916
4917     first = NEXTOPER(first);
4918     first_next= regnext(first);
4919   }
4920
4921   /* Starting-point info. */
4922  again:
4923   DEBUG_PEEP("first:",first,0);
4924   /* Ignore EXACT as we deal with it later. */
4925   if (PL_regkind[OP(first)] == EXACT) {
4926    if (OP(first) == EXACT)
4927     NOOP; /* Empty, get anchored substr later. */
4928    else
4929     ri->regstclass = first;
4930   }
4931 #ifdef TRIE_STCLASS
4932   else if (PL_regkind[OP(first)] == TRIE &&
4933     ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4934   {
4935    regnode *trie_op;
4936    /* this can happen only on restudy */
4937    if ( OP(first) == TRIE ) {
4938     struct regnode_1 *trieop = (struct regnode_1 *)
4939      PerlMemShared_calloc(1, sizeof(struct regnode_1));
4940     StructCopy(first,trieop,struct regnode_1);
4941     trie_op=(regnode *)trieop;
4942    } else {
4943     struct regnode_charclass *trieop = (struct regnode_charclass *)
4944      PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4945     StructCopy(first,trieop,struct regnode_charclass);
4946     trie_op=(regnode *)trieop;
4947    }
4948    OP(trie_op)+=2;
4949    make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4950    ri->regstclass = trie_op;
4951   }
4952 #endif
4953   else if (REGNODE_SIMPLE(OP(first)))
4954    ri->regstclass = first;
4955   else if (PL_regkind[OP(first)] == BOUND ||
4956     PL_regkind[OP(first)] == NBOUND)
4957    ri->regstclass = first;
4958   else if (PL_regkind[OP(first)] == BOL) {
4959    r->extflags |= (OP(first) == MBOL
4960       ? RXf_ANCH_MBOL
4961       : (OP(first) == SBOL
4962        ? RXf_ANCH_SBOL
4963        : RXf_ANCH_BOL));
4964    first = NEXTOPER(first);
4965    goto again;
4966   }
4967   else if (OP(first) == GPOS) {
4968    r->extflags |= RXf_ANCH_GPOS;
4969    first = NEXTOPER(first);
4970    goto again;
4971   }
4972   else if ((!sawopen || !RExC_sawback) &&
4973    (OP(first) == STAR &&
4974    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4975    !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4976   {
4977    /* turn .* into ^.* with an implied $*=1 */
4978    const int type =
4979     (OP(NEXTOPER(first)) == REG_ANY)
4980      ? RXf_ANCH_MBOL
4981      : RXf_ANCH_SBOL;
4982    r->extflags |= type;
4983    r->intflags |= PREGf_IMPLICIT;
4984    first = NEXTOPER(first);
4985    goto again;
4986   }
4987   if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4988    && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4989    /* x+ must match at the 1st pos of run of x's */
4990    r->intflags |= PREGf_SKIP;
4991
4992   /* Scan is after the zeroth branch, first is atomic matcher. */
4993 #ifdef TRIE_STUDY_OPT
4994   DEBUG_PARSE_r(
4995    if (!restudied)
4996     PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4997        (IV)(first - scan + 1))
4998   );
4999 #else
5000   DEBUG_PARSE_r(
5001    PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5002     (IV)(first - scan + 1))
5003   );
5004 #endif
5005
5006
5007   /*
5008   * If there's something expensive in the r.e., find the
5009   * longest literal string that must appear and make it the
5010   * regmust.  Resolve ties in favor of later strings, since
5011   * the regstart check works with the beginning of the r.e.
5012   * and avoiding duplication strengthens checking.  Not a
5013   * strong reason, but sufficient in the absence of others.
5014   * [Now we resolve ties in favor of the earlier string if
5015   * it happens that c_offset_min has been invalidated, since the
5016   * earlier string may buy us something the later one won't.]
5017   */
5018
5019   data.longest_fixed = newSVpvs("");
5020   data.longest_float = newSVpvs("");
5021   data.last_found = newSVpvs("");
5022   data.longest = &(data.longest_fixed);
5023   first = scan;
5024   if (!ri->regstclass) {
5025    cl_init(pRExC_state, &ch_class);
5026    data.start_class = &ch_class;
5027    stclass_flag = SCF_DO_STCLASS_AND;
5028   } else    /* XXXX Check for BOUND? */
5029    stclass_flag = 0;
5030   data.last_closep = &last_close;
5031
5032   minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5033    &data, -1, NULL, NULL,
5034    SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5035
5036
5037   CHECK_RESTUDY_GOTO;
5038
5039
5040   if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5041    && data.last_start_min == 0 && data.last_end > 0
5042    && !RExC_seen_zerolen
5043    && !(RExC_seen & REG_SEEN_VERBARG)
5044    && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5045    r->extflags |= RXf_CHECK_ALL;
5046   scan_commit(pRExC_state, &data,&minlen,0);
5047   SvREFCNT_dec(data.last_found);
5048
5049   /* Note that code very similar to this but for anchored string
5050   follows immediately below, changes may need to be made to both.
5051   Be careful.
5052   */
5053   longest_float_length = CHR_SVLEN(data.longest_float);
5054   if (longest_float_length
5055    || (data.flags & SF_FL_BEFORE_EOL
5056     && (!(data.flags & SF_FL_BEFORE_MEOL)
5057      || (RExC_flags & RXf_PMf_MULTILINE))))
5058   {
5059    I32 t,ml;
5060
5061    if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
5062     && data.offset_fixed == data.offset_float_min
5063     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
5064      goto remove_float;  /* As in (a)+. */
5065
5066    /* copy the information about the longest float from the reg_scan_data
5067    over to the program. */
5068    if (SvUTF8(data.longest_float)) {
5069     r->float_utf8 = data.longest_float;
5070     r->float_substr = NULL;
5071    } else {
5072     r->float_substr = data.longest_float;
5073     r->float_utf8 = NULL;
5074    }
5075    /* float_end_shift is how many chars that must be matched that
5076    follow this item. We calculate it ahead of time as once the
5077    lookbehind offset is added in we lose the ability to correctly
5078    calculate it.*/
5079    ml = data.minlen_float ? *(data.minlen_float)
5080         : (I32)longest_float_length;
5081    r->float_end_shift = ml - data.offset_float_min
5082     - longest_float_length + (SvTAIL(data.longest_float) != 0)
5083     + data.lookbehind_float;
5084    r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5085    r->float_max_offset = data.offset_float_max;
5086    if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5087     r->float_max_offset -= data.lookbehind_float;
5088
5089    t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5090      && (!(data.flags & SF_FL_BEFORE_MEOL)
5091       || (RExC_flags & RXf_PMf_MULTILINE)));
5092    fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5093   }
5094   else {
5095   remove_float:
5096    r->float_substr = r->float_utf8 = NULL;
5097    SvREFCNT_dec(data.longest_float);
5098    longest_float_length = 0;
5099   }
5100
5101   /* Note that code very similar to this but for floating string
5102   is immediately above, changes may need to be made to both.
5103   Be careful.
5104   */
5105   longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5106   if (longest_fixed_length
5107    || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5108     && (!(data.flags & SF_FIX_BEFORE_MEOL)
5109      || (RExC_flags & RXf_PMf_MULTILINE))))
5110   {
5111    I32 t,ml;
5112
5113    /* copy the information about the longest fixed
5114    from the reg_scan_data over to the program. */
5115    if (SvUTF8(data.longest_fixed)) {
5116     r->anchored_utf8 = data.longest_fixed;
5117     r->anchored_substr = NULL;
5118    } else {
5119     r->anchored_substr = data.longest_fixed;
5120     r->anchored_utf8 = NULL;
5121    }
5122    /* fixed_end_shift is how many chars that must be matched that
5123    follow this item. We calculate it ahead of time as once the
5124    lookbehind offset is added in we lose the ability to correctly
5125    calculate it.*/
5126    ml = data.minlen_fixed ? *(data.minlen_fixed)
5127         : (I32)longest_fixed_length;
5128    r->anchored_end_shift = ml - data.offset_fixed
5129     - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5130     + data.lookbehind_fixed;
5131    r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5132
5133    t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5134     && (!(data.flags & SF_FIX_BEFORE_MEOL)
5135      || (RExC_flags & RXf_PMf_MULTILINE)));
5136    fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5137   }
5138   else {
5139    r->anchored_substr = r->anchored_utf8 = NULL;
5140    SvREFCNT_dec(data.longest_fixed);
5141    longest_fixed_length = 0;
5142   }
5143   if (ri->regstclass
5144    && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5145    ri->regstclass = NULL;
5146
5147   if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5148    && stclass_flag
5149    && !(data.start_class->flags & ANYOF_EOS)
5150    && !cl_is_anything(data.start_class))
5151   {
5152    const U32 n = add_data(pRExC_state, 1, "f");
5153    data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5154
5155    Newx(RExC_rxi->data->data[n], 1,
5156     struct regnode_charclass_class);
5157    StructCopy(data.start_class,
5158      (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5159      struct regnode_charclass_class);
5160    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5161    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5162    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5163      regprop(r, sv, (regnode*)data.start_class);
5164      PerlIO_printf(Perl_debug_log,
5165          "synthetic stclass \"%s\".\n",
5166          SvPVX_const(sv));});
5167   }
5168
5169   /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5170   if (longest_fixed_length > longest_float_length) {
5171    r->check_end_shift = r->anchored_end_shift;
5172    r->check_substr = r->anchored_substr;
5173    r->check_utf8 = r->anchored_utf8;
5174    r->check_offset_min = r->check_offset_max = r->anchored_offset;
5175    if (r->extflags & RXf_ANCH_SINGLE)
5176     r->extflags |= RXf_NOSCAN;
5177   }
5178   else {
5179    r->check_end_shift = r->float_end_shift;
5180    r->check_substr = r->float_substr;
5181    r->check_utf8 = r->float_utf8;
5182    r->check_offset_min = r->float_min_offset;
5183    r->check_offset_max = r->float_max_offset;
5184   }
5185   /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5186   This should be changed ASAP!  */
5187   if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5188    r->extflags |= RXf_USE_INTUIT;
5189    if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5190     r->extflags |= RXf_INTUIT_TAIL;
5191   }
5192   /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5193   if ( (STRLEN)minlen < longest_float_length )
5194    minlen= longest_float_length;
5195   if ( (STRLEN)minlen < longest_fixed_length )
5196    minlen= longest_fixed_length;
5197   */
5198  }
5199  else {
5200   /* Several toplevels. Best we can is to set minlen. */
5201   I32 fake;
5202   struct regnode_charclass_class ch_class;
5203   I32 last_close = 0;
5204
5205   DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5206
5207   scan = ri->program + 1;
5208   cl_init(pRExC_state, &ch_class);
5209   data.start_class = &ch_class;
5210   data.last_closep = &last_close;
5211
5212
5213   minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5214    &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5215
5216   CHECK_RESTUDY_GOTO;
5217
5218   r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5219     = r->float_substr = r->float_utf8 = NULL;
5220
5221   if (!(data.start_class->flags & ANYOF_EOS)
5222    && !cl_is_anything(data.start_class))
5223   {
5224    const U32 n = add_data(pRExC_state, 1, "f");
5225    data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5226
5227    Newx(RExC_rxi->data->data[n], 1,
5228     struct regnode_charclass_class);
5229    StructCopy(data.start_class,
5230      (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5231      struct regnode_charclass_class);
5232    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5233    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5234    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5235      regprop(r, sv, (regnode*)data.start_class);
5236      PerlIO_printf(Perl_debug_log,
5237          "synthetic stclass \"%s\".\n",
5238          SvPVX_const(sv));});
5239   }
5240  }
5241
5242  /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5243  the "real" pattern. */
5244  DEBUG_OPTIMISE_r({
5245   PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5246      (IV)minlen, (IV)r->minlen);
5247  });
5248  r->minlenret = minlen;
5249  if (r->minlen < minlen)
5250   r->minlen = minlen;
5251
5252  if (RExC_seen & REG_SEEN_GPOS)
5253   r->extflags |= RXf_GPOS_SEEN;
5254  if (RExC_seen & REG_SEEN_LOOKBEHIND)
5255   r->extflags |= RXf_LOOKBEHIND_SEEN;
5256  if (RExC_seen & REG_SEEN_EVAL)
5257   r->extflags |= RXf_EVAL_SEEN;
5258  if (RExC_seen & REG_SEEN_CANY)
5259   r->extflags |= RXf_CANY_SEEN;
5260  if (RExC_seen & REG_SEEN_VERBARG)
5261   r->intflags |= PREGf_VERBARG_SEEN;
5262  if (RExC_seen & REG_SEEN_CUTGROUP)
5263   r->intflags |= PREGf_CUTGROUP_SEEN;
5264  if (RExC_paren_names)
5265   RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5266  else
5267   RXp_PAREN_NAMES(r) = NULL;
5268
5269 #ifdef STUPID_PATTERN_CHECKS
5270  if (RX_PRELEN(rx) == 0)
5271   r->extflags |= RXf_NULL;
5272  if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5273   /* XXX: this should happen BEFORE we compile */
5274   r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5275  else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5276   r->extflags |= RXf_WHITE;
5277  else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5278   r->extflags |= RXf_START_ONLY;
5279 #else
5280  if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5281    /* XXX: this should happen BEFORE we compile */
5282    r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5283  else {
5284   regnode *first = ri->program + 1;
5285   U8 fop = OP(first);
5286
5287   if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5288    r->extflags |= RXf_NULL;
5289   else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5290    r->extflags |= RXf_START_ONLY;
5291   else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5292        && OP(regnext(first)) == END)
5293    r->extflags |= RXf_WHITE;
5294  }
5295 #endif
5296 #ifdef DEBUGGING
5297  if (RExC_paren_names) {
5298   ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5299   ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5300  } else
5301 #endif
5302   ri->name_list_idx = 0;
5303
5304  if (RExC_recurse_count) {
5305   for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5306    const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5307    ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5308   }
5309  }
5310  Newxz(r->offs, RExC_npar, regexp_paren_pair);
5311  /* assume we don't need to swap parens around before we match */
5312
5313  DEBUG_DUMP_r({
5314   PerlIO_printf(Perl_debug_log,"Final program:\n");
5315   regdump(r);
5316  });
5317 #ifdef RE_TRACK_PATTERN_OFFSETS
5318  DEBUG_OFFSETS_r(if (ri->u.offsets) {
5319   const U32 len = ri->u.offsets[0];
5320   U32 i;
5321   GET_RE_DEBUG_FLAGS_DECL;
5322   PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5323   for (i = 1; i <= len; i++) {
5324    if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5325     PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5326     (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5327    }
5328   PerlIO_printf(Perl_debug_log, "\n");
5329  });
5330 #endif
5331  return rx;
5332 }
5333
5334 #undef RE_ENGINE_PTR
5335
5336
5337 SV*
5338 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5339      const U32 flags)
5340 {
5341  PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5342
5343  PERL_UNUSED_ARG(value);
5344
5345  if (flags & RXapif_FETCH) {
5346   return reg_named_buff_fetch(rx, key, flags);
5347  } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5348   Perl_croak_no_modify(aTHX);
5349   return NULL;
5350  } else if (flags & RXapif_EXISTS) {
5351   return reg_named_buff_exists(rx, key, flags)
5352    ? &PL_sv_yes
5353    : &PL_sv_no;
5354  } else if (flags & RXapif_REGNAMES) {
5355   return reg_named_buff_all(rx, flags);
5356  } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5357   return reg_named_buff_scalar(rx, flags);
5358  } else {
5359   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5360   return NULL;
5361  }
5362 }
5363
5364 SV*
5365 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5366       const U32 flags)
5367 {
5368  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5369  PERL_UNUSED_ARG(lastkey);
5370
5371  if (flags & RXapif_FIRSTKEY)
5372   return reg_named_buff_firstkey(rx, flags);
5373  else if (flags & RXapif_NEXTKEY)
5374   return reg_named_buff_nextkey(rx, flags);
5375  else {
5376   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5377   return NULL;
5378  }
5379 }
5380
5381 SV*
5382 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5383       const U32 flags)
5384 {
5385  AV *retarray = NULL;
5386  SV *ret;
5387  struct regexp *const rx = (struct regexp *)SvANY(r);
5388
5389  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5390
5391  if (flags & RXapif_ALL)
5392   retarray=newAV();
5393
5394  if (rx && RXp_PAREN_NAMES(rx)) {
5395   HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5396   if (he_str) {
5397    IV i;
5398    SV* sv_dat=HeVAL(he_str);
5399    I32 *nums=(I32*)SvPVX(sv_dat);
5400    for ( i=0; i<SvIVX(sv_dat); i++ ) {
5401     if ((I32)(rx->nparens) >= nums[i]
5402      && rx->offs[nums[i]].start != -1
5403      && rx->offs[nums[i]].end != -1)
5404     {
5405      ret = newSVpvs("");
5406      CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5407      if (!retarray)
5408       return ret;
5409     } else {
5410      ret = newSVsv(&PL_sv_undef);
5411     }
5412     if (retarray)
5413      av_push(retarray, ret);
5414    }
5415    if (retarray)
5416     return newRV_noinc(MUTABLE_SV(retarray));
5417   }
5418  }
5419  return NULL;
5420 }
5421
5422 bool
5423 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5424       const U32 flags)
5425 {
5426  struct regexp *const rx = (struct regexp *)SvANY(r);
5427
5428  PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5429
5430  if (rx && RXp_PAREN_NAMES(rx)) {
5431   if (flags & RXapif_ALL) {
5432    return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5433   } else {
5434    SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5435    if (sv) {
5436     SvREFCNT_dec(sv);
5437     return TRUE;
5438    } else {
5439     return FALSE;
5440    }
5441   }
5442  } else {
5443   return FALSE;
5444  }
5445 }
5446
5447 SV*
5448 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5449 {
5450  struct regexp *const rx = (struct regexp *)SvANY(r);
5451
5452  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5453
5454  if ( rx && RXp_PAREN_NAMES(rx) ) {
5455   (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5456
5457   return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5458  } else {
5459   return FALSE;
5460  }
5461 }
5462
5463 SV*
5464 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5465 {
5466  struct regexp *const rx = (struct regexp *)SvANY(r);
5467  GET_RE_DEBUG_FLAGS_DECL;
5468
5469  PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5470
5471  if (rx && RXp_PAREN_NAMES(rx)) {
5472   HV *hv = RXp_PAREN_NAMES(rx);
5473   HE *temphe;
5474   while ( (temphe = hv_iternext_flags(hv,0)) ) {
5475    IV i;
5476    IV parno = 0;
5477    SV* sv_dat = HeVAL(temphe);
5478    I32 *nums = (I32*)SvPVX(sv_dat);
5479    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5480     if ((I32)(rx->lastparen) >= nums[i] &&
5481      rx->offs[nums[i]].start != -1 &&
5482      rx->offs[nums[i]].end != -1)
5483     {
5484      parno = nums[i];
5485      break;
5486     }
5487    }
5488    if (parno || flags & RXapif_ALL) {
5489     return newSVhek(HeKEY_hek(temphe));
5490    }
5491   }
5492  }
5493  return NULL;
5494 }
5495
5496 SV*
5497 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5498 {
5499  SV *ret;
5500  AV *av;
5501  I32 length;
5502  struct regexp *const rx = (struct regexp *)SvANY(r);
5503
5504  PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5505
5506  if (rx && RXp_PAREN_NAMES(rx)) {
5507   if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5508    return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5509   } else if (flags & RXapif_ONE) {
5510    ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5511    av = MUTABLE_AV(SvRV(ret));
5512    length = av_len(av);
5513    SvREFCNT_dec(ret);
5514    return newSViv(length + 1);
5515   } else {
5516    Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5517    return NULL;
5518   }
5519  }
5520  return &PL_sv_undef;
5521 }
5522
5523 SV*
5524 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5525 {
5526  struct regexp *const rx = (struct regexp *)SvANY(r);
5527  AV *av = newAV();
5528
5529  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5530
5531  if (rx && RXp_PAREN_NAMES(rx)) {
5532   HV *hv= RXp_PAREN_NAMES(rx);
5533   HE *temphe;
5534   (void)hv_iterinit(hv);
5535   while ( (temphe = hv_iternext_flags(hv,0)) ) {
5536    IV i;
5537    IV parno = 0;
5538    SV* sv_dat = HeVAL(temphe);
5539    I32 *nums = (I32*)SvPVX(sv_dat);
5540    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5541     if ((I32)(rx->lastparen) >= nums[i] &&
5542      rx->offs[nums[i]].start != -1 &&
5543      rx->offs[nums[i]].end != -1)
5544     {
5545      parno = nums[i];
5546      break;
5547     }
5548    }
5549    if (parno || flags & RXapif_ALL) {
5550     av_push(av, newSVhek(HeKEY_hek(temphe)));
5551    }
5552   }
5553  }
5554
5555  return newRV_noinc(MUTABLE_SV(av));
5556 }
5557
5558 void
5559 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5560        SV * const sv)
5561 {
5562  struct regexp *const rx = (struct regexp *)SvANY(r);
5563  char *s = NULL;
5564  I32 i = 0;
5565  I32 s1, t1;
5566
5567  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5568
5569  if (!rx->subbeg) {
5570   sv_setsv(sv,&PL_sv_undef);
5571   return;
5572  }
5573  else
5574  if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5575   /* $` */
5576   i = rx->offs[0].start;
5577   s = rx->subbeg;
5578  }
5579  else
5580  if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5581   /* $' */
5582   s = rx->subbeg + rx->offs[0].end;
5583   i = rx->sublen - rx->offs[0].end;
5584  }
5585  else
5586  if ( 0 <= paren && paren <= (I32)rx->nparens &&
5587   (s1 = rx->offs[paren].start) != -1 &&
5588   (t1 = rx->offs[paren].end) != -1)
5589  {
5590   /* $& $1 ... */
5591   i = t1 - s1;
5592   s = rx->subbeg + s1;
5593  } else {
5594   sv_setsv(sv,&PL_sv_undef);
5595   return;
5596  }
5597  assert(rx->sublen >= (s - rx->subbeg) + i );
5598  if (i >= 0) {
5599   const int oldtainted = PL_tainted;
5600   TAINT_NOT;
5601   sv_setpvn(sv, s, i);
5602   PL_tainted = oldtainted;
5603   if ( (rx->extflags & RXf_CANY_SEEN)
5604    ? (RXp_MATCH_UTF8(rx)
5605       && (!i || is_utf8_string((U8*)s, i)))
5606    : (RXp_MATCH_UTF8(rx)) )
5607   {
5608    SvUTF8_on(sv);
5609   }
5610   else
5611    SvUTF8_off(sv);
5612   if (PL_tainting) {
5613    if (RXp_MATCH_TAINTED(rx)) {
5614     if (SvTYPE(sv) >= SVt_PVMG) {
5615      MAGIC* const mg = SvMAGIC(sv);
5616      MAGIC* mgt;
5617      PL_tainted = 1;
5618      SvMAGIC_set(sv, mg->mg_moremagic);
5619      SvTAINT(sv);
5620      if ((mgt = SvMAGIC(sv))) {
5621       mg->mg_moremagic = mgt;
5622       SvMAGIC_set(sv, mg);
5623      }
5624     } else {
5625      PL_tainted = 1;
5626      SvTAINT(sv);
5627     }
5628    } else
5629     SvTAINTED_off(sv);
5630   }
5631  } else {
5632   sv_setsv(sv,&PL_sv_undef);
5633   return;
5634  }
5635 }
5636
5637 void
5638 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5639               SV const * const value)
5640 {
5641  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5642
5643  PERL_UNUSED_ARG(rx);
5644  PERL_UNUSED_ARG(paren);
5645  PERL_UNUSED_ARG(value);
5646
5647  if (!PL_localizing)
5648   Perl_croak_no_modify(aTHX);
5649 }
5650
5651 I32
5652 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5653        const I32 paren)
5654 {
5655  struct regexp *const rx = (struct regexp *)SvANY(r);
5656  I32 i;
5657  I32 s1, t1;
5658
5659  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5660
5661  /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5662   switch (paren) {
5663  /* $` / ${^PREMATCH} */
5664  case RX_BUFF_IDX_PREMATCH:
5665   if (rx->offs[0].start != -1) {
5666       i = rx->offs[0].start;
5667       if (i > 0) {
5668         s1 = 0;
5669         t1 = i;
5670         goto getlen;
5671       }
5672    }
5673   return 0;
5674  /* $' / ${^POSTMATCH} */
5675  case RX_BUFF_IDX_POSTMATCH:
5676    if (rx->offs[0].end != -1) {
5677       i = rx->sublen - rx->offs[0].end;
5678       if (i > 0) {
5679         s1 = rx->offs[0].end;
5680         t1 = rx->sublen;
5681         goto getlen;
5682       }
5683    }
5684   return 0;
5685  /* $& / ${^MATCH}, $1, $2, ... */
5686  default:
5687    if (paren <= (I32)rx->nparens &&
5688    (s1 = rx->offs[paren].start) != -1 &&
5689    (t1 = rx->offs[paren].end) != -1)
5690    {
5691    i = t1 - s1;
5692    goto getlen;
5693   } else {
5694    if (ckWARN(WARN_UNINITIALIZED))
5695     report_uninit((const SV *)sv);
5696    return 0;
5697   }
5698  }
5699   getlen:
5700  if (i > 0 && RXp_MATCH_UTF8(rx)) {
5701   const char * const s = rx->subbeg + s1;
5702   const U8 *ep;
5703   STRLEN el;
5704
5705   i = t1 - s1;
5706   if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5707       i = el;
5708  }
5709  return i;
5710 }
5711
5712 SV*
5713 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5714 {
5715  PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5716   PERL_UNUSED_ARG(rx);
5717   if (0)
5718    return NULL;
5719   else
5720    return newSVpvs("Regexp");
5721 }
5722
5723 /* Scans the name of a named buffer from the pattern.
5724  * If flags is REG_RSN_RETURN_NULL returns null.
5725  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5726  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5727  * to the parsed name as looked up in the RExC_paren_names hash.
5728  * If there is an error throws a vFAIL().. type exception.
5729  */
5730
5731 #define REG_RSN_RETURN_NULL    0
5732 #define REG_RSN_RETURN_NAME    1
5733 #define REG_RSN_RETURN_DATA    2
5734
5735 STATIC SV*
5736 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5737 {
5738  char *name_start = RExC_parse;
5739
5740  PERL_ARGS_ASSERT_REG_SCAN_NAME;
5741
5742  if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5743   /* skip IDFIRST by using do...while */
5744   if (UTF)
5745    do {
5746     RExC_parse += UTF8SKIP(RExC_parse);
5747    } while (isALNUM_utf8((U8*)RExC_parse));
5748   else
5749    do {
5750     RExC_parse++;
5751    } while (isALNUM(*RExC_parse));
5752  }
5753
5754  if ( flags ) {
5755   SV* sv_name
5756    = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5757        SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5758   if ( flags == REG_RSN_RETURN_NAME)
5759    return sv_name;
5760   else if (flags==REG_RSN_RETURN_DATA) {
5761    HE *he_str = NULL;
5762    SV *sv_dat = NULL;
5763    if ( ! sv_name )      /* should not happen*/
5764     Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5765    if (RExC_paren_names)
5766     he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5767    if ( he_str )
5768     sv_dat = HeVAL(he_str);
5769    if ( ! sv_dat )
5770     vFAIL("Reference to nonexistent named group");
5771    return sv_dat;
5772   }
5773   else {
5774    Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5775   }
5776   /* NOT REACHED */
5777  }
5778  return NULL;
5779 }
5780
5781 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5782  int rem=(int)(RExC_end - RExC_parse);                       \
5783  int cut;                                                    \
5784  int num;                                                    \
5785  int iscut=0;                                                \
5786  if (rem>10) {                                               \
5787   rem=10;                                                 \
5788   iscut=1;                                                \
5789  }                                                           \
5790  cut=10-rem;                                                 \
5791  if (RExC_lastparse!=RExC_parse)                             \
5792   PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5793    rem, RExC_parse,                                    \
5794    cut + 4,                                            \
5795    iscut ? "..." : "<"                                 \
5796   );                                                      \
5797  else                                                        \
5798   PerlIO_printf(Perl_debug_log,"%16s","");                \
5799                 \
5800  if (SIZE_ONLY)                                              \
5801  num = RExC_size + 1;                                     \
5802  else                                                        \
5803  num=REG_NODE_NUM(RExC_emit);                             \
5804  if (RExC_lastnum!=num)                                      \
5805  PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5806  else                                                        \
5807  PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5808  PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5809   (int)((depth*2)), "",                                   \
5810   (funcname)                                              \
5811  );                                                          \
5812  RExC_lastnum=num;                                           \
5813  RExC_lastparse=RExC_parse;                                  \
5814 })
5815
5816
5817
5818 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5819  DEBUG_PARSE_MSG((funcname));                            \
5820  PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5821 })
5822 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5823  DEBUG_PARSE_MSG((funcname));                            \
5824  PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5825 })
5826
5827 /* This section of code defines the inversion list object and its methods.  The
5828  * interfaces are highly subject to change, so as much as possible is static to
5829  * this file.  An inversion list is here implemented as a malloc'd C array with
5830  * some added info.  More will be coming when functionality is added later.
5831  *
5832  * It is currently implemented as an HV to the outside world, but is actually
5833  * an SV pointing to an array of UVs that the SV thinks are bytes.  This allows
5834  * us to have an array of UV whose memory management is automatically handled
5835  * by the existing facilities for SV's.
5836  *
5837  * Some of the methods should always be private to the implementation, and some
5838  * should eventually be made public */
5839
5840 #define INVLIST_INITIAL_LEN 10
5841
5842 PERL_STATIC_INLINE UV*
5843 S_invlist_array(pTHX_ HV* const invlist)
5844 {
5845  /* Returns the pointer to the inversion list's array.  Every time the
5846  * length changes, this needs to be called in case malloc or realloc moved
5847  * it */
5848
5849  PERL_ARGS_ASSERT_INVLIST_ARRAY;
5850
5851  return (UV *) SvPVX(invlist);
5852 }
5853
5854 PERL_STATIC_INLINE UV
5855 S_invlist_len(pTHX_ HV* const invlist)
5856 {
5857  /* Returns the current number of elements in the inversion list's array */
5858
5859  PERL_ARGS_ASSERT_INVLIST_LEN;
5860
5861  return SvCUR(invlist) / sizeof(UV);
5862 }
5863
5864 PERL_STATIC_INLINE UV
5865 S_invlist_max(pTHX_ HV* const invlist)
5866 {
5867  /* Returns the maximum number of elements storable in the inversion list's
5868  * array, without having to realloc() */
5869
5870  PERL_ARGS_ASSERT_INVLIST_MAX;
5871
5872  return SvLEN(invlist) / sizeof(UV);
5873 }
5874
5875 PERL_STATIC_INLINE void
5876 S_invlist_set_len(pTHX_ HV* const invlist, const UV len)
5877 {
5878  /* Sets the current number of elements stored in the inversion list */
5879
5880  PERL_ARGS_ASSERT_INVLIST_SET_LEN;
5881
5882  SvCUR_set(invlist, len * sizeof(UV));
5883 }
5884
5885 PERL_STATIC_INLINE void
5886 S_invlist_set_max(pTHX_ HV* const invlist, const UV max)
5887 {
5888
5889  /* Sets the maximum number of elements storable in the inversion list
5890  * without having to realloc() */
5891
5892  PERL_ARGS_ASSERT_INVLIST_SET_MAX;
5893
5894  if (max < invlist_len(invlist)) {
5895   Perl_croak(aTHX_ "panic: Can't make max size '%"UVuf"' less than current length %"UVuf" in inversion list", invlist_max(invlist), invlist_len(invlist));
5896  }
5897
5898  SvLEN_set(invlist, max * sizeof(UV));
5899 }
5900
5901 #ifndef PERL_IN_XSUB_RE
5902 HV*
5903 Perl__new_invlist(pTHX_ IV initial_size)
5904 {
5905
5906  /* Return a pointer to a newly constructed inversion list, with enough
5907  * space to store 'initial_size' elements.  If that number is negative, a
5908  * system default is used instead */
5909
5910  if (initial_size < 0) {
5911   initial_size = INVLIST_INITIAL_LEN;
5912  }
5913
5914  /* Allocate the initial space */
5915  return (HV *) newSV(initial_size * sizeof(UV));
5916 }
5917 #endif
5918
5919 PERL_STATIC_INLINE void
5920 S_invlist_destroy(pTHX_ HV* const invlist)
5921 {
5922    /* Inversion list destructor */
5923
5924  PERL_ARGS_ASSERT_INVLIST_DESTROY;
5925
5926  SvREFCNT_dec(invlist);
5927 }
5928
5929 STATIC void
5930 S_invlist_extend(pTHX_ HV* const invlist, const UV new_max)
5931 {
5932  /* Grow the maximum size of an inversion list */
5933
5934  PERL_ARGS_ASSERT_INVLIST_EXTEND;
5935
5936  SvGROW((SV *)invlist, new_max * sizeof(UV));
5937 }
5938
5939 PERL_STATIC_INLINE void
5940 S_invlist_trim(pTHX_ HV* const invlist)
5941 {
5942  PERL_ARGS_ASSERT_INVLIST_TRIM;
5943
5944  /* Change the length of the inversion list to how many entries it currently
5945  * has */
5946
5947  SvPV_shrink_to_cur((SV *) invlist);
5948 }
5949
5950 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
5951  * etc */
5952
5953 #define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
5954
5955 #ifndef PERL_IN_XSUB_RE
5956 void
5957 Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
5958 {
5959    /* Subject to change or removal.  Append the range from 'start' to 'end' at
5960  * the end of the inversion list.  The range must be above any existing
5961  * ones. */
5962
5963  UV* array = invlist_array(invlist);
5964  UV max = invlist_max(invlist);
5965  UV len = invlist_len(invlist);
5966
5967  PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
5968
5969  if (len > 0) {
5970
5971   /* Here, the existing list is non-empty. The current max entry in the
5972   * list is generally the first value not in the set, except when the
5973   * set extends to the end of permissible values, in which case it is
5974   * the first entry in that final set, and so this call is an attempt to
5975   * append out-of-order */
5976
5977   UV final_element = len - 1;
5978   if (array[final_element] > start
5979    || ELEMENT_IN_INVLIST_SET(final_element))
5980   {
5981    Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list");
5982   }
5983
5984   /* Here, it is a legal append.  If the new range begins with the first
5985   * value not in the set, it is extending the set, so the new first
5986   * value not in the set is one greater than the newly extended range.
5987   * */
5988   if (array[final_element] == start) {
5989    if (end != UV_MAX) {
5990     array[final_element] = end + 1;
5991    }
5992    else {
5993     /* But if the end is the maximum representable on the machine,
5994     * just let the range that this would extend have no end */
5995     invlist_set_len(invlist, len - 1);
5996    }
5997    return;
5998   }
5999  }
6000
6001  /* Here the new range doesn't extend any existing set.  Add it */
6002
6003  len += 2; /* Includes an element each for the start and end of range */
6004
6005  /* If overflows the existing space, extend, which may cause the array to be
6006  * moved */
6007  if (max < len) {
6008   invlist_extend(invlist, len);
6009   array = invlist_array(invlist);
6010  }
6011
6012  invlist_set_len(invlist, len);
6013
6014  /* The next item on the list starts the range, the one after that is
6015  * one past the new range.  */
6016  array[len - 2] = start;
6017  if (end != UV_MAX) {
6018   array[len - 1] = end + 1;
6019  }
6020  else {
6021   /* But if the end is the maximum representable on the machine, just let
6022   * the range have no end */
6023   invlist_set_len(invlist, len - 1);
6024  }
6025 }
6026 #endif
6027
6028 STATIC HV*
6029 S_invlist_union(pTHX_ HV* const a, HV* const b)
6030 {
6031  /* Return a new inversion list which is the union of two inversion lists.
6032  * The basis for this comes from "Unicode Demystified" Chapter 13 by
6033  * Richard Gillam, published by Addison-Wesley, and explained at some
6034  * length there.  The preface says to incorporate its examples into your
6035  * code at your own risk.
6036  *
6037  * The algorithm is like a merge sort.
6038  *
6039  * XXX A potential performance improvement is to keep track as we go along
6040  * if only one of the inputs contributes to the result, meaning the other
6041  * is a subset of that one.  In that case, we can skip the final copy and
6042  * return the larger of the input lists */
6043
6044  UV* array_a = invlist_array(a);   /* a's array */
6045  UV* array_b = invlist_array(b);
6046  UV len_a = invlist_len(a); /* length of a's array */
6047  UV len_b = invlist_len(b);
6048
6049  HV* u;   /* the resulting union */
6050  UV* array_u;
6051  UV len_u;
6052
6053  UV i_a = 0;      /* current index into a's array */
6054  UV i_b = 0;
6055  UV i_u = 0;
6056
6057  /* running count, as explained in the algorithm source book; items are
6058  * stopped accumulating and are output when the count changes to/from 0.
6059  * The count is incremented when we start a range that's in the set, and
6060  * decremented when we start a range that's not in the set.  So its range
6061  * is 0 to 2.  Only when the count is zero is something not in the set.
6062  */
6063  UV count = 0;
6064
6065  PERL_ARGS_ASSERT_INVLIST_UNION;
6066
6067  /* Size the union for the worst case: that the sets are completely
6068  * disjoint */
6069  u = _new_invlist(len_a + len_b);
6070  array_u = invlist_array(u);
6071
6072  /* Go through each list item by item, stopping when exhausted one of
6073  * them */
6074  while (i_a < len_a && i_b < len_b) {
6075   UV cp;     /* The element to potentially add to the union's array */
6076   bool cp_in_set;   /* is it in the the input list's set or not */
6077
6078   /* We need to take one or the other of the two inputs for the union.
6079   * Since we are merging two sorted lists, we take the smaller of the
6080   * next items.  In case of a tie, we take the one that is in its set
6081   * first.  If we took one not in the set first, it would decrement the
6082   * count, possibly to 0 which would cause it to be output as ending the
6083   * range, and the next time through we would take the same number, and
6084   * output it again as beginning the next range.  By doing it the
6085   * opposite way, there is no possibility that the count will be
6086   * momentarily decremented to 0, and thus the two adjoining ranges will
6087   * be seamlessly merged.  (In a tie and both are in the set or both not
6088   * in the set, it doesn't matter which we take first.) */
6089   if (array_a[i_a] < array_b[i_b]
6090    || (array_a[i_a] == array_b[i_b] && ELEMENT_IN_INVLIST_SET(i_a)))
6091   {
6092    cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6093    cp= array_a[i_a++];
6094   }
6095   else {
6096    cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6097    cp= array_b[i_b++];
6098   }
6099
6100   /* Here, have chosen which of the two inputs to look at.  Only output
6101   * if the running count changes to/from 0, which marks the
6102   * beginning/end of a range in that's in the set */
6103   if (cp_in_set) {
6104    if (count == 0) {
6105     array_u[i_u++] = cp;
6106    }
6107    count++;
6108   }
6109   else {
6110    count--;
6111    if (count == 0) {
6112     array_u[i_u++] = cp;
6113    }
6114   }
6115  }
6116
6117  /* Here, we are finished going through at least one of the lists, which
6118  * means there is something remaining in at most one.  We check if the list
6119  * that hasn't been exhausted is positioned such that we are in the middle
6120  * of a range in its set or not.  (We are in the set if the next item in
6121  * the array marks the beginning of something not in the set)   If in the
6122  * set, we decrement 'count'; if 0, there is potentially more to output.
6123  * There are four cases:
6124  * 1) Both weren't in their sets, count is 0, and remains 0.  What's left
6125  *    in the union is entirely from the non-exhausted set.
6126  * 2) Both were in their sets, count is 2.  Nothing further should
6127  *    be output, as everything that remains will be in the exhausted
6128  *    list's set, hence in the union; decrementing to 1 but not 0 insures
6129  *    that
6130  * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
6131  *    Nothing further should be output because the union includes
6132  *    everything from the exhausted set.  Not decrementing insures that.
6133  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6134  *    decrementing to 0 insures that we look at the remainder of the
6135  *    non-exhausted set */
6136  if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6137   || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6138  {
6139   count--;
6140  }
6141
6142  /* The final length is what we've output so far, plus what else is about to
6143  * be output.  (If 'count' is non-zero, then the input list we exhausted
6144  * has everything remaining up to the machine's limit in its set, and hence
6145  * in the union, so there will be no further output. */
6146  len_u = i_u;
6147  if (count == 0) {
6148   /* At most one of the subexpressions will be non-zero */
6149   len_u += (len_a - i_a) + (len_b - i_b);
6150  }
6151
6152  /* Set result to final length, which can change the pointer to array_u, so
6153  * re-find it */
6154  if (len_u != invlist_len(u)) {
6155   invlist_set_len(u, len_u);
6156   invlist_trim(u);
6157   array_u = invlist_array(u);
6158  }
6159
6160  /* When 'count' is 0, the list that was exhausted (if one was shorter than
6161  * the other) ended with everything above it not in its set.  That means
6162  * that the remaining part of the union is precisely the same as the
6163  * non-exhausted list, so can just copy it unchanged.  (If both list were
6164  * exhausted at the same time, then the operations below will be both 0.)
6165  */
6166  if (count == 0) {
6167   IV copy_count; /* At most one will have a non-zero copy count */
6168   if ((copy_count = len_a - i_a) > 0) {
6169    Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6170   }
6171   else if ((copy_count = len_b - i_b) > 0) {
6172    Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6173   }
6174  }
6175
6176  return u;
6177 }
6178
6179 STATIC HV*
6180 S_invlist_intersection(pTHX_ HV* const a, HV* const b)
6181 {
6182  /* Return the intersection of two inversion lists.  The basis for this
6183  * comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
6184  * by Addison-Wesley, and explained at some length there.  The preface says
6185  * to incorporate its examples into your code at your own risk.
6186  *
6187  * The algorithm is like a merge sort, and is essentially the same as the
6188  * union above
6189  */
6190
6191  UV* array_a = invlist_array(a);   /* a's array */
6192  UV* array_b = invlist_array(b);
6193  UV len_a = invlist_len(a); /* length of a's array */
6194  UV len_b = invlist_len(b);
6195
6196  HV* r;       /* the resulting intersection */
6197  UV* array_r;
6198  UV len_r;
6199
6200  UV i_a = 0;      /* current index into a's array */
6201  UV i_b = 0;
6202  UV i_r = 0;
6203
6204  /* running count, as explained in the algorithm source book; items are
6205  * stopped accumulating and are output when the count changes to/from 2.
6206  * The count is incremented when we start a range that's in the set, and
6207  * decremented when we start a range that's not in the set.  So its range
6208  * is 0 to 2.  Only when the count is 2 is something in the intersection.
6209  */
6210  UV count = 0;
6211
6212  PERL_ARGS_ASSERT_INVLIST_INTERSECTION;
6213
6214  /* Size the intersection for the worst case: that the intersection ends up
6215  * fragmenting everything to be completely disjoint */
6216  r= _new_invlist(len_a + len_b);
6217  array_r = invlist_array(r);
6218
6219  /* Go through each list item by item, stopping when exhausted one of
6220  * them */
6221  while (i_a < len_a && i_b < len_b) {
6222   UV cp;     /* The element to potentially add to the intersection's
6223      array */
6224   bool cp_in_set; /* Is it in the input list's set or not */
6225
6226   /* We need to take one or the other of the two inputs for the union.
6227   * Since we are merging two sorted lists, we take the smaller of the
6228   * next items.  In case of a tie, we take the one that is not in its
6229   * set first (a difference from the union algorithm).  If we took one
6230   * in the set first, it would increment the count, possibly to 2 which
6231   * would cause it to be output as starting a range in the intersection,
6232   * and the next time through we would take that same number, and output
6233   * it again as ending the set.  By doing it the opposite of this, we
6234   * there is no possibility that the count will be momentarily
6235   * incremented to 2.  (In a tie and both are in the set or both not in
6236   * the set, it doesn't matter which we take first.) */
6237   if (array_a[i_a] < array_b[i_b]
6238    || (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
6239   {
6240    cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6241    cp= array_a[i_a++];
6242   }
6243   else {
6244    cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6245    cp= array_b[i_b++];
6246   }
6247
6248   /* Here, have chosen which of the two inputs to look at.  Only output
6249   * if the running count changes to/from 2, which marks the
6250   * beginning/end of a range that's in the intersection */
6251   if (cp_in_set) {
6252    count++;
6253    if (count == 2) {
6254     array_r[i_r++] = cp;
6255    }
6256   }
6257   else {
6258    if (count == 2) {
6259     array_r[i_r++] = cp;
6260    }
6261    count--;
6262   }
6263  }
6264
6265  /* Here, we are finished going through at least one of the sets, which
6266  * means there is something remaining in at most one.  See the comments in
6267  * the union code */
6268  if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6269   || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6270  {
6271   count--;
6272  }
6273
6274  /* The final length is what we've output so far plus what else is in the
6275  * intersection.  Only one of the subexpressions below will be non-zero */
6276  len_r = i_r;
6277  if (count == 2) {
6278   len_r += (len_a - i_a) + (len_b - i_b);
6279  }
6280
6281  /* Set result to final length, which can change the pointer to array_r, so
6282  * re-find it */
6283  if (len_r != invlist_len(r)) {
6284   invlist_set_len(r, len_r);
6285   invlist_trim(r);
6286   array_r = invlist_array(r);
6287  }
6288
6289  /* Finish outputting any remaining */
6290  if (count == 2) { /* Only one of will have a non-zero copy count */
6291   IV copy_count;
6292   if ((copy_count = len_a - i_a) > 0) {
6293    Copy(array_a + i_a, array_r + i_r, copy_count, UV);
6294   }
6295   else if ((copy_count = len_b - i_b) > 0) {
6296    Copy(array_b + i_b, array_r + i_r, copy_count, UV);
6297   }
6298  }
6299
6300  return r;
6301 }
6302
6303 STATIC HV*
6304 S_add_range_to_invlist(pTHX_ HV* invlist, const UV start, const UV end)
6305 {
6306  /* Add the range from 'start' to 'end' inclusive to the inversion list's
6307  * set.  A pointer to the inversion list is returned.  This may actually be
6308  * a new list, in which case the passed in one has been destroyed.  The
6309  * passed in inversion list can be NULL, in which case a new one is created
6310  * with just the one range in it */
6311
6312  HV* range_invlist;
6313  HV* added_invlist;
6314  UV len;
6315
6316  if (invlist == NULL) {
6317   invlist = _new_invlist(2);
6318   len = 0;
6319  }
6320  else {
6321   len = invlist_len(invlist);
6322  }
6323
6324  /* If comes after the final entry, can just append it to the end */
6325  if (len == 0
6326   || start >= invlist_array(invlist)
6327          [invlist_len(invlist) - 1])
6328  {
6329   _append_range_to_invlist(invlist, start, end);
6330   return invlist;
6331  }
6332
6333  /* Here, can't just append things, create and return a new inversion list
6334  * which is the union of this range and the existing inversion list */
6335  range_invlist = _new_invlist(2);
6336  _append_range_to_invlist(range_invlist, start, end);
6337
6338  added_invlist = invlist_union(invlist, range_invlist);
6339
6340  /* The passed in list can be freed, as well as our temporary */
6341  invlist_destroy(range_invlist);
6342  if (invlist != added_invlist) {
6343   invlist_destroy(invlist);
6344  }
6345
6346  return added_invlist;
6347 }
6348
6349 PERL_STATIC_INLINE HV*
6350 S_add_cp_to_invlist(pTHX_ HV* invlist, const UV cp) {
6351  return add_range_to_invlist(invlist, cp, cp);
6352 }
6353
6354 /* End of inversion list object */
6355
6356 /*
6357  - reg - regular expression, i.e. main body or parenthesized thing
6358  *
6359  * Caller must absorb opening parenthesis.
6360  *
6361  * Combining parenthesis handling with the base level of regular expression
6362  * is a trifle forced, but the need to tie the tails of the branches to what
6363  * follows makes it hard to avoid.
6364  */
6365 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
6366 #ifdef DEBUGGING
6367 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
6368 #else
6369 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
6370 #endif
6371
6372 STATIC regnode *
6373 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
6374  /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
6375 {
6376  dVAR;
6377  register regnode *ret;  /* Will be the head of the group. */
6378  register regnode *br;
6379  register regnode *lastbr;
6380  register regnode *ender = NULL;
6381  register I32 parno = 0;
6382  I32 flags;
6383  U32 oregflags = RExC_flags;
6384  bool have_branch = 0;
6385  bool is_open = 0;
6386  I32 freeze_paren = 0;
6387  I32 after_freeze = 0;
6388
6389  /* for (?g), (?gc), and (?o) warnings; warning
6390  about (?c) will warn about (?g) -- japhy    */
6391
6392 #define WASTED_O  0x01
6393 #define WASTED_G  0x02
6394 #define WASTED_C  0x04
6395 #define WASTED_GC (0x02|0x04)
6396  I32 wastedflags = 0x00;
6397
6398  char * parse_start = RExC_parse; /* MJD */
6399  char * const oregcomp_parse = RExC_parse;
6400
6401  GET_RE_DEBUG_FLAGS_DECL;
6402
6403  PERL_ARGS_ASSERT_REG;
6404  DEBUG_PARSE("reg ");
6405
6406  *flagp = 0;    /* Tentatively. */
6407
6408
6409  /* Make an OPEN node, if parenthesized. */
6410  if (paren) {
6411   if ( *RExC_parse == '*') { /* (*VERB:ARG) */
6412    char *start_verb = RExC_parse;
6413    STRLEN verb_len = 0;
6414    char *start_arg = NULL;
6415    unsigned char op = 0;
6416    int argok = 1;
6417    int internal_argval = 0; /* internal_argval is only useful if !argok */
6418    while ( *RExC_parse && *RExC_parse != ')' ) {
6419     if ( *RExC_parse == ':' ) {
6420      start_arg = RExC_parse + 1;
6421      break;
6422     }
6423     RExC_parse++;
6424    }
6425    ++start_verb;
6426    verb_len = RExC_parse - start_verb;
6427    if ( start_arg ) {
6428     RExC_parse++;
6429     while ( *RExC_parse && *RExC_parse != ')' )
6430      RExC_parse++;
6431     if ( *RExC_parse != ')' )
6432      vFAIL("Unterminated verb pattern argument");
6433     if ( RExC_parse == start_arg )
6434      start_arg = NULL;
6435    } else {
6436     if ( *RExC_parse != ')' )
6437      vFAIL("Unterminated verb pattern");
6438    }
6439
6440    switch ( *start_verb ) {
6441    case 'A':  /* (*ACCEPT) */
6442     if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
6443      op = ACCEPT;
6444      internal_argval = RExC_nestroot;
6445     }
6446     break;
6447    case 'C':  /* (*COMMIT) */
6448     if ( memEQs(start_verb,verb_len,"COMMIT") )
6449      op = COMMIT;
6450     break;
6451    case 'F':  /* (*FAIL) */
6452     if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
6453      op = OPFAIL;
6454      argok = 0;
6455     }
6456     break;
6457    case ':':  /* (*:NAME) */
6458    case 'M':  /* (*MARK:NAME) */
6459     if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
6460      op = MARKPOINT;
6461      argok = -1;
6462     }
6463     break;
6464    case 'P':  /* (*PRUNE) */
6465     if ( memEQs(start_verb,verb_len,"PRUNE") )
6466      op = PRUNE;
6467     break;
6468    case 'S':   /* (*SKIP) */
6469     if ( memEQs(start_verb,verb_len,"SKIP") )
6470      op = SKIP;
6471     break;
6472    case 'T':  /* (*THEN) */
6473     /* [19:06] <TimToady> :: is then */
6474     if ( memEQs(start_verb,verb_len,"THEN") ) {
6475      op = CUTGROUP;
6476      RExC_seen |= REG_SEEN_CUTGROUP;
6477     }
6478     break;
6479    }
6480    if ( ! op ) {
6481     RExC_parse++;
6482     vFAIL3("Unknown verb pattern '%.*s'",
6483      verb_len, start_verb);
6484    }
6485    if ( argok ) {
6486     if ( start_arg && internal_argval ) {
6487      vFAIL3("Verb pattern '%.*s' may not have an argument",
6488       verb_len, start_verb);
6489     } else if ( argok < 0 && !start_arg ) {
6490      vFAIL3("Verb pattern '%.*s' has a mandatory argument",
6491       verb_len, start_verb);
6492     } else {
6493      ret = reganode(pRExC_state, op, internal_argval);
6494      if ( ! internal_argval && ! SIZE_ONLY ) {
6495       if (start_arg) {
6496        SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
6497        ARG(ret) = add_data( pRExC_state, 1, "S" );
6498        RExC_rxi->data->data[ARG(ret)]=(void*)sv;
6499        ret->flags = 0;
6500       } else {
6501        ret->flags = 1;
6502       }
6503      }
6504     }
6505     if (!internal_argval)
6506      RExC_seen |= REG_SEEN_VERBARG;
6507    } else if ( start_arg ) {
6508     vFAIL3("Verb pattern '%.*s' may not have an argument",
6509       verb_len, start_verb);
6510    } else {
6511     ret = reg_node(pRExC_state, op);
6512    }
6513    nextchar(pRExC_state);
6514    return ret;
6515   } else
6516   if (*RExC_parse == '?') { /* (?...) */
6517    bool is_logical = 0;
6518    const char * const seqstart = RExC_parse;
6519    bool has_use_defaults = FALSE;
6520
6521    RExC_parse++;
6522    paren = *RExC_parse++;
6523    ret = NULL;   /* For look-ahead/behind. */
6524    switch (paren) {
6525
6526    case 'P': /* (?P...) variants for those used to PCRE/Python */
6527     paren = *RExC_parse++;
6528     if ( paren == '<')         /* (?P<...>) named capture */
6529      goto named_capture;
6530     else if (paren == '>') {   /* (?P>name) named recursion */
6531      goto named_recursion;
6532     }
6533     else if (paren == '=') {   /* (?P=...)  named backref */
6534      /* this pretty much dupes the code for \k<NAME> in regatom(), if
6535      you change this make sure you change that */
6536      char* name_start = RExC_parse;
6537      U32 num = 0;
6538      SV *sv_dat = reg_scan_name(pRExC_state,
6539       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6540      if (RExC_parse == name_start || *RExC_parse != ')')
6541       vFAIL2("Sequence %.3s... not terminated",parse_start);
6542
6543      if (!SIZE_ONLY) {
6544       num = add_data( pRExC_state, 1, "S" );
6545       RExC_rxi->data->data[num]=(void*)sv_dat;
6546       SvREFCNT_inc_simple_void(sv_dat);
6547      }
6548      RExC_sawback = 1;
6549      ret = reganode(pRExC_state,
6550         ((! FOLD)
6551          ? NREF
6552          : (MORE_ASCII_RESTRICTED)
6553          ? NREFFA
6554          : (AT_LEAST_UNI_SEMANTICS)
6555           ? NREFFU
6556           : (LOC)
6557           ? NREFFL
6558           : NREFF),
6559          num);
6560      *flagp |= HASWIDTH;
6561
6562      Set_Node_Offset(ret, parse_start+1);
6563      Set_Node_Cur_Length(ret); /* MJD */
6564
6565      nextchar(pRExC_state);
6566      return ret;
6567     }
6568     RExC_parse++;
6569     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6570     /*NOTREACHED*/
6571    case '<':           /* (?<...) */
6572     if (*RExC_parse == '!')
6573      paren = ',';
6574     else if (*RExC_parse != '=')
6575    named_capture:
6576     {               /* (?<...>) */
6577      char *name_start;
6578      SV *svname;
6579      paren= '>';
6580    case '\'':          /* (?'...') */
6581       name_start= RExC_parse;
6582       svname = reg_scan_name(pRExC_state,
6583        SIZE_ONLY ?  /* reverse test from the others */
6584        REG_RSN_RETURN_NAME :
6585        REG_RSN_RETURN_NULL);
6586      if (RExC_parse == name_start) {
6587       RExC_parse++;
6588       vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6589       /*NOTREACHED*/
6590      }
6591      if (*RExC_parse != paren)
6592       vFAIL2("Sequence (?%c... not terminated",
6593        paren=='>' ? '<' : paren);
6594      if (SIZE_ONLY) {
6595       HE *he_str;
6596       SV *sv_dat = NULL;
6597       if (!svname) /* shouldn't happen */
6598        Perl_croak(aTHX_
6599         "panic: reg_scan_name returned NULL");
6600       if (!RExC_paren_names) {
6601        RExC_paren_names= newHV();
6602        sv_2mortal(MUTABLE_SV(RExC_paren_names));
6603 #ifdef DEBUGGING
6604        RExC_paren_name_list= newAV();
6605        sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
6606 #endif
6607       }
6608       he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
6609       if ( he_str )
6610        sv_dat = HeVAL(he_str);
6611       if ( ! sv_dat ) {
6612        /* croak baby croak */
6613        Perl_croak(aTHX_
6614         "panic: paren_name hash element allocation failed");
6615       } else if ( SvPOK(sv_dat) ) {
6616        /* (?|...) can mean we have dupes so scan to check
6617        its already been stored. Maybe a flag indicating
6618        we are inside such a construct would be useful,
6619        but the arrays are likely to be quite small, so
6620        for now we punt -- dmq */
6621        IV count = SvIV(sv_dat);
6622        I32 *pv = (I32*)SvPVX(sv_dat);
6623        IV i;
6624        for ( i = 0 ; i < count ; i++ ) {
6625         if ( pv[i] == RExC_npar ) {
6626          count = 0;
6627          break;
6628         }
6629        }
6630        if ( count ) {
6631         pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
6632         SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
6633         pv[count] = RExC_npar;
6634         SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
6635        }
6636       } else {
6637        (void)SvUPGRADE(sv_dat,SVt_PVNV);
6638        sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
6639        SvIOK_on(sv_dat);
6640        SvIV_set(sv_dat, 1);
6641       }
6642 #ifdef DEBUGGING
6643       if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
6644        SvREFCNT_dec(svname);
6645 #endif
6646
6647       /*sv_dump(sv_dat);*/
6648      }
6649      nextchar(pRExC_state);
6650      paren = 1;
6651      goto capturing_parens;
6652     }
6653     RExC_seen |= REG_SEEN_LOOKBEHIND;
6654     RExC_in_lookbehind++;
6655     RExC_parse++;
6656    case '=':           /* (?=...) */
6657     RExC_seen_zerolen++;
6658     break;
6659    case '!':           /* (?!...) */
6660     RExC_seen_zerolen++;
6661     if (*RExC_parse == ')') {
6662      ret=reg_node(pRExC_state, OPFAIL);
6663      nextchar(pRExC_state);
6664      return ret;
6665     }
6666     break;
6667    case '|':           /* (?|...) */
6668     /* branch reset, behave like a (?:...) except that
6669     buffers in alternations share the same numbers */
6670     paren = ':';
6671     after_freeze = freeze_paren = RExC_npar;
6672     break;
6673    case ':':           /* (?:...) */
6674    case '>':           /* (?>...) */
6675     break;
6676    case '$':           /* (?$...) */
6677    case '@':           /* (?@...) */
6678     vFAIL2("Sequence (?%c...) not implemented", (int)paren);
6679     break;
6680    case '#':           /* (?#...) */
6681     while (*RExC_parse && *RExC_parse != ')')
6682      RExC_parse++;
6683     if (*RExC_parse != ')')
6684      FAIL("Sequence (?#... not terminated");
6685     nextchar(pRExC_state);
6686     *flagp = TRYAGAIN;
6687     return NULL;
6688    case '0' :           /* (?0) */
6689    case 'R' :           /* (?R) */
6690     if (*RExC_parse != ')')
6691      FAIL("Sequence (?R) not terminated");
6692     ret = reg_node(pRExC_state, GOSTART);
6693     *flagp |= POSTPONED;
6694     nextchar(pRExC_state);
6695     return ret;
6696     /*notreached*/
6697    { /* named and numeric backreferences */
6698     I32 num;
6699    case '&':            /* (?&NAME) */
6700     parse_start = RExC_parse - 1;
6701    named_recursion:
6702     {
6703       SV *sv_dat = reg_scan_name(pRExC_state,
6704        SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6705       num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6706     }
6707     goto gen_recurse_regop;
6708     /* NOT REACHED */
6709    case '+':
6710     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6711      RExC_parse++;
6712      vFAIL("Illegal pattern");
6713     }
6714     goto parse_recursion;
6715     /* NOT REACHED*/
6716    case '-': /* (?-1) */
6717     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6718      RExC_parse--; /* rewind to let it be handled later */
6719      goto parse_flags;
6720     }
6721     /*FALLTHROUGH */
6722    case '1': case '2': case '3': case '4': /* (?1) */
6723    case '5': case '6': case '7': case '8': case '9':
6724     RExC_parse--;
6725    parse_recursion:
6726     num = atoi(RExC_parse);
6727     parse_start = RExC_parse - 1; /* MJD */
6728     if (*RExC_parse == '-')
6729      RExC_parse++;
6730     while (isDIGIT(*RExC_parse))
6731       RExC_parse++;
6732     if (*RExC_parse!=')')
6733      vFAIL("Expecting close bracket");
6734
6735    gen_recurse_regop:
6736     if ( paren == '-' ) {
6737      /*
6738      Diagram of capture buffer numbering.
6739      Top line is the normal capture buffer numbers
6740      Bottom line is the negative indexing as from
6741      the X (the (?-2))
6742
6743      +   1 2    3 4 5 X          6 7
6744      /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6745      -   5 4    3 2 1 X          x x
6746
6747      */
6748      num = RExC_npar + num;
6749      if (num < 1)  {
6750       RExC_parse++;
6751       vFAIL("Reference to nonexistent group");
6752      }
6753     } else if ( paren == '+' ) {
6754      num = RExC_npar + num - 1;
6755     }
6756
6757     ret = reganode(pRExC_state, GOSUB, num);
6758     if (!SIZE_ONLY) {
6759      if (num > (I32)RExC_rx->nparens) {
6760       RExC_parse++;
6761       vFAIL("Reference to nonexistent group");
6762      }
6763      ARG2L_SET( ret, RExC_recurse_count++);
6764      RExC_emit++;
6765      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6766       "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6767     } else {
6768      RExC_size++;
6769      }
6770      RExC_seen |= REG_SEEN_RECURSE;
6771     Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6772     Set_Node_Offset(ret, parse_start); /* MJD */
6773
6774     *flagp |= POSTPONED;
6775     nextchar(pRExC_state);
6776     return ret;
6777    } /* named and numeric backreferences */
6778    /* NOT REACHED */
6779
6780    case '?':           /* (??...) */
6781     is_logical = 1;
6782     if (*RExC_parse != '{') {
6783      RExC_parse++;
6784      vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6785      /*NOTREACHED*/
6786     }
6787     *flagp |= POSTPONED;
6788     paren = *RExC_parse++;
6789     /* FALL THROUGH */
6790    case '{':           /* (?{...}) */
6791    {
6792     I32 count = 1;
6793     U32 n = 0;
6794     char c;
6795     char *s = RExC_parse;
6796
6797     RExC_seen_zerolen++;
6798     RExC_seen |= REG_SEEN_EVAL;
6799     while (count && (c = *RExC_parse)) {
6800      if (c == '\\') {
6801       if (RExC_parse[1])
6802        RExC_parse++;
6803      }
6804      else if (c == '{')
6805       count++;
6806      else if (c == '}')
6807       count--;
6808      RExC_parse++;
6809     }
6810     if (*RExC_parse != ')') {
6811      RExC_parse = s;
6812      vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6813     }
6814     if (!SIZE_ONLY) {
6815      PAD *pad;
6816      OP_4tree *sop, *rop;
6817      SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6818
6819      ENTER;
6820      Perl_save_re_context(aTHX);
6821      rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6822      sop->op_private |= OPpREFCOUNTED;
6823      /* re_dup will OpREFCNT_inc */
6824      OpREFCNT_set(sop, 1);
6825      LEAVE;
6826
6827      n = add_data(pRExC_state, 3, "nop");
6828      RExC_rxi->data->data[n] = (void*)rop;
6829      RExC_rxi->data->data[n+1] = (void*)sop;
6830      RExC_rxi->data->data[n+2] = (void*)pad;
6831      SvREFCNT_dec(sv);
6832     }
6833     else {      /* First pass */
6834      if (PL_reginterp_cnt < ++RExC_seen_evals
6835       && IN_PERL_RUNTIME)
6836       /* No compiled RE interpolated, has runtime
6837       components ===> unsafe.  */
6838       FAIL("Eval-group not allowed at runtime, use re 'eval'");
6839      if (PL_tainting && PL_tainted)
6840       FAIL("Eval-group in insecure regular expression");
6841 #if PERL_VERSION > 8
6842      if (IN_PERL_COMPILETIME)
6843       PL_cv_has_eval = 1;
6844 #endif
6845     }
6846
6847     nextchar(pRExC_state);
6848     if (is_logical) {
6849      ret = reg_node(pRExC_state, LOGICAL);
6850      if (!SIZE_ONLY)
6851       ret->flags = 2;
6852      REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6853      /* deal with the length of this later - MJD */
6854      return ret;
6855     }
6856     ret = reganode(pRExC_state, EVAL, n);
6857     Set_Node_Length(ret, RExC_parse - parse_start + 1);
6858     Set_Node_Offset(ret, parse_start);
6859     return ret;
6860    }
6861    case '(':           /* (?(?{...})...) and (?(?=...)...) */
6862    {
6863     int is_define= 0;
6864     if (RExC_parse[0] == '?') {        /* (?(?...)) */
6865      if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6866       || RExC_parse[1] == '<'
6867       || RExC_parse[1] == '{') { /* Lookahead or eval. */
6868       I32 flag;
6869
6870       ret = reg_node(pRExC_state, LOGICAL);
6871       if (!SIZE_ONLY)
6872        ret->flags = 1;
6873       REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6874       goto insert_if;
6875      }
6876     }
6877     else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
6878       || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6879     {
6880      char ch = RExC_parse[0] == '<' ? '>' : '\'';
6881      char *name_start= RExC_parse++;
6882      U32 num = 0;
6883      SV *sv_dat=reg_scan_name(pRExC_state,
6884       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6885      if (RExC_parse == name_start || *RExC_parse != ch)
6886       vFAIL2("Sequence (?(%c... not terminated",
6887        (ch == '>' ? '<' : ch));
6888      RExC_parse++;
6889      if (!SIZE_ONLY) {
6890       num = add_data( pRExC_state, 1, "S" );
6891       RExC_rxi->data->data[num]=(void*)sv_dat;
6892       SvREFCNT_inc_simple_void(sv_dat);
6893      }
6894      ret = reganode(pRExC_state,NGROUPP,num);
6895      goto insert_if_check_paren;
6896     }
6897     else if (RExC_parse[0] == 'D' &&
6898       RExC_parse[1] == 'E' &&
6899       RExC_parse[2] == 'F' &&
6900       RExC_parse[3] == 'I' &&
6901       RExC_parse[4] == 'N' &&
6902       RExC_parse[5] == 'E')
6903     {
6904      ret = reganode(pRExC_state,DEFINEP,0);
6905      RExC_parse +=6 ;
6906      is_define = 1;
6907      goto insert_if_check_paren;
6908     }
6909     else if (RExC_parse[0] == 'R') {
6910      RExC_parse++;
6911      parno = 0;
6912      if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6913       parno = atoi(RExC_parse++);
6914       while (isDIGIT(*RExC_parse))
6915        RExC_parse++;
6916      } else if (RExC_parse[0] == '&') {
6917       SV *sv_dat;
6918       RExC_parse++;
6919       sv_dat = reg_scan_name(pRExC_state,
6920         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6921        parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6922      }
6923      ret = reganode(pRExC_state,INSUBP,parno);
6924      goto insert_if_check_paren;
6925     }
6926     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6927      /* (?(1)...) */
6928      char c;
6929      parno = atoi(RExC_parse++);
6930
6931      while (isDIGIT(*RExC_parse))
6932       RExC_parse++;
6933      ret = reganode(pRExC_state, GROUPP, parno);
6934
6935     insert_if_check_paren:
6936      if ((c = *nextchar(pRExC_state)) != ')')
6937       vFAIL("Switch condition not recognized");
6938     insert_if:
6939      REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6940      br = regbranch(pRExC_state, &flags, 1,depth+1);
6941      if (br == NULL)
6942       br = reganode(pRExC_state, LONGJMP, 0);
6943      else
6944       REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6945      c = *nextchar(pRExC_state);
6946      if (flags&HASWIDTH)
6947       *flagp |= HASWIDTH;
6948      if (c == '|') {
6949       if (is_define)
6950        vFAIL("(?(DEFINE)....) does not allow branches");
6951       lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6952       regbranch(pRExC_state, &flags, 1,depth+1);
6953       REGTAIL(pRExC_state, ret, lastbr);
6954       if (flags&HASWIDTH)
6955        *flagp |= HASWIDTH;
6956       c = *nextchar(pRExC_state);
6957      }
6958      else
6959       lastbr = NULL;
6960      if (c != ')')
6961       vFAIL("Switch (?(condition)... contains too many branches");
6962      ender = reg_node(pRExC_state, TAIL);
6963      REGTAIL(pRExC_state, br, ender);
6964      if (lastbr) {
6965       REGTAIL(pRExC_state, lastbr, ender);
6966       REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6967      }
6968      else
6969       REGTAIL(pRExC_state, ret, ender);
6970      RExC_size++; /* XXX WHY do we need this?!!
6971          For large programs it seems to be required
6972          but I can't figure out why. -- dmq*/
6973      return ret;
6974     }
6975     else {
6976      vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6977     }
6978    }
6979    case 0:
6980     RExC_parse--; /* for vFAIL to print correctly */
6981     vFAIL("Sequence (? incomplete");
6982     break;
6983    case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
6984          that follow */
6985     has_use_defaults = TRUE;
6986     STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
6987     set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
6988             ? REGEX_UNICODE_CHARSET
6989             : REGEX_DEPENDS_CHARSET);
6990     goto parse_flags;
6991    default:
6992     --RExC_parse;
6993     parse_flags:      /* (?i) */
6994    {
6995     U32 posflags = 0, negflags = 0;
6996     U32 *flagsp = &posflags;
6997     char has_charset_modifier = '\0';
6998     regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
6999          ? REGEX_UNICODE_CHARSET
7000          : REGEX_DEPENDS_CHARSET;
7001
7002     while (*RExC_parse) {
7003      /* && strchr("iogcmsx", *RExC_parse) */
7004      /* (?g), (?gc) and (?o) are useless here
7005      and must be globally applied -- japhy */
7006      switch (*RExC_parse) {
7007      CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
7008      case LOCALE_PAT_MOD:
7009       if (has_charset_modifier) {
7010        goto excess_modifier;
7011       }
7012       else if (flagsp == &negflags) {
7013        goto neg_modifier;
7014       }
7015       cs = REGEX_LOCALE_CHARSET;
7016       has_charset_modifier = LOCALE_PAT_MOD;
7017       RExC_contains_locale = 1;
7018       break;
7019      case UNICODE_PAT_MOD:
7020       if (has_charset_modifier) {
7021        goto excess_modifier;
7022       }
7023       else if (flagsp == &negflags) {
7024        goto neg_modifier;
7025       }
7026       cs = REGEX_UNICODE_CHARSET;
7027       has_charset_modifier = UNICODE_PAT_MOD;
7028       break;
7029      case ASCII_RESTRICT_PAT_MOD:
7030       if (flagsp == &negflags) {
7031        goto neg_modifier;
7032       }
7033       if (has_charset_modifier) {
7034        if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
7035         goto excess_modifier;
7036        }
7037        /* Doubled modifier implies more restricted */
7038        cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
7039       }
7040       else {
7041        cs = REGEX_ASCII_RESTRICTED_CHARSET;
7042       }
7043       has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
7044       break;
7045      case DEPENDS_PAT_MOD:
7046       if (has_use_defaults) {
7047        goto fail_modifiers;
7048       }
7049       else if (flagsp == &negflags) {
7050        goto neg_modifier;
7051       }
7052       else if (has_charset_modifier) {
7053        goto excess_modifier;
7054       }
7055
7056       /* The dual charset means unicode semantics if the
7057       * pattern (or target, not known until runtime) are
7058       * utf8, or something in the pattern indicates unicode
7059       * semantics */
7060       cs = (RExC_utf8 || RExC_uni_semantics)
7061        ? REGEX_UNICODE_CHARSET
7062        : REGEX_DEPENDS_CHARSET;
7063       has_charset_modifier = DEPENDS_PAT_MOD;
7064       break;
7065      excess_modifier:
7066       RExC_parse++;
7067       if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
7068        vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
7069       }
7070       else if (has_charset_modifier == *(RExC_parse - 1)) {
7071        vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
7072       }
7073       else {
7074        vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
7075       }
7076       /*NOTREACHED*/
7077      neg_modifier:
7078       RExC_parse++;
7079       vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
7080       /*NOTREACHED*/
7081      case ONCE_PAT_MOD: /* 'o' */
7082      case GLOBAL_PAT_MOD: /* 'g' */
7083       if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7084        const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
7085        if (! (wastedflags & wflagbit) ) {
7086         wastedflags |= wflagbit;
7087         vWARN5(
7088          RExC_parse + 1,
7089          "Useless (%s%c) - %suse /%c modifier",
7090          flagsp == &negflags ? "?-" : "?",
7091          *RExC_parse,
7092          flagsp == &negflags ? "don't " : "",
7093          *RExC_parse
7094         );
7095        }
7096       }
7097       break;
7098
7099      case CONTINUE_PAT_MOD: /* 'c' */
7100       if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7101        if (! (wastedflags & WASTED_C) ) {
7102         wastedflags |= WASTED_GC;
7103         vWARN3(
7104          RExC_parse + 1,
7105          "Useless (%sc) - %suse /gc modifier",
7106          flagsp == &negflags ? "?-" : "?",
7107          flagsp == &negflags ? "don't " : ""
7108         );
7109        }
7110       }
7111       break;
7112      case KEEPCOPY_PAT_MOD: /* 'p' */
7113       if (flagsp == &negflags) {
7114        if (SIZE_ONLY)
7115         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
7116       } else {
7117        *flagsp |= RXf_PMf_KEEPCOPY;
7118       }
7119       break;
7120      case '-':
7121       /* A flag is a default iff it is following a minus, so
7122       * if there is a minus, it means will be trying to
7123       * re-specify a default which is an error */
7124       if (has_use_defaults || flagsp == &negflags) {
7125    fail_modifiers:
7126        RExC_parse++;
7127        vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7128        /*NOTREACHED*/
7129       }
7130       flagsp = &negflags;
7131       wastedflags = 0;  /* reset so (?g-c) warns twice */
7132       break;
7133      case ':':
7134       paren = ':';
7135       /*FALLTHROUGH*/
7136      case ')':
7137       RExC_flags |= posflags;
7138       RExC_flags &= ~negflags;
7139       set_regex_charset(&RExC_flags, cs);
7140       if (paren != ':') {
7141        oregflags |= posflags;
7142        oregflags &= ~negflags;
7143        set_regex_charset(&oregflags, cs);
7144       }
7145       nextchar(pRExC_state);
7146       if (paren != ':') {
7147        *flagp = TRYAGAIN;
7148        return NULL;
7149       } else {
7150        ret = NULL;
7151        goto parse_rest;
7152       }
7153       /*NOTREACHED*/
7154      default:
7155       RExC_parse++;
7156       vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7157       /*NOTREACHED*/
7158      }
7159      ++RExC_parse;
7160     }
7161    }} /* one for the default block, one for the switch */
7162   }
7163   else {                  /* (...) */
7164   capturing_parens:
7165    parno = RExC_npar;
7166    RExC_npar++;
7167
7168    ret = reganode(pRExC_state, OPEN, parno);
7169    if (!SIZE_ONLY ){
7170     if (!RExC_nestroot)
7171      RExC_nestroot = parno;
7172     if (RExC_seen & REG_SEEN_RECURSE
7173      && !RExC_open_parens[parno-1])
7174     {
7175      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7176       "Setting open paren #%"IVdf" to %d\n",
7177       (IV)parno, REG_NODE_NUM(ret)));
7178      RExC_open_parens[parno-1]= ret;
7179     }
7180    }
7181    Set_Node_Length(ret, 1); /* MJD */
7182    Set_Node_Offset(ret, RExC_parse); /* MJD */
7183    is_open = 1;
7184   }
7185  }
7186  else                        /* ! paren */
7187   ret = NULL;
7188
7189    parse_rest:
7190  /* Pick up the branches, linking them together. */
7191  parse_start = RExC_parse;   /* MJD */
7192  br = regbranch(pRExC_state, &flags, 1,depth+1);
7193
7194  /*     branch_len = (paren != 0); */
7195
7196  if (br == NULL)
7197   return(NULL);
7198  if (*RExC_parse == '|') {
7199   if (!SIZE_ONLY && RExC_extralen) {
7200    reginsert(pRExC_state, BRANCHJ, br, depth+1);
7201   }
7202   else {                  /* MJD */
7203    reginsert(pRExC_state, BRANCH, br, depth+1);
7204    Set_Node_Length(br, paren != 0);
7205    Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
7206   }
7207   have_branch = 1;
7208   if (SIZE_ONLY)
7209    RExC_extralen += 1;  /* For BRANCHJ-BRANCH. */
7210  }
7211  else if (paren == ':') {
7212   *flagp |= flags&SIMPLE;
7213  }
7214  if (is_open) {    /* Starts with OPEN. */
7215   REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
7216  }
7217  else if (paren != '?')  /* Not Conditional */
7218   ret = br;
7219  *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7220  lastbr = br;
7221  while (*RExC_parse == '|') {
7222   if (!SIZE_ONLY && RExC_extralen) {
7223    ender = reganode(pRExC_state, LONGJMP,0);
7224    REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
7225   }
7226   if (SIZE_ONLY)
7227    RExC_extralen += 2;  /* Account for LONGJMP. */
7228   nextchar(pRExC_state);
7229   if (freeze_paren) {
7230    if (RExC_npar > after_freeze)
7231     after_freeze = RExC_npar;
7232    RExC_npar = freeze_paren;
7233   }
7234   br = regbranch(pRExC_state, &flags, 0, depth+1);
7235
7236   if (br == NULL)
7237    return(NULL);
7238   REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
7239   lastbr = br;
7240   *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7241  }
7242
7243  if (have_branch || paren != ':') {
7244   /* Make a closing node, and hook it on the end. */
7245   switch (paren) {
7246   case ':':
7247    ender = reg_node(pRExC_state, TAIL);
7248    break;
7249   case 1:
7250    ender = reganode(pRExC_state, CLOSE, parno);
7251    if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
7252     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7253       "Setting close paren #%"IVdf" to %d\n",
7254       (IV)parno, REG_NODE_NUM(ender)));
7255     RExC_close_parens[parno-1]= ender;
7256     if (RExC_nestroot == parno)
7257      RExC_nestroot = 0;
7258    }
7259    Set_Node_Offset(ender,RExC_parse+1); /* MJD */
7260    Set_Node_Length(ender,1); /* MJD */
7261    break;
7262   case '<':
7263   case ',':
7264   case '=':
7265   case '!':
7266    *flagp &= ~HASWIDTH;
7267    /* FALL THROUGH */
7268   case '>':
7269    ender = reg_node(pRExC_state, SUCCEED);
7270    break;
7271   case 0:
7272    ender = reg_node(pRExC_state, END);
7273    if (!SIZE_ONLY) {
7274     assert(!RExC_opend); /* there can only be one! */
7275     RExC_opend = ender;
7276    }
7277    break;
7278   }
7279   REGTAIL(pRExC_state, lastbr, ender);
7280
7281   if (have_branch && !SIZE_ONLY) {
7282    if (depth==1)
7283     RExC_seen |= REG_TOP_LEVEL_BRANCHES;
7284
7285    /* Hook the tails of the branches to the closing node. */
7286    for (br = ret; br; br = regnext(br)) {
7287     const U8 op = PL_regkind[OP(br)];
7288     if (op == BRANCH) {
7289      REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
7290     }
7291     else if (op == BRANCHJ) {
7292      REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
7293     }
7294    }
7295   }
7296  }
7297
7298  {
7299   const char *p;
7300   static const char parens[] = "=!<,>";
7301
7302   if (paren && (p = strchr(parens, paren))) {
7303    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
7304    int flag = (p - parens) > 1;
7305
7306    if (paren == '>')
7307     node = SUSPEND, flag = 0;
7308    reginsert(pRExC_state, node,ret, depth+1);
7309    Set_Node_Cur_Length(ret);
7310    Set_Node_Offset(ret, parse_start + 1);
7311    ret->flags = flag;
7312    REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
7313   }
7314  }
7315
7316  /* Check for proper termination. */
7317  if (paren) {
7318   RExC_flags = oregflags;
7319   if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
7320    RExC_parse = oregcomp_parse;
7321    vFAIL("Unmatched (");
7322   }
7323  }
7324  else if (!paren && RExC_parse < RExC_end) {
7325   if (*RExC_parse == ')') {
7326    RExC_parse++;
7327    vFAIL("Unmatched )");
7328   }
7329   else
7330    FAIL("Junk on end of regexp"); /* "Can't happen". */
7331   /* NOTREACHED */
7332  }
7333
7334  if (RExC_in_lookbehind) {
7335   RExC_in_lookbehind--;
7336  }
7337  if (after_freeze > RExC_npar)
7338   RExC_npar = after_freeze;
7339  return(ret);
7340 }
7341
7342 /*
7343  - regbranch - one alternative of an | operator
7344  *
7345  * Implements the concatenation operator.
7346  */
7347 STATIC regnode *
7348 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
7349 {
7350  dVAR;
7351  register regnode *ret;
7352  register regnode *chain = NULL;
7353  register regnode *latest;
7354  I32 flags = 0, c = 0;
7355  GET_RE_DEBUG_FLAGS_DECL;
7356
7357  PERL_ARGS_ASSERT_REGBRANCH;
7358
7359  DEBUG_PARSE("brnc");
7360
7361  if (first)
7362   ret = NULL;
7363  else {
7364   if (!SIZE_ONLY && RExC_extralen)
7365    ret = reganode(pRExC_state, BRANCHJ,0);
7366   else {
7367    ret = reg_node(pRExC_state, BRANCH);
7368    Set_Node_Length(ret, 1);
7369   }
7370  }
7371
7372  if (!first && SIZE_ONLY)
7373   RExC_extralen += 1;   /* BRANCHJ */
7374
7375  *flagp = WORST;   /* Tentatively. */
7376
7377  RExC_parse--;
7378  nextchar(pRExC_state);
7379  while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
7380   flags &= ~TRYAGAIN;
7381   latest = regpiece(pRExC_state, &flags,depth+1);
7382   if (latest == NULL) {
7383    if (flags & TRYAGAIN)
7384     continue;
7385    return(NULL);
7386   }
7387   else if (ret == NULL)
7388    ret = latest;
7389   *flagp |= flags&(HASWIDTH|POSTPONED);
7390   if (chain == NULL)  /* First piece. */
7391    *flagp |= flags&SPSTART;
7392   else {
7393    RExC_naughty++;
7394    REGTAIL(pRExC_state, chain, latest);
7395   }
7396   chain = latest;
7397   c++;
7398  }
7399  if (chain == NULL) { /* Loop ran zero times. */
7400   chain = reg_node(pRExC_state, NOTHING);
7401   if (ret == NULL)
7402    ret = chain;
7403  }
7404  if (c == 1) {
7405   *flagp |= flags&SIMPLE;
7406  }
7407
7408  return ret;
7409 }
7410
7411 /*
7412  - regpiece - something followed by possible [*+?]
7413  *
7414  * Note that the branching code sequences used for ? and the general cases
7415  * of * and + are somewhat optimized:  they use the same NOTHING node as
7416  * both the endmarker for their branch list and the body of the last branch.
7417  * It might seem that this node could be dispensed with entirely, but the
7418  * endmarker role is not redundant.
7419  */
7420 STATIC regnode *
7421 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7422 {
7423  dVAR;
7424  register regnode *ret;
7425  register char op;
7426  register char *next;
7427  I32 flags;
7428  const char * const origparse = RExC_parse;
7429  I32 min;
7430  I32 max = REG_INFTY;
7431  char *parse_start;
7432  const char *maxpos = NULL;
7433  GET_RE_DEBUG_FLAGS_DECL;
7434
7435  PERL_ARGS_ASSERT_REGPIECE;
7436
7437  DEBUG_PARSE("piec");
7438
7439  ret = regatom(pRExC_state, &flags,depth+1);
7440  if (ret == NULL) {
7441   if (flags & TRYAGAIN)
7442    *flagp |= TRYAGAIN;
7443   return(NULL);
7444  }
7445
7446  op = *RExC_parse;
7447
7448  if (op == '{' && regcurly(RExC_parse)) {
7449   maxpos = NULL;
7450   parse_start = RExC_parse; /* MJD */
7451   next = RExC_parse + 1;
7452   while (isDIGIT(*next) || *next == ',') {
7453    if (*next == ',') {
7454     if (maxpos)
7455      break;
7456     else
7457      maxpos = next;
7458    }
7459    next++;
7460   }
7461   if (*next == '}') {  /* got one */
7462    if (!maxpos)
7463     maxpos = next;
7464    RExC_parse++;
7465    min = atoi(RExC_parse);
7466    if (*maxpos == ',')
7467     maxpos++;
7468    else
7469     maxpos = RExC_parse;
7470    max = atoi(maxpos);
7471    if (!max && *maxpos != '0')
7472     max = REG_INFTY;  /* meaning "infinity" */
7473    else if (max >= REG_INFTY)
7474     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
7475    RExC_parse = next;
7476    nextchar(pRExC_state);
7477
7478   do_curly:
7479    if ((flags&SIMPLE)) {
7480     RExC_naughty += 2 + RExC_naughty / 2;
7481     reginsert(pRExC_state, CURLY, ret, depth+1);
7482     Set_Node_Offset(ret, parse_start+1); /* MJD */
7483     Set_Node_Cur_Length(ret);
7484    }
7485    else {
7486     regnode * const w = reg_node(pRExC_state, WHILEM);
7487
7488     w->flags = 0;
7489     REGTAIL(pRExC_state, ret, w);
7490     if (!SIZE_ONLY && RExC_extralen) {
7491      reginsert(pRExC_state, LONGJMP,ret, depth+1);
7492      reginsert(pRExC_state, NOTHING,ret, depth+1);
7493      NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
7494     }
7495     reginsert(pRExC_state, CURLYX,ret, depth+1);
7496         /* MJD hk */
7497     Set_Node_Offset(ret, parse_start+1);
7498     Set_Node_Length(ret,
7499         op == '{' ? (RExC_parse - parse_start) : 1);
7500
7501     if (!SIZE_ONLY && RExC_extralen)
7502      NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
7503     REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
7504     if (SIZE_ONLY)
7505      RExC_whilem_seen++, RExC_extralen += 3;
7506     RExC_naughty += 4 + RExC_naughty; /* compound interest */
7507    }
7508    ret->flags = 0;
7509
7510    if (min > 0)
7511     *flagp = WORST;
7512    if (max > 0)
7513     *flagp |= HASWIDTH;
7514    if (max < min)
7515     vFAIL("Can't do {n,m} with n > m");
7516    if (!SIZE_ONLY) {
7517     ARG1_SET(ret, (U16)min);
7518     ARG2_SET(ret, (U16)max);
7519    }
7520
7521    goto nest_check;
7522   }
7523  }
7524
7525  if (!ISMULT1(op)) {
7526   *flagp = flags;
7527   return(ret);
7528  }
7529
7530 #if 0    /* Now runtime fix should be reliable. */
7531
7532  /* if this is reinstated, don't forget to put this back into perldiag:
7533
7534    =item Regexp *+ operand could be empty at {#} in regex m/%s/
7535
7536   (F) The part of the regexp subject to either the * or + quantifier
7537   could match an empty string. The {#} shows in the regular
7538   expression about where the problem was discovered.
7539
7540  */
7541
7542  if (!(flags&HASWIDTH) && op != '?')
7543  vFAIL("Regexp *+ operand could be empty");
7544 #endif
7545
7546  parse_start = RExC_parse;
7547  nextchar(pRExC_state);
7548
7549  *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
7550
7551  if (op == '*' && (flags&SIMPLE)) {
7552   reginsert(pRExC_state, STAR, ret, depth+1);
7553   ret->flags = 0;
7554   RExC_naughty += 4;
7555  }
7556  else if (op == '*') {
7557   min = 0;
7558   goto do_curly;
7559  }
7560  else if (op == '+' && (flags&SIMPLE)) {
7561   reginsert(pRExC_state, PLUS, ret, depth+1);
7562   ret->flags = 0;
7563   RExC_naughty += 3;
7564  }
7565  else if (op == '+') {
7566   min = 1;
7567   goto do_curly;
7568  }
7569  else if (op == '?') {
7570   min = 0; max = 1;
7571   goto do_curly;
7572  }
7573   nest_check:
7574  if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
7575   ckWARN3reg(RExC_parse,
7576     "%.*s matches null string many times",
7577     (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
7578     origparse);
7579  }
7580
7581  if (RExC_parse < RExC_end && *RExC_parse == '?') {
7582   nextchar(pRExC_state);
7583   reginsert(pRExC_state, MINMOD, ret, depth+1);
7584   REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
7585  }
7586 #ifndef REG_ALLOW_MINMOD_SUSPEND
7587  else
7588 #endif
7589  if (RExC_parse < RExC_end && *RExC_parse == '+') {
7590   regnode *ender;
7591   nextchar(pRExC_state);
7592   ender = reg_node(pRExC_state, SUCCEED);
7593   REGTAIL(pRExC_state, ret, ender);
7594   reginsert(pRExC_state, SUSPEND, ret, depth+1);
7595   ret->flags = 0;
7596   ender = reg_node(pRExC_state, TAIL);
7597   REGTAIL(pRExC_state, ret, ender);
7598   /*ret= ender;*/
7599  }
7600
7601  if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
7602   RExC_parse++;
7603   vFAIL("Nested quantifiers");
7604  }
7605
7606  return(ret);
7607 }
7608
7609
7610 /* reg_namedseq(pRExC_state,UVp, UV depth)
7611
7612    This is expected to be called by a parser routine that has
7613    recognized '\N' and needs to handle the rest. RExC_parse is
7614    expected to point at the first char following the N at the time
7615    of the call.
7616
7617    The \N may be inside (indicated by valuep not being NULL) or outside a
7618    character class.
7619
7620    \N may begin either a named sequence, or if outside a character class, mean
7621    to match a non-newline.  For non single-quoted regexes, the tokenizer has
7622    attempted to decide which, and in the case of a named sequence converted it
7623    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
7624    where c1... are the characters in the sequence.  For single-quoted regexes,
7625    the tokenizer passes the \N sequence through unchanged; this code will not
7626    attempt to determine this nor expand those.  The net effect is that if the
7627    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
7628    signals that this \N occurrence means to match a non-newline.
7629
7630    Only the \N{U+...} form should occur in a character class, for the same
7631    reason that '.' inside a character class means to just match a period: it
7632    just doesn't make sense.
7633
7634    If valuep is non-null then it is assumed that we are parsing inside
7635    of a charclass definition and the first codepoint in the resolved
7636    string is returned via *valuep and the routine will return NULL.
7637    In this mode if a multichar string is returned from the charnames
7638    handler, a warning will be issued, and only the first char in the
7639    sequence will be examined. If the string returned is zero length
7640    then the value of *valuep is undefined and NON-NULL will
7641    be returned to indicate failure. (This will NOT be a valid pointer
7642    to a regnode.)
7643
7644    If valuep is null then it is assumed that we are parsing normal text and a
7645    new EXACT node is inserted into the program containing the resolved string,
7646    and a pointer to the new node is returned.  But if the string is zero length
7647    a NOTHING node is emitted instead.
7648
7649    On success RExC_parse is set to the char following the endbrace.
7650    Parsing failures will generate a fatal error via vFAIL(...)
7651  */
7652 STATIC regnode *
7653 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
7654 {
7655  char * endbrace;    /* '}' following the name */
7656  regnode *ret = NULL;
7657  char* p;
7658
7659  GET_RE_DEBUG_FLAGS_DECL;
7660
7661  PERL_ARGS_ASSERT_REG_NAMEDSEQ;
7662
7663  GET_RE_DEBUG_FLAGS;
7664
7665  /* The [^\n] meaning of \N ignores spaces and comments under the /x
7666  * modifier.  The other meaning does not */
7667  p = (RExC_flags & RXf_PMf_EXTENDED)
7668   ? regwhite( pRExC_state, RExC_parse )
7669   : RExC_parse;
7670
7671  /* Disambiguate between \N meaning a named character versus \N meaning
7672  * [^\n].  The former is assumed when it can't be the latter. */
7673  if (*p != '{' || regcurly(p)) {
7674   RExC_parse = p;
7675   if (valuep) {
7676    /* no bare \N in a charclass */
7677    vFAIL("\\N in a character class must be a named character: \\N{...}");
7678   }
7679   nextchar(pRExC_state);
7680   ret = reg_node(pRExC_state, REG_ANY);
7681   *flagp |= HASWIDTH|SIMPLE;
7682   RExC_naughty++;
7683   RExC_parse--;
7684   Set_Node_Length(ret, 1); /* MJD */
7685   return ret;
7686  }
7687
7688  /* Here, we have decided it should be a named sequence */
7689
7690  /* The test above made sure that the next real character is a '{', but
7691  * under the /x modifier, it could be separated by space (or a comment and
7692  * \n) and this is not allowed (for consistency with \x{...} and the
7693  * tokenizer handling of \N{NAME}). */
7694  if (*RExC_parse != '{') {
7695   vFAIL("Missing braces on \\N{}");
7696  }
7697
7698  RExC_parse++; /* Skip past the '{' */
7699
7700  if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
7701   || ! (endbrace == RExC_parse  /* nothing between the {} */
7702    || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
7703     && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
7704  {
7705   if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
7706   vFAIL("\\N{NAME} must be resolved by the lexer");
7707  }
7708
7709  if (endbrace == RExC_parse) {   /* empty: \N{} */
7710   if (! valuep) {
7711    RExC_parse = endbrace + 1;
7712    return reg_node(pRExC_state,NOTHING);
7713   }
7714
7715   if (SIZE_ONLY) {
7716    ckWARNreg(RExC_parse,
7717      "Ignoring zero length \\N{} in character class"
7718    );
7719    RExC_parse = endbrace + 1;
7720   }
7721   *valuep = 0;
7722   return (regnode *) &RExC_parse; /* Invalid regnode pointer */
7723  }
7724
7725  REQUIRE_UTF8; /* named sequences imply Unicode semantics */
7726  RExC_parse += 2; /* Skip past the 'U+' */
7727
7728  if (valuep) {   /* In a bracketed char class */
7729   /* We only pay attention to the first char of
7730   multichar strings being returned. I kinda wonder
7731   if this makes sense as it does change the behaviour
7732   from earlier versions, OTOH that behaviour was broken
7733   as well. XXX Solution is to recharacterize as
7734   [rest-of-class]|multi1|multi2... */
7735
7736   STRLEN length_of_hex;
7737   I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7738    | PERL_SCAN_DISALLOW_PREFIX
7739    | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7740
7741   char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7742   if (endchar < endbrace) {
7743    ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7744   }
7745
7746   length_of_hex = (STRLEN)(endchar - RExC_parse);
7747   *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7748
7749   /* The tokenizer should have guaranteed validity, but it's possible to
7750   * bypass it by using single quoting, so check */
7751   if (length_of_hex == 0
7752    || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7753   {
7754    RExC_parse += length_of_hex; /* Includes all the valid */
7755    RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7756        ? UTF8SKIP(RExC_parse)
7757        : 1;
7758    /* Guard against malformed utf8 */
7759    if (RExC_parse >= endchar) RExC_parse = endchar;
7760    vFAIL("Invalid hexadecimal number in \\N{U+...}");
7761   }
7762
7763   RExC_parse = endbrace + 1;
7764   if (endchar == endbrace) return NULL;
7765
7766   ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
7767  }
7768  else { /* Not a char class */
7769
7770   /* What is done here is to convert this to a sub-pattern of the form
7771   * (?:\x{char1}\x{char2}...)
7772   * and then call reg recursively.  That way, it retains its atomicness,
7773   * while not having to worry about special handling that some code
7774   * points may have.  toke.c has converted the original Unicode values
7775   * to native, so that we can just pass on the hex values unchanged.  We
7776   * do have to set a flag to keep recoding from happening in the
7777   * recursion */
7778
7779   SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
7780   STRLEN len;
7781   char *endchar;     /* Points to '.' or '}' ending cur char in the input
7782        stream */
7783   char *orig_end = RExC_end;
7784
7785   while (RExC_parse < endbrace) {
7786
7787    /* Code points are separated by dots.  If none, there is only one
7788    * code point, and is terminated by the brace */
7789    endchar = RExC_parse + strcspn(RExC_parse, ".}");
7790
7791    /* Convert to notation the rest of the code understands */
7792    sv_catpv(substitute_parse, "\\x{");
7793    sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
7794    sv_catpv(substitute_parse, "}");
7795
7796    /* Point to the beginning of the next character in the sequence. */
7797    RExC_parse = endchar + 1;
7798   }
7799   sv_catpv(substitute_parse, ")");
7800
7801   RExC_parse = SvPV(substitute_parse, len);
7802
7803   /* Don't allow empty number */
7804   if (len < 8) {
7805    vFAIL("Invalid hexadecimal number in \\N{U+...}");
7806   }
7807   RExC_end = RExC_parse + len;
7808
7809   /* The values are Unicode, and therefore not subject to recoding */
7810   RExC_override_recoding = 1;
7811
7812   ret = reg(pRExC_state, 1, flagp, depth+1);
7813
7814   RExC_parse = endbrace;
7815   RExC_end = orig_end;
7816   RExC_override_recoding = 0;
7817
7818   nextchar(pRExC_state);
7819  }
7820
7821  return ret;
7822 }
7823
7824
7825 /*
7826  * reg_recode
7827  *
7828  * It returns the code point in utf8 for the value in *encp.
7829  *    value: a code value in the source encoding
7830  *    encp:  a pointer to an Encode object
7831  *
7832  * If the result from Encode is not a single character,
7833  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7834  */
7835 STATIC UV
7836 S_reg_recode(pTHX_ const char value, SV **encp)
7837 {
7838  STRLEN numlen = 1;
7839  SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7840  const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7841  const STRLEN newlen = SvCUR(sv);
7842  UV uv = UNICODE_REPLACEMENT;
7843
7844  PERL_ARGS_ASSERT_REG_RECODE;
7845
7846  if (newlen)
7847   uv = SvUTF8(sv)
7848    ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7849    : *(U8*)s;
7850
7851  if (!newlen || numlen != newlen) {
7852   uv = UNICODE_REPLACEMENT;
7853   *encp = NULL;
7854  }
7855  return uv;
7856 }
7857
7858
7859 /*
7860  - regatom - the lowest level
7861
7862    Try to identify anything special at the start of the pattern. If there
7863    is, then handle it as required. This may involve generating a single regop,
7864    such as for an assertion; or it may involve recursing, such as to
7865    handle a () structure.
7866
7867    If the string doesn't start with something special then we gobble up
7868    as much literal text as we can.
7869
7870    Once we have been able to handle whatever type of thing started the
7871    sequence, we return.
7872
7873    Note: we have to be careful with escapes, as they can be both literal
7874    and special, and in the case of \10 and friends can either, depending
7875    on context. Specifically there are two separate switches for handling
7876    escape sequences, with the one for handling literal escapes requiring
7877    a dummy entry for all of the special escapes that are actually handled
7878    by the other.
7879 */
7880
7881 STATIC regnode *
7882 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7883 {
7884  dVAR;
7885  register regnode *ret = NULL;
7886  I32 flags;
7887  char *parse_start = RExC_parse;
7888  U8 op;
7889  GET_RE_DEBUG_FLAGS_DECL;
7890  DEBUG_PARSE("atom");
7891  *flagp = WORST;  /* Tentatively. */
7892
7893  PERL_ARGS_ASSERT_REGATOM;
7894
7895 tryagain:
7896  switch ((U8)*RExC_parse) {
7897  case '^':
7898   RExC_seen_zerolen++;
7899   nextchar(pRExC_state);
7900   if (RExC_flags & RXf_PMf_MULTILINE)
7901    ret = reg_node(pRExC_state, MBOL);
7902   else if (RExC_flags & RXf_PMf_SINGLELINE)
7903    ret = reg_node(pRExC_state, SBOL);
7904   else
7905    ret = reg_node(pRExC_state, BOL);
7906   Set_Node_Length(ret, 1); /* MJD */
7907   break;
7908  case '$':
7909   nextchar(pRExC_state);
7910   if (*RExC_parse)
7911    RExC_seen_zerolen++;
7912   if (RExC_flags & RXf_PMf_MULTILINE)
7913    ret = reg_node(pRExC_state, MEOL);
7914   else if (RExC_flags & RXf_PMf_SINGLELINE)
7915    ret = reg_node(pRExC_state, SEOL);
7916   else
7917    ret = reg_node(pRExC_state, EOL);
7918   Set_Node_Length(ret, 1); /* MJD */
7919   break;
7920  case '.':
7921   nextchar(pRExC_state);
7922   if (RExC_flags & RXf_PMf_SINGLELINE)
7923    ret = reg_node(pRExC_state, SANY);
7924   else
7925    ret = reg_node(pRExC_state, REG_ANY);
7926   *flagp |= HASWIDTH|SIMPLE;
7927   RExC_naughty++;
7928   Set_Node_Length(ret, 1); /* MJD */
7929   break;
7930  case '[':
7931  {
7932   char * const oregcomp_parse = ++RExC_parse;
7933   ret = regclass(pRExC_state,depth+1);
7934   if (*RExC_parse != ']') {
7935    RExC_parse = oregcomp_parse;
7936    vFAIL("Unmatched [");
7937   }
7938   nextchar(pRExC_state);
7939   *flagp |= HASWIDTH|SIMPLE;
7940   Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7941   break;
7942  }
7943  case '(':
7944   nextchar(pRExC_state);
7945   ret = reg(pRExC_state, 1, &flags,depth+1);
7946   if (ret == NULL) {
7947     if (flags & TRYAGAIN) {
7948      if (RExC_parse == RExC_end) {
7949       /* Make parent create an empty node if needed. */
7950       *flagp |= TRYAGAIN;
7951       return(NULL);
7952      }
7953      goto tryagain;
7954     }
7955     return(NULL);
7956   }
7957   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7958   break;
7959  case '|':
7960  case ')':
7961   if (flags & TRYAGAIN) {
7962    *flagp |= TRYAGAIN;
7963    return NULL;
7964   }
7965   vFAIL("Internal urp");
7966         /* Supposed to be caught earlier. */
7967   break;
7968  case '{':
7969   if (!regcurly(RExC_parse)) {
7970    RExC_parse++;
7971    goto defchar;
7972   }
7973   /* FALL THROUGH */
7974  case '?':
7975  case '+':
7976  case '*':
7977   RExC_parse++;
7978   vFAIL("Quantifier follows nothing");
7979   break;
7980  case '\\':
7981   /* Special Escapes
7982
7983   This switch handles escape sequences that resolve to some kind
7984   of special regop and not to literal text. Escape sequnces that
7985   resolve to literal text are handled below in the switch marked
7986   "Literal Escapes".
7987
7988   Every entry in this switch *must* have a corresponding entry
7989   in the literal escape switch. However, the opposite is not
7990   required, as the default for this switch is to jump to the
7991   literal text handling code.
7992   */
7993   switch ((U8)*++RExC_parse) {
7994   /* Special Escapes */
7995   case 'A':
7996    RExC_seen_zerolen++;
7997    ret = reg_node(pRExC_state, SBOL);
7998    *flagp |= SIMPLE;
7999    goto finish_meta_pat;
8000   case 'G':
8001    ret = reg_node(pRExC_state, GPOS);
8002    RExC_seen |= REG_SEEN_GPOS;
8003    *flagp |= SIMPLE;
8004    goto finish_meta_pat;
8005   case 'K':
8006    RExC_seen_zerolen++;
8007    ret = reg_node(pRExC_state, KEEPS);
8008    *flagp |= SIMPLE;
8009    /* XXX:dmq : disabling in-place substitution seems to
8010    * be necessary here to avoid cases of memory corruption, as
8011    * with: C<$_="x" x 80; s/x\K/y/> -- rgs
8012    */
8013    RExC_seen |= REG_SEEN_LOOKBEHIND;
8014    goto finish_meta_pat;
8015   case 'Z':
8016    ret = reg_node(pRExC_state, SEOL);
8017    *flagp |= SIMPLE;
8018    RExC_seen_zerolen++;  /* Do not optimize RE away */
8019    goto finish_meta_pat;
8020   case 'z':
8021    ret = reg_node(pRExC_state, EOS);
8022    *flagp |= SIMPLE;
8023    RExC_seen_zerolen++;  /* Do not optimize RE away */
8024    goto finish_meta_pat;
8025   case 'C':
8026    ret = reg_node(pRExC_state, CANY);
8027    RExC_seen |= REG_SEEN_CANY;
8028    *flagp |= HASWIDTH|SIMPLE;
8029    goto finish_meta_pat;
8030   case 'X':
8031    ret = reg_node(pRExC_state, CLUMP);
8032    *flagp |= HASWIDTH;
8033    goto finish_meta_pat;
8034   case 'w':
8035    switch (get_regex_charset(RExC_flags)) {
8036     case REGEX_LOCALE_CHARSET:
8037      op = ALNUML;
8038      break;
8039     case REGEX_UNICODE_CHARSET:
8040      op = ALNUMU;
8041      break;
8042     case REGEX_ASCII_RESTRICTED_CHARSET:
8043     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8044      op = ALNUMA;
8045      break;
8046     case REGEX_DEPENDS_CHARSET:
8047      op = ALNUM;
8048      break;
8049     default:
8050      goto bad_charset;
8051    }
8052    ret = reg_node(pRExC_state, op);
8053    *flagp |= HASWIDTH|SIMPLE;
8054    goto finish_meta_pat;
8055   case 'W':
8056    switch (get_regex_charset(RExC_flags)) {
8057     case REGEX_LOCALE_CHARSET:
8058      op = NALNUML;
8059      break;
8060     case REGEX_UNICODE_CHARSET:
8061      op = NALNUMU;
8062      break;
8063     case REGEX_ASCII_RESTRICTED_CHARSET:
8064     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8065      op = NALNUMA;
8066      break;
8067     case REGEX_DEPENDS_CHARSET:
8068      op = NALNUM;
8069      break;
8070     default:
8071      goto bad_charset;
8072    }
8073    ret = reg_node(pRExC_state, op);
8074    *flagp |= HASWIDTH|SIMPLE;
8075    goto finish_meta_pat;
8076   case 'b':
8077    RExC_seen_zerolen++;
8078    RExC_seen |= REG_SEEN_LOOKBEHIND;
8079    switch (get_regex_charset(RExC_flags)) {
8080     case REGEX_LOCALE_CHARSET:
8081      op = BOUNDL;
8082      break;
8083     case REGEX_UNICODE_CHARSET:
8084      op = BOUNDU;
8085      break;
8086     case REGEX_ASCII_RESTRICTED_CHARSET:
8087     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8088      op = BOUNDA;
8089      break;
8090     case REGEX_DEPENDS_CHARSET:
8091      op = BOUND;
8092      break;
8093     default:
8094      goto bad_charset;
8095    }
8096    ret = reg_node(pRExC_state, op);
8097    FLAGS(ret) = get_regex_charset(RExC_flags);
8098    *flagp |= SIMPLE;
8099    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8100     ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
8101    }
8102    goto finish_meta_pat;
8103   case 'B':
8104    RExC_seen_zerolen++;
8105    RExC_seen |= REG_SEEN_LOOKBEHIND;
8106    switch (get_regex_charset(RExC_flags)) {
8107     case REGEX_LOCALE_CHARSET:
8108      op = NBOUNDL;
8109      break;
8110     case REGEX_UNICODE_CHARSET:
8111      op = NBOUNDU;
8112      break;
8113     case REGEX_ASCII_RESTRICTED_CHARSET:
8114     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8115      op = NBOUNDA;
8116      break;
8117     case REGEX_DEPENDS_CHARSET:
8118      op = NBOUND;
8119      break;
8120     default:
8121      goto bad_charset;
8122    }
8123    ret = reg_node(pRExC_state, op);
8124    FLAGS(ret) = get_regex_charset(RExC_flags);
8125    *flagp |= SIMPLE;
8126    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8127     ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
8128    }
8129    goto finish_meta_pat;
8130   case 's':
8131    switch (get_regex_charset(RExC_flags)) {
8132     case REGEX_LOCALE_CHARSET:
8133      op = SPACEL;
8134      break;
8135     case REGEX_UNICODE_CHARSET:
8136      op = SPACEU;
8137      break;
8138     case REGEX_ASCII_RESTRICTED_CHARSET:
8139     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8140      op = SPACEA;
8141      break;
8142     case REGEX_DEPENDS_CHARSET:
8143      op = SPACE;
8144      break;
8145     default:
8146      goto bad_charset;
8147    }
8148    ret = reg_node(pRExC_state, op);
8149    *flagp |= HASWIDTH|SIMPLE;
8150    goto finish_meta_pat;
8151   case 'S':
8152    switch (get_regex_charset(RExC_flags)) {
8153     case REGEX_LOCALE_CHARSET:
8154      op = NSPACEL;
8155      break;
8156     case REGEX_UNICODE_CHARSET:
8157      op = NSPACEU;
8158      break;
8159     case REGEX_ASCII_RESTRICTED_CHARSET:
8160     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8161      op = NSPACEA;
8162      break;
8163     case REGEX_DEPENDS_CHARSET:
8164      op = NSPACE;
8165      break;
8166     default:
8167      goto bad_charset;
8168    }
8169    ret = reg_node(pRExC_state, op);
8170    *flagp |= HASWIDTH|SIMPLE;
8171    goto finish_meta_pat;
8172   case 'd':
8173    switch (get_regex_charset(RExC_flags)) {
8174     case REGEX_LOCALE_CHARSET:
8175      op = DIGITL;
8176      break;
8177     case REGEX_ASCII_RESTRICTED_CHARSET:
8178     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8179      op = DIGITA;
8180      break;
8181     case REGEX_DEPENDS_CHARSET: /* No difference between these */
8182     case REGEX_UNICODE_CHARSET:
8183      op = DIGIT;
8184      break;
8185     default:
8186      goto bad_charset;
8187    }
8188    ret = reg_node(pRExC_state, op);
8189    *flagp |= HASWIDTH|SIMPLE;
8190    goto finish_meta_pat;
8191   case 'D':
8192    switch (get_regex_charset(RExC_flags)) {
8193     case REGEX_LOCALE_CHARSET:
8194      op = NDIGITL;
8195      break;
8196     case REGEX_ASCII_RESTRICTED_CHARSET:
8197     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8198      op = NDIGITA;
8199      break;
8200     case REGEX_DEPENDS_CHARSET: /* No difference between these */
8201     case REGEX_UNICODE_CHARSET:
8202      op = NDIGIT;
8203      break;
8204     default:
8205      goto bad_charset;
8206    }
8207    ret = reg_node(pRExC_state, op);
8208    *flagp |= HASWIDTH|SIMPLE;
8209    goto finish_meta_pat;
8210   case 'R':
8211    ret = reg_node(pRExC_state, LNBREAK);
8212    *flagp |= HASWIDTH|SIMPLE;
8213    goto finish_meta_pat;
8214   case 'h':
8215    ret = reg_node(pRExC_state, HORIZWS);
8216    *flagp |= HASWIDTH|SIMPLE;
8217    goto finish_meta_pat;
8218   case 'H':
8219    ret = reg_node(pRExC_state, NHORIZWS);
8220    *flagp |= HASWIDTH|SIMPLE;
8221    goto finish_meta_pat;
8222   case 'v':
8223    ret = reg_node(pRExC_state, VERTWS);
8224    *flagp |= HASWIDTH|SIMPLE;
8225    goto finish_meta_pat;
8226   case 'V':
8227    ret = reg_node(pRExC_state, NVERTWS);
8228    *flagp |= HASWIDTH|SIMPLE;
8229   finish_meta_pat:
8230    nextchar(pRExC_state);
8231    Set_Node_Length(ret, 2); /* MJD */
8232    break;
8233   case 'p':
8234   case 'P':
8235    {
8236     char* const oldregxend = RExC_end;
8237 #ifdef DEBUGGING
8238     char* parse_start = RExC_parse - 2;
8239 #endif
8240
8241     if (RExC_parse[1] == '{') {
8242     /* a lovely hack--pretend we saw [\pX] instead */
8243      RExC_end = strchr(RExC_parse, '}');
8244      if (!RExC_end) {
8245       const U8 c = (U8)*RExC_parse;
8246       RExC_parse += 2;
8247       RExC_end = oldregxend;
8248       vFAIL2("Missing right brace on \\%c{}", c);
8249      }
8250      RExC_end++;
8251     }
8252     else {
8253      RExC_end = RExC_parse + 2;
8254      if (RExC_end > oldregxend)
8255       RExC_end = oldregxend;
8256     }
8257     RExC_parse--;
8258
8259     ret = regclass(pRExC_state,depth+1);
8260
8261     RExC_end = oldregxend;
8262     RExC_parse--;
8263
8264     Set_Node_Offset(ret, parse_start + 2);
8265     Set_Node_Cur_Length(ret);
8266     nextchar(pRExC_state);
8267     *flagp |= HASWIDTH|SIMPLE;
8268    }
8269    break;
8270   case 'N':
8271    /* Handle \N and \N{NAME} here and not below because it can be
8272    multicharacter. join_exact() will join them up later on.
8273    Also this makes sure that things like /\N{BLAH}+/ and
8274    \N{BLAH} being multi char Just Happen. dmq*/
8275    ++RExC_parse;
8276    ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
8277    break;
8278   case 'k':    /* Handle \k<NAME> and \k'NAME' */
8279   parse_named_seq:
8280   {
8281    char ch= RExC_parse[1];
8282    if (ch != '<' && ch != '\'' && ch != '{') {
8283     RExC_parse++;
8284     vFAIL2("Sequence %.2s... not terminated",parse_start);
8285    } else {
8286     /* this pretty much dupes the code for (?P=...) in reg(), if
8287     you change this make sure you change that */
8288     char* name_start = (RExC_parse += 2);
8289     U32 num = 0;
8290     SV *sv_dat = reg_scan_name(pRExC_state,
8291      SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8292     ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
8293     if (RExC_parse == name_start || *RExC_parse != ch)
8294      vFAIL2("Sequence %.3s... not terminated",parse_start);
8295
8296     if (!SIZE_ONLY) {
8297      num = add_data( pRExC_state, 1, "S" );
8298      RExC_rxi->data->data[num]=(void*)sv_dat;
8299      SvREFCNT_inc_simple_void(sv_dat);
8300     }
8301
8302     RExC_sawback = 1;
8303     ret = reganode(pRExC_state,
8304        ((! FOLD)
8305         ? NREF
8306         : (MORE_ASCII_RESTRICTED)
8307         ? NREFFA
8308         : (AT_LEAST_UNI_SEMANTICS)
8309          ? NREFFU
8310          : (LOC)
8311          ? NREFFL
8312          : NREFF),
8313         num);
8314     *flagp |= HASWIDTH;
8315
8316     /* override incorrect value set in reganode MJD */
8317     Set_Node_Offset(ret, parse_start+1);
8318     Set_Node_Cur_Length(ret); /* MJD */
8319     nextchar(pRExC_state);
8320
8321    }
8322    break;
8323   }
8324   case 'g':
8325   case '1': case '2': case '3': case '4':
8326   case '5': case '6': case '7': case '8': case '9':
8327    {
8328     I32 num;
8329     bool isg = *RExC_parse == 'g';
8330     bool isrel = 0;
8331     bool hasbrace = 0;
8332     if (isg) {
8333      RExC_parse++;
8334      if (*RExC_parse == '{') {
8335       RExC_parse++;
8336       hasbrace = 1;
8337      }
8338      if (*RExC_parse == '-') {
8339       RExC_parse++;
8340       isrel = 1;
8341      }
8342      if (hasbrace && !isDIGIT(*RExC_parse)) {
8343       if (isrel) RExC_parse--;
8344       RExC_parse -= 2;
8345       goto parse_named_seq;
8346     }   }
8347     num = atoi(RExC_parse);
8348     if (isg && num == 0)
8349      vFAIL("Reference to invalid group 0");
8350     if (isrel) {
8351      num = RExC_npar - num;
8352      if (num < 1)
8353       vFAIL("Reference to nonexistent or unclosed group");
8354     }
8355     if (!isg && num > 9 && num >= RExC_npar)
8356      goto defchar;
8357     else {
8358      char * const parse_start = RExC_parse - 1; /* MJD */
8359      while (isDIGIT(*RExC_parse))
8360       RExC_parse++;
8361      if (parse_start == RExC_parse - 1)
8362       vFAIL("Unterminated \\g... pattern");
8363      if (hasbrace) {
8364       if (*RExC_parse != '}')
8365        vFAIL("Unterminated \\g{...} pattern");
8366       RExC_parse++;
8367      }
8368      if (!SIZE_ONLY) {
8369       if (num > (I32)RExC_rx->nparens)
8370        vFAIL("Reference to nonexistent group");
8371      }
8372      RExC_sawback = 1;
8373      ret = reganode(pRExC_state,
8374         ((! FOLD)
8375          ? REF
8376          : (MORE_ASCII_RESTRICTED)
8377          ? REFFA
8378          : (AT_LEAST_UNI_SEMANTICS)
8379           ? REFFU
8380           : (LOC)
8381           ? REFFL
8382           : REFF),
8383          num);
8384      *flagp |= HASWIDTH;
8385
8386      /* override incorrect value set in reganode MJD */
8387      Set_Node_Offset(ret, parse_start+1);
8388      Set_Node_Cur_Length(ret); /* MJD */
8389      RExC_parse--;
8390      nextchar(pRExC_state);
8391     }
8392    }
8393    break;
8394   case '\0':
8395    if (RExC_parse >= RExC_end)
8396     FAIL("Trailing \\");
8397    /* FALL THROUGH */
8398   default:
8399    /* Do not generate "unrecognized" warnings here, we fall
8400    back into the quick-grab loop below */
8401    parse_start--;
8402    goto defchar;
8403   }
8404   break;
8405
8406  case '#':
8407   if (RExC_flags & RXf_PMf_EXTENDED) {
8408    if ( reg_skipcomment( pRExC_state ) )
8409     goto tryagain;
8410   }
8411   /* FALL THROUGH */
8412
8413  default:
8414
8415    parse_start = RExC_parse - 1;
8416
8417    RExC_parse++;
8418
8419   defchar: {
8420    typedef enum {
8421     generic_char = 0,
8422     char_s,
8423     upsilon_1,
8424     upsilon_2,
8425     iota_1,
8426     iota_2,
8427    } char_state;
8428    char_state latest_char_state = generic_char;
8429    register STRLEN len;
8430    register UV ender;
8431    register char *p;
8432    char *s;
8433    STRLEN foldlen;
8434    U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
8435    regnode * orig_emit;
8436
8437    ender = 0;
8438    orig_emit = RExC_emit; /* Save the original output node position in
8439          case we need to output a different node
8440          type */
8441    ret = reg_node(pRExC_state,
8442       (U8) ((! FOLD) ? EXACT
8443           : (LOC)
8444            ? EXACTFL
8445            : (MORE_ASCII_RESTRICTED)
8446            ? EXACTFA
8447            : (AT_LEAST_UNI_SEMANTICS)
8448             ? EXACTFU
8449             : EXACTF)
8450      );
8451    s = STRING(ret);
8452    for (len = 0, p = RExC_parse - 1;
8453    len < 127 && p < RExC_end;
8454    len++)
8455    {
8456     char * const oldp = p;
8457
8458     if (RExC_flags & RXf_PMf_EXTENDED)
8459      p = regwhite( pRExC_state, p );
8460     switch ((U8)*p) {
8461     case '^':
8462     case '$':
8463     case '.':
8464     case '[':
8465     case '(':
8466     case ')':
8467     case '|':
8468      goto loopdone;
8469     case '\\':
8470      /* Literal Escapes Switch
8471
8472      This switch is meant to handle escape sequences that
8473      resolve to a literal character.
8474
8475      Every escape sequence that represents something
8476      else, like an assertion or a char class, is handled
8477      in the switch marked 'Special Escapes' above in this
8478      routine, but also has an entry here as anything that
8479      isn't explicitly mentioned here will be treated as
8480      an unescaped equivalent literal.
8481      */
8482
8483      switch ((U8)*++p) {
8484      /* These are all the special escapes. */
8485      case 'A':             /* Start assertion */
8486      case 'b': case 'B':   /* Word-boundary assertion*/
8487      case 'C':             /* Single char !DANGEROUS! */
8488      case 'd': case 'D':   /* digit class */
8489      case 'g': case 'G':   /* generic-backref, pos assertion */
8490      case 'h': case 'H':   /* HORIZWS */
8491      case 'k': case 'K':   /* named backref, keep marker */
8492      case 'N':             /* named char sequence */
8493      case 'p': case 'P':   /* Unicode property */
8494        case 'R':   /* LNBREAK */
8495      case 's': case 'S':   /* space class */
8496      case 'v': case 'V':   /* VERTWS */
8497      case 'w': case 'W':   /* word class */
8498      case 'X':             /* eXtended Unicode "combining character sequence" */
8499      case 'z': case 'Z':   /* End of line/string assertion */
8500       --p;
8501       goto loopdone;
8502
8503      /* Anything after here is an escape that resolves to a
8504      literal. (Except digits, which may or may not)
8505      */
8506      case 'n':
8507       ender = '\n';
8508       p++;
8509       break;
8510      case 'r':
8511       ender = '\r';
8512       p++;
8513       break;
8514      case 't':
8515       ender = '\t';
8516       p++;
8517       break;
8518      case 'f':
8519       ender = '\f';
8520       p++;
8521       break;
8522      case 'e':
8523       ender = ASCII_TO_NATIVE('\033');
8524       p++;
8525       break;
8526      case 'a':
8527       ender = ASCII_TO_NATIVE('\007');
8528       p++;
8529       break;
8530      case 'o':
8531       {
8532        STRLEN brace_len = len;
8533        UV result;
8534        const char* error_msg;
8535
8536        bool valid = grok_bslash_o(p,
8537              &result,
8538              &brace_len,
8539              &error_msg,
8540              1);
8541        p += brace_len;
8542        if (! valid) {
8543         RExC_parse = p; /* going to die anyway; point
8544             to exact spot of failure */
8545         vFAIL(error_msg);
8546        }
8547        else
8548        {
8549         ender = result;
8550        }
8551        if (PL_encoding && ender < 0x100) {
8552         goto recode_encoding;
8553        }
8554        if (ender > 0xff) {
8555         REQUIRE_UTF8;
8556        }
8557        break;
8558       }
8559      case 'x':
8560       if (*++p == '{') {
8561        char* const e = strchr(p, '}');
8562
8563        if (!e) {
8564         RExC_parse = p + 1;
8565         vFAIL("Missing right brace on \\x{}");
8566        }
8567        else {
8568         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8569          | PERL_SCAN_DISALLOW_PREFIX;
8570         STRLEN numlen = e - p - 1;
8571         ender = grok_hex(p + 1, &numlen, &flags, NULL);
8572         if (ender > 0xff)
8573          REQUIRE_UTF8;
8574         p = e + 1;
8575        }
8576       }
8577       else {
8578        I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8579        STRLEN numlen = 2;
8580        ender = grok_hex(p, &numlen, &flags, NULL);
8581        p += numlen;
8582       }
8583       if (PL_encoding && ender < 0x100)
8584        goto recode_encoding;
8585       break;
8586      case 'c':
8587       p++;
8588       ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
8589       break;
8590      case '0': case '1': case '2': case '3':case '4':
8591      case '5': case '6': case '7': case '8':case '9':
8592       if (*p == '0' ||
8593        (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
8594       {
8595        I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8596        STRLEN numlen = 3;
8597        ender = grok_oct(p, &numlen, &flags, NULL);
8598        if (ender > 0xff) {
8599         REQUIRE_UTF8;
8600        }
8601        p += numlen;
8602       }
8603       else {
8604        --p;
8605        goto loopdone;
8606       }
8607       if (PL_encoding && ender < 0x100)
8608        goto recode_encoding;
8609       break;
8610      recode_encoding:
8611       if (! RExC_override_recoding) {
8612        SV* enc = PL_encoding;
8613        ender = reg_recode((const char)(U8)ender, &enc);
8614        if (!enc && SIZE_ONLY)
8615         ckWARNreg(p, "Invalid escape in the specified encoding");
8616        REQUIRE_UTF8;
8617       }
8618       break;
8619      case '\0':
8620       if (p >= RExC_end)
8621        FAIL("Trailing \\");
8622       /* FALL THROUGH */
8623      default:
8624       if (!SIZE_ONLY&& isALPHA(*p)) {
8625        /* Include any { following the alpha to emphasize
8626        * that it could be part of an escape at some point
8627        * in the future */
8628        int len = (*(p + 1) == '{') ? 2 : 1;
8629        ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
8630       }
8631       goto normal_default;
8632      }
8633      break;
8634     default:
8635     normal_default:
8636      if (UTF8_IS_START(*p) && UTF) {
8637       STRLEN numlen;
8638       ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
8639            &numlen, UTF8_ALLOW_DEFAULT);
8640       p += numlen;
8641      }
8642      else
8643       ender = (U8) *p++;
8644      break;
8645     } /* End of switch on the literal */
8646
8647     /* Certain characters are problematic because their folded
8648     * length is so different from their original length that it
8649     * isn't handleable by the optimizer.  They are therefore not
8650     * placed in an EXACTish node; and are here handled specially.
8651     * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
8652     * putting it in a special node keeps regexec from having to
8653     * deal with a non-utf8 multi-char fold */
8654     if (FOLD
8655      && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC)))
8656     {
8657      /* We look for either side of the fold.  For example \xDF
8658      * folds to 'ss'.  We look for both the single character
8659      * \xDF and the sequence 'ss'.  When we find something that
8660      * could be one of those, we stop and flush whatever we
8661      * have output so far into the EXACTish node that was being
8662      * built.  Then restore the input pointer to what it was.
8663      * regatom will return that EXACT node, and will be called
8664      * again, positioned so the first character is the one in
8665      * question, which we return in a different node type.
8666      * The multi-char folds are a sequence, so the occurrence
8667      * of the first character in that sequence doesn't
8668      * necessarily mean that what follows is the rest of the
8669      * sequence.  We keep track of that with a state machine,
8670      * with the state being set to the latest character
8671      * processed before the current one.  Most characters will
8672      * set the state to 0, but if one occurs that is part of a
8673      * potential tricky fold sequence, the state is set to that
8674      * character, and the next loop iteration sees if the state
8675      * should progress towards the final folded-from character,
8676      * or if it was a false alarm.  If it turns out to be a
8677      * false alarm, the character(s) will be output in a new
8678      * EXACTish node, and join_exact() will later combine them.
8679      * In the case of the 'ss' sequence, which is more common
8680      * and more easily checked, some look-ahead is done to
8681      * save time by ruling-out some false alarms */
8682      switch (ender) {
8683       default:
8684        latest_char_state = generic_char;
8685        break;
8686       case 's':
8687       case 'S':
8688       case 0x17F: /* LATIN SMALL LETTER LONG S */
8689        if (AT_LEAST_UNI_SEMANTICS) {
8690         if (latest_char_state == char_s) {  /* 'ss' */
8691          ender = LATIN_SMALL_LETTER_SHARP_S;
8692          goto do_tricky;
8693         }
8694         else if (p < RExC_end) {
8695
8696          /* Look-ahead at the next character.  If it
8697          * is also an s, we handle as a sharp s
8698          * tricky regnode.  */
8699          if (*p == 's' || *p == 'S') {
8700
8701           /* But first flush anything in the
8702           * EXACTish buffer */
8703           if (len != 0) {
8704            p = oldp;
8705            goto loopdone;
8706           }
8707           p++; /* Account for swallowing this
8708             's' up */
8709           ender = LATIN_SMALL_LETTER_SHARP_S;
8710           goto do_tricky;
8711          }
8712           /* Here, the next character is not a
8713           * literal 's', but still could
8714           * evaluate to one if part of a \o{},
8715           * \x or \OCTAL-DIGIT.  The minimum
8716           * length required for that is 4, eg
8717           * \x53 or \123 */
8718          else if (*p == '\\'
8719            && p < RExC_end - 4
8720            && (isDIGIT(*(p + 1))
8721             || *(p + 1) == 'x'
8722             || *(p + 1) == 'o' ))
8723          {
8724
8725           /* Here, it could be an 's', too much
8726           * bother to figure it out here.  Flush
8727           * the buffer if any; when come back
8728           * here, set the state so know that the
8729           * previous char was an 's' */
8730           if (len != 0) {
8731            latest_char_state = generic_char;
8732            p = oldp;
8733            goto loopdone;
8734           }
8735           latest_char_state = char_s;
8736           break;
8737          }
8738         }
8739        }
8740
8741        /* Here, can't be an 'ss' sequence, or at least not
8742        * one that could fold to/from the sharp ss */
8743        latest_char_state = generic_char;
8744        break;
8745       case 0x03C5: /* First char in upsilon series */
8746        if (p < RExC_end - 4) { /* Need >= 4 bytes left */
8747         latest_char_state = upsilon_1;
8748         if (len != 0) {
8749          p = oldp;
8750          goto loopdone;
8751         }
8752        }
8753        else {
8754         latest_char_state = generic_char;
8755        }
8756        break;
8757       case 0x03B9: /* First char in iota series */
8758        if (p < RExC_end - 4) {
8759         latest_char_state = iota_1;
8760         if (len != 0) {
8761          p = oldp;
8762          goto loopdone;
8763         }
8764        }
8765        else {
8766         latest_char_state = generic_char;
8767        }
8768        break;
8769       case 0x0308:
8770        if (latest_char_state == upsilon_1) {
8771         latest_char_state = upsilon_2;
8772        }
8773        else if (latest_char_state == iota_1) {
8774         latest_char_state = iota_2;
8775        }
8776        else {
8777         latest_char_state = generic_char;
8778        }
8779        break;
8780       case 0x301:
8781        if (latest_char_state == upsilon_2) {
8782         ender = GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS;
8783         goto do_tricky;
8784        }
8785        else if (latest_char_state == iota_2) {
8786         ender = GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS;
8787         goto do_tricky;
8788        }
8789        latest_char_state = generic_char;
8790        break;
8791
8792       /* These are the tricky fold characters.  Flush any
8793       * buffer first. */
8794       case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS:
8795       case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS:
8796       case LATIN_SMALL_LETTER_SHARP_S:
8797       case LATIN_CAPITAL_LETTER_SHARP_S:
8798       case 0x1FD3:
8799       case 0x1FE3:
8800        if (len != 0) {
8801         p = oldp;
8802         goto loopdone;
8803        }
8804        /* FALL THROUGH */
8805       do_tricky: {
8806        char* const oldregxend = RExC_end;
8807        U8 tmpbuf[UTF8_MAXBYTES+1];
8808
8809        /* Here, we know we need to generate a special
8810        * regnode, and 'ender' contains the tricky
8811        * character.  What's done is to pretend it's in a
8812        * [bracketed] class, and let the code that deals
8813        * with those handle it, as that code has all the
8814        * intelligence necessary.  First save the current
8815        * parse state, get rid of the already allocated
8816        * but empty EXACT node that the ANYOFV node will
8817        * replace, and point the parse to a buffer which
8818        * we fill with the character we want the regclass
8819        * code to think is being parsed */
8820        RExC_emit = orig_emit;
8821        RExC_parse = (char *) tmpbuf;
8822        if (UTF) {
8823         U8 *d = uvchr_to_utf8(tmpbuf, ender);
8824         *d = '\0';
8825         RExC_end = (char *) d;
8826        }
8827        else {  /* ender above 255 already excluded */
8828         tmpbuf[0] = (U8) ender;
8829         tmpbuf[1] = '\0';
8830         RExC_end = RExC_parse + 1;
8831        }
8832
8833        ret = regclass(pRExC_state,depth+1);
8834
8835        /* Here, have parsed the buffer.  Reset the parse to
8836        * the actual input, and return */
8837        RExC_end = oldregxend;
8838        RExC_parse = p - 1;
8839
8840        Set_Node_Offset(ret, RExC_parse);
8841        Set_Node_Cur_Length(ret);
8842        nextchar(pRExC_state);
8843        *flagp |= HASWIDTH|SIMPLE;
8844        return ret;
8845       }
8846      }
8847     }
8848
8849     if ( RExC_flags & RXf_PMf_EXTENDED)
8850      p = regwhite( pRExC_state, p );
8851     if (UTF && FOLD) {
8852      /* Prime the casefolded buffer.  Locale rules, which apply
8853      * only to code points < 256, aren't known until execution,
8854      * so for them, just output the original character using
8855      * utf8 */
8856      if (LOC && ender < 256) {
8857       if (UNI_IS_INVARIANT(ender)) {
8858        *tmpbuf = (U8) ender;
8859        foldlen = 1;
8860       } else {
8861        *tmpbuf = UTF8_TWO_BYTE_HI(ender);
8862        *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
8863        foldlen = 2;
8864       }
8865      }
8866      else if (isASCII(ender)) { /* Note: Here can't also be LOC
8867             */
8868       ender = toLOWER(ender);
8869       *tmpbuf = (U8) ender;
8870       foldlen = 1;
8871      }
8872      else if (! MORE_ASCII_RESTRICTED && ! LOC) {
8873
8874       /* Locale and /aa require more selectivity about the
8875       * fold, so are handled below.  Otherwise, here, just
8876       * use the fold */
8877       ender = toFOLD_uni(ender, tmpbuf, &foldlen);
8878      }
8879      else {
8880       /* Under locale rules or /aa we are not to mix,
8881       * respectively, ords < 256 or ASCII with non-.  So
8882       * reject folds that mix them, using only the
8883       * non-folded code point.  So do the fold to a
8884       * temporary, and inspect each character in it. */
8885       U8 trialbuf[UTF8_MAXBYTES_CASE+1];
8886       U8* s = trialbuf;
8887       UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
8888       U8* e = s + foldlen;
8889       bool fold_ok = TRUE;
8890
8891       while (s < e) {
8892        if (isASCII(*s)
8893         || (LOC && (UTF8_IS_INVARIANT(*s)
8894           || UTF8_IS_DOWNGRADEABLE_START(*s))))
8895        {
8896         fold_ok = FALSE;
8897         break;
8898        }
8899        s += UTF8SKIP(s);
8900       }
8901       if (fold_ok) {
8902        Copy(trialbuf, tmpbuf, foldlen, U8);
8903        ender = tmpender;
8904       }
8905       else {
8906        uvuni_to_utf8(tmpbuf, ender);
8907        foldlen = UNISKIP(ender);
8908       }
8909      }
8910     }
8911     if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
8912      if (len)
8913       p = oldp;
8914      else if (UTF) {
8915       if (FOLD) {
8916        /* Emit all the Unicode characters. */
8917        STRLEN numlen;
8918        for (foldbuf = tmpbuf;
8919         foldlen;
8920         foldlen -= numlen) {
8921         ender = utf8_to_uvchr(foldbuf, &numlen);
8922         if (numlen > 0) {
8923           const STRLEN unilen = reguni(pRExC_state, ender, s);
8924           s       += unilen;
8925           len     += unilen;
8926           /* In EBCDIC the numlen
8927           * and unilen can differ. */
8928           foldbuf += numlen;
8929           if (numlen >= foldlen)
8930            break;
8931         }
8932         else
8933           break; /* "Can't happen." */
8934        }
8935       }
8936       else {
8937        const STRLEN unilen = reguni(pRExC_state, ender, s);
8938        if (unilen > 0) {
8939         s   += unilen;
8940         len += unilen;
8941        }
8942       }
8943      }
8944      else {
8945       len++;
8946       REGC((char)ender, s++);
8947      }
8948      break;
8949     }
8950     if (UTF) {
8951      if (FOLD) {
8952       /* Emit all the Unicode characters. */
8953       STRLEN numlen;
8954       for (foldbuf = tmpbuf;
8955        foldlen;
8956        foldlen -= numlen) {
8957        ender = utf8_to_uvchr(foldbuf, &numlen);
8958        if (numlen > 0) {
8959          const STRLEN unilen = reguni(pRExC_state, ender, s);
8960          len     += unilen;
8961          s       += unilen;
8962          /* In EBCDIC the numlen
8963          * and unilen can differ. */
8964          foldbuf += numlen;
8965          if (numlen >= foldlen)
8966           break;
8967        }
8968        else
8969          break;
8970       }
8971      }
8972      else {
8973       const STRLEN unilen = reguni(pRExC_state, ender, s);
8974       if (unilen > 0) {
8975        s   += unilen;
8976        len += unilen;
8977       }
8978      }
8979      len--;
8980     }
8981     else {
8982      REGC((char)ender, s++);
8983     }
8984    }
8985   loopdone:   /* Jumped to when encounters something that shouldn't be in
8986      the node */
8987    RExC_parse = p - 1;
8988    Set_Node_Cur_Length(ret); /* MJD */
8989    nextchar(pRExC_state);
8990    {
8991     /* len is STRLEN which is unsigned, need to copy to signed */
8992     IV iv = len;
8993     if (iv < 0)
8994      vFAIL("Internal disaster");
8995    }
8996    if (len > 0)
8997     *flagp |= HASWIDTH;
8998    if (len == 1 && UNI_IS_INVARIANT(ender))
8999     *flagp |= SIMPLE;
9000
9001    if (SIZE_ONLY)
9002     RExC_size += STR_SZ(len);
9003    else {
9004     STR_LEN(ret) = len;
9005     RExC_emit += STR_SZ(len);
9006    }
9007   }
9008   break;
9009  }
9010
9011  return(ret);
9012
9013 /* Jumped to when an unrecognized character set is encountered */
9014 bad_charset:
9015  Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
9016  return(NULL);
9017 }
9018
9019 STATIC char *
9020 S_regwhite( RExC_state_t *pRExC_state, char *p )
9021 {
9022  const char *e = RExC_end;
9023
9024  PERL_ARGS_ASSERT_REGWHITE;
9025
9026  while (p < e) {
9027   if (isSPACE(*p))
9028    ++p;
9029   else if (*p == '#') {
9030    bool ended = 0;
9031    do {
9032     if (*p++ == '\n') {
9033      ended = 1;
9034      break;
9035     }
9036    } while (p < e);
9037    if (!ended)
9038     RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9039   }
9040   else
9041    break;
9042  }
9043  return p;
9044 }
9045
9046 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9047    Character classes ([:foo:]) can also be negated ([:^foo:]).
9048    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9049    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
9050    but trigger failures because they are currently unimplemented. */
9051
9052 #define POSIXCC_DONE(c)   ((c) == ':')
9053 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9054 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9055
9056 STATIC I32
9057 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
9058 {
9059  dVAR;
9060  I32 namedclass = OOB_NAMEDCLASS;
9061
9062  PERL_ARGS_ASSERT_REGPPOSIXCC;
9063
9064  if (value == '[' && RExC_parse + 1 < RExC_end &&
9065   /* I smell either [: or [= or [. -- POSIX has been here, right? */
9066   POSIXCC(UCHARAT(RExC_parse))) {
9067   const char c = UCHARAT(RExC_parse);
9068   char* const s = RExC_parse++;
9069
9070   while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
9071    RExC_parse++;
9072   if (RExC_parse == RExC_end)
9073    /* Grandfather lone [:, [=, [. */
9074    RExC_parse = s;
9075   else {
9076    const char* const t = RExC_parse++; /* skip over the c */
9077    assert(*t == c);
9078
9079    if (UCHARAT(RExC_parse) == ']') {
9080     const char *posixcc = s + 1;
9081     RExC_parse++; /* skip over the ending ] */
9082
9083     if (*s == ':') {
9084      const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
9085      const I32 skip = t - posixcc;
9086
9087      /* Initially switch on the length of the name.  */
9088      switch (skip) {
9089      case 4:
9090       if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
9091        namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
9092       break;
9093      case 5:
9094       /* Names all of length 5.  */
9095       /* alnum alpha ascii blank cntrl digit graph lower
9096       print punct space upper  */
9097       /* Offset 4 gives the best switch position.  */
9098       switch (posixcc[4]) {
9099       case 'a':
9100        if (memEQ(posixcc, "alph", 4)) /* alpha */
9101         namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
9102        break;
9103       case 'e':
9104        if (memEQ(posixcc, "spac", 4)) /* space */
9105         namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
9106        break;
9107       case 'h':
9108        if (memEQ(posixcc, "grap", 4)) /* graph */
9109         namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
9110        break;
9111       case 'i':
9112        if (memEQ(posixcc, "asci", 4)) /* ascii */
9113         namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
9114        break;
9115       case 'k':
9116        if (memEQ(posixcc, "blan", 4)) /* blank */
9117         namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
9118        break;
9119       case 'l':
9120        if (memEQ(posixcc, "cntr", 4)) /* cntrl */
9121         namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
9122        break;
9123       case 'm':
9124        if (memEQ(posixcc, "alnu", 4)) /* alnum */
9125         namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
9126        break;
9127       case 'r':
9128        if (memEQ(posixcc, "lowe", 4)) /* lower */
9129         namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
9130        else if (memEQ(posixcc, "uppe", 4)) /* upper */
9131         namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
9132        break;
9133       case 't':
9134        if (memEQ(posixcc, "digi", 4)) /* digit */
9135         namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
9136        else if (memEQ(posixcc, "prin", 4)) /* print */
9137         namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
9138        else if (memEQ(posixcc, "punc", 4)) /* punct */
9139         namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
9140        break;
9141       }
9142       break;
9143      case 6:
9144       if (memEQ(posixcc, "xdigit", 6))
9145        namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
9146       break;
9147      }
9148
9149      if (namedclass == OOB_NAMEDCLASS)
9150       Simple_vFAIL3("POSIX class [:%.*s:] unknown",
9151          t - s - 1, s + 1);
9152      assert (posixcc[skip] == ':');
9153      assert (posixcc[skip+1] == ']');
9154     } else if (!SIZE_ONLY) {
9155      /* [[=foo=]] and [[.foo.]] are still future. */
9156
9157      /* adjust RExC_parse so the warning shows after
9158      the class closes */
9159      while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
9160       RExC_parse++;
9161      Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9162     }
9163    } else {
9164     /* Maternal grandfather:
9165     * "[:" ending in ":" but not in ":]" */
9166     RExC_parse = s;
9167    }
9168   }
9169  }
9170
9171  return namedclass;
9172 }
9173
9174 STATIC void
9175 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
9176 {
9177  dVAR;
9178
9179  PERL_ARGS_ASSERT_CHECKPOSIXCC;
9180
9181  if (POSIXCC(UCHARAT(RExC_parse))) {
9182   const char *s = RExC_parse;
9183   const char  c = *s++;
9184
9185   while (isALNUM(*s))
9186    s++;
9187   if (*s && c == *s && s[1] == ']') {
9188    ckWARN3reg(s+2,
9189      "POSIX syntax [%c %c] belongs inside character classes",
9190      c, c);
9191
9192    /* [[=foo=]] and [[.foo.]] are still future. */
9193    if (POSIXCC_NOTYET(c)) {
9194     /* adjust RExC_parse so the error shows after
9195     the class closes */
9196     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
9197      NOOP;
9198     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9199    }
9200   }
9201  }
9202 }
9203
9204 /* No locale test, and always Unicode semantics */
9205 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                                          \
9206 ANYOF_##NAME:                                                                  \
9207   for (value = 0; value < 256; value++)                                  \
9208    if (TEST)                                                          \
9209    stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9210  yesno = '+';                                                               \
9211  what = WORD;                                                               \
9212  break;                                                                     \
9213 case ANYOF_N##NAME:                                                            \
9214   for (value = 0; value < 256; value++)                                  \
9215    if (!TEST)                                                         \
9216    stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9217  yesno = '!';                                                               \
9218  what = WORD;                                                               \
9219  break
9220
9221 /* Like the above, but there are differences if we are in uni-8-bit or not, so
9222  * there are two tests passed in, to use depending on that. There aren't any
9223  * cases where the label is different from the name, so no need for that
9224  * parameter */
9225 #define _C_C_T_(NAME, TEST_8, TEST_7, WORD)                                    \
9226 ANYOF_##NAME:                                                                  \
9227  if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);                               \
9228  else if (UNI_SEMANTICS) {                                                  \
9229   for (value = 0; value < 256; value++) {                                \
9230    if (TEST_8(value)) stored +=                                       \
9231      set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9232   }                                                                      \
9233  }                                                                          \
9234  else {                                                                     \
9235   for (value = 0; value < 128; value++) {                                \
9236    if (TEST_7(UNI_TO_NATIVE(value))) stored +=                        \
9237     set_regclass_bit(pRExC_state, ret,                     \
9238         (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);                 \
9239   }                                                                      \
9240  }                                                                          \
9241  yesno = '+';                                                               \
9242  what = WORD;                                                               \
9243  break;                                                                     \
9244 case ANYOF_N##NAME:                                                            \
9245  if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);                              \
9246  else if (UNI_SEMANTICS) {                                                  \
9247   for (value = 0; value < 256; value++) {                                \
9248    if (! TEST_8(value)) stored +=                                     \
9249      set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);    \
9250   }                                                                      \
9251  }                                                                          \
9252  else {                                                                     \
9253   for (value = 0; value < 128; value++) {                                \
9254    if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit(  \
9255       pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);    \
9256   }                                                                      \
9257   if (AT_LEAST_ASCII_RESTRICTED) {                                       \
9258    for (value = 128; value < 256; value++) {                          \
9259    stored += set_regclass_bit(                                     \
9260       pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
9261    }                                                                  \
9262    ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;                             \
9263   }                                                                      \
9264   else {                                                                 \
9265    /* For a non-ut8 target string with DEPENDS semantics, all above   \
9266    * ASCII Latin1 code points match the complement of any of the     \
9267    * classes.  But in utf8, they have their Unicode semantics, so    \
9268    * can't just set them in the bitmap, or else regexec.c will think \
9269    * they matched when they shouldn't. */                            \
9270    ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;                     \
9271   }                                                                      \
9272  }                                                                          \
9273  yesno = '!';                                                               \
9274  what = WORD;                                                               \
9275  break
9276
9277 STATIC U8
9278 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
9279 {
9280
9281  /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
9282  * Locale folding is done at run-time, so this function should not be
9283  * called for nodes that are for locales.
9284  *
9285  * This function sets the bit corresponding to the fold of the input
9286  * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
9287  * 'F' is 'f'.
9288  *
9289  * It also knows about the characters that are in the bitmap that have
9290  * folds that are matchable only outside it, and sets the appropriate lists
9291  * and flags.
9292  *
9293  * It returns the number of bits that actually changed from 0 to 1 */
9294
9295  U8 stored = 0;
9296  U8 fold;
9297
9298  PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
9299
9300  fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
9301          : PL_fold[value];
9302
9303  /* It assumes the bit for 'value' has already been set */
9304  if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
9305   ANYOF_BITMAP_SET(node, fold);
9306   stored++;
9307  }
9308  if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
9309   /* Certain Latin1 characters have matches outside the bitmap.  To get
9310   * here, 'value' is one of those characters.   None of these matches is
9311   * valid for ASCII characters under /aa, which have been excluded by
9312   * the 'if' above.  The matches fall into three categories:
9313   * 1) They are singly folded-to or -from an above 255 character, as
9314   *    LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
9315   *    WITH DIAERESIS;
9316   * 2) They are part of a multi-char fold with another character in the
9317   *    bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
9318   * 3) They are part of a multi-char fold with a character not in the
9319   *    bitmap, such as various ligatures.
9320   * We aren't dealing fully with multi-char folds, except we do deal
9321   * with the pattern containing a character that has a multi-char fold
9322   * (not so much the inverse).
9323   * For types 1) and 3), the matches only happen when the target string
9324   * is utf8; that's not true for 2), and we set a flag for it.
9325   *
9326   * The code below adds to the passed in inversion list the single fold
9327   * closures for 'value'.  The values are hard-coded here so that an
9328   * innocent-looking character class, like /[ks]/i won't have to go out
9329   * to disk to find the possible matches.  XXX It would be better to
9330   * generate these via regen, in case a new version of the Unicode
9331   * standard adds new mappings, though that is not really likely. */
9332   switch (value) {
9333    case 'k':
9334    case 'K':
9335     /* KELVIN SIGN */
9336     *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
9337     break;
9338    case 's':
9339    case 'S':
9340     /* LATIN SMALL LETTER LONG S */
9341     *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
9342     break;
9343    case MICRO_SIGN:
9344     *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9345             GREEK_SMALL_LETTER_MU);
9346     *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9347             GREEK_CAPITAL_LETTER_MU);
9348     break;
9349    case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
9350    case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
9351     /* ANGSTROM SIGN */
9352     *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
9353     if (DEPENDS_SEMANTICS) {    /* See DEPENDS comment below */
9354      *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9355              PL_fold_latin1[value]);
9356     }
9357     break;
9358    case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
9359     *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9360           LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
9361     break;
9362    case LATIN_SMALL_LETTER_SHARP_S:
9363     *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9364           LATIN_CAPITAL_LETTER_SHARP_S);
9365
9366     /* Under /a, /d, and /u, this can match the two chars "ss" */
9367     if (! MORE_ASCII_RESTRICTED) {
9368      add_alternate(alternate_ptr, (U8 *) "ss", 2);
9369
9370      /* And under /u or /a, it can match even if the target is
9371      * not utf8 */
9372      if (AT_LEAST_UNI_SEMANTICS) {
9373       ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
9374      }
9375     }
9376     break;
9377    case 'F': case 'f':
9378    case 'I': case 'i':
9379    case 'L': case 'l':
9380    case 'T': case 't':
9381    case 'A': case 'a':
9382    case 'H': case 'h':
9383    case 'J': case 'j':
9384    case 'N': case 'n':
9385    case 'W': case 'w':
9386    case 'Y': case 'y':
9387     /* These all are targets of multi-character folds from code
9388     * points that require UTF8 to express, so they can't match
9389     * unless the target string is in UTF-8, so no action here is
9390     * necessary, as regexec.c properly handles the general case
9391     * for UTF-8 matching */
9392     break;
9393    default:
9394     /* Use deprecated warning to increase the chances of this
9395     * being output */
9396     ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
9397     break;
9398   }
9399  }
9400  else if (DEPENDS_SEMANTICS
9401    && ! isASCII(value)
9402    && PL_fold_latin1[value] != value)
9403  {
9404   /* Under DEPENDS rules, non-ASCII Latin1 characters match their
9405    * folds only when the target string is in UTF-8.  We add the fold
9406    * here to the list of things to match outside the bitmap, which
9407    * won't be looked at unless it is UTF8 (or else if something else
9408    * says to look even if not utf8, but those things better not happen
9409    * under DEPENDS semantics. */
9410   *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
9411  }
9412
9413  return stored;
9414 }
9415
9416
9417 PERL_STATIC_INLINE U8
9418 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
9419 {
9420  /* This inline function sets a bit in the bitmap if not already set, and if
9421  * appropriate, its fold, returning the number of bits that actually
9422  * changed from 0 to 1 */
9423
9424  U8 stored;
9425
9426  PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
9427
9428  if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
9429   return 0;
9430  }
9431
9432  ANYOF_BITMAP_SET(node, value);
9433  stored = 1;
9434
9435  if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
9436   stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
9437  }
9438
9439  return stored;
9440 }
9441
9442 STATIC void
9443 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
9444 {
9445  /* Adds input 'string' with length 'len' to the ANYOF node's unicode
9446  * alternate list, pointed to by 'alternate_ptr'.  This is an array of
9447  * the multi-character folds of characters in the node */
9448  SV *sv;
9449
9450  PERL_ARGS_ASSERT_ADD_ALTERNATE;
9451
9452  if (! *alternate_ptr) {
9453   *alternate_ptr = newAV();
9454  }
9455  sv = newSVpvn_utf8((char*)string, len, TRUE);
9456  av_push(*alternate_ptr, sv);
9457  return;
9458 }
9459
9460 /*
9461    parse a class specification and produce either an ANYOF node that
9462    matches the pattern or perhaps will be optimized into an EXACTish node
9463    instead. The node contains a bit map for the first 256 characters, with the
9464    corresponding bit set if that character is in the list.  For characters
9465    above 255, a range list is used */
9466
9467 STATIC regnode *
9468 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
9469 {
9470  dVAR;
9471  register UV nextvalue;
9472  register IV prevvalue = OOB_UNICODE;
9473  register IV range = 0;
9474  UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
9475  register regnode *ret;
9476  STRLEN numlen;
9477  IV namedclass;
9478  char *rangebegin = NULL;
9479  bool need_class = 0;
9480  bool allow_full_fold = TRUE;   /* Assume wants multi-char folding */
9481  SV *listsv = NULL;
9482  STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
9483          than just initialized.  */
9484  UV n;
9485
9486  /* code points this node matches that can't be stored in the bitmap */
9487  HV* nonbitmap = NULL;
9488
9489  /* The items that are to match that aren't stored in the bitmap, but are a
9490  * result of things that are stored there.  This is the fold closure of
9491  * such a character, either because it has DEPENDS semantics and shouldn't
9492  * be matched unless the target string is utf8, or is a code point that is
9493  * too large for the bit map, as for example, the fold of the MICRO SIGN is
9494  * above 255.  This all is solely for performance reasons.  By having this
9495  * code know the outside-the-bitmap folds that the bitmapped characters are
9496  * involved with, we don't have to go out to disk to find the list of
9497  * matches, unless the character class includes code points that aren't
9498  * storable in the bit map.  That means that a character class with an 's'
9499  * in it, for example, doesn't need to go out to disk to find everything
9500  * that matches.  A 2nd list is used so that the 'nonbitmap' list is kept
9501  * empty unless there is something whose fold we don't know about, and will
9502  * have to go out to the disk to find. */
9503  HV* l1_fold_invlist = NULL;
9504
9505  /* List of multi-character folds that are matched by this node */
9506  AV* unicode_alternate  = NULL;
9507 #ifdef EBCDIC
9508  UV literal_endpoint = 0;
9509 #endif
9510  UV stored = 0;  /* how many chars stored in the bitmap */
9511
9512  regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
9513   case we need to change the emitted regop to an EXACT. */
9514  const char * orig_parse = RExC_parse;
9515  GET_RE_DEBUG_FLAGS_DECL;
9516
9517  PERL_ARGS_ASSERT_REGCLASS;
9518 #ifndef DEBUGGING
9519  PERL_UNUSED_ARG(depth);
9520 #endif
9521
9522  DEBUG_PARSE("clas");
9523
9524  /* Assume we are going to generate an ANYOF node. */
9525  ret = reganode(pRExC_state, ANYOF, 0);
9526
9527
9528  if (!SIZE_ONLY) {
9529   ANYOF_FLAGS(ret) = 0;
9530  }
9531
9532  if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
9533   RExC_naughty++;
9534   RExC_parse++;
9535   if (!SIZE_ONLY)
9536    ANYOF_FLAGS(ret) |= ANYOF_INVERT;
9537
9538   /* We have decided to not allow multi-char folds in inverted character
9539   * classes, due to the confusion that can happen, even with classes
9540   * that are designed for a non-Unicode world:  You have the peculiar
9541   * case that:
9542    "s s" =~ /^[^\xDF]+$/i => Y
9543    "ss"  =~ /^[^\xDF]+$/i => N
9544   *
9545   * See [perl #89750] */
9546   allow_full_fold = FALSE;
9547  }
9548
9549  if (SIZE_ONLY) {
9550   RExC_size += ANYOF_SKIP;
9551   listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
9552  }
9553  else {
9554   RExC_emit += ANYOF_SKIP;
9555   if (LOC) {
9556    ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
9557   }
9558   ANYOF_BITMAP_ZERO(ret);
9559   listsv = newSVpvs("# comment\n");
9560   initial_listsv_len = SvCUR(listsv);
9561  }
9562
9563  nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9564
9565  if (!SIZE_ONLY && POSIXCC(nextvalue))
9566   checkposixcc(pRExC_state);
9567
9568  /* allow 1st char to be ] (allowing it to be - is dealt with later) */
9569  if (UCHARAT(RExC_parse) == ']')
9570   goto charclassloop;
9571
9572 parseit:
9573  while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
9574
9575  charclassloop:
9576
9577   namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9578
9579   if (!range)
9580    rangebegin = RExC_parse;
9581   if (UTF) {
9582    value = utf8n_to_uvchr((U8*)RExC_parse,
9583         RExC_end - RExC_parse,
9584         &numlen, UTF8_ALLOW_DEFAULT);
9585    RExC_parse += numlen;
9586   }
9587   else
9588    value = UCHARAT(RExC_parse++);
9589
9590   nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9591   if (value == '[' && POSIXCC(nextvalue))
9592    namedclass = regpposixcc(pRExC_state, value);
9593   else if (value == '\\') {
9594    if (UTF) {
9595     value = utf8n_to_uvchr((U8*)RExC_parse,
9596         RExC_end - RExC_parse,
9597         &numlen, UTF8_ALLOW_DEFAULT);
9598     RExC_parse += numlen;
9599    }
9600    else
9601     value = UCHARAT(RExC_parse++);
9602    /* Some compilers cannot handle switching on 64-bit integer
9603    * values, therefore value cannot be an UV.  Yes, this will
9604    * be a problem later if we want switch on Unicode.
9605    * A similar issue a little bit later when switching on
9606    * namedclass. --jhi */
9607    switch ((I32)value) {
9608    case 'w': namedclass = ANYOF_ALNUM; break;
9609    case 'W': namedclass = ANYOF_NALNUM; break;
9610    case 's': namedclass = ANYOF_SPACE; break;
9611    case 'S': namedclass = ANYOF_NSPACE; break;
9612    case 'd': namedclass = ANYOF_DIGIT; break;
9613    case 'D': namedclass = ANYOF_NDIGIT; break;
9614    case 'v': namedclass = ANYOF_VERTWS; break;
9615    case 'V': namedclass = ANYOF_NVERTWS; break;
9616    case 'h': namedclass = ANYOF_HORIZWS; break;
9617    case 'H': namedclass = ANYOF_NHORIZWS; break;
9618    case 'N':  /* Handle \N{NAME} in class */
9619     {
9620      /* We only pay attention to the first char of
9621      multichar strings being returned. I kinda wonder
9622      if this makes sense as it does change the behaviour
9623      from earlier versions, OTOH that behaviour was broken
9624      as well. */
9625      UV v; /* value is register so we cant & it /grrr */
9626      if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
9627       goto parseit;
9628      }
9629      value= v;
9630     }
9631     break;
9632    case 'p':
9633    case 'P':
9634     {
9635     char *e;
9636     if (RExC_parse >= RExC_end)
9637      vFAIL2("Empty \\%c{}", (U8)value);
9638     if (*RExC_parse == '{') {
9639      const U8 c = (U8)value;
9640      e = strchr(RExC_parse++, '}');
9641      if (!e)
9642       vFAIL2("Missing right brace on \\%c{}", c);
9643      while (isSPACE(UCHARAT(RExC_parse)))
9644       RExC_parse++;
9645      if (e == RExC_parse)
9646       vFAIL2("Empty \\%c{}", c);
9647      n = e - RExC_parse;
9648      while (isSPACE(UCHARAT(RExC_parse + n - 1)))
9649       n--;
9650     }
9651     else {
9652      e = RExC_parse;
9653      n = 1;
9654     }
9655     if (!SIZE_ONLY) {
9656      if (UCHARAT(RExC_parse) == '^') {
9657       RExC_parse++;
9658       n--;
9659       value = value == 'p' ? 'P' : 'p'; /* toggle */
9660       while (isSPACE(UCHARAT(RExC_parse))) {
9661        RExC_parse++;
9662        n--;
9663       }
9664      }
9665
9666      /* Add the property name to the list.  If /i matching, give
9667      * a different name which consists of the normal name
9668      * sandwiched between two underscores and '_i'.  The design
9669      * is discussed in the commit message for this. */
9670      Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
9671           (value=='p' ? '+' : '!'),
9672           (FOLD) ? "__" : "",
9673           (int)n,
9674           RExC_parse,
9675           (FOLD) ? "_i" : ""
9676          );
9677     }
9678     RExC_parse = e + 1;
9679
9680     /* The \p could match something in the Latin1 range, hence
9681     * something that isn't utf8 */
9682     ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
9683     namedclass = ANYOF_MAX;  /* no official name, but it's named */
9684
9685     /* \p means they want Unicode semantics */
9686     RExC_uni_semantics = 1;
9687     }
9688     break;
9689    case 'n': value = '\n';   break;
9690    case 'r': value = '\r';   break;
9691    case 't': value = '\t';   break;
9692    case 'f': value = '\f';   break;
9693    case 'b': value = '\b';   break;
9694    case 'e': value = ASCII_TO_NATIVE('\033');break;
9695    case 'a': value = ASCII_TO_NATIVE('\007');break;
9696    case 'o':
9697     RExC_parse--; /* function expects to be pointed at the 'o' */
9698     {
9699      const char* error_msg;
9700      bool valid = grok_bslash_o(RExC_parse,
9701            &value,
9702            &numlen,
9703            &error_msg,
9704            SIZE_ONLY);
9705      RExC_parse += numlen;
9706      if (! valid) {
9707       vFAIL(error_msg);
9708      }
9709     }
9710     if (PL_encoding && value < 0x100) {
9711      goto recode_encoding;
9712     }
9713     break;
9714    case 'x':
9715     if (*RExC_parse == '{') {
9716      I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9717       | PERL_SCAN_DISALLOW_PREFIX;
9718      char * const e = strchr(RExC_parse++, '}');
9719      if (!e)
9720       vFAIL("Missing right brace on \\x{}");
9721
9722      numlen = e - RExC_parse;
9723      value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9724      RExC_parse = e + 1;
9725     }
9726     else {
9727      I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9728      numlen = 2;
9729      value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9730      RExC_parse += numlen;
9731     }
9732     if (PL_encoding && value < 0x100)
9733      goto recode_encoding;
9734     break;
9735    case 'c':
9736     value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
9737     break;
9738    case '0': case '1': case '2': case '3': case '4':
9739    case '5': case '6': case '7':
9740     {
9741      /* Take 1-3 octal digits */
9742      I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9743      numlen = 3;
9744      value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
9745      RExC_parse += numlen;
9746      if (PL_encoding && value < 0x100)
9747       goto recode_encoding;
9748      break;
9749     }
9750    recode_encoding:
9751     if (! RExC_override_recoding) {
9752      SV* enc = PL_encoding;
9753      value = reg_recode((const char)(U8)value, &enc);
9754      if (!enc && SIZE_ONLY)
9755       ckWARNreg(RExC_parse,
9756         "Invalid escape in the specified encoding");
9757      break;
9758     }
9759    default:
9760     /* Allow \_ to not give an error */
9761     if (!SIZE_ONLY && isALNUM(value) && value != '_') {
9762      ckWARN2reg(RExC_parse,
9763        "Unrecognized escape \\%c in character class passed through",
9764        (int)value);
9765     }
9766     break;
9767    }
9768   } /* end of \blah */
9769 #ifdef EBCDIC
9770   else
9771    literal_endpoint++;
9772 #endif
9773
9774   if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
9775
9776    /* What matches in a locale is not known until runtime, so need to
9777    * (one time per class) allocate extra space to pass to regexec.
9778    * The space will contain a bit for each named class that is to be
9779    * matched against.  This isn't needed for \p{} and pseudo-classes,
9780    * as they are not affected by locale, and hence are dealt with
9781    * separately */
9782    if (LOC && namedclass < ANYOF_MAX && ! need_class) {
9783     need_class = 1;
9784     if (SIZE_ONLY) {
9785      RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
9786     }
9787     else {
9788      RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
9789      ANYOF_CLASS_ZERO(ret);
9790     }
9791     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
9792    }
9793
9794    /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
9795    * literal, as is the character that began the false range, i.e.
9796    * the 'a' in the examples */
9797    if (range) {
9798     if (!SIZE_ONLY) {
9799      const int w =
9800       RExC_parse >= rangebegin ?
9801       RExC_parse - rangebegin : 0;
9802      ckWARN4reg(RExC_parse,
9803        "False [] range \"%*.*s\"",
9804        w, w, rangebegin);
9805
9806      stored +=
9807       set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
9808      if (prevvalue < 256) {
9809       stored +=
9810       set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
9811      }
9812      else {
9813       nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
9814      }
9815     }
9816
9817     range = 0; /* this was not a true range */
9818    }
9819
9820
9821
9822    if (!SIZE_ONLY) {
9823     const char *what = NULL;
9824     char yesno = 0;
9825
9826     /* Possible truncation here but in some 64-bit environments
9827     * the compiler gets heartburn about switch on 64-bit values.
9828     * A similar issue a little earlier when switching on value.
9829     * --jhi */
9830     switch ((I32)namedclass) {
9831
9832     case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
9833     case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
9834     case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
9835     case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
9836     case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
9837     case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
9838     case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
9839     case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
9840     case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
9841     case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
9842     /* \s, \w match all unicode if utf8. */
9843     case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
9844     case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
9845     case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
9846     case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
9847     case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
9848     case ANYOF_ASCII:
9849      if (LOC)
9850       ANYOF_CLASS_SET(ret, ANYOF_ASCII);
9851      else {
9852       for (value = 0; value < 128; value++)
9853        stored +=
9854        set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
9855      }
9856      yesno = '+';
9857      what = NULL; /* Doesn't match outside ascii, so
9858           don't want to add +utf8:: */
9859      break;
9860     case ANYOF_NASCII:
9861      if (LOC)
9862       ANYOF_CLASS_SET(ret, ANYOF_NASCII);
9863      else {
9864       for (value = 128; value < 256; value++)
9865        stored +=
9866        set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
9867      }
9868      ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9869      yesno = '!';
9870      what = "ASCII";
9871      break;
9872     case ANYOF_DIGIT:
9873      if (LOC)
9874       ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
9875      else {
9876       /* consecutive digits assumed */
9877       for (value = '0'; value <= '9'; value++)
9878        stored +=
9879        set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9880      }
9881      yesno = '+';
9882      what = "Digit";
9883      break;
9884     case ANYOF_NDIGIT:
9885      if (LOC)
9886       ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
9887      else {
9888       /* consecutive digits assumed */
9889       for (value = 0; value < '0'; value++)
9890        stored +=
9891        set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9892       for (value = '9' + 1; value < 256; value++)
9893        stored +=
9894        set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9895      }
9896      yesno = '!';
9897      what = "Digit";
9898      if (AT_LEAST_ASCII_RESTRICTED ) {
9899       ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9900      }
9901      break;
9902     case ANYOF_MAX:
9903      /* this is to handle \p and \P */
9904      break;
9905     default:
9906      vFAIL("Invalid [::] class");
9907      break;
9908     }
9909     if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
9910      /* Strings such as "+utf8::isWord\n" */
9911      Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
9912     }
9913
9914     continue;
9915    }
9916   } /* end of namedclass \blah */
9917
9918   if (range) {
9919    if (prevvalue > (IV)value) /* b-a */ {
9920     const int w = RExC_parse - rangebegin;
9921     Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
9922     range = 0; /* not a valid range */
9923    }
9924   }
9925   else {
9926    prevvalue = value; /* save the beginning of the range */
9927    if (RExC_parse+1 < RExC_end
9928     && *RExC_parse == '-'
9929     && RExC_parse[1] != ']')
9930    {
9931     RExC_parse++;
9932
9933     /* a bad range like \w-, [:word:]- ? */
9934     if (namedclass > OOB_NAMEDCLASS) {
9935      if (ckWARN(WARN_REGEXP)) {
9936       const int w =
9937        RExC_parse >= rangebegin ?
9938        RExC_parse - rangebegin : 0;
9939       vWARN4(RExC_parse,
9940        "False [] range \"%*.*s\"",
9941        w, w, rangebegin);
9942      }
9943      if (!SIZE_ONLY)
9944       stored +=
9945        set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
9946     } else
9947      range = 1; /* yeah, it's a range! */
9948     continue; /* but do it the next time */
9949    }
9950   }
9951
9952   /* non-Latin1 code point implies unicode semantics.  Must be set in
9953   * pass1 so is there for the whole of pass 2 */
9954   if (value > 255) {
9955    RExC_uni_semantics = 1;
9956   }
9957
9958   /* now is the next time */
9959   if (!SIZE_ONLY) {
9960    if (prevvalue < 256) {
9961     const IV ceilvalue = value < 256 ? value : 255;
9962     IV i;
9963 #ifdef EBCDIC
9964     /* In EBCDIC [\x89-\x91] should include
9965     * the \x8e but [i-j] should not. */
9966     if (literal_endpoint == 2 &&
9967      ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
9968      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
9969     {
9970      if (isLOWER(prevvalue)) {
9971       for (i = prevvalue; i <= ceilvalue; i++)
9972        if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9973         stored +=
9974         set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
9975        }
9976      } else {
9977       for (i = prevvalue; i <= ceilvalue; i++)
9978        if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9979         stored +=
9980         set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
9981        }
9982      }
9983     }
9984     else
9985 #endif
9986      for (i = prevvalue; i <= ceilvalue; i++) {
9987       stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
9988      }
9989   }
9990   if (value > 255) {
9991    const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
9992    const UV natvalue      = NATIVE_TO_UNI(value);
9993    nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
9994   }
9995 #ifdef EBCDIC
9996    literal_endpoint = 0;
9997 #endif
9998   }
9999
10000   range = 0; /* this range (if it was one) is done now */
10001  }
10002
10003
10004
10005  if (SIZE_ONLY)
10006   return ret;
10007  /****** !SIZE_ONLY AFTER HERE *********/
10008
10009  /* If folding and there are code points above 255, we calculate all
10010  * characters that could fold to or from the ones already on the list */
10011  if (FOLD && nonbitmap) {
10012   UV i;
10013
10014   HV* fold_intersection;
10015   UV* fold_list;
10016
10017   /* This is a list of all the characters that participate in folds
10018    * (except marks, etc in multi-char folds */
10019   if (! PL_utf8_foldable) {
10020    SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
10021    PL_utf8_foldable = _swash_to_invlist(swash);
10022   }
10023
10024   /* This is a hash that for a particular fold gives all characters
10025    * that are involved in it */
10026   if (! PL_utf8_foldclosures) {
10027
10028    /* If we were unable to find any folds, then we likely won't be
10029    * able to find the closures.  So just create an empty list.
10030    * Folding will effectively be restricted to the non-Unicode rules
10031    * hard-coded into Perl.  (This case happens legitimately during
10032    * compilation of Perl itself before the Unicode tables are
10033    * generated) */
10034    if (invlist_len(PL_utf8_foldable) == 0) {
10035     PL_utf8_foldclosures = _new_invlist(0);
10036    } else {
10037     /* If the folds haven't been read in, call a fold function
10038      * to force that */
10039     if (! PL_utf8_tofold) {
10040      U8 dummy[UTF8_MAXBYTES+1];
10041      STRLEN dummy_len;
10042      to_utf8_fold((U8*) "A", dummy, &dummy_len);
10043     }
10044     PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
10045    }
10046   }
10047
10048   /* Only the characters in this class that participate in folds need
10049    * be checked.  Get the intersection of this class and all the
10050    * possible characters that are foldable.  This can quickly narrow
10051    * down a large class */
10052   fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
10053
10054   /* Now look at the foldable characters in this class individually */
10055   fold_list = invlist_array(fold_intersection);
10056   for (i = 0; i < invlist_len(fold_intersection); i++) {
10057    UV j;
10058
10059    /* The next entry is the beginning of the range that is in the
10060    * class */
10061    UV start = fold_list[i++];
10062
10063
10064    /* The next entry is the beginning of the next range, which
10065     * isn't in the class, so the end of the current range is one
10066     * less than that */
10067    UV end = fold_list[i] - 1;
10068
10069    /* Look at every character in the range */
10070    for (j = start; j <= end; j++) {
10071
10072     /* Get its fold */
10073     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10074     STRLEN foldlen;
10075     const UV f =
10076      _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
10077
10078     if (foldlen > (STRLEN)UNISKIP(f)) {
10079
10080      /* Any multicharacter foldings (disallowed in
10081       * lookbehind patterns) require the following
10082       * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
10083       * E folds into "pq" and F folds into "rst", all other
10084       * characters fold to single characters.  We save away
10085       * these multicharacter foldings, to be later saved as
10086       * part of the additional "s" data. */
10087      if (! RExC_in_lookbehind) {
10088       U8* loc = foldbuf;
10089       U8* e = foldbuf + foldlen;
10090
10091       /* If any of the folded characters of this are in
10092        * the Latin1 range, tell the regex engine that
10093        * this can match a non-utf8 target string.  The
10094        * only multi-byte fold whose source is in the
10095        * Latin1 range (U+00DF) applies only when the
10096        * target string is utf8, or under unicode rules */
10097       if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
10098        while (loc < e) {
10099
10100         /* Can't mix ascii with non- under /aa */
10101         if (MORE_ASCII_RESTRICTED
10102          && (isASCII(*loc) != isASCII(j)))
10103         {
10104          goto end_multi_fold;
10105         }
10106         if (UTF8_IS_INVARIANT(*loc)
10107          || UTF8_IS_DOWNGRADEABLE_START(*loc))
10108         {
10109          /* Can't mix above and below 256 under
10110           * LOC */
10111          if (LOC) {
10112           goto end_multi_fold;
10113          }
10114          ANYOF_FLAGS(ret)
10115            |= ANYOF_NONBITMAP_NON_UTF8;
10116          break;
10117         }
10118         loc += UTF8SKIP(loc);
10119        }
10120       }
10121
10122       add_alternate(&unicode_alternate, foldbuf, foldlen);
10123      end_multi_fold: ;
10124      }
10125
10126      /* This is special-cased, as it is the only letter which
10127      * has both a multi-fold and single-fold in Latin1.  All
10128      * the other chars that have single and multi-folds are
10129      * always in utf8, and the utf8 folding algorithm catches
10130      * them */
10131      if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
10132       stored += set_regclass_bit(pRExC_state,
10133           ret,
10134           LATIN_SMALL_LETTER_SHARP_S,
10135           &l1_fold_invlist, &unicode_alternate);
10136      }
10137     }
10138     else {
10139      /* Single character fold.  Add everything in its fold
10140       * closure to the list that this node should match */
10141      SV** listp;
10142
10143      /* The fold closures data structure is a hash with the
10144       * keys being every character that is folded to, like
10145       * 'k', and the values each an array of everything that
10146       * folds to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
10147      if ((listp = hv_fetch(PL_utf8_foldclosures,
10148          (char *) foldbuf, foldlen, FALSE)))
10149      {
10150       AV* list = (AV*) *listp;
10151       IV k;
10152       for (k = 0; k <= av_len(list); k++) {
10153        SV** c_p = av_fetch(list, k, FALSE);
10154        UV c;
10155        if (c_p == NULL) {
10156         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
10157        }
10158        c = SvUV(*c_p);
10159
10160        /* /aa doesn't allow folds between ASCII and
10161         * non-; /l doesn't allow them between above
10162         * and below 256 */
10163        if ((MORE_ASCII_RESTRICTED
10164         && (isASCII(c) != isASCII(j)))
10165          || (LOC && ((c < 256) != (j < 256))))
10166        {
10167         continue;
10168        }
10169
10170        if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
10171         stored += set_regclass_bit(pRExC_state,
10172           ret,
10173           (U8) c,
10174           &l1_fold_invlist, &unicode_alternate);
10175        }
10176         /* It may be that the code point is already
10177          * in this range or already in the bitmap,
10178          * in which case we need do nothing */
10179        else if ((c < start || c > end)
10180           && (c > 255
10181            || ! ANYOF_BITMAP_TEST(ret, c)))
10182        {
10183         nonbitmap = add_cp_to_invlist(nonbitmap, c);
10184        }
10185       }
10186      }
10187     }
10188    }
10189   }
10190   invlist_destroy(fold_intersection);
10191  }
10192
10193  /* Combine the two lists into one. */
10194  if (l1_fold_invlist) {
10195   if (nonbitmap) {
10196    HV* temp = invlist_union(nonbitmap, l1_fold_invlist);
10197    invlist_destroy(nonbitmap);
10198    nonbitmap = temp;
10199    invlist_destroy(l1_fold_invlist);
10200   }
10201   else {
10202    nonbitmap = l1_fold_invlist;
10203   }
10204  }
10205
10206  /* Here, we have calculated what code points should be in the character
10207  * class.   Now we can see about various optimizations.  Fold calculation
10208  * needs to take place before inversion.  Otherwise /[^k]/i would invert to
10209  * include K, which under /i would match k. */
10210
10211  /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
10212  * set the FOLD flag yet, so this this does optimize those.  It doesn't
10213  * optimize locale.  Doing so perhaps could be done as long as there is
10214  * nothing like \w in it; some thought also would have to be given to the
10215  * interaction with above 0x100 chars */
10216  if (! LOC
10217   && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT
10218   && ! unicode_alternate
10219   && ! nonbitmap
10220   && SvCUR(listsv) == initial_listsv_len)
10221  {
10222   for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
10223    ANYOF_BITMAP(ret)[value] ^= 0xFF;
10224   stored = 256 - stored;
10225
10226   /* The inversion means that everything above 255 is matched; and at the
10227   * same time we clear the invert flag */
10228   ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
10229  }
10230
10231  /* Folding in the bitmap is taken care of above, but not for locale (for
10232  * which we have to wait to see what folding is in effect at runtime), and
10233  * for things not in the bitmap.  Set run-time fold flag for these */
10234  if (FOLD && (LOC || nonbitmap || unicode_alternate)) {
10235   ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
10236  }
10237
10238  /* A single character class can be "optimized" into an EXACTish node.
10239  * Note that since we don't currently count how many characters there are
10240  * outside the bitmap, we are XXX missing optimization possibilities for
10241  * them.  This optimization can't happen unless this is a truly single
10242  * character class, which means that it can't be an inversion into a
10243  * many-character class, and there must be no possibility of there being
10244  * things outside the bitmap.  'stored' (only) for locales doesn't include
10245  * \w, etc, so have to make a special test that they aren't present
10246  *
10247  * Similarly A 2-character class of the very special form like [bB] can be
10248  * optimized into an EXACTFish node, but only for non-locales, and for
10249  * characters which only have the two folds; so things like 'fF' and 'Ii'
10250  * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
10251  * FI'. */
10252  if (! nonbitmap
10253   && ! unicode_alternate
10254   && SvCUR(listsv) == initial_listsv_len
10255   && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
10256   && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10257        || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
10258    || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10259         && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
10260         /* If the latest code point has a fold whose
10261         * bit is set, it must be the only other one */
10262         && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
10263         && ANYOF_BITMAP_TEST(ret, prevvalue)))))
10264  {
10265   /* Note that the information needed to decide to do this optimization
10266   * is not currently available until the 2nd pass, and that the actually
10267   * used EXACTish node takes less space than the calculated ANYOF node,
10268   * and hence the amount of space calculated in the first pass is larger
10269   * than actually used, so this optimization doesn't gain us any space.
10270   * But an EXACT node is faster than an ANYOF node, and can be combined
10271   * with any adjacent EXACT nodes later by the optimizer for further
10272   * gains.  The speed of executing an EXACTF is similar to an ANYOF
10273   * node, so the optimization advantage comes from the ability to join
10274   * it to adjacent EXACT nodes */
10275
10276   const char * cur_parse= RExC_parse;
10277   U8 op;
10278   RExC_emit = (regnode *)orig_emit;
10279   RExC_parse = (char *)orig_parse;
10280
10281   if (stored == 1) {
10282
10283    /* A locale node with one point can be folded; all the other cases
10284    * with folding will have two points, since we calculate them above
10285    */
10286    if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
10287     op = EXACTFL;
10288    }
10289    else {
10290     op = EXACT;
10291    }
10292   }   /* else 2 chars in the bit map: the folds of each other */
10293   else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
10294
10295    /* To join adjacent nodes, they must be the exact EXACTish type.
10296    * Try to use the most likely type, by using EXACTFU if the regex
10297    * calls for them, or is required because the character is
10298    * non-ASCII */
10299    op = EXACTFU;
10300   }
10301   else {    /* Otherwise, more likely to be EXACTF type */
10302    op = EXACTF;
10303   }
10304
10305   ret = reg_node(pRExC_state, op);
10306   RExC_parse = (char *)cur_parse;
10307   if (UTF && ! NATIVE_IS_INVARIANT(value)) {
10308    *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
10309    *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
10310    STR_LEN(ret)= 2;
10311    RExC_emit += STR_SZ(2);
10312   }
10313   else {
10314    *STRING(ret)= (char)value;
10315    STR_LEN(ret)= 1;
10316    RExC_emit += STR_SZ(1);
10317   }
10318   SvREFCNT_dec(listsv);
10319   return ret;
10320  }
10321
10322  if (nonbitmap) {
10323   UV* nonbitmap_array = invlist_array(nonbitmap);
10324   UV nonbitmap_len = invlist_len(nonbitmap);
10325   UV i;
10326
10327   /*  Here have the full list of items to match that aren't in the
10328   *  bitmap.  Convert to the structure that the rest of the code is
10329   *  expecting.   XXX That rest of the code should convert to this
10330   *  structure */
10331   for (i = 0; i < nonbitmap_len; i++) {
10332
10333    /* The next entry is the beginning of the range that is in the
10334    * class */
10335    UV start = nonbitmap_array[i++];
10336    UV end;
10337
10338    /* The next entry is the beginning of the next range, which isn't
10339    * in the class, so the end of the current range is one less than
10340    * that.  But if there is no next range, it means that the range
10341    * begun by 'start' extends to infinity, which for this platform
10342    * ends at UV_MAX */
10343    if (i == nonbitmap_len) {
10344     end = UV_MAX;
10345    }
10346    else {
10347     end = nonbitmap_array[i] - 1;
10348    }
10349
10350    if (start == end) {
10351     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
10352    }
10353    else {
10354     /* The \t sets the whole range */
10355     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
10356       /* XXX EBCDIC */
10357         start, end);
10358    }
10359   }
10360   invlist_destroy(nonbitmap);
10361  }
10362
10363  if (SvCUR(listsv) == initial_listsv_len && ! unicode_alternate) {
10364   ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
10365   SvREFCNT_dec(listsv);
10366   SvREFCNT_dec(unicode_alternate);
10367  }
10368  else {
10369
10370   AV * const av = newAV();
10371   SV *rv;
10372   /* The 0th element stores the character class description
10373   * in its textual form: used later (regexec.c:Perl_regclass_swash())
10374   * to initialize the appropriate swash (which gets stored in
10375   * the 1st element), and also useful for dumping the regnode.
10376   * The 2nd element stores the multicharacter foldings,
10377   * used later (regexec.c:S_reginclass()). */
10378   av_store(av, 0, listsv);
10379   av_store(av, 1, NULL);
10380
10381   /* Store any computed multi-char folds only if we are allowing
10382   * them */
10383   if (allow_full_fold) {
10384    av_store(av, 2, MUTABLE_SV(unicode_alternate));
10385    if (unicode_alternate) { /* This node is variable length */
10386     OP(ret) = ANYOFV;
10387    }
10388   }
10389   else {
10390    av_store(av, 2, NULL);
10391   }
10392   rv = newRV_noinc(MUTABLE_SV(av));
10393   n = add_data(pRExC_state, 1, "s");
10394   RExC_rxi->data->data[n] = (void*)rv;
10395   ARG_SET(ret, n);
10396  }
10397  return ret;
10398 }
10399 #undef _C_C_T_
10400
10401
10402 /* reg_skipcomment()
10403
10404    Absorbs an /x style # comments from the input stream.
10405    Returns true if there is more text remaining in the stream.
10406    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
10407    terminates the pattern without including a newline.
10408
10409    Note its the callers responsibility to ensure that we are
10410    actually in /x mode
10411
10412 */
10413
10414 STATIC bool
10415 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
10416 {
10417  bool ended = 0;
10418
10419  PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
10420
10421  while (RExC_parse < RExC_end)
10422   if (*RExC_parse++ == '\n') {
10423    ended = 1;
10424    break;
10425   }
10426  if (!ended) {
10427   /* we ran off the end of the pattern without ending
10428   the comment, so we have to add an \n when wrapping */
10429   RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10430   return 0;
10431  } else
10432   return 1;
10433 }
10434
10435 /* nextchar()
10436
10437    Advances the parse position, and optionally absorbs
10438    "whitespace" from the inputstream.
10439
10440    Without /x "whitespace" means (?#...) style comments only,
10441    with /x this means (?#...) and # comments and whitespace proper.
10442
10443    Returns the RExC_parse point from BEFORE the scan occurs.
10444
10445    This is the /x friendly way of saying RExC_parse++.
10446 */
10447
10448 STATIC char*
10449 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
10450 {
10451  char* const retval = RExC_parse++;
10452
10453  PERL_ARGS_ASSERT_NEXTCHAR;
10454
10455  for (;;) {
10456   if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
10457     RExC_parse[2] == '#') {
10458    while (*RExC_parse != ')') {
10459     if (RExC_parse == RExC_end)
10460      FAIL("Sequence (?#... not terminated");
10461     RExC_parse++;
10462    }
10463    RExC_parse++;
10464    continue;
10465   }
10466   if (RExC_flags & RXf_PMf_EXTENDED) {
10467    if (isSPACE(*RExC_parse)) {
10468     RExC_parse++;
10469     continue;
10470    }
10471    else if (*RExC_parse == '#') {
10472     if ( reg_skipcomment( pRExC_state ) )
10473      continue;
10474    }
10475   }
10476   return retval;
10477  }
10478 }
10479
10480 /*
10481 - reg_node - emit a node
10482 */
10483 STATIC regnode *   /* Location. */
10484 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
10485 {
10486  dVAR;
10487  register regnode *ptr;
10488  regnode * const ret = RExC_emit;
10489  GET_RE_DEBUG_FLAGS_DECL;
10490
10491  PERL_ARGS_ASSERT_REG_NODE;
10492
10493  if (SIZE_ONLY) {
10494   SIZE_ALIGN(RExC_size);
10495   RExC_size += 1;
10496   return(ret);
10497  }
10498  if (RExC_emit >= RExC_emit_bound)
10499   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10500
10501  NODE_ALIGN_FILL(ret);
10502  ptr = ret;
10503  FILL_ADVANCE_NODE(ptr, op);
10504  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
10505 #ifdef RE_TRACK_PATTERN_OFFSETS
10506  if (RExC_offsets) {         /* MJD */
10507   MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
10508    "reg_node", __LINE__,
10509    PL_reg_name[op],
10510    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
10511     ? "Overwriting end of array!\n" : "OK",
10512    (UV)(RExC_emit - RExC_emit_start),
10513    (UV)(RExC_parse - RExC_start),
10514    (UV)RExC_offsets[0]));
10515   Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
10516  }
10517 #endif
10518  RExC_emit = ptr;
10519  return(ret);
10520 }
10521
10522 /*
10523 - reganode - emit a node with an argument
10524 */
10525 STATIC regnode *   /* Location. */
10526 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
10527 {
10528  dVAR;
10529  register regnode *ptr;
10530  regnode * const ret = RExC_emit;
10531  GET_RE_DEBUG_FLAGS_DECL;
10532
10533  PERL_ARGS_ASSERT_REGANODE;
10534
10535  if (SIZE_ONLY) {
10536   SIZE_ALIGN(RExC_size);
10537   RExC_size += 2;
10538   /*
10539   We can't do this:
10540
10541   assert(2==regarglen[op]+1);
10542
10543   Anything larger than this has to allocate the extra amount.
10544   If we changed this to be:
10545
10546   RExC_size += (1 + regarglen[op]);
10547
10548   then it wouldn't matter. Its not clear what side effect
10549   might come from that so its not done so far.
10550   -- dmq
10551   */
10552   return(ret);
10553  }
10554  if (RExC_emit >= RExC_emit_bound)
10555   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10556
10557  NODE_ALIGN_FILL(ret);
10558  ptr = ret;
10559  FILL_ADVANCE_NODE_ARG(ptr, op, arg);
10560  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
10561 #ifdef RE_TRACK_PATTERN_OFFSETS
10562  if (RExC_offsets) {         /* MJD */
10563   MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10564    "reganode",
10565    __LINE__,
10566    PL_reg_name[op],
10567    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
10568    "Overwriting end of array!\n" : "OK",
10569    (UV)(RExC_emit - RExC_emit_start),
10570    (UV)(RExC_parse - RExC_start),
10571    (UV)RExC_offsets[0]));
10572   Set_Cur_Node_Offset;
10573  }
10574 #endif
10575  RExC_emit = ptr;
10576  return(ret);
10577 }
10578
10579 /*
10580 - reguni - emit (if appropriate) a Unicode character
10581 */
10582 STATIC STRLEN
10583 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
10584 {
10585  dVAR;
10586
10587  PERL_ARGS_ASSERT_REGUNI;
10588
10589  return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
10590 }
10591
10592 /*
10593 - reginsert - insert an operator in front of already-emitted operand
10594 *
10595 * Means relocating the operand.
10596 */
10597 STATIC void
10598 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
10599 {
10600  dVAR;
10601  register regnode *src;
10602  register regnode *dst;
10603  register regnode *place;
10604  const int offset = regarglen[(U8)op];
10605  const int size = NODE_STEP_REGNODE + offset;
10606  GET_RE_DEBUG_FLAGS_DECL;
10607
10608  PERL_ARGS_ASSERT_REGINSERT;
10609  PERL_UNUSED_ARG(depth);
10610 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
10611  DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
10612  if (SIZE_ONLY) {
10613   RExC_size += size;
10614   return;
10615  }
10616
10617  src = RExC_emit;
10618  RExC_emit += size;
10619  dst = RExC_emit;
10620  if (RExC_open_parens) {
10621   int paren;
10622   /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
10623   for ( paren=0 ; paren < RExC_npar ; paren++ ) {
10624    if ( RExC_open_parens[paren] >= opnd ) {
10625     /*DEBUG_PARSE_FMT("open"," - %d",size);*/
10626     RExC_open_parens[paren] += size;
10627    } else {
10628     /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
10629    }
10630    if ( RExC_close_parens[paren] >= opnd ) {
10631     /*DEBUG_PARSE_FMT("close"," - %d",size);*/
10632     RExC_close_parens[paren] += size;
10633    } else {
10634     /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
10635    }
10636   }
10637  }
10638
10639  while (src > opnd) {
10640   StructCopy(--src, --dst, regnode);
10641 #ifdef RE_TRACK_PATTERN_OFFSETS
10642   if (RExC_offsets) {     /* MJD 20010112 */
10643    MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
10644     "reg_insert",
10645     __LINE__,
10646     PL_reg_name[op],
10647     (UV)(dst - RExC_emit_start) > RExC_offsets[0]
10648      ? "Overwriting end of array!\n" : "OK",
10649     (UV)(src - RExC_emit_start),
10650     (UV)(dst - RExC_emit_start),
10651     (UV)RExC_offsets[0]));
10652    Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
10653    Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
10654   }
10655 #endif
10656  }
10657
10658
10659  place = opnd;  /* Op node, where operand used to be. */
10660 #ifdef RE_TRACK_PATTERN_OFFSETS
10661  if (RExC_offsets) {         /* MJD */
10662   MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10663    "reginsert",
10664    __LINE__,
10665    PL_reg_name[op],
10666    (UV)(place - RExC_emit_start) > RExC_offsets[0]
10667    ? "Overwriting end of array!\n" : "OK",
10668    (UV)(place - RExC_emit_start),
10669    (UV)(RExC_parse - RExC_start),
10670    (UV)RExC_offsets[0]));
10671   Set_Node_Offset(place, RExC_parse);
10672   Set_Node_Length(place, 1);
10673  }
10674 #endif
10675  src = NEXTOPER(place);
10676  FILL_ADVANCE_NODE(place, op);
10677  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
10678  Zero(src, offset, regnode);
10679 }
10680
10681 /*
10682 - regtail - set the next-pointer at the end of a node chain of p to val.
10683 - SEE ALSO: regtail_study
10684 */
10685 /* TODO: All three parms should be const */
10686 STATIC void
10687 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10688 {
10689  dVAR;
10690  register regnode *scan;
10691  GET_RE_DEBUG_FLAGS_DECL;
10692
10693  PERL_ARGS_ASSERT_REGTAIL;
10694 #ifndef DEBUGGING
10695  PERL_UNUSED_ARG(depth);
10696 #endif
10697
10698  if (SIZE_ONLY)
10699   return;
10700
10701  /* Find last node. */
10702  scan = p;
10703  for (;;) {
10704   regnode * const temp = regnext(scan);
10705   DEBUG_PARSE_r({
10706    SV * const mysv=sv_newmortal();
10707    DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
10708    regprop(RExC_rx, mysv, scan);
10709    PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
10710     SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
10711      (temp == NULL ? "->" : ""),
10712      (temp == NULL ? PL_reg_name[OP(val)] : "")
10713    );
10714   });
10715   if (temp == NULL)
10716    break;
10717   scan = temp;
10718  }
10719
10720  if (reg_off_by_arg[OP(scan)]) {
10721   ARG_SET(scan, val - scan);
10722  }
10723  else {
10724   NEXT_OFF(scan) = val - scan;
10725  }
10726 }
10727
10728 #ifdef DEBUGGING
10729 /*
10730 - regtail_study - set the next-pointer at the end of a node chain of p to val.
10731 - Look for optimizable sequences at the same time.
10732 - currently only looks for EXACT chains.
10733
10734 This is experimental code. The idea is to use this routine to perform
10735 in place optimizations on branches and groups as they are constructed,
10736 with the long term intention of removing optimization from study_chunk so
10737 that it is purely analytical.
10738
10739 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
10740 to control which is which.
10741
10742 */
10743 /* TODO: All four parms should be const */
10744
10745 STATIC U8
10746 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10747 {
10748  dVAR;
10749  register regnode *scan;
10750  U8 exact = PSEUDO;
10751 #ifdef EXPERIMENTAL_INPLACESCAN
10752  I32 min = 0;
10753 #endif
10754  GET_RE_DEBUG_FLAGS_DECL;
10755
10756  PERL_ARGS_ASSERT_REGTAIL_STUDY;
10757
10758
10759  if (SIZE_ONLY)
10760   return exact;
10761
10762  /* Find last node. */
10763
10764  scan = p;
10765  for (;;) {
10766   regnode * const temp = regnext(scan);
10767 #ifdef EXPERIMENTAL_INPLACESCAN
10768   if (PL_regkind[OP(scan)] == EXACT)
10769    if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
10770     return EXACT;
10771 #endif
10772   if ( exact ) {
10773    switch (OP(scan)) {
10774     case EXACT:
10775     case EXACTF:
10776     case EXACTFA:
10777     case EXACTFU:
10778     case EXACTFL:
10779       if( exact == PSEUDO )
10780        exact= OP(scan);
10781       else if ( exact != OP(scan) )
10782        exact= 0;
10783     case NOTHING:
10784      break;
10785     default:
10786      exact= 0;
10787    }
10788   }
10789   DEBUG_PARSE_r({
10790    SV * const mysv=sv_newmortal();
10791    DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
10792    regprop(RExC_rx, mysv, scan);
10793    PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
10794     SvPV_nolen_const(mysv),
10795     REG_NODE_NUM(scan),
10796     PL_reg_name[exact]);
10797   });
10798   if (temp == NULL)
10799    break;
10800   scan = temp;
10801  }
10802  DEBUG_PARSE_r({
10803   SV * const mysv_val=sv_newmortal();
10804   DEBUG_PARSE_MSG("");
10805   regprop(RExC_rx, mysv_val, val);
10806   PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
10807      SvPV_nolen_const(mysv_val),
10808      (IV)REG_NODE_NUM(val),
10809      (IV)(val - scan)
10810   );
10811  });
10812  if (reg_off_by_arg[OP(scan)]) {
10813   ARG_SET(scan, val - scan);
10814  }
10815  else {
10816   NEXT_OFF(scan) = val - scan;
10817  }
10818
10819  return exact;
10820 }
10821 #endif
10822
10823 /*
10824  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
10825  */
10826 #ifdef DEBUGGING
10827 static void
10828 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
10829 {
10830  int bit;
10831  int set=0;
10832  regex_charset cs;
10833
10834  for (bit=0; bit<32; bit++) {
10835   if (flags & (1<<bit)) {
10836    if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
10837     continue;
10838    }
10839    if (!set++ && lead)
10840     PerlIO_printf(Perl_debug_log, "%s",lead);
10841    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
10842   }
10843  }
10844  if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
10845    if (!set++ && lead) {
10846     PerlIO_printf(Perl_debug_log, "%s",lead);
10847    }
10848    switch (cs) {
10849     case REGEX_UNICODE_CHARSET:
10850      PerlIO_printf(Perl_debug_log, "UNICODE");
10851      break;
10852     case REGEX_LOCALE_CHARSET:
10853      PerlIO_printf(Perl_debug_log, "LOCALE");
10854      break;
10855     case REGEX_ASCII_RESTRICTED_CHARSET:
10856      PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
10857      break;
10858     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
10859      PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
10860      break;
10861     default:
10862      PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
10863      break;
10864    }
10865  }
10866  if (lead)  {
10867   if (set)
10868    PerlIO_printf(Perl_debug_log, "\n");
10869   else
10870    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
10871  }
10872 }
10873 #endif
10874
10875 void
10876 Perl_regdump(pTHX_ const regexp *r)
10877 {
10878 #ifdef DEBUGGING
10879  dVAR;
10880  SV * const sv = sv_newmortal();
10881  SV *dsv= sv_newmortal();
10882  RXi_GET_DECL(r,ri);
10883  GET_RE_DEBUG_FLAGS_DECL;
10884
10885  PERL_ARGS_ASSERT_REGDUMP;
10886
10887  (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
10888
10889  /* Header fields of interest. */
10890  if (r->anchored_substr) {
10891   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
10892    RE_SV_DUMPLEN(r->anchored_substr), 30);
10893   PerlIO_printf(Perl_debug_log,
10894      "anchored %s%s at %"IVdf" ",
10895      s, RE_SV_TAIL(r->anchored_substr),
10896      (IV)r->anchored_offset);
10897  } else if (r->anchored_utf8) {
10898   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
10899    RE_SV_DUMPLEN(r->anchored_utf8), 30);
10900   PerlIO_printf(Perl_debug_log,
10901      "anchored utf8 %s%s at %"IVdf" ",
10902      s, RE_SV_TAIL(r->anchored_utf8),
10903      (IV)r->anchored_offset);
10904  }
10905  if (r->float_substr) {
10906   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
10907    RE_SV_DUMPLEN(r->float_substr), 30);
10908   PerlIO_printf(Perl_debug_log,
10909      "floating %s%s at %"IVdf"..%"UVuf" ",
10910      s, RE_SV_TAIL(r->float_substr),
10911      (IV)r->float_min_offset, (UV)r->float_max_offset);
10912  } else if (r->float_utf8) {
10913   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
10914    RE_SV_DUMPLEN(r->float_utf8), 30);
10915   PerlIO_printf(Perl_debug_log,
10916      "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
10917      s, RE_SV_TAIL(r->float_utf8),
10918      (IV)r->float_min_offset, (UV)r->float_max_offset);
10919  }
10920  if (r->check_substr || r->check_utf8)
10921   PerlIO_printf(Perl_debug_log,
10922      (const char *)
10923      (r->check_substr == r->float_substr
10924      && r->check_utf8 == r->float_utf8
10925      ? "(checking floating" : "(checking anchored"));
10926  if (r->extflags & RXf_NOSCAN)
10927   PerlIO_printf(Perl_debug_log, " noscan");
10928  if (r->extflags & RXf_CHECK_ALL)
10929   PerlIO_printf(Perl_debug_log, " isall");
10930  if (r->check_substr || r->check_utf8)
10931   PerlIO_printf(Perl_debug_log, ") ");
10932
10933  if (ri->regstclass) {
10934   regprop(r, sv, ri->regstclass);
10935   PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
10936  }
10937  if (r->extflags & RXf_ANCH) {
10938   PerlIO_printf(Perl_debug_log, "anchored");
10939   if (r->extflags & RXf_ANCH_BOL)
10940    PerlIO_printf(Perl_debug_log, "(BOL)");
10941   if (r->extflags & RXf_ANCH_MBOL)
10942    PerlIO_printf(Perl_debug_log, "(MBOL)");
10943   if (r->extflags & RXf_ANCH_SBOL)
10944    PerlIO_printf(Perl_debug_log, "(SBOL)");
10945   if (r->extflags & RXf_ANCH_GPOS)
10946    PerlIO_printf(Perl_debug_log, "(GPOS)");
10947   PerlIO_putc(Perl_debug_log, ' ');
10948  }
10949  if (r->extflags & RXf_GPOS_SEEN)
10950   PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
10951  if (r->intflags & PREGf_SKIP)
10952   PerlIO_printf(Perl_debug_log, "plus ");
10953  if (r->intflags & PREGf_IMPLICIT)
10954   PerlIO_printf(Perl_debug_log, "implicit ");
10955  PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
10956  if (r->extflags & RXf_EVAL_SEEN)
10957   PerlIO_printf(Perl_debug_log, "with eval ");
10958  PerlIO_printf(Perl_debug_log, "\n");
10959  DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
10960 #else
10961  PERL_ARGS_ASSERT_REGDUMP;
10962  PERL_UNUSED_CONTEXT;
10963  PERL_UNUSED_ARG(r);
10964 #endif /* DEBUGGING */
10965 }
10966
10967 /*
10968 - regprop - printable representation of opcode
10969 */
10970 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
10971 STMT_START { \
10972   if (do_sep) {                           \
10973    Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
10974    if (flags & ANYOF_INVERT)           \
10975     /*make sure the invert info is in each */ \
10976     sv_catpvs(sv, "^");             \
10977    do_sep = 0;                         \
10978   }                                       \
10979 } STMT_END
10980
10981 void
10982 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
10983 {
10984 #ifdef DEBUGGING
10985  dVAR;
10986  register int k;
10987  RXi_GET_DECL(prog,progi);
10988  GET_RE_DEBUG_FLAGS_DECL;
10989
10990  PERL_ARGS_ASSERT_REGPROP;
10991
10992  sv_setpvs(sv, "");
10993
10994  if (OP(o) > REGNODE_MAX)  /* regnode.type is unsigned */
10995   /* It would be nice to FAIL() here, but this may be called from
10996   regexec.c, and it would be hard to supply pRExC_state. */
10997   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
10998  sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
10999
11000  k = PL_regkind[OP(o)];
11001
11002  if (k == EXACT) {
11003   sv_catpvs(sv, " ");
11004   /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
11005   * is a crude hack but it may be the best for now since
11006   * we have no flag "this EXACTish node was UTF-8"
11007   * --jhi */
11008   pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
11009     PERL_PV_ESCAPE_UNI_DETECT |
11010     PERL_PV_ESCAPE_NONASCII   |
11011     PERL_PV_PRETTY_ELLIPSES   |
11012     PERL_PV_PRETTY_LTGT       |
11013     PERL_PV_PRETTY_NOCLEAR
11014     );
11015  } else if (k == TRIE) {
11016   /* print the details of the trie in dumpuntil instead, as
11017   * progi->data isn't available here */
11018   const char op = OP(o);
11019   const U32 n = ARG(o);
11020   const reg_ac_data * const ac = IS_TRIE_AC(op) ?
11021    (reg_ac_data *)progi->data->data[n] :
11022    NULL;
11023   const reg_trie_data * const trie
11024    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
11025
11026   Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
11027   DEBUG_TRIE_COMPILE_r(
11028    Perl_sv_catpvf(aTHX_ sv,
11029     "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
11030     (UV)trie->startstate,
11031     (IV)trie->statecount-1, /* -1 because of the unused 0 element */
11032     (UV)trie->wordcount,
11033     (UV)trie->minlen,
11034     (UV)trie->maxlen,
11035     (UV)TRIE_CHARCOUNT(trie),
11036     (UV)trie->uniquecharcount
11037    )
11038   );
11039   if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
11040    int i;
11041    int rangestart = -1;
11042    U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
11043    sv_catpvs(sv, "[");
11044    for (i = 0; i <= 256; i++) {
11045     if (i < 256 && BITMAP_TEST(bitmap,i)) {
11046      if (rangestart == -1)
11047       rangestart = i;
11048     } else if (rangestart != -1) {
11049      if (i <= rangestart + 3)
11050       for (; rangestart < i; rangestart++)
11051        put_byte(sv, rangestart);
11052      else {
11053       put_byte(sv, rangestart);
11054       sv_catpvs(sv, "-");
11055       put_byte(sv, i - 1);
11056      }
11057      rangestart = -1;
11058     }
11059    }
11060    sv_catpvs(sv, "]");
11061   }
11062
11063  } else if (k == CURLY) {
11064   if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
11065    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
11066   Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
11067  }
11068  else if (k == WHILEM && o->flags)   /* Ordinal/of */
11069   Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
11070  else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
11071   Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
11072   if ( RXp_PAREN_NAMES(prog) ) {
11073    if ( k != REF || (OP(o) < NREF)) {
11074     AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
11075     SV **name= av_fetch(list, ARG(o), 0 );
11076     if (name)
11077      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
11078    }
11079    else {
11080     AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
11081     SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
11082     I32 *nums=(I32*)SvPVX(sv_dat);
11083     SV **name= av_fetch(list, nums[0], 0 );
11084     I32 n;
11085     if (name) {
11086      for ( n=0; n<SvIVX(sv_dat); n++ ) {
11087       Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
11088          (n ? "," : ""), (IV)nums[n]);
11089      }
11090      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
11091     }
11092    }
11093   }
11094  } else if (k == GOSUB)
11095   Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
11096  else if (k == VERB) {
11097   if (!o->flags)
11098    Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
11099       SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
11100  } else if (k == LOGICAL)
11101   Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
11102  else if (k == FOLDCHAR)
11103   Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
11104  else if (k == ANYOF) {
11105   int i, rangestart = -1;
11106   const U8 flags = ANYOF_FLAGS(o);
11107   int do_sep = 0;
11108
11109   /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
11110   static const char * const anyofs[] = {
11111    "\\w",
11112    "\\W",
11113    "\\s",
11114    "\\S",
11115    "\\d",
11116    "\\D",
11117    "[:alnum:]",
11118    "[:^alnum:]",
11119    "[:alpha:]",
11120    "[:^alpha:]",
11121    "[:ascii:]",
11122    "[:^ascii:]",
11123    "[:cntrl:]",
11124    "[:^cntrl:]",
11125    "[:graph:]",
11126    "[:^graph:]",
11127    "[:lower:]",
11128    "[:^lower:]",
11129    "[:print:]",
11130    "[:^print:]",
11131    "[:punct:]",
11132    "[:^punct:]",
11133    "[:upper:]",
11134    "[:^upper:]",
11135    "[:xdigit:]",
11136    "[:^xdigit:]",
11137    "[:space:]",
11138    "[:^space:]",
11139    "[:blank:]",
11140    "[:^blank:]"
11141   };
11142
11143   if (flags & ANYOF_LOCALE)
11144    sv_catpvs(sv, "{loc}");
11145   if (flags & ANYOF_LOC_NONBITMAP_FOLD)
11146    sv_catpvs(sv, "{i}");
11147   Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
11148   if (flags & ANYOF_INVERT)
11149    sv_catpvs(sv, "^");
11150
11151   /* output what the standard cp 0-255 bitmap matches */
11152   for (i = 0; i <= 256; i++) {
11153    if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
11154     if (rangestart == -1)
11155      rangestart = i;
11156    } else if (rangestart != -1) {
11157     if (i <= rangestart + 3)
11158      for (; rangestart < i; rangestart++)
11159       put_byte(sv, rangestart);
11160     else {
11161      put_byte(sv, rangestart);
11162      sv_catpvs(sv, "-");
11163      put_byte(sv, i - 1);
11164     }
11165     do_sep = 1;
11166     rangestart = -1;
11167    }
11168   }
11169
11170   EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11171   /* output any special charclass tests (used entirely under use locale) */
11172   if (ANYOF_CLASS_TEST_ANY_SET(o))
11173    for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
11174     if (ANYOF_CLASS_TEST(o,i)) {
11175      sv_catpv(sv, anyofs[i]);
11176      do_sep = 1;
11177     }
11178
11179   EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11180
11181   if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
11182    sv_catpvs(sv, "{non-utf8-latin1-all}");
11183   }
11184
11185   /* output information about the unicode matching */
11186   if (flags & ANYOF_UNICODE_ALL)
11187    sv_catpvs(sv, "{unicode_all}");
11188   else if (ANYOF_NONBITMAP(o))
11189    sv_catpvs(sv, "{unicode}");
11190   if (flags & ANYOF_NONBITMAP_NON_UTF8)
11191    sv_catpvs(sv, "{outside bitmap}");
11192
11193   if (ANYOF_NONBITMAP(o)) {
11194    SV *lv;
11195    SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
11196
11197    if (lv) {
11198     if (sw) {
11199      U8 s[UTF8_MAXBYTES_CASE+1];
11200
11201      for (i = 0; i <= 256; i++) { /* just the first 256 */
11202       uvchr_to_utf8(s, i);
11203
11204       if (i < 256 && swash_fetch(sw, s, TRUE)) {
11205        if (rangestart == -1)
11206         rangestart = i;
11207       } else if (rangestart != -1) {
11208        if (i <= rangestart + 3)
11209         for (; rangestart < i; rangestart++) {
11210          const U8 * const e = uvchr_to_utf8(s,rangestart);
11211          U8 *p;
11212          for(p = s; p < e; p++)
11213           put_byte(sv, *p);
11214         }
11215        else {
11216         const U8 *e = uvchr_to_utf8(s,rangestart);
11217         U8 *p;
11218         for (p = s; p < e; p++)
11219          put_byte(sv, *p);
11220         sv_catpvs(sv, "-");
11221         e = uvchr_to_utf8(s, i-1);
11222         for (p = s; p < e; p++)
11223          put_byte(sv, *p);
11224         }
11225         rangestart = -1;
11226        }
11227       }
11228
11229      sv_catpvs(sv, "..."); /* et cetera */
11230     }
11231
11232     {
11233      char *s = savesvpv(lv);
11234      char * const origs = s;
11235
11236      while (*s && *s != '\n')
11237       s++;
11238
11239      if (*s == '\n') {
11240       const char * const t = ++s;
11241
11242       while (*s) {
11243        if (*s == '\n')
11244         *s = ' ';
11245        s++;
11246       }
11247       if (s[-1] == ' ')
11248        s[-1] = 0;
11249
11250       sv_catpv(sv, t);
11251      }
11252
11253      Safefree(origs);
11254     }
11255    }
11256   }
11257
11258   Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
11259  }
11260  else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
11261   Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
11262 #else
11263  PERL_UNUSED_CONTEXT;
11264  PERL_UNUSED_ARG(sv);
11265  PERL_UNUSED_ARG(o);
11266  PERL_UNUSED_ARG(prog);
11267 #endif /* DEBUGGING */
11268 }
11269
11270 SV *
11271 Perl_re_intuit_string(pTHX_ REGEXP * const r)
11272 {    /* Assume that RE_INTUIT is set */
11273  dVAR;
11274  struct regexp *const prog = (struct regexp *)SvANY(r);
11275  GET_RE_DEBUG_FLAGS_DECL;
11276
11277  PERL_ARGS_ASSERT_RE_INTUIT_STRING;
11278  PERL_UNUSED_CONTEXT;
11279
11280  DEBUG_COMPILE_r(
11281   {
11282    const char * const s = SvPV_nolen_const(prog->check_substr
11283      ? prog->check_substr : prog->check_utf8);
11284
11285    if (!PL_colorset) reginitcolors();
11286    PerlIO_printf(Perl_debug_log,
11287      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
11288      PL_colors[4],
11289      prog->check_substr ? "" : "utf8 ",
11290      PL_colors[5],PL_colors[0],
11291      s,
11292      PL_colors[1],
11293      (strlen(s) > 60 ? "..." : ""));
11294   } );
11295
11296  return prog->check_substr ? prog->check_substr : prog->check_utf8;
11297 }
11298
11299 /*
11300    pregfree()
11301
11302    handles refcounting and freeing the perl core regexp structure. When
11303    it is necessary to actually free the structure the first thing it
11304    does is call the 'free' method of the regexp_engine associated to
11305    the regexp, allowing the handling of the void *pprivate; member
11306    first. (This routine is not overridable by extensions, which is why
11307    the extensions free is called first.)
11308
11309    See regdupe and regdupe_internal if you change anything here.
11310 */
11311 #ifndef PERL_IN_XSUB_RE
11312 void
11313 Perl_pregfree(pTHX_ REGEXP *r)
11314 {
11315  SvREFCNT_dec(r);
11316 }
11317
11318 void
11319 Perl_pregfree2(pTHX_ REGEXP *rx)
11320 {
11321  dVAR;
11322  struct regexp *const r = (struct regexp *)SvANY(rx);
11323  GET_RE_DEBUG_FLAGS_DECL;
11324
11325  PERL_ARGS_ASSERT_PREGFREE2;
11326
11327  if (r->mother_re) {
11328   ReREFCNT_dec(r->mother_re);
11329  } else {
11330   CALLREGFREE_PVT(rx); /* free the private data */
11331   SvREFCNT_dec(RXp_PAREN_NAMES(r));
11332  }
11333  if (r->substrs) {
11334   SvREFCNT_dec(r->anchored_substr);
11335   SvREFCNT_dec(r->anchored_utf8);
11336   SvREFCNT_dec(r->float_substr);
11337   SvREFCNT_dec(r->float_utf8);
11338   Safefree(r->substrs);
11339  }
11340  RX_MATCH_COPY_FREE(rx);
11341 #ifdef PERL_OLD_COPY_ON_WRITE
11342  SvREFCNT_dec(r->saved_copy);
11343 #endif
11344  Safefree(r->offs);
11345 }
11346
11347 /*  reg_temp_copy()
11348
11349  This is a hacky workaround to the structural issue of match results
11350  being stored in the regexp structure which is in turn stored in
11351  PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
11352  could be PL_curpm in multiple contexts, and could require multiple
11353  result sets being associated with the pattern simultaneously, such
11354  as when doing a recursive match with (??{$qr})
11355
11356  The solution is to make a lightweight copy of the regexp structure
11357  when a qr// is returned from the code executed by (??{$qr}) this
11358  lightweight copy doesn't actually own any of its data except for
11359  the starp/end and the actual regexp structure itself.
11360
11361 */
11362
11363
11364 REGEXP *
11365 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
11366 {
11367  struct regexp *ret;
11368  struct regexp *const r = (struct regexp *)SvANY(rx);
11369  register const I32 npar = r->nparens+1;
11370
11371  PERL_ARGS_ASSERT_REG_TEMP_COPY;
11372
11373  if (!ret_x)
11374   ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
11375  ret = (struct regexp *)SvANY(ret_x);
11376
11377  (void)ReREFCNT_inc(rx);
11378  /* We can take advantage of the existing "copied buffer" mechanism in SVs
11379  by pointing directly at the buffer, but flagging that the allocated
11380  space in the copy is zero. As we've just done a struct copy, it's now
11381  a case of zero-ing that, rather than copying the current length.  */
11382  SvPV_set(ret_x, RX_WRAPPED(rx));
11383  SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
11384  memcpy(&(ret->xpv_cur), &(r->xpv_cur),
11385   sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
11386  SvLEN_set(ret_x, 0);
11387  SvSTASH_set(ret_x, NULL);
11388  SvMAGIC_set(ret_x, NULL);
11389  Newx(ret->offs, npar, regexp_paren_pair);
11390  Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11391  if (r->substrs) {
11392   Newx(ret->substrs, 1, struct reg_substr_data);
11393   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11394
11395   SvREFCNT_inc_void(ret->anchored_substr);
11396   SvREFCNT_inc_void(ret->anchored_utf8);
11397   SvREFCNT_inc_void(ret->float_substr);
11398   SvREFCNT_inc_void(ret->float_utf8);
11399
11400   /* check_substr and check_utf8, if non-NULL, point to either their
11401   anchored or float namesakes, and don't hold a second reference.  */
11402  }
11403  RX_MATCH_COPIED_off(ret_x);
11404 #ifdef PERL_OLD_COPY_ON_WRITE
11405  ret->saved_copy = NULL;
11406 #endif
11407  ret->mother_re = rx;
11408
11409  return ret_x;
11410 }
11411 #endif
11412
11413 /* regfree_internal()
11414
11415    Free the private data in a regexp. This is overloadable by
11416    extensions. Perl takes care of the regexp structure in pregfree(),
11417    this covers the *pprivate pointer which technically perl doesn't
11418    know about, however of course we have to handle the
11419    regexp_internal structure when no extension is in use.
11420
11421    Note this is called before freeing anything in the regexp
11422    structure.
11423  */
11424
11425 void
11426 Perl_regfree_internal(pTHX_ REGEXP * const rx)
11427 {
11428  dVAR;
11429  struct regexp *const r = (struct regexp *)SvANY(rx);
11430  RXi_GET_DECL(r,ri);
11431  GET_RE_DEBUG_FLAGS_DECL;
11432
11433  PERL_ARGS_ASSERT_REGFREE_INTERNAL;
11434
11435  DEBUG_COMPILE_r({
11436   if (!PL_colorset)
11437    reginitcolors();
11438   {
11439    SV *dsv= sv_newmortal();
11440    RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
11441     dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
11442    PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
11443     PL_colors[4],PL_colors[5],s);
11444   }
11445  });
11446 #ifdef RE_TRACK_PATTERN_OFFSETS
11447  if (ri->u.offsets)
11448   Safefree(ri->u.offsets);             /* 20010421 MJD */
11449 #endif
11450  if (ri->data) {
11451   int n = ri->data->count;
11452   PAD* new_comppad = NULL;
11453   PAD* old_comppad;
11454   PADOFFSET refcnt;
11455
11456   while (--n >= 0) {
11457   /* If you add a ->what type here, update the comment in regcomp.h */
11458    switch (ri->data->what[n]) {
11459    case 'a':
11460    case 's':
11461    case 'S':
11462    case 'u':
11463     SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
11464     break;
11465    case 'f':
11466     Safefree(ri->data->data[n]);
11467     break;
11468    case 'p':
11469     new_comppad = MUTABLE_AV(ri->data->data[n]);
11470     break;
11471    case 'o':
11472     if (new_comppad == NULL)
11473      Perl_croak(aTHX_ "panic: pregfree comppad");
11474     PAD_SAVE_LOCAL(old_comppad,
11475      /* Watch out for global destruction's random ordering. */
11476      (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
11477     );
11478     OP_REFCNT_LOCK;
11479     refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
11480     OP_REFCNT_UNLOCK;
11481     if (!refcnt)
11482      op_free((OP_4tree*)ri->data->data[n]);
11483
11484     PAD_RESTORE_LOCAL(old_comppad);
11485     SvREFCNT_dec(MUTABLE_SV(new_comppad));
11486     new_comppad = NULL;
11487     break;
11488    case 'n':
11489     break;
11490    case 'T':
11491     { /* Aho Corasick add-on structure for a trie node.
11492      Used in stclass optimization only */
11493      U32 refcount;
11494      reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
11495      OP_REFCNT_LOCK;
11496      refcount = --aho->refcount;
11497      OP_REFCNT_UNLOCK;
11498      if ( !refcount ) {
11499       PerlMemShared_free(aho->states);
11500       PerlMemShared_free(aho->fail);
11501       /* do this last!!!! */
11502       PerlMemShared_free(ri->data->data[n]);
11503       PerlMemShared_free(ri->regstclass);
11504      }
11505     }
11506     break;
11507    case 't':
11508     {
11509      /* trie structure. */
11510      U32 refcount;
11511      reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
11512      OP_REFCNT_LOCK;
11513      refcount = --trie->refcount;
11514      OP_REFCNT_UNLOCK;
11515      if ( !refcount ) {
11516       PerlMemShared_free(trie->charmap);
11517       PerlMemShared_free(trie->states);
11518       PerlMemShared_free(trie->trans);
11519       if (trie->bitmap)
11520        PerlMemShared_free(trie->bitmap);
11521       if (trie->jump)
11522        PerlMemShared_free(trie->jump);
11523       PerlMemShared_free(trie->wordinfo);
11524       /* do this last!!!! */
11525       PerlMemShared_free(ri->data->data[n]);
11526      }
11527     }
11528     break;
11529    default:
11530     Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
11531    }
11532   }
11533   Safefree(ri->data->what);
11534   Safefree(ri->data);
11535  }
11536
11537  Safefree(ri);
11538 }
11539
11540 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
11541 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
11542 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
11543
11544 /*
11545    re_dup - duplicate a regexp.
11546
11547    This routine is expected to clone a given regexp structure. It is only
11548    compiled under USE_ITHREADS.
11549
11550    After all of the core data stored in struct regexp is duplicated
11551    the regexp_engine.dupe method is used to copy any private data
11552    stored in the *pprivate pointer. This allows extensions to handle
11553    any duplication it needs to do.
11554
11555    See pregfree() and regfree_internal() if you change anything here.
11556 */
11557 #if defined(USE_ITHREADS)
11558 #ifndef PERL_IN_XSUB_RE
11559 void
11560 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
11561 {
11562  dVAR;
11563  I32 npar;
11564  const struct regexp *r = (const struct regexp *)SvANY(sstr);
11565  struct regexp *ret = (struct regexp *)SvANY(dstr);
11566
11567  PERL_ARGS_ASSERT_RE_DUP_GUTS;
11568
11569  npar = r->nparens+1;
11570  Newx(ret->offs, npar, regexp_paren_pair);
11571  Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11572  if(ret->swap) {
11573   /* no need to copy these */
11574   Newx(ret->swap, npar, regexp_paren_pair);
11575  }
11576
11577  if (ret->substrs) {
11578   /* Do it this way to avoid reading from *r after the StructCopy().
11579   That way, if any of the sv_dup_inc()s dislodge *r from the L1
11580   cache, it doesn't matter.  */
11581   const bool anchored = r->check_substr
11582    ? r->check_substr == r->anchored_substr
11583    : r->check_utf8 == r->anchored_utf8;
11584   Newx(ret->substrs, 1, struct reg_substr_data);
11585   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11586
11587   ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
11588   ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
11589   ret->float_substr = sv_dup_inc(ret->float_substr, param);
11590   ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
11591
11592   /* check_substr and check_utf8, if non-NULL, point to either their
11593   anchored or float namesakes, and don't hold a second reference.  */
11594
11595   if (ret->check_substr) {
11596    if (anchored) {
11597     assert(r->check_utf8 == r->anchored_utf8);
11598     ret->check_substr = ret->anchored_substr;
11599     ret->check_utf8 = ret->anchored_utf8;
11600    } else {
11601     assert(r->check_substr == r->float_substr);
11602     assert(r->check_utf8 == r->float_utf8);
11603     ret->check_substr = ret->float_substr;
11604     ret->check_utf8 = ret->float_utf8;
11605    }
11606   } else if (ret->check_utf8) {
11607    if (anchored) {
11608     ret->check_utf8 = ret->anchored_utf8;
11609    } else {
11610     ret->check_utf8 = ret->float_utf8;
11611    }
11612   }
11613  }
11614
11615  RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
11616
11617  if (ret->pprivate)
11618   RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
11619
11620  if (RX_MATCH_COPIED(dstr))
11621   ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
11622  else
11623   ret->subbeg = NULL;
11624 #ifdef PERL_OLD_COPY_ON_WRITE
11625  ret->saved_copy = NULL;
11626 #endif
11627
11628  if (ret->mother_re) {
11629   if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
11630    /* Our storage points directly to our mother regexp, but that's
11631    1: a buffer in a different thread
11632    2: something we no longer hold a reference on
11633    so we need to copy it locally.  */
11634    /* Note we need to sue SvCUR() on our mother_re, because it, in
11635    turn, may well be pointing to its own mother_re.  */
11636    SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
11637         SvCUR(ret->mother_re)+1));
11638    SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
11639   }
11640   ret->mother_re      = NULL;
11641  }
11642  ret->gofs = 0;
11643 }
11644 #endif /* PERL_IN_XSUB_RE */
11645
11646 /*
11647    regdupe_internal()
11648
11649    This is the internal complement to regdupe() which is used to copy
11650    the structure pointed to by the *pprivate pointer in the regexp.
11651    This is the core version of the extension overridable cloning hook.
11652    The regexp structure being duplicated will be copied by perl prior
11653    to this and will be provided as the regexp *r argument, however
11654    with the /old/ structures pprivate pointer value. Thus this routine
11655    may override any copying normally done by perl.
11656
11657    It returns a pointer to the new regexp_internal structure.
11658 */
11659
11660 void *
11661 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
11662 {
11663  dVAR;
11664  struct regexp *const r = (struct regexp *)SvANY(rx);
11665  regexp_internal *reti;
11666  int len, npar;
11667  RXi_GET_DECL(r,ri);
11668
11669  PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
11670
11671  npar = r->nparens+1;
11672  len = ProgLen(ri);
11673
11674  Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
11675  Copy(ri->program, reti->program, len+1, regnode);
11676
11677
11678  reti->regstclass = NULL;
11679
11680  if (ri->data) {
11681   struct reg_data *d;
11682   const int count = ri->data->count;
11683   int i;
11684
11685   Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
11686     char, struct reg_data);
11687   Newx(d->what, count, U8);
11688
11689   d->count = count;
11690   for (i = 0; i < count; i++) {
11691    d->what[i] = ri->data->what[i];
11692    switch (d->what[i]) {
11693     /* legal options are one of: sSfpontTua
11694     see also regcomp.h and pregfree() */
11695    case 'a': /* actually an AV, but the dup function is identical.  */
11696    case 's':
11697    case 'S':
11698    case 'p': /* actually an AV, but the dup function is identical.  */
11699    case 'u': /* actually an HV, but the dup function is identical.  */
11700     d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
11701     break;
11702    case 'f':
11703     /* This is cheating. */
11704     Newx(d->data[i], 1, struct regnode_charclass_class);
11705     StructCopy(ri->data->data[i], d->data[i],
11706        struct regnode_charclass_class);
11707     reti->regstclass = (regnode*)d->data[i];
11708     break;
11709    case 'o':
11710     /* Compiled op trees are readonly and in shared memory,
11711     and can thus be shared without duplication. */
11712     OP_REFCNT_LOCK;
11713     d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
11714     OP_REFCNT_UNLOCK;
11715     break;
11716    case 'T':
11717     /* Trie stclasses are readonly and can thus be shared
11718     * without duplication. We free the stclass in pregfree
11719     * when the corresponding reg_ac_data struct is freed.
11720     */
11721     reti->regstclass= ri->regstclass;
11722     /* Fall through */
11723    case 't':
11724     OP_REFCNT_LOCK;
11725     ((reg_trie_data*)ri->data->data[i])->refcount++;
11726     OP_REFCNT_UNLOCK;
11727     /* Fall through */
11728    case 'n':
11729     d->data[i] = ri->data->data[i];
11730     break;
11731    default:
11732     Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
11733    }
11734   }
11735
11736   reti->data = d;
11737  }
11738  else
11739   reti->data = NULL;
11740
11741  reti->name_list_idx = ri->name_list_idx;
11742
11743 #ifdef RE_TRACK_PATTERN_OFFSETS
11744  if (ri->u.offsets) {
11745   Newx(reti->u.offsets, 2*len+1, U32);
11746   Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
11747  }
11748 #else
11749  SetProgLen(reti,len);
11750 #endif
11751
11752  return (void*)reti;
11753 }
11754
11755 #endif    /* USE_ITHREADS */
11756
11757 #ifndef PERL_IN_XSUB_RE
11758
11759 /*
11760  - regnext - dig the "next" pointer out of a node
11761  */
11762 regnode *
11763 Perl_regnext(pTHX_ register regnode *p)
11764 {
11765  dVAR;
11766  register I32 offset;
11767
11768  if (!p)
11769   return(NULL);
11770
11771  if (OP(p) > REGNODE_MAX) {  /* regnode.type is unsigned */
11772   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
11773  }
11774
11775  offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
11776  if (offset == 0)
11777   return(NULL);
11778
11779  return(p+offset);
11780 }
11781 #endif
11782
11783 STATIC void
11784 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
11785 {
11786  va_list args;
11787  STRLEN l1 = strlen(pat1);
11788  STRLEN l2 = strlen(pat2);
11789  char buf[512];
11790  SV *msv;
11791  const char *message;
11792
11793  PERL_ARGS_ASSERT_RE_CROAK2;
11794
11795  if (l1 > 510)
11796   l1 = 510;
11797  if (l1 + l2 > 510)
11798   l2 = 510 - l1;
11799  Copy(pat1, buf, l1 , char);
11800  Copy(pat2, buf + l1, l2 , char);
11801  buf[l1 + l2] = '\n';
11802  buf[l1 + l2 + 1] = '\0';
11803 #ifdef I_STDARG
11804  /* ANSI variant takes additional second argument */
11805  va_start(args, pat2);
11806 #else
11807  va_start(args);
11808 #endif
11809  msv = vmess(buf, &args);
11810  va_end(args);
11811  message = SvPV_const(msv,l1);
11812  if (l1 > 512)
11813   l1 = 512;
11814  Copy(message, buf, l1 , char);
11815  buf[l1-1] = '\0';   /* Overwrite \n */
11816  Perl_croak(aTHX_ "%s", buf);
11817 }
11818
11819 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
11820
11821 #ifndef PERL_IN_XSUB_RE
11822 void
11823 Perl_save_re_context(pTHX)
11824 {
11825  dVAR;
11826
11827  struct re_save_state *state;
11828
11829  SAVEVPTR(PL_curcop);
11830  SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
11831
11832  state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
11833  PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
11834  SSPUSHUV(SAVEt_RE_STATE);
11835
11836  Copy(&PL_reg_state, state, 1, struct re_save_state);
11837
11838  PL_reg_start_tmp = 0;
11839  PL_reg_start_tmpl = 0;
11840  PL_reg_oldsaved = NULL;
11841  PL_reg_oldsavedlen = 0;
11842  PL_reg_maxiter = 0;
11843  PL_reg_leftiter = 0;
11844  PL_reg_poscache = NULL;
11845  PL_reg_poscache_size = 0;
11846 #ifdef PERL_OLD_COPY_ON_WRITE
11847  PL_nrs = NULL;
11848 #endif
11849
11850  /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
11851  if (PL_curpm) {
11852   const REGEXP * const rx = PM_GETRE(PL_curpm);
11853   if (rx) {
11854    U32 i;
11855    for (i = 1; i <= RX_NPARENS(rx); i++) {
11856     char digits[TYPE_CHARS(long)];
11857     const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
11858     GV *const *const gvp
11859      = (GV**)hv_fetch(PL_defstash, digits, len, 0);
11860
11861     if (gvp) {
11862      GV * const gv = *gvp;
11863      if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
11864       save_scalar(gv);
11865     }
11866    }
11867   }
11868  }
11869 }
11870 #endif
11871
11872 static void
11873 clear_re(pTHX_ void *r)
11874 {
11875  dVAR;
11876  ReREFCNT_dec((REGEXP *)r);
11877 }
11878
11879 #ifdef DEBUGGING
11880
11881 STATIC void
11882 S_put_byte(pTHX_ SV *sv, int c)
11883 {
11884  PERL_ARGS_ASSERT_PUT_BYTE;
11885
11886  /* Our definition of isPRINT() ignores locales, so only bytes that are
11887  not part of UTF-8 are considered printable. I assume that the same
11888  holds for UTF-EBCDIC.
11889  Also, code point 255 is not printable in either (it's E0 in EBCDIC,
11890  which Wikipedia says:
11891
11892  EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
11893  ones (binary 1111 1111, hexadecimal FF). It is similar, but not
11894  identical, to the ASCII delete (DEL) or rubout control character.
11895  ) So the old condition can be simplified to !isPRINT(c)  */
11896  if (!isPRINT(c)) {
11897   if (c < 256) {
11898    Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
11899   }
11900   else {
11901    Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
11902   }
11903  }
11904  else {
11905   const char string = c;
11906   if (c == '-' || c == ']' || c == '\\' || c == '^')
11907    sv_catpvs(sv, "\\");
11908   sv_catpvn(sv, &string, 1);
11909  }
11910 }
11911
11912
11913 #define CLEAR_OPTSTART \
11914  if (optstart) STMT_START { \
11915    DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
11916    optstart=NULL; \
11917  } STMT_END
11918
11919 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
11920
11921 STATIC const regnode *
11922 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
11923    const regnode *last, const regnode *plast,
11924    SV* sv, I32 indent, U32 depth)
11925 {
11926  dVAR;
11927  register U8 op = PSEUDO; /* Arbitrary non-END op. */
11928  register const regnode *next;
11929  const regnode *optstart= NULL;
11930
11931  RXi_GET_DECL(r,ri);
11932  GET_RE_DEBUG_FLAGS_DECL;
11933
11934  PERL_ARGS_ASSERT_DUMPUNTIL;
11935
11936 #ifdef DEBUG_DUMPUNTIL
11937  PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
11938   last ? last-start : 0,plast ? plast-start : 0);
11939 #endif
11940
11941  if (plast && plast < last)
11942   last= plast;
11943
11944  while (PL_regkind[op] != END && (!last || node < last)) {
11945   /* While that wasn't END last time... */
11946   NODE_ALIGN(node);
11947   op = OP(node);
11948   if (op == CLOSE || op == WHILEM)
11949    indent--;
11950   next = regnext((regnode *)node);
11951
11952   /* Where, what. */
11953   if (OP(node) == OPTIMIZED) {
11954    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
11955     optstart = node;
11956    else
11957     goto after_print;
11958   } else
11959    CLEAR_OPTSTART;
11960
11961   regprop(r, sv, node);
11962   PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
11963      (int)(2*indent + 1), "", SvPVX_const(sv));
11964
11965   if (OP(node) != OPTIMIZED) {
11966    if (next == NULL)  /* Next ptr. */
11967     PerlIO_printf(Perl_debug_log, " (0)");
11968    else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
11969     PerlIO_printf(Perl_debug_log, " (FAIL)");
11970    else
11971     PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
11972    (void)PerlIO_putc(Perl_debug_log, '\n');
11973   }
11974
11975  after_print:
11976   if (PL_regkind[(U8)op] == BRANCHJ) {
11977    assert(next);
11978    {
11979     register const regnode *nnode = (OP(next) == LONGJMP
11980            ? regnext((regnode *)next)
11981            : next);
11982     if (last && nnode > last)
11983      nnode = last;
11984     DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
11985    }
11986   }
11987   else if (PL_regkind[(U8)op] == BRANCH) {
11988    assert(next);
11989    DUMPUNTIL(NEXTOPER(node), next);
11990   }
11991   else if ( PL_regkind[(U8)op]  == TRIE ) {
11992    const regnode *this_trie = node;
11993    const char op = OP(node);
11994    const U32 n = ARG(node);
11995    const reg_ac_data * const ac = op>=AHOCORASICK ?
11996    (reg_ac_data *)ri->data->data[n] :
11997    NULL;
11998    const reg_trie_data * const trie =
11999     (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
12000 #ifdef DEBUGGING
12001    AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
12002 #endif
12003    const regnode *nextbranch= NULL;
12004    I32 word_idx;
12005    sv_setpvs(sv, "");
12006    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
12007     SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
12008
12009     PerlIO_printf(Perl_debug_log, "%*s%s ",
12010     (int)(2*(indent+3)), "",
12011      elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
12012        PL_colors[0], PL_colors[1],
12013        (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
12014        PERL_PV_PRETTY_ELLIPSES    |
12015        PERL_PV_PRETTY_LTGT
12016        )
12017        : "???"
12018     );
12019     if (trie->jump) {
12020      U16 dist= trie->jump[word_idx+1];
12021      PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
12022         (UV)((dist ? this_trie + dist : next) - start));
12023      if (dist) {
12024       if (!nextbranch)
12025        nextbranch= this_trie + trie->jump[0];
12026       DUMPUNTIL(this_trie + dist, nextbranch);
12027      }
12028      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
12029       nextbranch= regnext((regnode *)nextbranch);
12030     } else {
12031      PerlIO_printf(Perl_debug_log, "\n");
12032     }
12033    }
12034    if (last && next > last)
12035     node= last;
12036    else
12037     node= next;
12038   }
12039   else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
12040    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
12041      NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
12042   }
12043   else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
12044    assert(next);
12045    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
12046   }
12047   else if ( op == PLUS || op == STAR) {
12048    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
12049   }
12050   else if (PL_regkind[(U8)op] == ANYOF) {
12051    /* arglen 1 + class block */
12052    node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
12053      ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
12054    node = NEXTOPER(node);
12055   }
12056   else if (PL_regkind[(U8)op] == EXACT) {
12057    /* Literal string, where present. */
12058    node += NODE_SZ_STR(node) - 1;
12059    node = NEXTOPER(node);
12060   }
12061   else {
12062    node = NEXTOPER(node);
12063    node += regarglen[(U8)op];
12064   }
12065   if (op == CURLYX || op == OPEN)
12066    indent++;
12067  }
12068  CLEAR_OPTSTART;
12069 #ifdef DEBUG_DUMPUNTIL
12070  PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
12071 #endif
12072  return node;
12073 }
12074
12075 #endif /* DEBUGGING */
12076
12077 /*
12078  * Local variables:
12079  * c-indentation-style: bsd
12080  * c-basic-offset: 4
12081  * indent-tabs-mode: t
12082  * End:
12083  *
12084  * ex: set ts=8 sts=4 sw=4 noet:
12085  */