src/5014000/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  * Copyright (c) 1986 by University of Toronto.
  42  * Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  * Permission is granted to anyone to use this software for any
  45  * purpose on any computer system, and to redistribute it freely,
  46  * subject to the following restrictions:
  47  *
  48  * 1. The author is not responsible for the consequences of use of
  49  *  this software, no matter how awful, even if they arise
  50  *  from defects in it.
  51  *
  52  * 2. The origin of this software must not be misrepresented, either
  53  *  by explicit claim or by omission.
  54  *
  55  * 3. Altered versions must be plainly marked as such, and must not
  56  *  be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #undef PERL_IN_XSUB_RE
  76 #define PERL_IN_XSUB_RE 1
  77 #include "perl.h"
  78 #undef PERL_IN_XSUB_RE
  79
  80 #ifndef PERL_IN_XSUB_RE
  81 #include "re_defs.h"
  82 #endif
  83
  84 #define REG_COMP_C
  85 #ifdef PERL_IN_XSUB_RE
  86 #  include "re_comp.h"
  87 #else
  88 #  include "regcomp.h"
  89 #endif
  90
  91 #include "dquote_static.c"
  92
  93 #ifdef op
  94 #undef op
  95 #endif /* op */
  96
  97 #ifdef MSDOS
  98 #  if defined(BUGGY_MSC6)
  99  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 100 #    pragma optimize("a",off)
 101  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 102 #    pragma optimize("w",on )
 103 #  endif /* BUGGY_MSC6 */
 104 #endif /* MSDOS */
 105
 106 #ifndef STATIC
 107 #define STATIC static
 108 #endif
 109
 110 typedef struct RExC_state_t {
 111  U32  flags;   /* are we folding, multilining? */
 112  char *precomp;  /* uncompiled string. */
 113  REGEXP *rx_sv;   /* The SV that is the regexp. */
 114  regexp *rx;                    /* perl core regexp structure */
 115  regexp_internal *rxi;           /* internal data for regexp object pprivate field */
 116  char *start;   /* Start of input for compile */
 117  char *end;   /* End of input for compile */
 118  char *parse;   /* Input-scan pointer. */
 119  I32  whilem_seen;  /* number of WHILEM in this expr */
 120  regnode *emit_start;  /* Start of emitted-code area */
 121  regnode *emit_bound;  /* First regnode outside of the allocated space */
 122  regnode *emit;   /* Code-emit pointer; &regdummy = don't = compiling */
 123  I32  naughty;  /* How bad is this pattern? */
 124  I32  sawback;  /* Did we see \1, ...? */
 125  U32  seen;
 126  I32  size;   /* Code size. */
 127  I32  npar;   /* Capture buffer count, (OPEN). */
 128  I32  cpar;   /* Capture buffer count, (CLOSE). */
 129  I32  nestroot;  /* root parens we are in - used by accept */
 130  I32  extralen;
 131  I32  seen_zerolen;
 132  I32  seen_evals;
 133  regnode **open_parens;  /* pointers to open parens */
 134  regnode **close_parens;  /* pointers to close parens */
 135  regnode *opend;   /* END node in program */
 136  I32  utf8;  /* whether the pattern is utf8 or not */
 137  I32  orig_utf8; /* whether the pattern was originally in utf8 */
 138         /* XXX use this for future optimisation of case
 139         * where pattern must be upgraded to utf8. */
 140  I32  uni_semantics; /* If a d charset modifier should use unicode
 141         rules, even if the pattern is not in
 142         utf8 */
 143  HV  *paren_names;  /* Paren names */
 144
 145  regnode **recurse;  /* Recurse regops */
 146  I32  recurse_count;  /* Number of recurse regops */
 147  I32  in_lookbehind;
 148  I32  contains_locale;
 149  I32  override_recoding;
 150 #if ADD_TO_REGEXEC
 151  char  *starttry;  /* -Dr: where regtry was called. */
 152 #define RExC_starttry (pRExC_state->starttry)
 153 #endif
 154 #ifdef DEBUGGING
 155  const char  *lastparse;
 156  I32         lastnum;
 157  AV          *paren_name_list;       /* idx -> name */
 158 #define RExC_lastparse (pRExC_state->lastparse)
 159 #define RExC_lastnum (pRExC_state->lastnum)
 160 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 161 #endif
 162 } RExC_state_t;
 163
 164 #define RExC_flags (pRExC_state->flags)
 165 #define RExC_precomp (pRExC_state->precomp)
 166 #define RExC_rx_sv (pRExC_state->rx_sv)
 167 #define RExC_rx  (pRExC_state->rx)
 168 #define RExC_rxi (pRExC_state->rxi)
 169 #define RExC_start (pRExC_state->start)
 170 #define RExC_end (pRExC_state->end)
 171 #define RExC_parse (pRExC_state->parse)
 172 #define RExC_whilem_seen (pRExC_state->whilem_seen)
 173 #ifdef RE_TRACK_PATTERN_OFFSETS
 174 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
 175 #endif
 176 #define RExC_emit (pRExC_state->emit)
 177 #define RExC_emit_start (pRExC_state->emit_start)
 178 #define RExC_emit_bound (pRExC_state->emit_bound)
 179 #define RExC_naughty (pRExC_state->naughty)
 180 #define RExC_sawback (pRExC_state->sawback)
 181 #define RExC_seen (pRExC_state->seen)
 182 #define RExC_size (pRExC_state->size)
 183 #define RExC_npar (pRExC_state->npar)
 184 #define RExC_nestroot   (pRExC_state->nestroot)
 185 #define RExC_extralen (pRExC_state->extralen)
 186 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
 187 #define RExC_seen_evals (pRExC_state->seen_evals)
 188 #define RExC_utf8 (pRExC_state->utf8)
 189 #define RExC_uni_semantics (pRExC_state->uni_semantics)
 190 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
 191 #define RExC_open_parens (pRExC_state->open_parens)
 192 #define RExC_close_parens (pRExC_state->close_parens)
 193 #define RExC_opend (pRExC_state->opend)
 194 #define RExC_paren_names (pRExC_state->paren_names)
 195 #define RExC_recurse (pRExC_state->recurse)
 196 #define RExC_recurse_count (pRExC_state->recurse_count)
 197 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
 198 #define RExC_contains_locale (pRExC_state->contains_locale)
 199 #define RExC_override_recoding (pRExC_state->override_recoding)
 200
 201
 202 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
 203 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 204   ((*s) == '{' && regcurly(s)))
 205
 206 #ifdef SPSTART
 207 #undef SPSTART  /* dratted cpp namespace... */
 208 #endif
 209 /*
 210  * Flags to be passed up and down.
 211  */
 212 #define WORST  0 /* Worst case. */
 213 #define HASWIDTH 0x01 /* Known to match non-null strings. */
 214
 215 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 216  * character, and if utf8, must be invariant.  Note that this is not the same thing as REGNODE_SIMPLE */
 217 #define SIMPLE  0x02
 218 #define SPSTART  0x04 /* Starts with * or +. */
 219 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
 220 #define POSTPONED 0x10    /* (?1),(?&name), (??{...}) or similar */
 221
 222 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 223
 224 /* whether trie related optimizations are enabled */
 225 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 226 #define TRIE_STUDY_OPT
 227 #define FULL_TRIE_STUDY
 228 #define TRIE_STCLASS
 229 #endif
 230
 231
 232
 233 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 234 #define PBITVAL(paren) (1 << ((paren) & 7))
 235 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 236 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 237 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 238
 239 /* If not already in utf8, do a longjmp back to the beginning */
 240 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 241 #define REQUIRE_UTF8 STMT_START {                                       \
 242          if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 243       } STMT_END
 244
 245 /* About scan_data_t.
 246
 247   During optimisation we recurse through the regexp program performing
 248   various inplace (keyhole style) optimisations. In addition study_chunk
 249   and scan_commit populate this data structure with information about
 250   what strings MUST appear in the pattern. We look for the longest
 251   string that must appear at a fixed location, and we look for the
 252   longest string that may appear at a floating location. So for instance
 253   in the pattern:
 254
 255  /FOO[xX]A.*B[xX]BAR/
 256
 257   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 258   strings (because they follow a .* construct). study_chunk will identify
 259   both FOO and BAR as being the longest fixed and floating strings respectively.
 260
 261   The strings can be composites, for instance
 262
 263  /(f)(o)(o)/
 264
 265   will result in a composite fixed substring 'foo'.
 266
 267   For each string some basic information is maintained:
 268
 269   - offset or min_offset
 270  This is the position the string must appear at, or not before.
 271  It also implicitly (when combined with minlenp) tells us how many
 272  characters must match before the string we are searching for.
 273  Likewise when combined with minlenp and the length of the string it
 274  tells us how many characters must appear after the string we have
 275  found.
 276
 277   - max_offset
 278  Only used for floating strings. This is the rightmost point that
 279  the string can appear at. If set to I32 max it indicates that the
 280  string can occur infinitely far to the right.
 281
 282   - minlenp
 283  A pointer to the minimum length of the pattern that the string
 284  was found inside. This is important as in the case of positive
 285  lookahead or positive lookbehind we can have multiple patterns
 286  involved. Consider
 287
 288  /(?=FOO).*F/
 289
 290  The minimum length of the pattern overall is 3, the minimum length
 291  of the lookahead part is 3, but the minimum length of the part that
 292  will actually match is 1. So 'FOO's minimum length is 3, but the
 293  minimum length for the F is 1. This is important as the minimum length
 294  is used to determine offsets in front of and behind the string being
 295  looked for.  Since strings can be composites this is the length of the
 296  pattern at the time it was committed with a scan_commit. Note that
 297  the length is calculated by study_chunk, so that the minimum lengths
 298  are not known until the full pattern has been compiled, thus the
 299  pointer to the value.
 300
 301   - lookbehind
 302
 303  In the case of lookbehind the string being searched for can be
 304  offset past the start point of the final matching string.
 305  If this value was just blithely removed from the min_offset it would
 306  invalidate some of the calculations for how many chars must match
 307  before or after (as they are derived from min_offset and minlen and
 308  the length of the string being searched for).
 309  When the final pattern is compiled and the data is moved from the
 310  scan_data_t structure into the regexp structure the information
 311  about lookbehind is factored in, with the information that would
 312  have been lost precalculated in the end_shift field for the
 313  associated string.
 314
 315   The fields pos_min and pos_delta are used to store the minimum offset
 316   and the delta to the maximum offset at the current point in the pattern.
 317
 318 */
 319
 320 typedef struct scan_data_t {
 321  /*I32 len_min;      unused */
 322  /*I32 len_delta;    unused */
 323  I32 pos_min;
 324  I32 pos_delta;
 325  SV *last_found;
 326  I32 last_end;     /* min value, <0 unless valid. */
 327  I32 last_start_min;
 328  I32 last_start_max;
 329  SV **longest;     /* Either &l_fixed, or &l_float. */
 330  SV *longest_fixed;      /* longest fixed string found in pattern */
 331  I32 offset_fixed;       /* offset where it starts */
 332  I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 333  I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 334  SV *longest_float;      /* longest floating string found in pattern */
 335  I32 offset_float_min;   /* earliest point in string it can appear */
 336  I32 offset_float_max;   /* latest point in string it can appear */
 337  I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 338  I32 lookbehind_float;   /* is the position of the string modified by LB */
 339  I32 flags;
 340  I32 whilem_c;
 341  I32 *last_closep;
 342  struct regnode_charclass_class *start_class;
 343 } scan_data_t;
 344
 345 /*
 346  * Forward declarations for pregcomp()'s friends.
 347  */
 348
 349 static const scan_data_t zero_scan_data =
 350   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 351
 352 #define SF_BEFORE_EOL  (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 353 #define SF_BEFORE_SEOL  0x0001
 354 #define SF_BEFORE_MEOL  0x0002
 355 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 356 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 357
 358 #ifdef NO_UNARY_PLUS
 359 #  define SF_FIX_SHIFT_EOL (0+2)
 360 #  define SF_FL_SHIFT_EOL  (0+4)
 361 #else
 362 #  define SF_FIX_SHIFT_EOL (+2)
 363 #  define SF_FL_SHIFT_EOL  (+4)
 364 #endif
 365
 366 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 367 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 368
 369 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 370 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 371 #define SF_IS_INF  0x0040
 372 #define SF_HAS_PAR  0x0080
 373 #define SF_IN_PAR  0x0100
 374 #define SF_HAS_EVAL  0x0200
 375 #define SCF_DO_SUBSTR  0x0400
 376 #define SCF_DO_STCLASS_AND 0x0800
 377 #define SCF_DO_STCLASS_OR 0x1000
 378 #define SCF_DO_STCLASS  (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 379 #define SCF_WHILEM_VISITED_POS 0x2000
 380
 381 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 382 #define SCF_SEEN_ACCEPT         0x8000
 383
 384 #define UTF cBOOL(RExC_utf8)
 385 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 386 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 387 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 388 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 389 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 390 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 391 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 392
 393 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 394
 395 #define OOB_UNICODE  12345678
 396 #define OOB_NAMEDCLASS  -1
 397
 398 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 399 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 400
 401
 402 /* length of regex to show in messages that don't mark a position within */
 403 #define RegexLengthToShowInErrorMessages 127
 404
 405 /*
 406  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 407  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 408  * op/pragma/warn/regcomp.
 409  */
 410 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 411 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 412
 413 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 414
 415 /*
 416  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 417  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 418  * "...".
 419  */
 420 #define _FAIL(code) STMT_START {     \
 421  const char *ellipses = "";      \
 422  IV len = RExC_end - RExC_precomp;     \
 423                   \
 424  if (!SIZE_ONLY)       \
 425   SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 426  if (len > RegexLengthToShowInErrorMessages) {   \
 427   /* chop 10 shorter than the max, to ensure meaning of "..." */ \
 428   len = RegexLengthToShowInErrorMessages - 10;   \
 429   ellipses = "...";      \
 430  }         \
 431  code;                                                               \
 432 } STMT_END
 433
 434 #define FAIL(msg) _FAIL(       \
 435  Perl_croak(aTHX_ "%s in regex m/%.*s%s/",     \
 436    msg, (int)len, RExC_precomp, ellipses))
 437
 438 #define FAIL2(msg,arg) _FAIL(       \
 439  Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 440    arg, (int)len, RExC_precomp, ellipses))
 441
 442 /*
 443  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 444  */
 445 #define Simple_vFAIL(m) STMT_START {     \
 446  const IV offset = RExC_parse - RExC_precomp;   \
 447  Perl_croak(aTHX_ "%s" REPORT_LOCATION,    \
 448    m, (int)offset, RExC_precomp, RExC_precomp + offset); \
 449 } STMT_END
 450
 451 /*
 452  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 453  */
 454 #define vFAIL(m) STMT_START {    \
 455  if (!SIZE_ONLY)     \
 456   SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
 457  Simple_vFAIL(m);     \
 458 } STMT_END
 459
 460 /*
 461  * Like Simple_vFAIL(), but accepts two arguments.
 462  */
 463 #define Simple_vFAIL2(m,a1) STMT_START {   \
 464  const IV offset = RExC_parse - RExC_precomp;   \
 465  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,   \
 466    (int)offset, RExC_precomp, RExC_precomp + offset); \
 467 } STMT_END
 468
 469 /*
 470  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 471  */
 472 #define vFAIL2(m,a1) STMT_START {   \
 473  if (!SIZE_ONLY)     \
 474   SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
 475  Simple_vFAIL2(m, a1);    \
 476 } STMT_END
 477
 478
 479 /*
 480  * Like Simple_vFAIL(), but accepts three arguments.
 481  */
 482 #define Simple_vFAIL3(m, a1, a2) STMT_START {   \
 483  const IV offset = RExC_parse - RExC_precomp;  \
 484  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,  \
 485    (int)offset, RExC_precomp, RExC_precomp + offset); \
 486 } STMT_END
 487
 488 /*
 489  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 490  */
 491 #define vFAIL3(m,a1,a2) STMT_START {   \
 492  if (!SIZE_ONLY)     \
 493   SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
 494  Simple_vFAIL3(m, a1, a2);    \
 495 } STMT_END
 496
 497 /*
 498  * Like Simple_vFAIL(), but accepts four arguments.
 499  */
 500 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {  \
 501  const IV offset = RExC_parse - RExC_precomp;  \
 502  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,  \
 503    (int)offset, RExC_precomp, RExC_precomp + offset); \
 504 } STMT_END
 505
 506 #define ckWARNreg(loc,m) STMT_START {     \
 507  const IV offset = loc - RExC_precomp;    \
 508  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 509    (int)offset, RExC_precomp, RExC_precomp + offset);  \
 510 } STMT_END
 511
 512 #define ckWARNregdep(loc,m) STMT_START {    \
 513  const IV offset = loc - RExC_precomp;    \
 514  Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
 515    m REPORT_LOCATION,      \
 516    (int)offset, RExC_precomp, RExC_precomp + offset);  \
 517 } STMT_END
 518
 519 #define ckWARN2regdep(loc,m, a1) STMT_START {    \
 520  const IV offset = loc - RExC_precomp;    \
 521  Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
 522    m REPORT_LOCATION,      \
 523    a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
 524 } STMT_END
 525
 526 #define ckWARN2reg(loc, m, a1) STMT_START {    \
 527  const IV offset = loc - RExC_precomp;    \
 528  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 529    a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
 530 } STMT_END
 531
 532 #define vWARN3(loc, m, a1, a2) STMT_START {    \
 533  const IV offset = loc - RExC_precomp;    \
 534  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 535    a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
 536 } STMT_END
 537
 538 #define ckWARN3reg(loc, m, a1, a2) STMT_START {    \
 539  const IV offset = loc - RExC_precomp;    \
 540  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 541    a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
 542 } STMT_END
 543
 544 #define vWARN4(loc, m, a1, a2, a3) STMT_START {    \
 545  const IV offset = loc - RExC_precomp;    \
 546  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 547    a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 548 } STMT_END
 549
 550 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {   \
 551  const IV offset = loc - RExC_precomp;    \
 552  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 553    a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 554 } STMT_END
 555
 556 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {   \
 557  const IV offset = loc - RExC_precomp;    \
 558  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 559    a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 560 } STMT_END
 561
 562
 563 /* Allow for side effects in s */
 564 #define REGC(c,s) STMT_START {   \
 565  if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 566 } STMT_END
 567
 568 /* Macros for recording node offsets.   20001227 mjd@plover.com
 569  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 570  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 571  * Element 0 holds the number n.
 572  * Position is 1 indexed.
 573  */
 574 #ifndef RE_TRACK_PATTERN_OFFSETS
 575 #define Set_Node_Offset_To_R(node,byte)
 576 #define Set_Node_Offset(node,byte)
 577 #define Set_Cur_Node_Offset
 578 #define Set_Node_Length_To_R(node,len)
 579 #define Set_Node_Length(node,len)
 580 #define Set_Node_Cur_Length(node)
 581 #define Node_Offset(n)
 582 #define Node_Length(n)
 583 #define Set_Node_Offset_Length(node,offset,len)
 584 #define ProgLen(ri) ri->u.proglen
 585 #define SetProgLen(ri,x) ri->u.proglen = x
 586 #else
 587 #define ProgLen(ri) ri->u.offsets[0]
 588 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 589 #define Set_Node_Offset_To_R(node,byte) STMT_START {   \
 590  if (! SIZE_ONLY) {       \
 591   MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",  \
 592      __LINE__, (int)(node), (int)(byte)));  \
 593   if((node) < 0) {      \
 594    Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 595   } else {       \
 596    RExC_offsets[2*(node)-1] = (byte);    \
 597   }        \
 598  }         \
 599 } STMT_END
 600
 601 #define Set_Node_Offset(node,byte) \
 602  Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 603 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 604
 605 #define Set_Node_Length_To_R(node,len) STMT_START {   \
 606  if (! SIZE_ONLY) {       \
 607   MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",  \
 608     __LINE__, (int)(node), (int)(len)));   \
 609   if((node) < 0) {      \
 610    Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 611   } else {       \
 612    RExC_offsets[2*(node)] = (len);    \
 613   }        \
 614  }         \
 615 } STMT_END
 616
 617 #define Set_Node_Length(node,len) \
 618  Set_Node_Length_To_R((node)-RExC_emit_start, len)
 619 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 620 #define Set_Node_Cur_Length(node) \
 621  Set_Node_Length(node, RExC_parse - parse_start)
 622
 623 /* Get offsets and lengths */
 624 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 625 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 626
 627 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
 628  Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
 629  Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
 630 } STMT_END
 631 #endif
 632
 633 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 634 #define EXPERIMENTAL_INPLACESCAN
 635 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 636
 637 #define DEBUG_STUDYDATA(str,data,depth)                              \
 638 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 639  PerlIO_printf(Perl_debug_log,                                    \
 640   "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 641   " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 642   (int)(depth)*2, "",                                          \
 643   (IV)((data)->pos_min),                                       \
 644   (IV)((data)->pos_delta),                                     \
 645   (UV)((data)->flags),                                         \
 646   (IV)((data)->whilem_c),                                      \
 647   (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 648   is_inf ? "INF " : ""                                         \
 649  );                                                               \
 650  if ((data)->last_found)                                          \
 651   PerlIO_printf(Perl_debug_log,                                \
 652    "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 653    " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 654    SvPVX_const((data)->last_found),                         \
 655    (IV)((data)->last_end),                                  \
 656    (IV)((data)->last_start_min),                            \
 657    (IV)((data)->last_start_max),                            \
 658    ((data)->longest &&                                      \
 659    (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 660    SvPVX_const((data)->longest_fixed),                      \
 661    (IV)((data)->offset_fixed),                              \
 662    ((data)->longest &&                                      \
 663    (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 664    SvPVX_const((data)->longest_float),                      \
 665    (IV)((data)->offset_float_min),                          \
 666    (IV)((data)->offset_float_max)                           \
 667   );                                                           \
 668  PerlIO_printf(Perl_debug_log,"\n");                              \
 669 });
 670
 671 static void clear_re(pTHX_ void *r);
 672
 673 /* Mark that we cannot extend a found fixed substring at this point.
 674    Update the longest found anchored substring and the longest found
 675    floating substrings if needed. */
 676
 677 STATIC void
 678 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 679 {
 680  const STRLEN l = CHR_SVLEN(data->last_found);
 681  const STRLEN old_l = CHR_SVLEN(*data->longest);
 682  GET_RE_DEBUG_FLAGS_DECL;
 683
 684  PERL_ARGS_ASSERT_SCAN_COMMIT;
 685
 686  if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 687   SvSetMagicSV(*data->longest, data->last_found);
 688   if (*data->longest == data->longest_fixed) {
 689    data->offset_fixed = l ? data->last_start_min : data->pos_min;
 690    if (data->flags & SF_BEFORE_EOL)
 691     data->flags
 692      |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 693    else
 694     data->flags &= ~SF_FIX_BEFORE_EOL;
 695    data->minlen_fixed=minlenp;
 696    data->lookbehind_fixed=0;
 697   }
 698   else { /* *data->longest == data->longest_float */
 699    data->offset_float_min = l ? data->last_start_min : data->pos_min;
 700    data->offset_float_max = (l
 701          ? data->last_start_max
 702          : data->pos_min + data->pos_delta);
 703    if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 704     data->offset_float_max = I32_MAX;
 705    if (data->flags & SF_BEFORE_EOL)
 706     data->flags
 707      |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 708    else
 709     data->flags &= ~SF_FL_BEFORE_EOL;
 710    data->minlen_float=minlenp;
 711    data->lookbehind_float=0;
 712   }
 713  }
 714  SvCUR_set(data->last_found, 0);
 715  {
 716   SV * const sv = data->last_found;
 717   if (SvUTF8(sv) && SvMAGICAL(sv)) {
 718    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 719    if (mg)
 720     mg->mg_len = 0;
 721   }
 722  }
 723  data->last_end = -1;
 724  data->flags &= ~SF_BEFORE_EOL;
 725  DEBUG_STUDYDATA("commit: ",data,0);
 726 }
 727
 728 /* Can match anything (initialization) */
 729 STATIC void
 730 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 731 {
 732  PERL_ARGS_ASSERT_CL_ANYTHING;
 733
 734  ANYOF_BITMAP_SETALL(cl);
 735  cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
 736     |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
 737
 738  /* If any portion of the regex is to operate under locale rules,
 739  * initialization includes it.  The reason this isn't done for all regexes
 740  * is that the optimizer was written under the assumption that locale was
 741  * all-or-nothing.  Given the complexity and lack of documentation in the
 742  * optimizer, and that there are inadequate test cases for locale, so many
 743  * parts of it may not work properly, it is safest to avoid locale unless
 744  * necessary. */
 745  if (RExC_contains_locale) {
 746   ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 747   cl->flags |= ANYOF_LOCALE;
 748  }
 749  else {
 750   ANYOF_CLASS_ZERO(cl);     /* Only /l uses class now */
 751  }
 752 }
 753
 754 /* Can match anything (initialization) */
 755 STATIC int
 756 S_cl_is_anything(const struct regnode_charclass_class *cl)
 757 {
 758  int value;
 759
 760  PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 761
 762  for (value = 0; value <= ANYOF_MAX; value += 2)
 763   if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 764    return 1;
 765  if (!(cl->flags & ANYOF_UNICODE_ALL))
 766   return 0;
 767  if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 768   return 0;
 769  return 1;
 770 }
 771
 772 /* Can match anything (initialization) */
 773 STATIC void
 774 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 775 {
 776  PERL_ARGS_ASSERT_CL_INIT;
 777
 778  Zero(cl, 1, struct regnode_charclass_class);
 779  cl->type = ANYOF;
 780  cl_anything(pRExC_state, cl);
 781  ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 782 }
 783
 784 /* These two functions currently do the exact same thing */
 785 #define cl_init_zero  S_cl_init
 786
 787 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 788  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 789  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 790 STATIC void
 791 S_cl_and(struct regnode_charclass_class *cl,
 792   const struct regnode_charclass_class *and_with)
 793 {
 794  PERL_ARGS_ASSERT_CL_AND;
 795
 796  assert(and_with->type == ANYOF);
 797
 798  /* I (khw) am not sure all these restrictions are necessary XXX */
 799  if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 800   && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 801   && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 802   && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 803   && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
 804   int i;
 805
 806   if (and_with->flags & ANYOF_INVERT)
 807    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 808     cl->bitmap[i] &= ~and_with->bitmap[i];
 809   else
 810    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 811     cl->bitmap[i] &= and_with->bitmap[i];
 812  } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 813
 814  if (and_with->flags & ANYOF_INVERT) {
 815
 816   /* Here, the and'ed node is inverted.  Get the AND of the flags that
 817   * aren't affected by the inversion.  Those that are affected are
 818   * handled individually below */
 819   U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 820   cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 821   cl->flags |= affected_flags;
 822
 823   /* We currently don't know how to deal with things that aren't in the
 824   * bitmap, but we know that the intersection is no greater than what
 825   * is already in cl, so let there be false positives that get sorted
 826   * out after the synthetic start class succeeds, and the node is
 827   * matched for real. */
 828
 829   /* The inversion of these two flags indicate that the resulting
 830   * intersection doesn't have them */
 831   if (and_with->flags & ANYOF_UNICODE_ALL) {
 832    cl->flags &= ~ANYOF_UNICODE_ALL;
 833   }
 834   if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 835    cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 836   }
 837  }
 838  else {   /* and'd node is not inverted */
 839   U8 outside_bitmap_but_not_utf8; /* Temp variable */
 840
 841   if (! ANYOF_NONBITMAP(and_with)) {
 842
 843    /* Here 'and_with' doesn't match anything outside the bitmap
 844    * (except possibly ANYOF_UNICODE_ALL), which means the
 845    * intersection can't either, except for ANYOF_UNICODE_ALL, in
 846    * which case we don't know what the intersection is, but it's no
 847    * greater than what cl already has, so can just leave it alone,
 848    * with possible false positives */
 849    if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 850     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 851     cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 852    }
 853   }
 854   else if (! ANYOF_NONBITMAP(cl)) {
 855
 856    /* Here, 'and_with' does match something outside the bitmap, and cl
 857    * doesn't have a list of things to match outside the bitmap.  If
 858    * cl can match all code points above 255, the intersection will
 859    * be those above-255 code points that 'and_with' matches.  If cl
 860    * can't match all Unicode code points, it means that it can't
 861    * match anything outside the bitmap (since the 'if' that got us
 862    * into this block tested for that), so we leave the bitmap empty.
 863    */
 864    if (cl->flags & ANYOF_UNICODE_ALL) {
 865     ARG_SET(cl, ARG(and_with));
 866
 867     /* and_with's ARG may match things that don't require UTF8.
 868     * And now cl's will too, in spite of this being an 'and'.  See
 869     * the comments below about the kludge */
 870     cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 871    }
 872   }
 873   else {
 874    /* Here, both 'and_with' and cl match something outside the
 875    * bitmap.  Currently we do not do the intersection, so just match
 876    * whatever cl had at the beginning.  */
 877   }
 878
 879
 880   /* Take the intersection of the two sets of flags.  However, the
 881   * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 882   * kludge around the fact that this flag is not treated like the others
 883   * which are initialized in cl_anything().  The way the optimizer works
 884   * is that the synthetic start class (SSC) is initialized to match
 885   * anything, and then the first time a real node is encountered, its
 886   * values are AND'd with the SSC's with the result being the values of
 887   * the real node.  However, there are paths through the optimizer where
 888   * the AND never gets called, so those initialized bits are set
 889   * inappropriately, which is not usually a big deal, as they just cause
 890   * false positives in the SSC, which will just mean a probably
 891   * imperceptible slow down in execution.  However this bit has a
 892   * higher false positive consequence in that it can cause utf8.pm,
 893   * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 894   * bigger slowdown and also causes significant extra memory to be used.
 895   * In order to prevent this, the code now takes a different tack.  The
 896   * bit isn't set unless some part of the regular expression needs it,
 897   * but once set it won't get cleared.  This means that these extra
 898   * modules won't get loaded unless there was some path through the
 899   * pattern that would have required them anyway, and  so any false
 900   * positives that occur by not ANDing them out when they could be
 901   * aren't as severe as they would be if we treated this bit like all
 902   * the others */
 903   outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 904          & ANYOF_NONBITMAP_NON_UTF8;
 905   cl->flags &= and_with->flags;
 906   cl->flags |= outside_bitmap_but_not_utf8;
 907  }
 908 }
 909
 910 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 911  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 912  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 913 STATIC void
 914 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 915 {
 916  PERL_ARGS_ASSERT_CL_OR;
 917
 918  if (or_with->flags & ANYOF_INVERT) {
 919
 920   /* Here, the or'd node is to be inverted.  This means we take the
 921   * complement of everything not in the bitmap, but currently we don't
 922   * know what that is, so give up and match anything */
 923   if (ANYOF_NONBITMAP(or_with)) {
 924    cl_anything(pRExC_state, cl);
 925   }
 926   /* We do not use
 927   * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 928   *   <= (B1 | !B2) | (CL1 | !CL2)
 929   * which is wasteful if CL2 is small, but we ignore CL2:
 930   *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 931   * XXXX Can we handle case-fold?  Unclear:
 932   *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 933   *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 934   */
 935   else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 936    && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 937    && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 938    int i;
 939
 940    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 941     cl->bitmap[i] |= ~or_with->bitmap[i];
 942   } /* XXXX: logic is complicated otherwise */
 943   else {
 944    cl_anything(pRExC_state, cl);
 945   }
 946
 947   /* And, we can just take the union of the flags that aren't affected
 948   * by the inversion */
 949   cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 950
 951   /* For the remaining flags:
 952    ANYOF_UNICODE_ALL and inverted means to not match anything above
 953      255, which means that the union with cl should just be
 954      what cl has in it, so can ignore this flag
 955    ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
 956      is 127-255 to match them, but then invert that, so the
 957      union with cl should just be what cl has in it, so can
 958      ignore this flag
 959   */
 960  } else {    /* 'or_with' is not inverted */
 961   /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 962   if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 963    && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 964     || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
 965    int i;
 966
 967    /* OR char bitmap and class bitmap separately */
 968    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 969     cl->bitmap[i] |= or_with->bitmap[i];
 970    if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 971     for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 972      cl->classflags[i] |= or_with->classflags[i];
 973     cl->flags |= ANYOF_CLASS;
 974    }
 975   }
 976   else { /* XXXX: logic is complicated, leave it along for a moment. */
 977    cl_anything(pRExC_state, cl);
 978   }
 979
 980   if (ANYOF_NONBITMAP(or_with)) {
 981
 982    /* Use the added node's outside-the-bit-map match if there isn't a
 983    * conflict.  If there is a conflict (both nodes match something
 984    * outside the bitmap, but what they match outside is not the same
 985    * pointer, and hence not easily compared until XXX we extend
 986    * inversion lists this far), give up and allow the start class to
 987    * match everything outside the bitmap.  If that stuff is all above
 988    * 255, can just set UNICODE_ALL, otherwise caould be anything. */
 989    if (! ANYOF_NONBITMAP(cl)) {
 990     ARG_SET(cl, ARG(or_with));
 991    }
 992    else if (ARG(cl) != ARG(or_with)) {
 993
 994     if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
 995      cl_anything(pRExC_state, cl);
 996     }
 997     else {
 998      cl->flags |= ANYOF_UNICODE_ALL;
 999     }
1000    }
1001   }
1002
1003   /* Take the union */
1004   cl->flags |= or_with->flags;
1005  }
1006 }
1007
1008 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1009 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1010 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1011 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1012
1013
1014 #ifdef DEBUGGING
1015 /*
1016    dump_trie(trie,widecharmap,revcharmap)
1017    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1018    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1019
1020    These routines dump out a trie in a somewhat readable format.
1021    The _interim_ variants are used for debugging the interim
1022    tables that are used to generate the final compressed
1023    representation which is what dump_trie expects.
1024
1025    Part of the reason for their existence is to provide a form
1026    of documentation as to how the different representations function.
1027
1028 */
1029
1030 /*
1031   Dumps the final compressed table form of the trie to Perl_debug_log.
1032   Used for debugging make_trie().
1033 */
1034
1035 STATIC void
1036 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1037    AV *revcharmap, U32 depth)
1038 {
1039  U32 state;
1040  SV *sv=sv_newmortal();
1041  int colwidth= widecharmap ? 6 : 4;
1042  U16 word;
1043  GET_RE_DEBUG_FLAGS_DECL;
1044
1045  PERL_ARGS_ASSERT_DUMP_TRIE;
1046
1047  PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1048   (int)depth * 2 + 2,"",
1049   "Match","Base","Ofs" );
1050
1051  for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1052   SV ** const tmp = av_fetch( revcharmap, state, 0);
1053   if ( tmp ) {
1054    PerlIO_printf( Perl_debug_log, "%*s",
1055     colwidth,
1056     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1057        PL_colors[0], PL_colors[1],
1058        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1059        PERL_PV_ESCAPE_FIRSTCHAR
1060     )
1061    );
1062   }
1063  }
1064  PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1065   (int)depth * 2 + 2,"");
1066
1067  for( state = 0 ; state < trie->uniquecharcount ; state++ )
1068   PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1069  PerlIO_printf( Perl_debug_log, "\n");
1070
1071  for( state = 1 ; state < trie->statecount ; state++ ) {
1072   const U32 base = trie->states[ state ].trans.base;
1073
1074   PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1075
1076   if ( trie->states[ state ].wordnum ) {
1077    PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1078   } else {
1079    PerlIO_printf( Perl_debug_log, "%6s", "" );
1080   }
1081
1082   PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1083
1084   if ( base ) {
1085    U32 ofs = 0;
1086
1087    while( ( base + ofs  < trie->uniquecharcount ) ||
1088     ( base + ofs - trie->uniquecharcount < trie->lasttrans
1089      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1090      ofs++;
1091
1092    PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1093
1094    for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1095     if ( ( base + ofs >= trie->uniquecharcount ) &&
1096      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1097      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1098     {
1099     PerlIO_printf( Perl_debug_log, "%*"UVXf,
1100      colwidth,
1101      (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1102     } else {
1103      PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1104     }
1105    }
1106
1107    PerlIO_printf( Perl_debug_log, "]");
1108
1109   }
1110   PerlIO_printf( Perl_debug_log, "\n" );
1111  }
1112  PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1113  for (word=1; word <= trie->wordcount; word++) {
1114   PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1115    (int)word, (int)(trie->wordinfo[word].prev),
1116    (int)(trie->wordinfo[word].len));
1117  }
1118  PerlIO_printf(Perl_debug_log, "\n" );
1119 }
1120 /*
1121   Dumps a fully constructed but uncompressed trie in list form.
1122   List tries normally only are used for construction when the number of
1123   possible chars (trie->uniquecharcount) is very high.
1124   Used for debugging make_trie().
1125 */
1126 STATIC void
1127 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1128       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1129       U32 depth)
1130 {
1131  U32 state;
1132  SV *sv=sv_newmortal();
1133  int colwidth= widecharmap ? 6 : 4;
1134  GET_RE_DEBUG_FLAGS_DECL;
1135
1136  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1137
1138  /* print out the table precompression.  */
1139  PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1140   (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1141   "------:-----+-----------------\n" );
1142
1143  for( state=1 ; state < next_alloc ; state ++ ) {
1144   U16 charid;
1145
1146   PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1147    (int)depth * 2 + 2,"", (UV)state  );
1148   if ( ! trie->states[ state ].wordnum ) {
1149    PerlIO_printf( Perl_debug_log, "%5s| ","");
1150   } else {
1151    PerlIO_printf( Perl_debug_log, "W%4x| ",
1152     trie->states[ state ].wordnum
1153    );
1154   }
1155   for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1156    SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1157    if ( tmp ) {
1158     PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1159      colwidth,
1160      pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1161        PL_colors[0], PL_colors[1],
1162        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1163        PERL_PV_ESCAPE_FIRSTCHAR
1164      ) ,
1165      TRIE_LIST_ITEM(state,charid).forid,
1166      (UV)TRIE_LIST_ITEM(state,charid).newstate
1167     );
1168     if (!(charid % 10))
1169      PerlIO_printf(Perl_debug_log, "\n%*s| ",
1170       (int)((depth * 2) + 14), "");
1171    }
1172   }
1173   PerlIO_printf( Perl_debug_log, "\n");
1174  }
1175 }
1176
1177 /*
1178   Dumps a fully constructed but uncompressed trie in table form.
1179   This is the normal DFA style state transition table, with a few
1180   twists to facilitate compression later.
1181   Used for debugging make_trie().
1182 */
1183 STATIC void
1184 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1185       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1186       U32 depth)
1187 {
1188  U32 state;
1189  U16 charid;
1190  SV *sv=sv_newmortal();
1191  int colwidth= widecharmap ? 6 : 4;
1192  GET_RE_DEBUG_FLAGS_DECL;
1193
1194  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1195
1196  /*
1197  print out the table precompression so that we can do a visual check
1198  that they are identical.
1199  */
1200
1201  PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1202
1203  for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1204   SV ** const tmp = av_fetch( revcharmap, charid, 0);
1205   if ( tmp ) {
1206    PerlIO_printf( Perl_debug_log, "%*s",
1207     colwidth,
1208     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1209        PL_colors[0], PL_colors[1],
1210        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1211        PERL_PV_ESCAPE_FIRSTCHAR
1212     )
1213    );
1214   }
1215  }
1216
1217  PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1218
1219  for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1220   PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1221  }
1222
1223  PerlIO_printf( Perl_debug_log, "\n" );
1224
1225  for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1226
1227   PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1228    (int)depth * 2 + 2,"",
1229    (UV)TRIE_NODENUM( state ) );
1230
1231   for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1232    UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1233    if (v)
1234     PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1235    else
1236     PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1237   }
1238   if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1239    PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1240   } else {
1241    PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1242    trie->states[ TRIE_NODENUM( state ) ].wordnum );
1243   }
1244  }
1245 }
1246
1247 #endif
1248
1249
1250 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1251   startbranch: the first branch in the whole branch sequence
1252   first      : start branch of sequence of branch-exact nodes.
1253    May be the same as startbranch
1254   last       : Thing following the last branch.
1255    May be the same as tail.
1256   tail       : item following the branch sequence
1257   count      : words in the sequence
1258   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1259   depth      : indent depth
1260
1261 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1262
1263 A trie is an N'ary tree where the branches are determined by digital
1264 decomposition of the key. IE, at the root node you look up the 1st character and
1265 follow that branch repeat until you find the end of the branches. Nodes can be
1266 marked as "accepting" meaning they represent a complete word. Eg:
1267
1268   /he|she|his|hers/
1269
1270 would convert into the following structure. Numbers represent states, letters
1271 following numbers represent valid transitions on the letter from that state, if
1272 the number is in square brackets it represents an accepting state, otherwise it
1273 will be in parenthesis.
1274
1275  +-h->+-e->[3]-+-r->(8)-+-s->[9]
1276  |    |
1277  |   (2)
1278  |    |
1279  (1)   +-i->(6)-+-s->[7]
1280  |
1281  +-s->(3)-+-h->(4)-+-e->[5]
1282
1283  Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1284
1285 This shows that when matching against the string 'hers' we will begin at state 1
1286 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1287 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1288 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1289 single traverse. We store a mapping from accepting to state to which word was
1290 matched, and then when we have multiple possibilities we try to complete the
1291 rest of the regex in the order in which they occured in the alternation.
1292
1293 The only prior NFA like behaviour that would be changed by the TRIE support is
1294 the silent ignoring of duplicate alternations which are of the form:
1295
1296  / (DUPE|DUPE) X? (?{ ... }) Y /x
1297
1298 Thus EVAL blocks following a trie may be called a different number of times with
1299 and without the optimisation. With the optimisations dupes will be silently
1300 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1301 the following demonstrates:
1302
1303  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1304
1305 which prints out 'word' three times, but
1306
1307  'words'=~/(word|word|word)(?{ print $1 })S/
1308
1309 which doesnt print it out at all. This is due to other optimisations kicking in.
1310
1311 Example of what happens on a structural level:
1312
1313 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1314
1315    1: CURLYM[1] {1,32767}(18)
1316    5:   BRANCH(8)
1317    6:     EXACT <ac>(16)
1318    8:   BRANCH(11)
1319    9:     EXACT <ad>(16)
1320   11:   BRANCH(14)
1321   12:     EXACT <ab>(16)
1322   16:   SUCCEED(0)
1323   17:   NOTHING(18)
1324   18: END(0)
1325
1326 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1327 and should turn into:
1328
1329    1: CURLYM[1] {1,32767}(18)
1330    5:   TRIE(16)
1331   [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1332   <ac>
1333   <ad>
1334   <ab>
1335   16:   SUCCEED(0)
1336   17:   NOTHING(18)
1337   18: END(0)
1338
1339 Cases where tail != last would be like /(?foo|bar)baz/:
1340
1341    1: BRANCH(4)
1342    2:   EXACT <foo>(8)
1343    4: BRANCH(7)
1344    5:   EXACT <bar>(8)
1345    7: TAIL(8)
1346    8: EXACT <baz>(10)
1347   10: END(0)
1348
1349 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1350 and would end up looking like:
1351
1352  1: TRIE(8)
1353  [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1354   <foo>
1355   <bar>
1356    7: TAIL(8)
1357    8: EXACT <baz>(10)
1358   10: END(0)
1359
1360  d = uvuni_to_utf8_flags(d, uv, 0);
1361
1362 is the recommended Unicode-aware way of saying
1363
1364  *(d++) = uv;
1365 */
1366
1367 #define TRIE_STORE_REVCHAR                                                 \
1368  STMT_START {                                                           \
1369   if (UTF) {          \
1370    SV *zlopp = newSV(2);        \
1371    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);    \
1372    unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1373    SvCUR_set(zlopp, kapow - flrbbbbb);       \
1374    SvPOK_on(zlopp);         \
1375    SvUTF8_on(zlopp);         \
1376    av_push(revcharmap, zlopp);        \
1377   } else {          \
1378    char ooooff = (char)uvc;            \
1379    av_push(revcharmap, newSVpvn(&ooooff, 1));      \
1380   }           \
1381   } STMT_END
1382
1383 #define TRIE_READ_CHAR STMT_START {                                           \
1384  wordlen++;                                                                \
1385  if ( UTF ) {                                                              \
1386   if ( folder ) {                                                       \
1387    if ( foldlen > 0 ) {                                              \
1388    uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1389    foldlen -= len;                                                \
1390    scan += len;                                                   \
1391    len = 0;                                                       \
1392    } else {                                                          \
1393     uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1394     uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1395     foldlen -= UNISKIP( uvc );                                    \
1396     scan = foldbuf + UNISKIP( uvc );                              \
1397    }                                                                 \
1398   } else {                                                              \
1399    uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1400   }                                                                     \
1401  } else {                                                                  \
1402   uvc = (U32)*uc;                                                       \
1403   len = 1;                                                              \
1404  }                                                                         \
1405 } STMT_END
1406
1407
1408
1409 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1410  if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1411   U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1412   Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1413  }                                                           \
1414  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1415  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1416  TRIE_LIST_CUR( state )++;                                   \
1417 } STMT_END
1418
1419 #define TRIE_LIST_NEW(state) STMT_START {                       \
1420  Newxz( trie->states[ state ].trans.list,               \
1421   4, reg_trie_trans_le );                                 \
1422  TRIE_LIST_CUR( state ) = 1;                                \
1423  TRIE_LIST_LEN( state ) = 4;                                \
1424 } STMT_END
1425
1426 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1427  U16 dupe= trie->states[ state ].wordnum;                    \
1428  regnode * const noper_next = regnext( noper );              \
1429                 \
1430  DEBUG_r({                                                   \
1431   /* store the word for dumping */                        \
1432   SV* tmp;                                                \
1433   if (OP(noper) != NOTHING)                               \
1434    tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1435   else                                                    \
1436    tmp = newSVpvn_utf8( "", 0, UTF );   \
1437   av_push( trie_words, tmp );                             \
1438  });                                                         \
1439                 \
1440  curword++;                                                  \
1441  trie->wordinfo[curword].prev   = 0;                         \
1442  trie->wordinfo[curword].len    = wordlen;                   \
1443  trie->wordinfo[curword].accept = state;                     \
1444                 \
1445  if ( noper_next < tail ) {                                  \
1446   if (!trie->jump)                                        \
1447    trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1448   trie->jump[curword] = (U16)(noper_next - convert);      \
1449   if (!jumper)                                            \
1450    jumper = noper_next;                                \
1451   if (!nextbranch)                                        \
1452    nextbranch= regnext(cur);                           \
1453  }                                                           \
1454                 \
1455  if ( dupe ) {                                               \
1456   /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1457   /* chain, so that when the bits of chain are later    */\
1458   /* linked together, the dups appear in the chain      */\
1459   trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1460   trie->wordinfo[dupe].prev = curword;                    \
1461  } else {                                                    \
1462   /* we haven't inserted this word yet.                */ \
1463   trie->states[ state ].wordnum = curword;                \
1464  }                                                           \
1465 } STMT_END
1466
1467
1468 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)  \
1469  ( ( base + charid >=  ucharcount     \
1470   && base + charid < ubound     \
1471   && state == trie->trans[ base - ucharcount + charid ].check \
1472   && trie->trans[ base - ucharcount + charid ].next )  \
1473   ? trie->trans[ base - ucharcount + charid ].next  \
1474   : ( state==1 ? special : 0 )     \
1475  )
1476
1477 #define MADE_TRIE       1
1478 #define MADE_JUMP_TRIE  2
1479 #define MADE_EXACT_TRIE 4
1480
1481 STATIC I32
1482 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1483 {
1484  dVAR;
1485  /* first pass, loop through and scan words */
1486  reg_trie_data *trie;
1487  HV *widecharmap = NULL;
1488  AV *revcharmap = newAV();
1489  regnode *cur;
1490  const U32 uniflags = UTF8_ALLOW_DEFAULT;
1491  STRLEN len = 0;
1492  UV uvc = 0;
1493  U16 curword = 0;
1494  U32 next_alloc = 0;
1495  regnode *jumper = NULL;
1496  regnode *nextbranch = NULL;
1497  regnode *convert = NULL;
1498  U32 *prev_states; /* temp array mapping each state to previous one */
1499  /* we just use folder as a flag in utf8 */
1500  const U8 * folder = NULL;
1501
1502 #ifdef DEBUGGING
1503  const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1504  AV *trie_words = NULL;
1505  /* along with revcharmap, this only used during construction but both are
1506  * useful during debugging so we store them in the struct when debugging.
1507  */
1508 #else
1509  const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1510  STRLEN trie_charcount=0;
1511 #endif
1512  SV *re_trie_maxbuff;
1513  GET_RE_DEBUG_FLAGS_DECL;
1514
1515  PERL_ARGS_ASSERT_MAKE_TRIE;
1516 #ifndef DEBUGGING
1517  PERL_UNUSED_ARG(depth);
1518 #endif
1519
1520  switch (flags) {
1521   case EXACTFA:
1522   case EXACTFU: folder = PL_fold_latin1; break;
1523   case EXACTF:  folder = PL_fold; break;
1524   case EXACTFL: folder = PL_fold_locale; break;
1525  }
1526
1527  trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1528  trie->refcount = 1;
1529  trie->startstate = 1;
1530  trie->wordcount = word_count;
1531  RExC_rxi->data->data[ data_slot ] = (void*)trie;
1532  trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1533  if (!(UTF && folder))
1534   trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1535  trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1536      trie->wordcount+1, sizeof(reg_trie_wordinfo));
1537
1538  DEBUG_r({
1539   trie_words = newAV();
1540  });
1541
1542  re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1543  if (!SvIOK(re_trie_maxbuff)) {
1544   sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1545  }
1546  DEBUG_OPTIMISE_r({
1547     PerlIO_printf( Perl_debug_log,
1548     "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1549     (int)depth * 2 + 2, "",
1550     REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1551     REG_NODE_NUM(last), REG_NODE_NUM(tail),
1552     (int)depth);
1553  });
1554
1555    /* Find the node we are going to overwrite */
1556  if ( first == startbranch && OP( last ) != BRANCH ) {
1557   /* whole branch chain */
1558   convert = first;
1559  } else {
1560   /* branch sub-chain */
1561   convert = NEXTOPER( first );
1562  }
1563
1564  /*  -- First loop and Setup --
1565
1566  We first traverse the branches and scan each word to determine if it
1567  contains widechars, and how many unique chars there are, this is
1568  important as we have to build a table with at least as many columns as we
1569  have unique chars.
1570
1571  We use an array of integers to represent the character codes 0..255
1572  (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1573  native representation of the character value as the key and IV's for the
1574  coded index.
1575
1576  *TODO* If we keep track of how many times each character is used we can
1577  remap the columns so that the table compression later on is more
1578  efficient in terms of memory by ensuring the most common value is in the
1579  middle and the least common are on the outside.  IMO this would be better
1580  than a most to least common mapping as theres a decent chance the most
1581  common letter will share a node with the least common, meaning the node
1582  will not be compressible. With a middle is most common approach the worst
1583  case is when we have the least common nodes twice.
1584
1585  */
1586
1587  for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1588   regnode * const noper = NEXTOPER( cur );
1589   const U8 *uc = (U8*)STRING( noper );
1590   const U8 * const e  = uc + STR_LEN( noper );
1591   STRLEN foldlen = 0;
1592   U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1593   const U8 *scan = (U8*)NULL;
1594   U32 wordlen      = 0;         /* required init */
1595   STRLEN chars = 0;
1596   bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1597
1598   if (OP(noper) == NOTHING) {
1599    trie->minlen= 0;
1600    continue;
1601   }
1602   if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1603    TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1604           regardless of encoding */
1605
1606   for ( ; uc < e ; uc += len ) {
1607    TRIE_CHARCOUNT(trie)++;
1608    TRIE_READ_CHAR;
1609    chars++;
1610    if ( uvc < 256 ) {
1611     if ( !trie->charmap[ uvc ] ) {
1612      trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1613      if ( folder )
1614       trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1615      TRIE_STORE_REVCHAR;
1616     }
1617     if ( set_bit ) {
1618      /* store the codepoint in the bitmap, and its folded
1619      * equivalent. */
1620      TRIE_BITMAP_SET(trie,uvc);
1621
1622      /* store the folded codepoint */
1623      if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1624
1625      if ( !UTF ) {
1626       /* store first byte of utf8 representation of
1627       variant codepoints */
1628       if (! UNI_IS_INVARIANT(uvc)) {
1629        TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1630       }
1631      }
1632      set_bit = 0; /* We've done our bit :-) */
1633     }
1634    } else {
1635     SV** svpp;
1636     if ( !widecharmap )
1637      widecharmap = newHV();
1638
1639     svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1640
1641     if ( !svpp )
1642      Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1643
1644     if ( !SvTRUE( *svpp ) ) {
1645      sv_setiv( *svpp, ++trie->uniquecharcount );
1646      TRIE_STORE_REVCHAR;
1647     }
1648    }
1649   }
1650   if( cur == first ) {
1651    trie->minlen=chars;
1652    trie->maxlen=chars;
1653   } else if (chars < trie->minlen) {
1654    trie->minlen=chars;
1655   } else if (chars > trie->maxlen) {
1656    trie->maxlen=chars;
1657   }
1658
1659  } /* end first pass */
1660  DEBUG_TRIE_COMPILE_r(
1661   PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1662     (int)depth * 2 + 2,"",
1663     ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1664     (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1665     (int)trie->minlen, (int)trie->maxlen )
1666  );
1667
1668  /*
1669   We now know what we are dealing with in terms of unique chars and
1670   string sizes so we can calculate how much memory a naive
1671   representation using a flat table  will take. If it's over a reasonable
1672   limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1673   conservative but potentially much slower representation using an array
1674   of lists.
1675
1676   At the end we convert both representations into the same compressed
1677   form that will be used in regexec.c for matching with. The latter
1678   is a form that cannot be used to construct with but has memory
1679   properties similar to the list form and access properties similar
1680   to the table form making it both suitable for fast searches and
1681   small enough that its feasable to store for the duration of a program.
1682
1683   See the comment in the code where the compressed table is produced
1684   inplace from the flat tabe representation for an explanation of how
1685   the compression works.
1686
1687  */
1688
1689
1690  Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1691  prev_states[1] = 0;
1692
1693  if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1694   /*
1695    Second Pass -- Array Of Lists Representation
1696
1697    Each state will be represented by a list of charid:state records
1698    (reg_trie_trans_le) the first such element holds the CUR and LEN
1699    points of the allocated array. (See defines above).
1700
1701    We build the initial structure using the lists, and then convert
1702    it into the compressed table form which allows faster lookups
1703    (but cant be modified once converted).
1704   */
1705
1706   STRLEN transcount = 1;
1707
1708   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1709    "%*sCompiling trie using list compiler\n",
1710    (int)depth * 2 + 2, ""));
1711
1712   trie->states = (reg_trie_state *)
1713    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1714         sizeof(reg_trie_state) );
1715   TRIE_LIST_NEW(1);
1716   next_alloc = 2;
1717
1718   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1719
1720    regnode * const noper = NEXTOPER( cur );
1721    U8 *uc           = (U8*)STRING( noper );
1722    const U8 * const e = uc + STR_LEN( noper );
1723    U32 state        = 1;         /* required init */
1724    U16 charid       = 0;         /* sanity init */
1725    U8 *scan         = (U8*)NULL; /* sanity init */
1726    STRLEN foldlen   = 0;         /* required init */
1727    U32 wordlen      = 0;         /* required init */
1728    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1729
1730    if (OP(noper) != NOTHING) {
1731     for ( ; uc < e ; uc += len ) {
1732
1733      TRIE_READ_CHAR;
1734
1735      if ( uvc < 256 ) {
1736       charid = trie->charmap[ uvc ];
1737      } else {
1738       SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1739       if ( !svpp ) {
1740        charid = 0;
1741       } else {
1742        charid=(U16)SvIV( *svpp );
1743       }
1744      }
1745      /* charid is now 0 if we dont know the char read, or nonzero if we do */
1746      if ( charid ) {
1747
1748       U16 check;
1749       U32 newstate = 0;
1750
1751       charid--;
1752       if ( !trie->states[ state ].trans.list ) {
1753        TRIE_LIST_NEW( state );
1754       }
1755       for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1756        if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1757         newstate = TRIE_LIST_ITEM( state, check ).newstate;
1758         break;
1759        }
1760       }
1761       if ( ! newstate ) {
1762        newstate = next_alloc++;
1763        prev_states[newstate] = state;
1764        TRIE_LIST_PUSH( state, charid, newstate );
1765        transcount++;
1766       }
1767       state = newstate;
1768      } else {
1769       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1770      }
1771     }
1772    }
1773    TRIE_HANDLE_WORD(state);
1774
1775   } /* end second pass */
1776
1777   /* next alloc is the NEXT state to be allocated */
1778   trie->statecount = next_alloc;
1779   trie->states = (reg_trie_state *)
1780    PerlMemShared_realloc( trie->states,
1781         next_alloc
1782         * sizeof(reg_trie_state) );
1783
1784   /* and now dump it out before we compress it */
1785   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1786               revcharmap, next_alloc,
1787               depth+1)
1788   );
1789
1790   trie->trans = (reg_trie_trans *)
1791    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1792   {
1793    U32 state;
1794    U32 tp = 0;
1795    U32 zp = 0;
1796
1797
1798    for( state=1 ; state < next_alloc ; state ++ ) {
1799     U32 base=0;
1800
1801     /*
1802     DEBUG_TRIE_COMPILE_MORE_r(
1803      PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1804     );
1805     */
1806
1807     if (trie->states[state].trans.list) {
1808      U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1809      U16 maxid=minid;
1810      U16 idx;
1811
1812      for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1813       const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1814       if ( forid < minid ) {
1815        minid=forid;
1816       } else if ( forid > maxid ) {
1817        maxid=forid;
1818       }
1819      }
1820      if ( transcount < tp + maxid - minid + 1) {
1821       transcount *= 2;
1822       trie->trans = (reg_trie_trans *)
1823        PerlMemShared_realloc( trie->trans,
1824              transcount
1825              * sizeof(reg_trie_trans) );
1826       Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1827      }
1828      base = trie->uniquecharcount + tp - minid;
1829      if ( maxid == minid ) {
1830       U32 set = 0;
1831       for ( ; zp < tp ; zp++ ) {
1832        if ( ! trie->trans[ zp ].next ) {
1833         base = trie->uniquecharcount + zp - minid;
1834         trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1835         trie->trans[ zp ].check = state;
1836         set = 1;
1837         break;
1838        }
1839       }
1840       if ( !set ) {
1841        trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1842        trie->trans[ tp ].check = state;
1843        tp++;
1844        zp = tp;
1845       }
1846      } else {
1847       for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1848        const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1849        trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1850        trie->trans[ tid ].check = state;
1851       }
1852       tp += ( maxid - minid + 1 );
1853      }
1854      Safefree(trie->states[ state ].trans.list);
1855     }
1856     /*
1857     DEBUG_TRIE_COMPILE_MORE_r(
1858      PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1859     );
1860     */
1861     trie->states[ state ].trans.base=base;
1862    }
1863    trie->lasttrans = tp + 1;
1864   }
1865  } else {
1866   /*
1867   Second Pass -- Flat Table Representation.
1868
1869   we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1870   We know that we will need Charcount+1 trans at most to store the data
1871   (one row per char at worst case) So we preallocate both structures
1872   assuming worst case.
1873
1874   We then construct the trie using only the .next slots of the entry
1875   structs.
1876
1877   We use the .check field of the first entry of the node temporarily to
1878   make compression both faster and easier by keeping track of how many non
1879   zero fields are in the node.
1880
1881   Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1882   transition.
1883
1884   There are two terms at use here: state as a TRIE_NODEIDX() which is a
1885   number representing the first entry of the node, and state as a
1886   TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1887   TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1888   are 2 entrys per node. eg:
1889
1890    A B       A B
1891   1. 2 4    1. 3 7
1892   2. 0 3    3. 0 5
1893   3. 0 0    5. 0 0
1894   4. 0 0    7. 0 0
1895
1896   The table is internally in the right hand, idx form. However as we also
1897   have to deal with the states array which is indexed by nodenum we have to
1898   use TRIE_NODENUM() to convert.
1899
1900   */
1901   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1902    "%*sCompiling trie using table compiler\n",
1903    (int)depth * 2 + 2, ""));
1904
1905   trie->trans = (reg_trie_trans *)
1906    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1907         * trie->uniquecharcount + 1,
1908         sizeof(reg_trie_trans) );
1909   trie->states = (reg_trie_state *)
1910    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1911         sizeof(reg_trie_state) );
1912   next_alloc = trie->uniquecharcount + 1;
1913
1914
1915   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1916
1917    regnode * const noper   = NEXTOPER( cur );
1918    const U8 *uc     = (U8*)STRING( noper );
1919    const U8 * const e = uc + STR_LEN( noper );
1920
1921    U32 state        = 1;         /* required init */
1922
1923    U16 charid       = 0;         /* sanity init */
1924    U32 accept_state = 0;         /* sanity init */
1925    U8 *scan         = (U8*)NULL; /* sanity init */
1926
1927    STRLEN foldlen   = 0;         /* required init */
1928    U32 wordlen      = 0;         /* required init */
1929    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1930
1931    if ( OP(noper) != NOTHING ) {
1932     for ( ; uc < e ; uc += len ) {
1933
1934      TRIE_READ_CHAR;
1935
1936      if ( uvc < 256 ) {
1937       charid = trie->charmap[ uvc ];
1938      } else {
1939       SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1940       charid = svpp ? (U16)SvIV(*svpp) : 0;
1941      }
1942      if ( charid ) {
1943       charid--;
1944       if ( !trie->trans[ state + charid ].next ) {
1945        trie->trans[ state + charid ].next = next_alloc;
1946        trie->trans[ state ].check++;
1947        prev_states[TRIE_NODENUM(next_alloc)]
1948          = TRIE_NODENUM(state);
1949        next_alloc += trie->uniquecharcount;
1950       }
1951       state = trie->trans[ state + charid ].next;
1952      } else {
1953       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1954      }
1955      /* charid is now 0 if we dont know the char read, or nonzero if we do */
1956     }
1957    }
1958    accept_state = TRIE_NODENUM( state );
1959    TRIE_HANDLE_WORD(accept_state);
1960
1961   } /* end second pass */
1962
1963   /* and now dump it out before we compress it */
1964   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1965               revcharmap,
1966               next_alloc, depth+1));
1967
1968   {
1969   /*
1970   * Inplace compress the table.*
1971
1972   For sparse data sets the table constructed by the trie algorithm will
1973   be mostly 0/FAIL transitions or to put it another way mostly empty.
1974   (Note that leaf nodes will not contain any transitions.)
1975
1976   This algorithm compresses the tables by eliminating most such
1977   transitions, at the cost of a modest bit of extra work during lookup:
1978
1979   - Each states[] entry contains a .base field which indicates the
1980   index in the state[] array wheres its transition data is stored.
1981
1982   - If .base is 0 there are no valid transitions from that node.
1983
1984   - If .base is nonzero then charid is added to it to find an entry in
1985   the trans array.
1986
1987   -If trans[states[state].base+charid].check!=state then the
1988   transition is taken to be a 0/Fail transition. Thus if there are fail
1989   transitions at the front of the node then the .base offset will point
1990   somewhere inside the previous nodes data (or maybe even into a node
1991   even earlier), but the .check field determines if the transition is
1992   valid.
1993
1994   XXX - wrong maybe?
1995   The following process inplace converts the table to the compressed
1996   table: We first do not compress the root node 1,and mark all its
1997   .check pointers as 1 and set its .base pointer as 1 as well. This
1998   allows us to do a DFA construction from the compressed table later,
1999   and ensures that any .base pointers we calculate later are greater
2000   than 0.
2001
2002   - We set 'pos' to indicate the first entry of the second node.
2003
2004   - We then iterate over the columns of the node, finding the first and
2005   last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2006   and set the .check pointers accordingly, and advance pos
2007   appropriately and repreat for the next node. Note that when we copy
2008   the next pointers we have to convert them from the original
2009   NODEIDX form to NODENUM form as the former is not valid post
2010   compression.
2011
2012   - If a node has no transitions used we mark its base as 0 and do not
2013   advance the pos pointer.
2014
2015   - If a node only has one transition we use a second pointer into the
2016   structure to fill in allocated fail transitions from other states.
2017   This pointer is independent of the main pointer and scans forward
2018   looking for null transitions that are allocated to a state. When it
2019   finds one it writes the single transition into the "hole".  If the
2020   pointer doesnt find one the single transition is appended as normal.
2021
2022   - Once compressed we can Renew/realloc the structures to release the
2023   excess space.
2024
2025   See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2026   specifically Fig 3.47 and the associated pseudocode.
2027
2028   demq
2029   */
2030   const U32 laststate = TRIE_NODENUM( next_alloc );
2031   U32 state, charid;
2032   U32 pos = 0, zp=0;
2033   trie->statecount = laststate;
2034
2035   for ( state = 1 ; state < laststate ; state++ ) {
2036    U8 flag = 0;
2037    const U32 stateidx = TRIE_NODEIDX( state );
2038    const U32 o_used = trie->trans[ stateidx ].check;
2039    U32 used = trie->trans[ stateidx ].check;
2040    trie->trans[ stateidx ].check = 0;
2041
2042    for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2043     if ( flag || trie->trans[ stateidx + charid ].next ) {
2044      if ( trie->trans[ stateidx + charid ].next ) {
2045       if (o_used == 1) {
2046        for ( ; zp < pos ; zp++ ) {
2047         if ( ! trie->trans[ zp ].next ) {
2048          break;
2049         }
2050        }
2051        trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2052        trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2053        trie->trans[ zp ].check = state;
2054        if ( ++zp > pos ) pos = zp;
2055        break;
2056       }
2057       used--;
2058      }
2059      if ( !flag ) {
2060       flag = 1;
2061       trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2062      }
2063      trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2064      trie->trans[ pos ].check = state;
2065      pos++;
2066     }
2067    }
2068   }
2069   trie->lasttrans = pos + 1;
2070   trie->states = (reg_trie_state *)
2071    PerlMemShared_realloc( trie->states, laststate
2072         * sizeof(reg_trie_state) );
2073   DEBUG_TRIE_COMPILE_MORE_r(
2074     PerlIO_printf( Perl_debug_log,
2075      "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2076      (int)depth * 2 + 2,"",
2077      (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2078      (IV)next_alloc,
2079      (IV)pos,
2080      ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2081    );
2082
2083   } /* end table compress */
2084  }
2085  DEBUG_TRIE_COMPILE_MORE_r(
2086    PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2087     (int)depth * 2 + 2, "",
2088     (UV)trie->statecount,
2089     (UV)trie->lasttrans)
2090  );
2091  /* resize the trans array to remove unused space */
2092  trie->trans = (reg_trie_trans *)
2093   PerlMemShared_realloc( trie->trans, trie->lasttrans
2094        * sizeof(reg_trie_trans) );
2095
2096  {   /* Modify the program and insert the new TRIE node */
2097   U8 nodetype =(U8)(flags & 0xFF);
2098   char *str=NULL;
2099
2100 #ifdef DEBUGGING
2101   regnode *optimize = NULL;
2102 #ifdef RE_TRACK_PATTERN_OFFSETS
2103
2104   U32 mjd_offset = 0;
2105   U32 mjd_nodelen = 0;
2106 #endif /* RE_TRACK_PATTERN_OFFSETS */
2107 #endif /* DEBUGGING */
2108   /*
2109   This means we convert either the first branch or the first Exact,
2110   depending on whether the thing following (in 'last') is a branch
2111   or not and whther first is the startbranch (ie is it a sub part of
2112   the alternation or is it the whole thing.)
2113   Assuming its a sub part we convert the EXACT otherwise we convert
2114   the whole branch sequence, including the first.
2115   */
2116   /* Find the node we are going to overwrite */
2117   if ( first != startbranch || OP( last ) == BRANCH ) {
2118    /* branch sub-chain */
2119    NEXT_OFF( first ) = (U16)(last - first);
2120 #ifdef RE_TRACK_PATTERN_OFFSETS
2121    DEBUG_r({
2122     mjd_offset= Node_Offset((convert));
2123     mjd_nodelen= Node_Length((convert));
2124    });
2125 #endif
2126    /* whole branch chain */
2127   }
2128 #ifdef RE_TRACK_PATTERN_OFFSETS
2129   else {
2130    DEBUG_r({
2131     const  regnode *nop = NEXTOPER( convert );
2132     mjd_offset= Node_Offset((nop));
2133     mjd_nodelen= Node_Length((nop));
2134    });
2135   }
2136   DEBUG_OPTIMISE_r(
2137    PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2138     (int)depth * 2 + 2, "",
2139     (UV)mjd_offset, (UV)mjd_nodelen)
2140   );
2141 #endif
2142   /* But first we check to see if there is a common prefix we can
2143   split out as an EXACT and put in front of the TRIE node.  */
2144   trie->startstate= 1;
2145   if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2146    U32 state;
2147    for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2148     U32 ofs = 0;
2149     I32 idx = -1;
2150     U32 count = 0;
2151     const U32 base = trie->states[ state ].trans.base;
2152
2153     if ( trie->states[state].wordnum )
2154       count = 1;
2155
2156     for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2157      if ( ( base + ofs >= trie->uniquecharcount ) &&
2158       ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2159       trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2160      {
2161       if ( ++count > 1 ) {
2162        SV **tmp = av_fetch( revcharmap, ofs, 0);
2163        const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2164        if ( state == 1 ) break;
2165        if ( count == 2 ) {
2166         Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2167         DEBUG_OPTIMISE_r(
2168          PerlIO_printf(Perl_debug_log,
2169           "%*sNew Start State=%"UVuf" Class: [",
2170           (int)depth * 2 + 2, "",
2171           (UV)state));
2172         if (idx >= 0) {
2173          SV ** const tmp = av_fetch( revcharmap, idx, 0);
2174          const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2175
2176          TRIE_BITMAP_SET(trie,*ch);
2177          if ( folder )
2178           TRIE_BITMAP_SET(trie, folder[ *ch ]);
2179          DEBUG_OPTIMISE_r(
2180           PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2181          );
2182         }
2183        }
2184        TRIE_BITMAP_SET(trie,*ch);
2185        if ( folder )
2186         TRIE_BITMAP_SET(trie,folder[ *ch ]);
2187        DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2188       }
2189       idx = ofs;
2190      }
2191     }
2192     if ( count == 1 ) {
2193      SV **tmp = av_fetch( revcharmap, idx, 0);
2194      STRLEN len;
2195      char *ch = SvPV( *tmp, len );
2196      DEBUG_OPTIMISE_r({
2197       SV *sv=sv_newmortal();
2198       PerlIO_printf( Perl_debug_log,
2199        "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2200        (int)depth * 2 + 2, "",
2201        (UV)state, (UV)idx,
2202        pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2203         PL_colors[0], PL_colors[1],
2204         (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2205         PERL_PV_ESCAPE_FIRSTCHAR
2206        )
2207       );
2208      });
2209      if ( state==1 ) {
2210       OP( convert ) = nodetype;
2211       str=STRING(convert);
2212       STR_LEN(convert)=0;
2213      }
2214      STR_LEN(convert) += len;
2215      while (len--)
2216       *str++ = *ch++;
2217     } else {
2218 #ifdef DEBUGGING
2219      if (state>1)
2220       DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2221 #endif
2222      break;
2223     }
2224    }
2225    trie->prefixlen = (state-1);
2226    if (str) {
2227     regnode *n = convert+NODE_SZ_STR(convert);
2228     NEXT_OFF(convert) = NODE_SZ_STR(convert);
2229     trie->startstate = state;
2230     trie->minlen -= (state - 1);
2231     trie->maxlen -= (state - 1);
2232 #ifdef DEBUGGING
2233    /* At least the UNICOS C compiler choked on this
2234     * being argument to DEBUG_r(), so let's just have
2235     * it right here. */
2236    if (
2237 #ifdef PERL_EXT_RE_BUILD
2238     1
2239 #else
2240     DEBUG_r_TEST
2241 #endif
2242     ) {
2243     regnode *fix = convert;
2244     U32 word = trie->wordcount;
2245     mjd_nodelen++;
2246     Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2247     while( ++fix < n ) {
2248      Set_Node_Offset_Length(fix, 0, 0);
2249     }
2250     while (word--) {
2251      SV ** const tmp = av_fetch( trie_words, word, 0 );
2252      if (tmp) {
2253       if ( STR_LEN(convert) <= SvCUR(*tmp) )
2254        sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2255       else
2256        sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2257      }
2258     }
2259    }
2260 #endif
2261     if (trie->maxlen) {
2262      convert = n;
2263     } else {
2264      NEXT_OFF(convert) = (U16)(tail - convert);
2265      DEBUG_r(optimize= n);
2266     }
2267    }
2268   }
2269   if (!jumper)
2270    jumper = last;
2271   if ( trie->maxlen ) {
2272    NEXT_OFF( convert ) = (U16)(tail - convert);
2273    ARG_SET( convert, data_slot );
2274    /* Store the offset to the first unabsorbed branch in
2275    jump[0], which is otherwise unused by the jump logic.
2276    We use this when dumping a trie and during optimisation. */
2277    if (trie->jump)
2278     trie->jump[0] = (U16)(nextbranch - convert);
2279
2280    /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2281    *   and there is a bitmap
2282    *   and the first "jump target" node we found leaves enough room
2283    * then convert the TRIE node into a TRIEC node, with the bitmap
2284    * embedded inline in the opcode - this is hypothetically faster.
2285    */
2286    if ( !trie->states[trie->startstate].wordnum
2287     && trie->bitmap
2288     && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2289    {
2290     OP( convert ) = TRIEC;
2291     Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2292     PerlMemShared_free(trie->bitmap);
2293     trie->bitmap= NULL;
2294    } else
2295     OP( convert ) = TRIE;
2296
2297    /* store the type in the flags */
2298    convert->flags = nodetype;
2299    DEBUG_r({
2300    optimize = convert
2301      + NODE_STEP_REGNODE
2302      + regarglen[ OP( convert ) ];
2303    });
2304    /* XXX We really should free up the resource in trie now,
2305     as we won't use them - (which resources?) dmq */
2306   }
2307   /* needed for dumping*/
2308   DEBUG_r(if (optimize) {
2309    regnode *opt = convert;
2310
2311    while ( ++opt < optimize) {
2312     Set_Node_Offset_Length(opt,0,0);
2313    }
2314    /*
2315     Try to clean up some of the debris left after the
2316     optimisation.
2317    */
2318    while( optimize < jumper ) {
2319     mjd_nodelen += Node_Length((optimize));
2320     OP( optimize ) = OPTIMIZED;
2321     Set_Node_Offset_Length(optimize,0,0);
2322     optimize++;
2323    }
2324    Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2325   });
2326  } /* end node insert */
2327  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2328
2329  /*  Finish populating the prev field of the wordinfo array.  Walk back
2330  *  from each accept state until we find another accept state, and if
2331  *  so, point the first word's .prev field at the second word. If the
2332  *  second already has a .prev field set, stop now. This will be the
2333  *  case either if we've already processed that word's accept state,
2334  *  or that state had multiple words, and the overspill words were
2335  *  already linked up earlier.
2336  */
2337  {
2338   U16 word;
2339   U32 state;
2340   U16 prev;
2341
2342   for (word=1; word <= trie->wordcount; word++) {
2343    prev = 0;
2344    if (trie->wordinfo[word].prev)
2345     continue;
2346    state = trie->wordinfo[word].accept;
2347    while (state) {
2348     state = prev_states[state];
2349     if (!state)
2350      break;
2351     prev = trie->states[state].wordnum;
2352     if (prev)
2353      break;
2354    }
2355    trie->wordinfo[word].prev = prev;
2356   }
2357   Safefree(prev_states);
2358  }
2359
2360
2361  /* and now dump out the compressed format */
2362  DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2363
2364  RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2365 #ifdef DEBUGGING
2366  RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2367  RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2368 #else
2369  SvREFCNT_dec(revcharmap);
2370 #endif
2371  return trie->jump
2372   ? MADE_JUMP_TRIE
2373   : trie->startstate>1
2374    ? MADE_EXACT_TRIE
2375    : MADE_TRIE;
2376 }
2377
2378 STATIC void
2379 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2380 {
2381 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2382
2383    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2384    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2385    ISBN 0-201-10088-6
2386
2387    We find the fail state for each state in the trie, this state is the longest proper
2388    suffix of the current state's 'word' that is also a proper prefix of another word in our
2389    trie. State 1 represents the word '' and is thus the default fail state. This allows
2390    the DFA not to have to restart after its tried and failed a word at a given point, it
2391    simply continues as though it had been matching the other word in the first place.
2392    Consider
2393  'abcdgu'=~/abcdefg|cdgu/
2394    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2395    fail, which would bring us to the state representing 'd' in the second word where we would
2396    try 'g' and succeed, proceeding to match 'cdgu'.
2397  */
2398  /* add a fail transition */
2399  const U32 trie_offset = ARG(source);
2400  reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2401  U32 *q;
2402  const U32 ucharcount = trie->uniquecharcount;
2403  const U32 numstates = trie->statecount;
2404  const U32 ubound = trie->lasttrans + ucharcount;
2405  U32 q_read = 0;
2406  U32 q_write = 0;
2407  U32 charid;
2408  U32 base = trie->states[ 1 ].trans.base;
2409  U32 *fail;
2410  reg_ac_data *aho;
2411  const U32 data_slot = add_data( pRExC_state, 1, "T" );
2412  GET_RE_DEBUG_FLAGS_DECL;
2413
2414  PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2415 #ifndef DEBUGGING
2416  PERL_UNUSED_ARG(depth);
2417 #endif
2418
2419
2420  ARG_SET( stclass, data_slot );
2421  aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2422  RExC_rxi->data->data[ data_slot ] = (void*)aho;
2423  aho->trie=trie_offset;
2424  aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2425  Copy( trie->states, aho->states, numstates, reg_trie_state );
2426  Newxz( q, numstates, U32);
2427  aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2428  aho->refcount = 1;
2429  fail = aho->fail;
2430  /* initialize fail[0..1] to be 1 so that we always have
2431  a valid final fail state */
2432  fail[ 0 ] = fail[ 1 ] = 1;
2433
2434  for ( charid = 0; charid < ucharcount ; charid++ ) {
2435   const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2436   if ( newstate ) {
2437    q[ q_write ] = newstate;
2438    /* set to point at the root */
2439    fail[ q[ q_write++ ] ]=1;
2440   }
2441  }
2442  while ( q_read < q_write) {
2443   const U32 cur = q[ q_read++ % numstates ];
2444   base = trie->states[ cur ].trans.base;
2445
2446   for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2447    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2448    if (ch_state) {
2449     U32 fail_state = cur;
2450     U32 fail_base;
2451     do {
2452      fail_state = fail[ fail_state ];
2453      fail_base = aho->states[ fail_state ].trans.base;
2454     } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2455
2456     fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2457     fail[ ch_state ] = fail_state;
2458     if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2459     {
2460       aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2461     }
2462     q[ q_write++ % numstates] = ch_state;
2463    }
2464   }
2465  }
2466  /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2467  when we fail in state 1, this allows us to use the
2468  charclass scan to find a valid start char. This is based on the principle
2469  that theres a good chance the string being searched contains lots of stuff
2470  that cant be a start char.
2471  */
2472  fail[ 0 ] = fail[ 1 ] = 0;
2473  DEBUG_TRIE_COMPILE_r({
2474   PerlIO_printf(Perl_debug_log,
2475      "%*sStclass Failtable (%"UVuf" states): 0",
2476      (int)(depth * 2), "", (UV)numstates
2477   );
2478   for( q_read=1; q_read<numstates; q_read++ ) {
2479    PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2480   }
2481   PerlIO_printf(Perl_debug_log, "\n");
2482  });
2483  Safefree(q);
2484  /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2485 }
2486
2487
2488 /*
2489  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2490  * These need to be revisited when a newer toolchain becomes available.
2491  */
2492 #if defined(__sparc64__) && defined(__GNUC__)
2493 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2494 #       undef  SPARC64_GCC_WORKAROUND
2495 #       define SPARC64_GCC_WORKAROUND 1
2496 #   endif
2497 #endif
2498
2499 #define DEBUG_PEEP(str,scan,depth) \
2500  DEBUG_OPTIMISE_r({if (scan){ \
2501  SV * const mysv=sv_newmortal(); \
2502  regnode *Next = regnext(scan); \
2503  regprop(RExC_rx, mysv, scan); \
2504  PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2505  (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2506  Next ? (REG_NODE_NUM(Next)) : 0 ); \
2507    }});
2508
2509
2510
2511
2512
2513 #define JOIN_EXACT(scan,min,flags) \
2514  if (PL_regkind[OP(scan)] == EXACT) \
2515   join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2516
2517 STATIC U32
2518 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2519  /* Merge several consecutive EXACTish nodes into one. */
2520  regnode *n = regnext(scan);
2521  U32 stringok = 1;
2522  regnode *next = scan + NODE_SZ_STR(scan);
2523  U32 merged = 0;
2524  U32 stopnow = 0;
2525 #ifdef DEBUGGING
2526  regnode *stop = scan;
2527  GET_RE_DEBUG_FLAGS_DECL;
2528 #else
2529  PERL_UNUSED_ARG(depth);
2530 #endif
2531
2532  PERL_ARGS_ASSERT_JOIN_EXACT;
2533 #ifndef EXPERIMENTAL_INPLACESCAN
2534  PERL_UNUSED_ARG(flags);
2535  PERL_UNUSED_ARG(val);
2536 #endif
2537  DEBUG_PEEP("join",scan,depth);
2538
2539  /* Skip NOTHING, merge EXACT*. */
2540  while (n &&
2541   ( PL_regkind[OP(n)] == NOTHING ||
2542    (stringok && (OP(n) == OP(scan))))
2543   && NEXT_OFF(n)
2544   && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2545
2546   if (OP(n) == TAIL || n > next)
2547    stringok = 0;
2548   if (PL_regkind[OP(n)] == NOTHING) {
2549    DEBUG_PEEP("skip:",n,depth);
2550    NEXT_OFF(scan) += NEXT_OFF(n);
2551    next = n + NODE_STEP_REGNODE;
2552 #ifdef DEBUGGING
2553    if (stringok)
2554     stop = n;
2555 #endif
2556    n = regnext(n);
2557   }
2558   else if (stringok) {
2559    const unsigned int oldl = STR_LEN(scan);
2560    regnode * const nnext = regnext(n);
2561
2562    DEBUG_PEEP("merg",n,depth);
2563
2564    merged++;
2565    if (oldl + STR_LEN(n) > U8_MAX)
2566     break;
2567    NEXT_OFF(scan) += NEXT_OFF(n);
2568    STR_LEN(scan) += STR_LEN(n);
2569    next = n + NODE_SZ_STR(n);
2570    /* Now we can overwrite *n : */
2571    Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2572 #ifdef DEBUGGING
2573    stop = next - 1;
2574 #endif
2575    n = nnext;
2576    if (stopnow) break;
2577   }
2578
2579 #ifdef EXPERIMENTAL_INPLACESCAN
2580   if (flags && !NEXT_OFF(n)) {
2581    DEBUG_PEEP("atch", val, depth);
2582    if (reg_off_by_arg[OP(n)]) {
2583     ARG_SET(n, val - n);
2584    }
2585    else {
2586     NEXT_OFF(n) = val - n;
2587    }
2588    stopnow = 1;
2589   }
2590 #endif
2591  }
2592 #define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS   0x0390
2593 #define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2594 #define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
2595 #define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2596
2597  if (UTF
2598   && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
2599   && ( STR_LEN(scan) >= 6 ) )
2600  {
2601  /*
2602  Two problematic code points in Unicode casefolding of EXACT nodes:
2603
2604  U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2605  U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2606
2607  which casefold to
2608
2609  Unicode                      UTF-8
2610
2611  U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2612  U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2613
2614  This means that in case-insensitive matching (or "loose matching",
2615  as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2616  length of the above casefolded versions) can match a target string
2617  of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2618  This would rather mess up the minimum length computation.
2619
2620  What we'll do is to look for the tail four bytes, and then peek
2621  at the preceding two bytes to see whether we need to decrease
2622  the minimum length by four (six minus two).
2623
2624  Thanks to the design of UTF-8, there cannot be false matches:
2625  A sequence of valid UTF-8 bytes cannot be a subsequence of
2626  another valid sequence of UTF-8 bytes.
2627
2628  */
2629   char * const s0 = STRING(scan), *s, *t;
2630   char * const s1 = s0 + STR_LEN(scan) - 1;
2631   char * const s2 = s1 - 4;
2632 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2633   const char t0[] = "\xaf\x49\xaf\x42";
2634 #else
2635   const char t0[] = "\xcc\x88\xcc\x81";
2636 #endif
2637   const char * const t1 = t0 + 3;
2638
2639   for (s = s0 + 2;
2640    s < s2 && (t = ninstr(s, s1, t0, t1));
2641    s = t + 4) {
2642 #ifdef EBCDIC
2643    if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2644     ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2645 #else
2646    if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2647     ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2648 #endif
2649     *min -= 4;
2650   }
2651  }
2652
2653 #ifdef DEBUGGING
2654  /* Allow dumping */
2655  n = scan + NODE_SZ_STR(scan);
2656  while (n <= stop) {
2657   if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2658    OP(n) = OPTIMIZED;
2659    NEXT_OFF(n) = 0;
2660   }
2661   n++;
2662  }
2663 #endif
2664  DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2665  return stopnow;
2666 }
2667
2668 /* REx optimizer.  Converts nodes into quicker variants "in place".
2669    Finds fixed substrings.  */
2670
2671 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2672    to the position after last scanned or to NULL. */
2673
2674 #define INIT_AND_WITHP \
2675  assert(!and_withp); \
2676  Newx(and_withp,1,struct regnode_charclass_class); \
2677  SAVEFREEPV(and_withp)
2678
2679 /* this is a chain of data about sub patterns we are processing that
2680    need to be handled separately/specially in study_chunk. Its so
2681    we can simulate recursion without losing state.  */
2682 struct scan_frame;
2683 typedef struct scan_frame {
2684  regnode *last;  /* last node to process in this frame */
2685  regnode *next;  /* next node to process when last is reached */
2686  struct scan_frame *prev; /*previous frame*/
2687  I32 stop; /* what stopparen do we use */
2688 } scan_frame;
2689
2690
2691 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2692
2693 #define CASE_SYNST_FNC(nAmE)                                       \
2694 case nAmE:                                                         \
2695  if (flags & SCF_DO_STCLASS_AND) {                              \
2696    for (value = 0; value < 256; value++)                  \
2697     if (!is_ ## nAmE ## _cp(value))                       \
2698      ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2699  }                                                              \
2700  else {                                                         \
2701    for (value = 0; value < 256; value++)                  \
2702     if (is_ ## nAmE ## _cp(value))                        \
2703      ANYOF_BITMAP_SET(data->start_class, value);    \
2704  }                                                              \
2705  break;                                                         \
2706 case N ## nAmE:                                                    \
2707  if (flags & SCF_DO_STCLASS_AND) {                              \
2708    for (value = 0; value < 256; value++)                   \
2709     if (is_ ## nAmE ## _cp(value))                         \
2710      ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2711  }                                                               \
2712  else {                                                          \
2713    for (value = 0; value < 256; value++)                   \
2714     if (!is_ ## nAmE ## _cp(value))                        \
2715      ANYOF_BITMAP_SET(data->start_class, value);     \
2716  }                                                               \
2717  break
2718
2719
2720
2721 STATIC I32
2722 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2723       I32 *minlenp, I32 *deltap,
2724       regnode *last,
2725       scan_data_t *data,
2726       I32 stopparen,
2727       U8* recursed,
2728       struct regnode_charclass_class *and_withp,
2729       U32 flags, U32 depth)
2730       /* scanp: Start here (read-write). */
2731       /* deltap: Write maxlen-minlen here. */
2732       /* last: Stop before this one. */
2733       /* data: string data about the pattern */
2734       /* stopparen: treat close N as END */
2735       /* recursed: which subroutines have we recursed into */
2736       /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2737 {
2738  dVAR;
2739  I32 min = 0, pars = 0, code;
2740  regnode *scan = *scanp, *next;
2741  I32 delta = 0;
2742  int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2743  int is_inf_internal = 0;  /* The studied chunk is infinite */
2744  I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2745  scan_data_t data_fake;
2746  SV *re_trie_maxbuff = NULL;
2747  regnode *first_non_open = scan;
2748  I32 stopmin = I32_MAX;
2749  scan_frame *frame = NULL;
2750  GET_RE_DEBUG_FLAGS_DECL;
2751
2752  PERL_ARGS_ASSERT_STUDY_CHUNK;
2753
2754 #ifdef DEBUGGING
2755  StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2756 #endif
2757
2758  if ( depth == 0 ) {
2759   while (first_non_open && OP(first_non_open) == OPEN)
2760    first_non_open=regnext(first_non_open);
2761  }
2762
2763
2764   fake_study_recurse:
2765  while ( scan && OP(scan) != END && scan < last ){
2766   /* Peephole optimizer: */
2767   DEBUG_STUDYDATA("Peep:", data,depth);
2768   DEBUG_PEEP("Peep",scan,depth);
2769   JOIN_EXACT(scan,&min,0);
2770
2771   /* Follow the next-chain of the current node and optimize
2772   away all the NOTHINGs from it.  */
2773   if (OP(scan) != CURLYX) {
2774    const int max = (reg_off_by_arg[OP(scan)]
2775      ? I32_MAX
2776      /* I32 may be smaller than U16 on CRAYs! */
2777      : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2778    int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2779    int noff;
2780    regnode *n = scan;
2781
2782    /* Skip NOTHING and LONGJMP. */
2783    while ((n = regnext(n))
2784     && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2785      || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2786     && off + noff < max)
2787     off += noff;
2788    if (reg_off_by_arg[OP(scan)])
2789     ARG(scan) = off;
2790    else
2791     NEXT_OFF(scan) = off;
2792   }
2793
2794
2795
2796   /* The principal pseudo-switch.  Cannot be a switch, since we
2797   look into several different things.  */
2798   if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2799     || OP(scan) == IFTHEN) {
2800    next = regnext(scan);
2801    code = OP(scan);
2802    /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2803
2804    if (OP(next) == code || code == IFTHEN) {
2805     /* NOTE - There is similar code to this block below for handling
2806     TRIE nodes on a re-study.  If you change stuff here check there
2807     too. */
2808     I32 max1 = 0, min1 = I32_MAX, num = 0;
2809     struct regnode_charclass_class accum;
2810     regnode * const startbranch=scan;
2811
2812     if (flags & SCF_DO_SUBSTR)
2813      SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2814     if (flags & SCF_DO_STCLASS)
2815      cl_init_zero(pRExC_state, &accum);
2816
2817     while (OP(scan) == code) {
2818      I32 deltanext, minnext, f = 0, fake;
2819      struct regnode_charclass_class this_class;
2820
2821      num++;
2822      data_fake.flags = 0;
2823      if (data) {
2824       data_fake.whilem_c = data->whilem_c;
2825       data_fake.last_closep = data->last_closep;
2826      }
2827      else
2828       data_fake.last_closep = &fake;
2829
2830      data_fake.pos_delta = delta;
2831      next = regnext(scan);
2832      scan = NEXTOPER(scan);
2833      if (code != BRANCH)
2834       scan = NEXTOPER(scan);
2835      if (flags & SCF_DO_STCLASS) {
2836       cl_init(pRExC_state, &this_class);
2837       data_fake.start_class = &this_class;
2838       f = SCF_DO_STCLASS_AND;
2839      }
2840      if (flags & SCF_WHILEM_VISITED_POS)
2841       f |= SCF_WHILEM_VISITED_POS;
2842
2843      /* we suppose the run is continuous, last=next...*/
2844      minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2845           next, &data_fake,
2846           stopparen, recursed, NULL, f,depth+1);
2847      if (min1 > minnext)
2848       min1 = minnext;
2849      if (max1 < minnext + deltanext)
2850       max1 = minnext + deltanext;
2851      if (deltanext == I32_MAX)
2852       is_inf = is_inf_internal = 1;
2853      scan = next;
2854      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2855       pars++;
2856      if (data_fake.flags & SCF_SEEN_ACCEPT) {
2857       if ( stopmin > minnext)
2858        stopmin = min + min1;
2859       flags &= ~SCF_DO_SUBSTR;
2860       if (data)
2861        data->flags |= SCF_SEEN_ACCEPT;
2862      }
2863      if (data) {
2864       if (data_fake.flags & SF_HAS_EVAL)
2865        data->flags |= SF_HAS_EVAL;
2866       data->whilem_c = data_fake.whilem_c;
2867      }
2868      if (flags & SCF_DO_STCLASS)
2869       cl_or(pRExC_state, &accum, &this_class);
2870     }
2871     if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2872      min1 = 0;
2873     if (flags & SCF_DO_SUBSTR) {
2874      data->pos_min += min1;
2875      data->pos_delta += max1 - min1;
2876      if (max1 != min1 || is_inf)
2877       data->longest = &(data->longest_float);
2878     }
2879     min += min1;
2880     delta += max1 - min1;
2881     if (flags & SCF_DO_STCLASS_OR) {
2882      cl_or(pRExC_state, data->start_class, &accum);
2883      if (min1) {
2884       cl_and(data->start_class, and_withp);
2885       flags &= ~SCF_DO_STCLASS;
2886      }
2887     }
2888     else if (flags & SCF_DO_STCLASS_AND) {
2889      if (min1) {
2890       cl_and(data->start_class, &accum);
2891       flags &= ~SCF_DO_STCLASS;
2892      }
2893      else {
2894       /* Switch to OR mode: cache the old value of
2895       * data->start_class */
2896       INIT_AND_WITHP;
2897       StructCopy(data->start_class, and_withp,
2898         struct regnode_charclass_class);
2899       flags &= ~SCF_DO_STCLASS_AND;
2900       StructCopy(&accum, data->start_class,
2901         struct regnode_charclass_class);
2902       flags |= SCF_DO_STCLASS_OR;
2903       data->start_class->flags |= ANYOF_EOS;
2904      }
2905     }
2906
2907     if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2908     /* demq.
2909
2910     Assuming this was/is a branch we are dealing with: 'scan' now
2911     points at the item that follows the branch sequence, whatever
2912     it is. We now start at the beginning of the sequence and look
2913     for subsequences of
2914
2915     BRANCH->EXACT=>x1
2916     BRANCH->EXACT=>x2
2917     tail
2918
2919     which would be constructed from a pattern like /A|LIST|OF|WORDS/
2920
2921     If we can find such a subsequence we need to turn the first
2922     element into a trie and then add the subsequent branch exact
2923     strings to the trie.
2924
2925     We have two cases
2926
2927      1. patterns where the whole set of branches can be converted.
2928
2929      2. patterns where only a subset can be converted.
2930
2931     In case 1 we can replace the whole set with a single regop
2932     for the trie. In case 2 we need to keep the start and end
2933     branches so
2934
2935      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2936      becomes BRANCH TRIE; BRANCH X;
2937
2938     There is an additional case, that being where there is a
2939     common prefix, which gets split out into an EXACT like node
2940     preceding the TRIE node.
2941
2942     If x(1..n)==tail then we can do a simple trie, if not we make
2943     a "jump" trie, such that when we match the appropriate word
2944     we "jump" to the appropriate tail node. Essentially we turn
2945     a nested if into a case structure of sorts.
2946
2947     */
2948
2949      int made=0;
2950      if (!re_trie_maxbuff) {
2951       re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2952       if (!SvIOK(re_trie_maxbuff))
2953        sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2954      }
2955      if ( SvIV(re_trie_maxbuff)>=0  ) {
2956       regnode *cur;
2957       regnode *first = (regnode *)NULL;
2958       regnode *last = (regnode *)NULL;
2959       regnode *tail = scan;
2960       U8 optype = 0;
2961       U32 count=0;
2962
2963 #ifdef DEBUGGING
2964       SV * const mysv = sv_newmortal();       /* for dumping */
2965 #endif
2966       /* var tail is used because there may be a TAIL
2967       regop in the way. Ie, the exacts will point to the
2968       thing following the TAIL, but the last branch will
2969       point at the TAIL. So we advance tail. If we
2970       have nested (?:) we may have to move through several
2971       tails.
2972       */
2973
2974       while ( OP( tail ) == TAIL ) {
2975        /* this is the TAIL generated by (?:) */
2976        tail = regnext( tail );
2977       }
2978
2979
2980       DEBUG_OPTIMISE_r({
2981        regprop(RExC_rx, mysv, tail );
2982        PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2983         (int)depth * 2 + 2, "",
2984         "Looking for TRIE'able sequences. Tail node is: ",
2985         SvPV_nolen_const( mysv )
2986        );
2987       });
2988
2989       /*
2990
2991       step through the branches, cur represents each
2992       branch, noper is the first thing to be matched
2993       as part of that branch and noper_next is the
2994       regnext() of that node. if noper is an EXACT
2995       and noper_next is the same as scan (our current
2996       position in the regex) then the EXACT branch is
2997       a possible optimization target. Once we have
2998       two or more consecutive such branches we can
2999       create a trie of the EXACT's contents and stich
3000       it in place. If the sequence represents all of
3001       the branches we eliminate the whole thing and
3002       replace it with a single TRIE. If it is a
3003       subsequence then we need to stitch it in. This
3004       means the first branch has to remain, and needs
3005       to be repointed at the item on the branch chain
3006       following the last branch optimized. This could
3007       be either a BRANCH, in which case the
3008       subsequence is internal, or it could be the
3009       item following the branch sequence in which
3010       case the subsequence is at the end.
3011
3012       */
3013
3014       /* dont use tail as the end marker for this traverse */
3015       for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3016        regnode * const noper = NEXTOPER( cur );
3017 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3018        regnode * const noper_next = regnext( noper );
3019 #endif
3020
3021        DEBUG_OPTIMISE_r({
3022         regprop(RExC_rx, mysv, cur);
3023         PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3024         (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3025
3026         regprop(RExC_rx, mysv, noper);
3027         PerlIO_printf( Perl_debug_log, " -> %s",
3028          SvPV_nolen_const(mysv));
3029
3030         if ( noper_next ) {
3031         regprop(RExC_rx, mysv, noper_next );
3032         PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3033          SvPV_nolen_const(mysv));
3034         }
3035         PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
3036         REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
3037        });
3038        if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
3039           : PL_regkind[ OP( noper ) ] == EXACT )
3040         || OP(noper) == NOTHING )
3041 #ifdef NOJUMPTRIE
3042         && noper_next == tail
3043 #endif
3044         && count < U16_MAX)
3045        {
3046         count++;
3047         if ( !first || optype == NOTHING ) {
3048          if (!first) first = cur;
3049          optype = OP( noper );
3050         } else {
3051          last = cur;
3052         }
3053        } else {
3054 /*
3055  Currently we do not believe that the trie logic can
3056  handle case insensitive matching properly when the
3057  pattern is not unicode (thus forcing unicode semantics).
3058
3059  If/when this is fixed the following define can be swapped
3060  in below to fully enable trie logic.
3061
3062  XXX It may work if not UTF and/or /a (AT_LEAST_UNI_SEMANTICS) but perhaps
3063  not /aa
3064
3065 #define TRIE_TYPE_IS_SAFE 1
3066
3067 */
3068 #define TRIE_TYPE_IS_SAFE ((UTF && UNI_SEMANTICS) || optype==EXACT)
3069
3070         if ( last && TRIE_TYPE_IS_SAFE ) {
3071          make_trie( pRExC_state,
3072            startbranch, first, cur, tail, count,
3073            optype, depth+1 );
3074         }
3075         if ( PL_regkind[ OP( noper ) ] == EXACT
3076 #ifdef NOJUMPTRIE
3077          && noper_next == tail
3078 #endif
3079         ){
3080          count = 1;
3081          first = cur;
3082          optype = OP( noper );
3083         } else {
3084          count = 0;
3085          first = NULL;
3086          optype = 0;
3087         }
3088         last = NULL;
3089        }
3090       }
3091       DEBUG_OPTIMISE_r({
3092        regprop(RExC_rx, mysv, cur);
3093        PerlIO_printf( Perl_debug_log,
3094        "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3095        "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3096
3097       });
3098
3099       if ( last && TRIE_TYPE_IS_SAFE ) {
3100        made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3101 #ifdef TRIE_STUDY_OPT
3102        if ( ((made == MADE_EXACT_TRIE &&
3103         startbranch == first)
3104         || ( first_non_open == first )) &&
3105         depth==0 ) {
3106         flags |= SCF_TRIE_RESTUDY;
3107         if ( startbranch == first
3108          && scan == tail )
3109         {
3110          RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3111         }
3112        }
3113 #endif
3114       }
3115      }
3116
3117     } /* do trie */
3118
3119    }
3120    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3121     scan = NEXTOPER(NEXTOPER(scan));
3122    } else   /* single branch is optimized. */
3123     scan = NEXTOPER(scan);
3124    continue;
3125   } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3126    scan_frame *newframe = NULL;
3127    I32 paren;
3128    regnode *start;
3129    regnode *end;
3130
3131    if (OP(scan) != SUSPEND) {
3132    /* set the pointer */
3133     if (OP(scan) == GOSUB) {
3134      paren = ARG(scan);
3135      RExC_recurse[ARG2L(scan)] = scan;
3136      start = RExC_open_parens[paren-1];
3137      end   = RExC_close_parens[paren-1];
3138     } else {
3139      paren = 0;
3140      start = RExC_rxi->program + 1;
3141      end   = RExC_opend;
3142     }
3143     if (!recursed) {
3144      Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3145      SAVEFREEPV(recursed);
3146     }
3147     if (!PAREN_TEST(recursed,paren+1)) {
3148      PAREN_SET(recursed,paren+1);
3149      Newx(newframe,1,scan_frame);
3150     } else {
3151      if (flags & SCF_DO_SUBSTR) {
3152       SCAN_COMMIT(pRExC_state,data,minlenp);
3153       data->longest = &(data->longest_float);
3154      }
3155      is_inf = is_inf_internal = 1;
3156      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3157       cl_anything(pRExC_state, data->start_class);
3158      flags &= ~SCF_DO_STCLASS;
3159     }
3160    } else {
3161     Newx(newframe,1,scan_frame);
3162     paren = stopparen;
3163     start = scan+2;
3164     end = regnext(scan);
3165    }
3166    if (newframe) {
3167     assert(start);
3168     assert(end);
3169     SAVEFREEPV(newframe);
3170     newframe->next = regnext(scan);
3171     newframe->last = last;
3172     newframe->stop = stopparen;
3173     newframe->prev = frame;
3174
3175     frame = newframe;
3176     scan =  start;
3177     stopparen = paren;
3178     last = end;
3179
3180     continue;
3181    }
3182   }
3183   else if (OP(scan) == EXACT) {
3184    I32 l = STR_LEN(scan);
3185    UV uc;
3186    if (UTF) {
3187     const U8 * const s = (U8*)STRING(scan);
3188     l = utf8_length(s, s + l);
3189     uc = utf8_to_uvchr(s, NULL);
3190    } else {
3191     uc = *((U8*)STRING(scan));
3192    }
3193    min += l;
3194    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3195     /* The code below prefers earlier match for fixed
3196     offset, later match for variable offset.  */
3197     if (data->last_end == -1) { /* Update the start info. */
3198      data->last_start_min = data->pos_min;
3199      data->last_start_max = is_inf
3200       ? I32_MAX : data->pos_min + data->pos_delta;
3201     }
3202     sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3203     if (UTF)
3204      SvUTF8_on(data->last_found);
3205     {
3206      SV * const sv = data->last_found;
3207      MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3208       mg_find(sv, PERL_MAGIC_utf8) : NULL;
3209      if (mg && mg->mg_len >= 0)
3210       mg->mg_len += utf8_length((U8*)STRING(scan),
3211             (U8*)STRING(scan)+STR_LEN(scan));
3212     }
3213     data->last_end = data->pos_min + l;
3214     data->pos_min += l; /* As in the first entry. */
3215     data->flags &= ~SF_BEFORE_EOL;
3216    }
3217    if (flags & SCF_DO_STCLASS_AND) {
3218     /* Check whether it is compatible with what we know already! */
3219     int compat = 1;
3220
3221
3222     /* If compatible, we or it in below.  It is compatible if is
3223     * in the bitmp and either 1) its bit or its fold is set, or 2)
3224     * it's for a locale.  Even if there isn't unicode semantics
3225     * here, at runtime there may be because of matching against a
3226     * utf8 string, so accept a possible false positive for
3227     * latin1-range folds */
3228     if (uc >= 0x100 ||
3229      (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3230      && !ANYOF_BITMAP_TEST(data->start_class, uc)
3231      && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3232       || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3233      )
3234     {
3235      compat = 0;
3236     }
3237     ANYOF_CLASS_ZERO(data->start_class);
3238     ANYOF_BITMAP_ZERO(data->start_class);
3239     if (compat)
3240      ANYOF_BITMAP_SET(data->start_class, uc);
3241     else if (uc >= 0x100) {
3242      int i;
3243
3244      /* Some Unicode code points fold to the Latin1 range; as
3245      * XXX temporary code, instead of figuring out if this is
3246      * one, just assume it is and set all the start class bits
3247      * that could be some such above 255 code point's fold
3248      * which will generate fals positives.  As the code
3249      * elsewhere that does compute the fold settles down, it
3250      * can be extracted out and re-used here */
3251      for (i = 0; i < 256; i++){
3252       if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3253        ANYOF_BITMAP_SET(data->start_class, i);
3254       }
3255      }
3256     }
3257     data->start_class->flags &= ~ANYOF_EOS;
3258     if (uc < 0x100)
3259     data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3260    }
3261    else if (flags & SCF_DO_STCLASS_OR) {
3262     /* false positive possible if the class is case-folded */
3263     if (uc < 0x100)
3264      ANYOF_BITMAP_SET(data->start_class, uc);
3265     else
3266      data->start_class->flags |= ANYOF_UNICODE_ALL;
3267     data->start_class->flags &= ~ANYOF_EOS;
3268     cl_and(data->start_class, and_withp);
3269    }
3270    flags &= ~SCF_DO_STCLASS;
3271   }
3272   else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3273    I32 l = STR_LEN(scan);
3274    UV uc = *((U8*)STRING(scan));
3275
3276    /* Search for fixed substrings supports EXACT only. */
3277    if (flags & SCF_DO_SUBSTR) {
3278     assert(data);
3279     SCAN_COMMIT(pRExC_state, data, minlenp);
3280    }
3281    if (UTF) {
3282     const U8 * const s = (U8 *)STRING(scan);
3283     l = utf8_length(s, s + l);
3284     uc = utf8_to_uvchr(s, NULL);
3285    }
3286    min += l;
3287    if (flags & SCF_DO_SUBSTR)
3288     data->pos_min += l;
3289    if (flags & SCF_DO_STCLASS_AND) {
3290     /* Check whether it is compatible with what we know already! */
3291     int compat = 1;
3292     if (uc >= 0x100 ||
3293     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3294     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3295     && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3296     {
3297      compat = 0;
3298     }
3299     ANYOF_CLASS_ZERO(data->start_class);
3300     ANYOF_BITMAP_ZERO(data->start_class);
3301     if (compat) {
3302      ANYOF_BITMAP_SET(data->start_class, uc);
3303      data->start_class->flags &= ~ANYOF_EOS;
3304      data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3305      if (OP(scan) == EXACTFL) {
3306       /* XXX This set is probably no longer necessary, and
3307       * probably wrong as LOCALE now is on in the initial
3308       * state */
3309       data->start_class->flags |= ANYOF_LOCALE;
3310      }
3311      else {
3312
3313       /* Also set the other member of the fold pair.  In case
3314       * that unicode semantics is called for at runtime, use
3315       * the full latin1 fold.  (Can't do this for locale,
3316       * because not known until runtime */
3317       ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3318      }
3319     }
3320     else if (uc >= 0x100) {
3321      int i;
3322      for (i = 0; i < 256; i++){
3323       if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3324        ANYOF_BITMAP_SET(data->start_class, i);
3325       }
3326      }
3327     }
3328    }
3329    else if (flags & SCF_DO_STCLASS_OR) {
3330     if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3331      /* false positive possible if the class is case-folded.
3332      Assume that the locale settings are the same... */
3333      if (uc < 0x100) {
3334       ANYOF_BITMAP_SET(data->start_class, uc);
3335       if (OP(scan) != EXACTFL) {
3336
3337        /* And set the other member of the fold pair, but
3338        * can't do that in locale because not known until
3339        * run-time */
3340        ANYOF_BITMAP_SET(data->start_class,
3341            PL_fold_latin1[uc]);
3342       }
3343      }
3344      data->start_class->flags &= ~ANYOF_EOS;
3345     }
3346     cl_and(data->start_class, and_withp);
3347    }
3348    flags &= ~SCF_DO_STCLASS;
3349   }
3350   else if (REGNODE_VARIES(OP(scan))) {
3351    I32 mincount, maxcount, minnext, deltanext, fl = 0;
3352    I32 f = flags, pos_before = 0;
3353    regnode * const oscan = scan;
3354    struct regnode_charclass_class this_class;
3355    struct regnode_charclass_class *oclass = NULL;
3356    I32 next_is_eval = 0;
3357
3358    switch (PL_regkind[OP(scan)]) {
3359    case WHILEM:  /* End of (?:...)* . */
3360     scan = NEXTOPER(scan);
3361     goto finish;
3362    case PLUS:
3363     if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3364      next = NEXTOPER(scan);
3365      if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3366       mincount = 1;
3367       maxcount = REG_INFTY;
3368       next = regnext(scan);
3369       scan = NEXTOPER(scan);
3370       goto do_curly;
3371      }
3372     }
3373     if (flags & SCF_DO_SUBSTR)
3374      data->pos_min++;
3375     min++;
3376     /* Fall through. */
3377    case STAR:
3378     if (flags & SCF_DO_STCLASS) {
3379      mincount = 0;
3380      maxcount = REG_INFTY;
3381      next = regnext(scan);
3382      scan = NEXTOPER(scan);
3383      goto do_curly;
3384     }
3385     is_inf = is_inf_internal = 1;
3386     scan = regnext(scan);
3387     if (flags & SCF_DO_SUBSTR) {
3388      SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3389      data->longest = &(data->longest_float);
3390     }
3391     goto optimize_curly_tail;
3392    case CURLY:
3393     if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3394      && (scan->flags == stopparen))
3395     {
3396      mincount = 1;
3397      maxcount = 1;
3398     } else {
3399      mincount = ARG1(scan);
3400      maxcount = ARG2(scan);
3401     }
3402     next = regnext(scan);
3403     if (OP(scan) == CURLYX) {
3404      I32 lp = (data ? *(data->last_closep) : 0);
3405      scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3406     }
3407     scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3408     next_is_eval = (OP(scan) == EVAL);
3409    do_curly:
3410     if (flags & SCF_DO_SUBSTR) {
3411      if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3412      pos_before = data->pos_min;
3413     }
3414     if (data) {
3415      fl = data->flags;
3416      data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3417      if (is_inf)
3418       data->flags |= SF_IS_INF;
3419     }
3420     if (flags & SCF_DO_STCLASS) {
3421      cl_init(pRExC_state, &this_class);
3422      oclass = data->start_class;
3423      data->start_class = &this_class;
3424      f |= SCF_DO_STCLASS_AND;
3425      f &= ~SCF_DO_STCLASS_OR;
3426     }
3427     /* Exclude from super-linear cache processing any {n,m}
3428     regops for which the combination of input pos and regex
3429     pos is not enough information to determine if a match
3430     will be possible.
3431
3432     For example, in the regex /foo(bar\s*){4,8}baz/ with the
3433     regex pos at the \s*, the prospects for a match depend not
3434     only on the input position but also on how many (bar\s*)
3435     repeats into the {4,8} we are. */
3436    if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3437      f &= ~SCF_WHILEM_VISITED_POS;
3438
3439     /* This will finish on WHILEM, setting scan, or on NULL: */
3440     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3441          last, data, stopparen, recursed, NULL,
3442          (mincount == 0
3443           ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3444
3445     if (flags & SCF_DO_STCLASS)
3446      data->start_class = oclass;
3447     if (mincount == 0 || minnext == 0) {
3448      if (flags & SCF_DO_STCLASS_OR) {
3449       cl_or(pRExC_state, data->start_class, &this_class);
3450      }
3451      else if (flags & SCF_DO_STCLASS_AND) {
3452       /* Switch to OR mode: cache the old value of
3453       * data->start_class */
3454       INIT_AND_WITHP;
3455       StructCopy(data->start_class, and_withp,
3456         struct regnode_charclass_class);
3457       flags &= ~SCF_DO_STCLASS_AND;
3458       StructCopy(&this_class, data->start_class,
3459         struct regnode_charclass_class);
3460       flags |= SCF_DO_STCLASS_OR;
3461       data->start_class->flags |= ANYOF_EOS;
3462      }
3463     } else {  /* Non-zero len */
3464      if (flags & SCF_DO_STCLASS_OR) {
3465       cl_or(pRExC_state, data->start_class, &this_class);
3466       cl_and(data->start_class, and_withp);
3467      }
3468      else if (flags & SCF_DO_STCLASS_AND)
3469       cl_and(data->start_class, &this_class);
3470      flags &= ~SCF_DO_STCLASS;
3471     }
3472     if (!scan)   /* It was not CURLYX, but CURLY. */
3473      scan = next;
3474     if ( /* ? quantifier ok, except for (?{ ... }) */
3475      (next_is_eval || !(mincount == 0 && maxcount == 1))
3476      && (minnext == 0) && (deltanext == 0)
3477      && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3478      && maxcount <= REG_INFTY/3) /* Complement check for big count */
3479     {
3480      ckWARNreg(RExC_parse,
3481        "Quantifier unexpected on zero-length expression");
3482     }
3483
3484     min += minnext * mincount;
3485     is_inf_internal |= ((maxcount == REG_INFTY
3486          && (minnext + deltanext) > 0)
3487          || deltanext == I32_MAX);
3488     is_inf |= is_inf_internal;
3489     delta += (minnext + deltanext) * maxcount - minnext * mincount;
3490
3491     /* Try powerful optimization CURLYX => CURLYN. */
3492     if (  OP(oscan) == CURLYX && data
3493      && data->flags & SF_IN_PAR
3494      && !(data->flags & SF_HAS_EVAL)
3495      && !deltanext && minnext == 1 ) {
3496      /* Try to optimize to CURLYN.  */
3497      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3498      regnode * const nxt1 = nxt;
3499 #ifdef DEBUGGING
3500      regnode *nxt2;
3501 #endif
3502
3503      /* Skip open. */
3504      nxt = regnext(nxt);
3505      if (!REGNODE_SIMPLE(OP(nxt))
3506       && !(PL_regkind[OP(nxt)] == EXACT
3507        && STR_LEN(nxt) == 1))
3508       goto nogo;
3509 #ifdef DEBUGGING
3510      nxt2 = nxt;
3511 #endif
3512      nxt = regnext(nxt);
3513      if (OP(nxt) != CLOSE)
3514       goto nogo;
3515      if (RExC_open_parens) {
3516       RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3517       RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3518      }
3519      /* Now we know that nxt2 is the only contents: */
3520      oscan->flags = (U8)ARG(nxt);
3521      OP(oscan) = CURLYN;
3522      OP(nxt1) = NOTHING; /* was OPEN. */
3523
3524 #ifdef DEBUGGING
3525      OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3526      NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3527      NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3528      OP(nxt) = OPTIMIZED; /* was CLOSE. */
3529      OP(nxt + 1) = OPTIMIZED; /* was count. */
3530      NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3531 #endif
3532     }
3533    nogo:
3534
3535     /* Try optimization CURLYX => CURLYM. */
3536     if (  OP(oscan) == CURLYX && data
3537      && !(data->flags & SF_HAS_PAR)
3538      && !(data->flags & SF_HAS_EVAL)
3539      && !deltanext /* atom is fixed width */
3540      && minnext != 0 /* CURLYM can't handle zero width */
3541     ) {
3542      /* XXXX How to optimize if data == 0? */
3543      /* Optimize to a simpler form.  */
3544      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3545      regnode *nxt2;
3546
3547      OP(oscan) = CURLYM;
3548      while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3549        && (OP(nxt2) != WHILEM))
3550       nxt = nxt2;
3551      OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3552      /* Need to optimize away parenths. */
3553      if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3554       /* Set the parenth number.  */
3555       regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3556
3557       oscan->flags = (U8)ARG(nxt);
3558       if (RExC_open_parens) {
3559        RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3560        RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3561       }
3562       OP(nxt1) = OPTIMIZED; /* was OPEN. */
3563       OP(nxt) = OPTIMIZED; /* was CLOSE. */
3564
3565 #ifdef DEBUGGING
3566       OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3567       OP(nxt + 1) = OPTIMIZED; /* was count. */
3568       NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3569       NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3570 #endif
3571 #if 0
3572       while ( nxt1 && (OP(nxt1) != WHILEM)) {
3573        regnode *nnxt = regnext(nxt1);
3574        if (nnxt == nxt) {
3575         if (reg_off_by_arg[OP(nxt1)])
3576          ARG_SET(nxt1, nxt2 - nxt1);
3577         else if (nxt2 - nxt1 < U16_MAX)
3578          NEXT_OFF(nxt1) = nxt2 - nxt1;
3579         else
3580          OP(nxt) = NOTHING; /* Cannot beautify */
3581        }
3582        nxt1 = nnxt;
3583       }
3584 #endif
3585       /* Optimize again: */
3586       study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3587          NULL, stopparen, recursed, NULL, 0,depth+1);
3588      }
3589      else
3590       oscan->flags = 0;
3591     }
3592     else if ((OP(oscan) == CURLYX)
3593       && (flags & SCF_WHILEM_VISITED_POS)
3594       /* See the comment on a similar expression above.
3595        However, this time it's not a subexpression
3596        we care about, but the expression itself. */
3597       && (maxcount == REG_INFTY)
3598       && data && ++data->whilem_c < 16) {
3599      /* This stays as CURLYX, we can put the count/of pair. */
3600      /* Find WHILEM (as in regexec.c) */
3601      regnode *nxt = oscan + NEXT_OFF(oscan);
3602
3603      if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3604       nxt += ARG(nxt);
3605      PREVOPER(nxt)->flags = (U8)(data->whilem_c
3606       | (RExC_whilem_seen << 4)); /* On WHILEM */
3607     }
3608     if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3609      pars++;
3610     if (flags & SCF_DO_SUBSTR) {
3611      SV *last_str = NULL;
3612      int counted = mincount != 0;
3613
3614      if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3615 #if defined(SPARC64_GCC_WORKAROUND)
3616       I32 b = 0;
3617       STRLEN l = 0;
3618       const char *s = NULL;
3619       I32 old = 0;
3620
3621       if (pos_before >= data->last_start_min)
3622        b = pos_before;
3623       else
3624        b = data->last_start_min;
3625
3626       l = 0;
3627       s = SvPV_const(data->last_found, l);
3628       old = b - data->last_start_min;
3629
3630 #else
3631       I32 b = pos_before >= data->last_start_min
3632        ? pos_before : data->last_start_min;
3633       STRLEN l;
3634       const char * const s = SvPV_const(data->last_found, l);
3635       I32 old = b - data->last_start_min;
3636 #endif
3637
3638       if (UTF)
3639        old = utf8_hop((U8*)s, old) - (U8*)s;
3640       l -= old;
3641       /* Get the added string: */
3642       last_str = newSVpvn_utf8(s  + old, l, UTF);
3643       if (deltanext == 0 && pos_before == b) {
3644        /* What was added is a constant string */
3645        if (mincount > 1) {
3646         SvGROW(last_str, (mincount * l) + 1);
3647         repeatcpy(SvPVX(last_str) + l,
3648           SvPVX_const(last_str), l, mincount - 1);
3649         SvCUR_set(last_str, SvCUR(last_str) * mincount);
3650         /* Add additional parts. */
3651         SvCUR_set(data->last_found,
3652           SvCUR(data->last_found) - l);
3653         sv_catsv(data->last_found, last_str);
3654         {
3655          SV * sv = data->last_found;
3656          MAGIC *mg =
3657           SvUTF8(sv) && SvMAGICAL(sv) ?
3658           mg_find(sv, PERL_MAGIC_utf8) : NULL;
3659          if (mg && mg->mg_len >= 0)
3660           mg->mg_len += CHR_SVLEN(last_str) - l;
3661         }
3662         data->last_end += l * (mincount - 1);
3663        }
3664       } else {
3665        /* start offset must point into the last copy */
3666        data->last_start_min += minnext * (mincount - 1);
3667        data->last_start_max += is_inf ? I32_MAX
3668         : (maxcount - 1) * (minnext + data->pos_delta);
3669       }
3670      }
3671      /* It is counted once already... */
3672      data->pos_min += minnext * (mincount - counted);
3673      data->pos_delta += - counted * deltanext +
3674       (minnext + deltanext) * maxcount - minnext * mincount;
3675      if (mincount != maxcount) {
3676       /* Cannot extend fixed substrings found inside
3677        the group.  */
3678       SCAN_COMMIT(pRExC_state,data,minlenp);
3679       if (mincount && last_str) {
3680        SV * const sv = data->last_found;
3681        MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3682         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3683
3684        if (mg)
3685         mg->mg_len = -1;
3686        sv_setsv(sv, last_str);
3687        data->last_end = data->pos_min;
3688        data->last_start_min =
3689         data->pos_min - CHR_SVLEN(last_str);
3690        data->last_start_max = is_inf
3691         ? I32_MAX
3692         : data->pos_min + data->pos_delta
3693         - CHR_SVLEN(last_str);
3694       }
3695       data->longest = &(data->longest_float);
3696      }
3697      SvREFCNT_dec(last_str);
3698     }
3699     if (data && (fl & SF_HAS_EVAL))
3700      data->flags |= SF_HAS_EVAL;
3701    optimize_curly_tail:
3702     if (OP(oscan) != CURLYX) {
3703      while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3704       && NEXT_OFF(next))
3705       NEXT_OFF(oscan) += NEXT_OFF(next);
3706     }
3707     continue;
3708    default:   /* REF, ANYOFV, and CLUMP only? */
3709     if (flags & SCF_DO_SUBSTR) {
3710      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3711      data->longest = &(data->longest_float);
3712     }
3713     is_inf = is_inf_internal = 1;
3714     if (flags & SCF_DO_STCLASS_OR)
3715      cl_anything(pRExC_state, data->start_class);
3716     flags &= ~SCF_DO_STCLASS;
3717     break;
3718    }
3719   }
3720   else if (OP(scan) == LNBREAK) {
3721    if (flags & SCF_DO_STCLASS) {
3722     int value = 0;
3723     data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3724      if (flags & SCF_DO_STCLASS_AND) {
3725      for (value = 0; value < 256; value++)
3726       if (!is_VERTWS_cp(value))
3727        ANYOF_BITMAP_CLEAR(data->start_class, value);
3728     }
3729     else {
3730      for (value = 0; value < 256; value++)
3731       if (is_VERTWS_cp(value))
3732        ANYOF_BITMAP_SET(data->start_class, value);
3733     }
3734     if (flags & SCF_DO_STCLASS_OR)
3735      cl_and(data->start_class, and_withp);
3736     flags &= ~SCF_DO_STCLASS;
3737    }
3738    min += 1;
3739    delta += 1;
3740    if (flags & SCF_DO_SUBSTR) {
3741      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3742      data->pos_min += 1;
3743     data->pos_delta += 1;
3744     data->longest = &(data->longest_float);
3745     }
3746   }
3747   else if (OP(scan) == FOLDCHAR) {
3748    int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
3749    flags &= ~SCF_DO_STCLASS;
3750    min += 1;
3751    delta += d;
3752    if (flags & SCF_DO_SUBSTR) {
3753     SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3754     data->pos_min += 1;
3755     data->pos_delta += d;
3756     data->longest = &(data->longest_float);
3757    }
3758   }
3759   else if (REGNODE_SIMPLE(OP(scan))) {
3760    int value = 0;
3761
3762    if (flags & SCF_DO_SUBSTR) {
3763     SCAN_COMMIT(pRExC_state,data,minlenp);
3764     data->pos_min++;
3765    }
3766    min++;
3767    if (flags & SCF_DO_STCLASS) {
3768     data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3769
3770     /* Some of the logic below assumes that switching
3771     locale on will only add false positives. */
3772     switch (PL_regkind[OP(scan)]) {
3773     case SANY:
3774     default:
3775     do_default:
3776      /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3777      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3778       cl_anything(pRExC_state, data->start_class);
3779      break;
3780     case REG_ANY:
3781      if (OP(scan) == SANY)
3782       goto do_default;
3783      if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3784       value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3785         || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3786       cl_anything(pRExC_state, data->start_class);
3787      }
3788      if (flags & SCF_DO_STCLASS_AND || !value)
3789       ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3790      break;
3791     case ANYOF:
3792      if (flags & SCF_DO_STCLASS_AND)
3793       cl_and(data->start_class,
3794        (struct regnode_charclass_class*)scan);
3795      else
3796       cl_or(pRExC_state, data->start_class,
3797        (struct regnode_charclass_class*)scan);
3798      break;
3799     case ALNUM:
3800      if (flags & SCF_DO_STCLASS_AND) {
3801       if (!(data->start_class->flags & ANYOF_LOCALE)) {
3802        ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3803        if (OP(scan) == ALNUMU) {
3804         for (value = 0; value < 256; value++) {
3805          if (!isWORDCHAR_L1(value)) {
3806           ANYOF_BITMAP_CLEAR(data->start_class, value);
3807          }
3808         }
3809        } else {
3810         for (value = 0; value < 256; value++) {
3811          if (!isALNUM(value)) {
3812           ANYOF_BITMAP_CLEAR(data->start_class, value);
3813          }
3814         }
3815        }
3816       }
3817      }
3818      else {
3819       if (data->start_class->flags & ANYOF_LOCALE)
3820        ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3821
3822       /* Even if under locale, set the bits for non-locale
3823       * in case it isn't a true locale-node.  This will
3824       * create false positives if it truly is locale */
3825       if (OP(scan) == ALNUMU) {
3826        for (value = 0; value < 256; value++) {
3827         if (isWORDCHAR_L1(value)) {
3828          ANYOF_BITMAP_SET(data->start_class, value);
3829         }
3830        }
3831       } else {
3832        for (value = 0; value < 256; value++) {
3833         if (isALNUM(value)) {
3834          ANYOF_BITMAP_SET(data->start_class, value);
3835         }
3836        }
3837       }
3838      }
3839      break;
3840     case NALNUM:
3841      if (flags & SCF_DO_STCLASS_AND) {
3842       if (!(data->start_class->flags & ANYOF_LOCALE)) {
3843        ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3844        if (OP(scan) == NALNUMU) {
3845         for (value = 0; value < 256; value++) {
3846          if (isWORDCHAR_L1(value)) {
3847           ANYOF_BITMAP_CLEAR(data->start_class, value);
3848          }
3849         }
3850        } else {
3851         for (value = 0; value < 256; value++) {
3852          if (isALNUM(value)) {
3853           ANYOF_BITMAP_CLEAR(data->start_class, value);
3854          }
3855         }
3856        }
3857       }
3858      }
3859      else {
3860       if (data->start_class->flags & ANYOF_LOCALE)
3861        ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3862
3863       /* Even if under locale, set the bits for non-locale in
3864       * case it isn't a true locale-node.  This will create
3865       * false positives if it truly is locale */
3866       if (OP(scan) == NALNUMU) {
3867        for (value = 0; value < 256; value++) {
3868         if (! isWORDCHAR_L1(value)) {
3869          ANYOF_BITMAP_SET(data->start_class, value);
3870         }
3871        }
3872       } else {
3873        for (value = 0; value < 256; value++) {
3874         if (! isALNUM(value)) {
3875          ANYOF_BITMAP_SET(data->start_class, value);
3876         }
3877        }
3878       }
3879      }
3880      break;
3881     case SPACE:
3882      if (flags & SCF_DO_STCLASS_AND) {
3883       if (!(data->start_class->flags & ANYOF_LOCALE)) {
3884        ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3885        if (OP(scan) == SPACEU) {
3886         for (value = 0; value < 256; value++) {
3887          if (!isSPACE_L1(value)) {
3888           ANYOF_BITMAP_CLEAR(data->start_class, value);
3889          }
3890         }
3891        } else {
3892         for (value = 0; value < 256; value++) {
3893          if (!isSPACE(value)) {
3894           ANYOF_BITMAP_CLEAR(data->start_class, value);
3895          }
3896         }
3897        }
3898       }
3899      }
3900      else {
3901       if (data->start_class->flags & ANYOF_LOCALE) {
3902        ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3903       }
3904       if (OP(scan) == SPACEU) {
3905        for (value = 0; value < 256; value++) {
3906         if (isSPACE_L1(value)) {
3907          ANYOF_BITMAP_SET(data->start_class, value);
3908         }
3909        }
3910       } else {
3911        for (value = 0; value < 256; value++) {
3912         if (isSPACE(value)) {
3913          ANYOF_BITMAP_SET(data->start_class, value);
3914         }
3915        }
3916       }
3917      }
3918      break;
3919     case NSPACE:
3920      if (flags & SCF_DO_STCLASS_AND) {
3921       if (!(data->start_class->flags & ANYOF_LOCALE)) {
3922        ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3923        if (OP(scan) == NSPACEU) {
3924         for (value = 0; value < 256; value++) {
3925          if (isSPACE_L1(value)) {
3926           ANYOF_BITMAP_CLEAR(data->start_class, value);
3927          }
3928         }
3929        } else {
3930         for (value = 0; value < 256; value++) {
3931          if (isSPACE(value)) {
3932           ANYOF_BITMAP_CLEAR(data->start_class, value);
3933          }
3934         }
3935        }
3936       }
3937      }
3938      else {
3939       if (data->start_class->flags & ANYOF_LOCALE)
3940        ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3941       if (OP(scan) == NSPACEU) {
3942        for (value = 0; value < 256; value++) {
3943         if (!isSPACE_L1(value)) {
3944          ANYOF_BITMAP_SET(data->start_class, value);
3945         }
3946        }
3947       }
3948       else {
3949        for (value = 0; value < 256; value++) {
3950         if (!isSPACE(value)) {
3951          ANYOF_BITMAP_SET(data->start_class, value);
3952         }
3953        }
3954       }
3955      }
3956      break;
3957     case DIGIT:
3958      if (flags & SCF_DO_STCLASS_AND) {
3959       if (!(data->start_class->flags & ANYOF_LOCALE)) {
3960        ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3961        for (value = 0; value < 256; value++)
3962         if (!isDIGIT(value))
3963          ANYOF_BITMAP_CLEAR(data->start_class, value);
3964       }
3965      }
3966      else {
3967       if (data->start_class->flags & ANYOF_LOCALE)
3968        ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3969       for (value = 0; value < 256; value++)
3970        if (isDIGIT(value))
3971         ANYOF_BITMAP_SET(data->start_class, value);
3972      }
3973      break;
3974     case NDIGIT:
3975      if (flags & SCF_DO_STCLASS_AND) {
3976       if (!(data->start_class->flags & ANYOF_LOCALE))
3977        ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3978       for (value = 0; value < 256; value++)
3979        if (isDIGIT(value))
3980         ANYOF_BITMAP_CLEAR(data->start_class, value);
3981      }
3982      else {
3983       if (data->start_class->flags & ANYOF_LOCALE)
3984        ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3985       for (value = 0; value < 256; value++)
3986        if (!isDIGIT(value))
3987         ANYOF_BITMAP_SET(data->start_class, value);
3988      }
3989      break;
3990     CASE_SYNST_FNC(VERTWS);
3991     CASE_SYNST_FNC(HORIZWS);
3992
3993     }
3994     if (flags & SCF_DO_STCLASS_OR)
3995      cl_and(data->start_class, and_withp);
3996     flags &= ~SCF_DO_STCLASS;
3997    }
3998   }
3999   else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4000    data->flags |= (OP(scan) == MEOL
4001        ? SF_BEFORE_MEOL
4002        : SF_BEFORE_SEOL);
4003   }
4004   else if (  PL_regkind[OP(scan)] == BRANCHJ
4005     /* Lookbehind, or need to calculate parens/evals/stclass: */
4006     && (scan->flags || data || (flags & SCF_DO_STCLASS))
4007     && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4008    if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4009     || OP(scan) == UNLESSM )
4010    {
4011     /* Negative Lookahead/lookbehind
4012     In this case we can't do fixed string optimisation.
4013     */
4014
4015     I32 deltanext, minnext, fake = 0;
4016     regnode *nscan;
4017     struct regnode_charclass_class intrnl;
4018     int f = 0;
4019
4020     data_fake.flags = 0;
4021     if (data) {
4022      data_fake.whilem_c = data->whilem_c;
4023      data_fake.last_closep = data->last_closep;
4024     }
4025     else
4026      data_fake.last_closep = &fake;
4027     data_fake.pos_delta = delta;
4028     if ( flags & SCF_DO_STCLASS && !scan->flags
4029      && OP(scan) == IFMATCH ) { /* Lookahead */
4030      cl_init(pRExC_state, &intrnl);
4031      data_fake.start_class = &intrnl;
4032      f |= SCF_DO_STCLASS_AND;
4033     }
4034     if (flags & SCF_WHILEM_VISITED_POS)
4035      f |= SCF_WHILEM_VISITED_POS;
4036     next = regnext(scan);
4037     nscan = NEXTOPER(NEXTOPER(scan));
4038     minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4039      last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4040     if (scan->flags) {
4041      if (deltanext) {
4042       FAIL("Variable length lookbehind not implemented");
4043      }
4044      else if (minnext > (I32)U8_MAX) {
4045       FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4046      }
4047      scan->flags = (U8)minnext;
4048     }
4049     if (data) {
4050      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4051       pars++;
4052      if (data_fake.flags & SF_HAS_EVAL)
4053       data->flags |= SF_HAS_EVAL;
4054      data->whilem_c = data_fake.whilem_c;
4055     }
4056     if (f & SCF_DO_STCLASS_AND) {
4057      if (flags & SCF_DO_STCLASS_OR) {
4058       /* OR before, AND after: ideally we would recurse with
4059       * data_fake to get the AND applied by study of the
4060       * remainder of the pattern, and then derecurse;
4061       * *** HACK *** for now just treat as "no information".
4062       * See [perl #56690].
4063       */
4064       cl_init(pRExC_state, data->start_class);
4065      }  else {
4066       /* AND before and after: combine and continue */
4067       const int was = (data->start_class->flags & ANYOF_EOS);
4068
4069       cl_and(data->start_class, &intrnl);
4070       if (was)
4071        data->start_class->flags |= ANYOF_EOS;
4072      }
4073     }
4074    }
4075 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4076    else {
4077     /* Positive Lookahead/lookbehind
4078     In this case we can do fixed string optimisation,
4079     but we must be careful about it. Note in the case of
4080     lookbehind the positions will be offset by the minimum
4081     length of the pattern, something we won't know about
4082     until after the recurse.
4083     */
4084     I32 deltanext, fake = 0;
4085     regnode *nscan;
4086     struct regnode_charclass_class intrnl;
4087     int f = 0;
4088     /* We use SAVEFREEPV so that when the full compile
4089      is finished perl will clean up the allocated
4090      minlens when it's all done. This way we don't
4091      have to worry about freeing them when we know
4092      they wont be used, which would be a pain.
4093     */
4094     I32 *minnextp;
4095     Newx( minnextp, 1, I32 );
4096     SAVEFREEPV(minnextp);
4097
4098     if (data) {
4099      StructCopy(data, &data_fake, scan_data_t);
4100      if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4101       f |= SCF_DO_SUBSTR;
4102       if (scan->flags)
4103        SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4104       data_fake.last_found=newSVsv(data->last_found);
4105      }
4106     }
4107     else
4108      data_fake.last_closep = &fake;
4109     data_fake.flags = 0;
4110     data_fake.pos_delta = delta;
4111     if (is_inf)
4112      data_fake.flags |= SF_IS_INF;
4113     if ( flags & SCF_DO_STCLASS && !scan->flags
4114      && OP(scan) == IFMATCH ) { /* Lookahead */
4115      cl_init(pRExC_state, &intrnl);
4116      data_fake.start_class = &intrnl;
4117      f |= SCF_DO_STCLASS_AND;
4118     }
4119     if (flags & SCF_WHILEM_VISITED_POS)
4120      f |= SCF_WHILEM_VISITED_POS;
4121     next = regnext(scan);
4122     nscan = NEXTOPER(NEXTOPER(scan));
4123
4124     *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4125      last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4126     if (scan->flags) {
4127      if (deltanext) {
4128       FAIL("Variable length lookbehind not implemented");
4129      }
4130      else if (*minnextp > (I32)U8_MAX) {
4131       FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4132      }
4133      scan->flags = (U8)*minnextp;
4134     }
4135
4136     *minnextp += min;
4137
4138     if (f & SCF_DO_STCLASS_AND) {
4139      const int was = (data->start_class->flags & ANYOF_EOS);
4140
4141      cl_and(data->start_class, &intrnl);
4142      if (was)
4143       data->start_class->flags |= ANYOF_EOS;
4144     }
4145     if (data) {
4146      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4147       pars++;
4148      if (data_fake.flags & SF_HAS_EVAL)
4149       data->flags |= SF_HAS_EVAL;
4150      data->whilem_c = data_fake.whilem_c;
4151      if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4152       if (RExC_rx->minlen<*minnextp)
4153        RExC_rx->minlen=*minnextp;
4154       SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4155       SvREFCNT_dec(data_fake.last_found);
4156
4157       if ( data_fake.minlen_fixed != minlenp )
4158       {
4159        data->offset_fixed= data_fake.offset_fixed;
4160        data->minlen_fixed= data_fake.minlen_fixed;
4161        data->lookbehind_fixed+= scan->flags;
4162       }
4163       if ( data_fake.minlen_float != minlenp )
4164       {
4165        data->minlen_float= data_fake.minlen_float;
4166        data->offset_float_min=data_fake.offset_float_min;
4167        data->offset_float_max=data_fake.offset_float_max;
4168        data->lookbehind_float+= scan->flags;
4169       }
4170      }
4171     }
4172
4173
4174    }
4175 #endif
4176   }
4177   else if (OP(scan) == OPEN) {
4178    if (stopparen != (I32)ARG(scan))
4179     pars++;
4180   }
4181   else if (OP(scan) == CLOSE) {
4182    if (stopparen == (I32)ARG(scan)) {
4183     break;
4184    }
4185    if ((I32)ARG(scan) == is_par) {
4186     next = regnext(scan);
4187
4188     if ( next && (OP(next) != WHILEM) && next < last)
4189      is_par = 0;  /* Disable optimization */
4190    }
4191    if (data)
4192     *(data->last_closep) = ARG(scan);
4193   }
4194   else if (OP(scan) == EVAL) {
4195     if (data)
4196      data->flags |= SF_HAS_EVAL;
4197   }
4198   else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4199    if (flags & SCF_DO_SUBSTR) {
4200     SCAN_COMMIT(pRExC_state,data,minlenp);
4201     flags &= ~SCF_DO_SUBSTR;
4202    }
4203    if (data && OP(scan)==ACCEPT) {
4204     data->flags |= SCF_SEEN_ACCEPT;
4205     if (stopmin > min)
4206      stopmin = min;
4207    }
4208   }
4209   else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4210   {
4211     if (flags & SCF_DO_SUBSTR) {
4212      SCAN_COMMIT(pRExC_state,data,minlenp);
4213      data->longest = &(data->longest_float);
4214     }
4215     is_inf = is_inf_internal = 1;
4216     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4217      cl_anything(pRExC_state, data->start_class);
4218     flags &= ~SCF_DO_STCLASS;
4219   }
4220   else if (OP(scan) == GPOS) {
4221    if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4222     !(delta || is_inf || (data && data->pos_delta)))
4223    {
4224     if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4225      RExC_rx->extflags |= RXf_ANCH_GPOS;
4226     if (RExC_rx->gofs < (U32)min)
4227      RExC_rx->gofs = min;
4228    } else {
4229     RExC_rx->extflags |= RXf_GPOS_FLOAT;
4230     RExC_rx->gofs = 0;
4231    }
4232   }
4233 #ifdef TRIE_STUDY_OPT
4234 #ifdef FULL_TRIE_STUDY
4235   else if (PL_regkind[OP(scan)] == TRIE) {
4236    /* NOTE - There is similar code to this block above for handling
4237    BRANCH nodes on the initial study.  If you change stuff here
4238    check there too. */
4239    regnode *trie_node= scan;
4240    regnode *tail= regnext(scan);
4241    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4242    I32 max1 = 0, min1 = I32_MAX;
4243    struct regnode_charclass_class accum;
4244
4245    if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4246     SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4247    if (flags & SCF_DO_STCLASS)
4248     cl_init_zero(pRExC_state, &accum);
4249
4250    if (!trie->jump) {
4251     min1= trie->minlen;
4252     max1= trie->maxlen;
4253    } else {
4254     const regnode *nextbranch= NULL;
4255     U32 word;
4256
4257     for ( word=1 ; word <= trie->wordcount ; word++)
4258     {
4259      I32 deltanext=0, minnext=0, f = 0, fake;
4260      struct regnode_charclass_class this_class;
4261
4262      data_fake.flags = 0;
4263      if (data) {
4264       data_fake.whilem_c = data->whilem_c;
4265       data_fake.last_closep = data->last_closep;
4266      }
4267      else
4268       data_fake.last_closep = &fake;
4269      data_fake.pos_delta = delta;
4270      if (flags & SCF_DO_STCLASS) {
4271       cl_init(pRExC_state, &this_class);
4272       data_fake.start_class = &this_class;
4273       f = SCF_DO_STCLASS_AND;
4274      }
4275      if (flags & SCF_WHILEM_VISITED_POS)
4276       f |= SCF_WHILEM_VISITED_POS;
4277
4278      if (trie->jump[word]) {
4279       if (!nextbranch)
4280        nextbranch = trie_node + trie->jump[0];
4281       scan= trie_node + trie->jump[word];
4282       /* We go from the jump point to the branch that follows
4283       it. Note this means we need the vestigal unused branches
4284       even though they arent otherwise used.
4285       */
4286       minnext = study_chunk(pRExC_state, &scan, minlenp,
4287        &deltanext, (regnode *)nextbranch, &data_fake,
4288        stopparen, recursed, NULL, f,depth+1);
4289      }
4290      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4291       nextbranch= regnext((regnode*)nextbranch);
4292
4293      if (min1 > (I32)(minnext + trie->minlen))
4294       min1 = minnext + trie->minlen;
4295      if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4296       max1 = minnext + deltanext + trie->maxlen;
4297      if (deltanext == I32_MAX)
4298       is_inf = is_inf_internal = 1;
4299
4300      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4301       pars++;
4302      if (data_fake.flags & SCF_SEEN_ACCEPT) {
4303       if ( stopmin > min + min1)
4304        stopmin = min + min1;
4305       flags &= ~SCF_DO_SUBSTR;
4306       if (data)
4307        data->flags |= SCF_SEEN_ACCEPT;
4308      }
4309      if (data) {
4310       if (data_fake.flags & SF_HAS_EVAL)
4311        data->flags |= SF_HAS_EVAL;
4312       data->whilem_c = data_fake.whilem_c;
4313      }
4314      if (flags & SCF_DO_STCLASS)
4315       cl_or(pRExC_state, &accum, &this_class);
4316     }
4317    }
4318    if (flags & SCF_DO_SUBSTR) {
4319     data->pos_min += min1;
4320     data->pos_delta += max1 - min1;
4321     if (max1 != min1 || is_inf)
4322      data->longest = &(data->longest_float);
4323    }
4324    min += min1;
4325    delta += max1 - min1;
4326    if (flags & SCF_DO_STCLASS_OR) {
4327     cl_or(pRExC_state, data->start_class, &accum);
4328     if (min1) {
4329      cl_and(data->start_class, and_withp);
4330      flags &= ~SCF_DO_STCLASS;
4331     }
4332    }
4333    else if (flags & SCF_DO_STCLASS_AND) {
4334     if (min1) {
4335      cl_and(data->start_class, &accum);
4336      flags &= ~SCF_DO_STCLASS;
4337     }
4338     else {
4339      /* Switch to OR mode: cache the old value of
4340      * data->start_class */
4341      INIT_AND_WITHP;
4342      StructCopy(data->start_class, and_withp,
4343        struct regnode_charclass_class);
4344      flags &= ~SCF_DO_STCLASS_AND;
4345      StructCopy(&accum, data->start_class,
4346        struct regnode_charclass_class);
4347      flags |= SCF_DO_STCLASS_OR;
4348      data->start_class->flags |= ANYOF_EOS;
4349     }
4350    }
4351    scan= tail;
4352    continue;
4353   }
4354 #else
4355   else if (PL_regkind[OP(scan)] == TRIE) {
4356    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4357    U8*bang=NULL;
4358
4359    min += trie->minlen;
4360    delta += (trie->maxlen - trie->minlen);
4361    flags &= ~SCF_DO_STCLASS; /* xxx */
4362    if (flags & SCF_DO_SUBSTR) {
4363      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4364      data->pos_min += trie->minlen;
4365      data->pos_delta += (trie->maxlen - trie->minlen);
4366     if (trie->maxlen != trie->minlen)
4367      data->longest = &(data->longest_float);
4368     }
4369     if (trie->jump) /* no more substrings -- for now /grr*/
4370      flags &= ~SCF_DO_SUBSTR;
4371   }
4372 #endif /* old or new */
4373 #endif /* TRIE_STUDY_OPT */
4374
4375   /* Else: zero-length, ignore. */
4376   scan = regnext(scan);
4377  }
4378  if (frame) {
4379   last = frame->last;
4380   scan = frame->next;
4381   stopparen = frame->stop;
4382   frame = frame->prev;
4383   goto fake_study_recurse;
4384  }
4385
4386   finish:
4387  assert(!frame);
4388  DEBUG_STUDYDATA("pre-fin:",data,depth);
4389
4390  *scanp = scan;
4391  *deltap = is_inf_internal ? I32_MAX : delta;
4392  if (flags & SCF_DO_SUBSTR && is_inf)
4393   data->pos_delta = I32_MAX - data->pos_min;
4394  if (is_par > (I32)U8_MAX)
4395   is_par = 0;
4396  if (is_par && pars==1 && data) {
4397   data->flags |= SF_IN_PAR;
4398   data->flags &= ~SF_HAS_PAR;
4399  }
4400  else if (pars && data) {
4401   data->flags |= SF_HAS_PAR;
4402   data->flags &= ~SF_IN_PAR;
4403  }
4404  if (flags & SCF_DO_STCLASS_OR)
4405   cl_and(data->start_class, and_withp);
4406  if (flags & SCF_TRIE_RESTUDY)
4407   data->flags |=  SCF_TRIE_RESTUDY;
4408
4409  DEBUG_STUDYDATA("post-fin:",data,depth);
4410
4411  return min < stopmin ? min : stopmin;
4412 }
4413
4414 STATIC U32
4415 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4416 {
4417  U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4418
4419  PERL_ARGS_ASSERT_ADD_DATA;
4420
4421  Renewc(RExC_rxi->data,
4422   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4423   char, struct reg_data);
4424  if(count)
4425   Renew(RExC_rxi->data->what, count + n, U8);
4426  else
4427   Newx(RExC_rxi->data->what, n, U8);
4428  RExC_rxi->data->count = count + n;
4429  Copy(s, RExC_rxi->data->what + count, n, U8);
4430  return count;
4431 }
4432
4433 /*XXX: todo make this not included in a non debugging perl */
4434 #ifndef PERL_IN_XSUB_RE
4435 void
4436 Perl_reginitcolors(pTHX)
4437 {
4438  dVAR;
4439  const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4440  if (s) {
4441   char *t = savepv(s);
4442   int i = 0;
4443   PL_colors[0] = t;
4444   while (++i < 6) {
4445    t = strchr(t, '\t');
4446    if (t) {
4447     *t = '\0';
4448     PL_colors[i] = ++t;
4449    }
4450    else
4451     PL_colors[i] = t = (char *)"";
4452   }
4453  } else {
4454   int i = 0;
4455   while (i < 6)
4456    PL_colors[i++] = (char *)"";
4457  }
4458  PL_colorset = 1;
4459 }
4460 #endif
4461
4462
4463 #ifdef TRIE_STUDY_OPT
4464 #define CHECK_RESTUDY_GOTO                                  \
4465   if (                                                \
4466    (data.flags & SCF_TRIE_RESTUDY)               \
4467    && ! restudied++                              \
4468   )     goto reStudy
4469 #else
4470 #define CHECK_RESTUDY_GOTO
4471 #endif
4472
4473 /*
4474  - pregcomp - compile a regular expression into internal code
4475  *
4476  * We can't allocate space until we know how big the compiled form will be,
4477  * but we can't compile it (and thus know how big it is) until we've got a
4478  * place to put the code.  So we cheat:  we compile it twice, once with code
4479  * generation turned off and size counting turned on, and once "for real".
4480  * This also means that we don't allocate space until we are sure that the
4481  * thing really will compile successfully, and we never have to move the
4482  * code and thus invalidate pointers into it.  (Note that it has to be in
4483  * one piece because free() must be able to free it all.) [NB: not true in perl]
4484  *
4485  * Beware that the optimization-preparation code in here knows about some
4486  * of the structure of the compiled regexp.  [I'll say.]
4487  */
4488
4489
4490
4491 #ifndef PERL_IN_XSUB_RE
4492 #define RE_ENGINE_PTR &reh_regexp_engine
4493 #else
4494 extern const struct regexp_engine my_reg_engine;
4495 #define RE_ENGINE_PTR &my_reg_engine
4496 #endif
4497
4498 #ifndef PERL_IN_XSUB_RE
4499 REGEXP *
4500 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4501 {
4502  dVAR;
4503  HV * const table = GvHV(PL_hintgv);
4504
4505  PERL_ARGS_ASSERT_PREGCOMP;
4506
4507  /* Dispatch a request to compile a regexp to correct
4508  regexp engine. */
4509  if (table) {
4510   SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4511   GET_RE_DEBUG_FLAGS_DECL;
4512   if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4513    const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4514    DEBUG_COMPILE_r({
4515     PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4516      SvIV(*ptr));
4517    });
4518    return CALLREGCOMP_ENG(eng, pattern, flags);
4519   }
4520  }
4521  return Perl_re_compile(aTHX_ pattern, flags);
4522 }
4523 #endif
4524
4525 REGEXP *
4526 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4527 {
4528  dVAR;
4529  REGEXP *rx;
4530  struct regexp *r;
4531  register regexp_internal *ri;
4532  STRLEN plen;
4533  char  *exp;
4534  char* xend;
4535  regnode *scan;
4536  I32 flags;
4537  I32 minlen = 0;
4538  U32 pm_flags;
4539
4540  /* these are all flags - maybe they should be turned
4541  * into a single int with different bit masks */
4542  I32 sawlookahead = 0;
4543  I32 sawplus = 0;
4544  I32 sawopen = 0;
4545  bool used_setjump = FALSE;
4546  regex_charset initial_charset = get_regex_charset(orig_pm_flags);
4547
4548  U8 jump_ret = 0;
4549  dJMPENV;
4550  scan_data_t data;
4551  RExC_state_t RExC_state;
4552  RExC_state_t * const pRExC_state = &RExC_state;
4553 #ifdef TRIE_STUDY_OPT
4554  int restudied;
4555  RExC_state_t copyRExC_state;
4556 #endif
4557  GET_RE_DEBUG_FLAGS_DECL;
4558
4559  PERL_ARGS_ASSERT_RE_COMPILE;
4560
4561  DEBUG_r(if (!PL_colorset) reginitcolors());
4562
4563  RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4564  RExC_uni_semantics = 0;
4565  RExC_contains_locale = 0;
4566
4567  /****************** LONG JUMP TARGET HERE***********************/
4568  /* Longjmp back to here if have to switch in midstream to utf8 */
4569  if (! RExC_orig_utf8) {
4570   JMPENV_PUSH(jump_ret);
4571   used_setjump = TRUE;
4572  }
4573
4574  if (jump_ret == 0) {    /* First time through */
4575   exp = SvPV(pattern, plen);
4576   xend = exp + plen;
4577   /* ignore the utf8ness if the pattern is 0 length */
4578   if (plen == 0) {
4579    RExC_utf8 = RExC_orig_utf8 = 0;
4580   }
4581
4582   DEBUG_COMPILE_r({
4583    SV *dsv= sv_newmortal();
4584    RE_PV_QUOTED_DECL(s, RExC_utf8,
4585     dsv, exp, plen, 60);
4586    PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4587       PL_colors[4],PL_colors[5],s);
4588   });
4589  }
4590  else {  /* longjumped back */
4591   STRLEN len = plen;
4592
4593   /* If the cause for the longjmp was other than changing to utf8, pop
4594   * our own setjmp, and longjmp to the correct handler */
4595   if (jump_ret != UTF8_LONGJMP) {
4596    JMPENV_POP;
4597    JMPENV_JUMP(jump_ret);
4598   }
4599
4600   GET_RE_DEBUG_FLAGS;
4601
4602   /* It's possible to write a regexp in ascii that represents Unicode
4603   codepoints outside of the byte range, such as via \x{100}. If we
4604   detect such a sequence we have to convert the entire pattern to utf8
4605   and then recompile, as our sizing calculation will have been based
4606   on 1 byte == 1 character, but we will need to use utf8 to encode
4607   at least some part of the pattern, and therefore must convert the whole
4608   thing.
4609   -- dmq */
4610   DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4611    "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4612   exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4613   xend = exp + len;
4614   RExC_orig_utf8 = RExC_utf8 = 1;
4615   SAVEFREEPV(exp);
4616  }
4617
4618 #ifdef TRIE_STUDY_OPT
4619  restudied = 0;
4620 #endif
4621
4622  pm_flags = orig_pm_flags;
4623
4624  if (initial_charset == REGEX_LOCALE_CHARSET) {
4625   RExC_contains_locale = 1;
4626  }
4627  else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
4628
4629   /* Set to use unicode semantics if the pattern is in utf8 and has the
4630   * 'depends' charset specified, as it means unicode when utf8  */
4631   set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4632  }
4633
4634  RExC_precomp = exp;
4635  RExC_flags = pm_flags;
4636  RExC_sawback = 0;
4637
4638  RExC_seen = 0;
4639  RExC_in_lookbehind = 0;
4640  RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4641  RExC_seen_evals = 0;
4642  RExC_extralen = 0;
4643  RExC_override_recoding = 0;
4644
4645  /* First pass: determine size, legality. */
4646  RExC_parse = exp;
4647  RExC_start = exp;
4648  RExC_end = xend;
4649  RExC_naughty = 0;
4650  RExC_npar = 1;
4651  RExC_nestroot = 0;
4652  RExC_size = 0L;
4653  RExC_emit = &PL_regdummy;
4654  RExC_whilem_seen = 0;
4655  RExC_open_parens = NULL;
4656  RExC_close_parens = NULL;
4657  RExC_opend = NULL;
4658  RExC_paren_names = NULL;
4659 #ifdef DEBUGGING
4660  RExC_paren_name_list = NULL;
4661 #endif
4662  RExC_recurse = NULL;
4663  RExC_recurse_count = 0;
4664
4665 #if 0 /* REGC() is (currently) a NOP at the first pass.
4666  * Clever compilers notice this and complain. --jhi */
4667  REGC((U8)REG_MAGIC, (char*)RExC_emit);
4668 #endif
4669  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4670  if (reg(pRExC_state, 0, &flags,1) == NULL) {
4671   RExC_precomp = NULL;
4672   return(NULL);
4673  }
4674
4675  /* Here, finished first pass.  Get rid of any added setjmp */
4676  if (used_setjump) {
4677   JMPENV_POP;
4678  }
4679
4680  DEBUG_PARSE_r({
4681   PerlIO_printf(Perl_debug_log,
4682    "Required size %"IVdf" nodes\n"
4683    "Starting second pass (creation)\n",
4684    (IV)RExC_size);
4685   RExC_lastnum=0;
4686   RExC_lastparse=NULL;
4687  });
4688
4689  /* The first pass could have found things that force Unicode semantics */
4690  if ((RExC_utf8 || RExC_uni_semantics)
4691   && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
4692  {
4693   set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4694  }
4695
4696  /* Small enough for pointer-storage convention?
4697  If extralen==0, this means that we will not need long jumps. */
4698  if (RExC_size >= 0x10000L && RExC_extralen)
4699   RExC_size += RExC_extralen;
4700  else
4701   RExC_extralen = 0;
4702  if (RExC_whilem_seen > 15)
4703   RExC_whilem_seen = 15;
4704
4705  /* Allocate space and zero-initialize. Note, the two step process
4706  of zeroing when in debug mode, thus anything assigned has to
4707  happen after that */
4708  rx = (REGEXP*) newSV_type(SVt_REGEXP);
4709  r = (struct regexp*)SvANY(rx);
4710  Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4711   char, regexp_internal);
4712  if ( r == NULL || ri == NULL )
4713   FAIL("Regexp out of space");
4714 #ifdef DEBUGGING
4715  /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4716  Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4717 #else
4718  /* bulk initialize base fields with 0. */
4719  Zero(ri, sizeof(regexp_internal), char);
4720 #endif
4721
4722  /* non-zero initialization begins here */
4723  RXi_SET( r, ri );
4724  r->engine= RE_ENGINE_PTR;
4725  r->extflags = pm_flags;
4726  {
4727   bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4728   bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
4729
4730   /* The caret is output if there are any defaults: if not all the STD
4731   * flags are set, or if no character set specifier is needed */
4732   bool has_default =
4733      (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4734      || ! has_charset);
4735   bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4736   U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4737        >> RXf_PMf_STD_PMMOD_SHIFT);
4738   const char *fptr = STD_PAT_MODS;        /*"msix"*/
4739   char *p;
4740   /* Allocate for the worst case, which is all the std flags are turned
4741   * on.  If more precision is desired, we could do a population count of
4742   * the flags set.  This could be done with a small lookup table, or by
4743   * shifting, masking and adding, or even, when available, assembly
4744   * language for a machine-language population count.
4745   * We never output a minus, as all those are defaults, so are
4746   * covered by the caret */
4747   const STRLEN wraplen = plen + has_p + has_runon
4748    + has_default       /* If needs a caret */
4749
4750     /* If needs a character set specifier */
4751    + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
4752    + (sizeof(STD_PAT_MODS) - 1)
4753    + (sizeof("(?:)") - 1);
4754
4755   p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4756   SvPOK_on(rx);
4757   SvFLAGS(rx) |= SvUTF8(pattern);
4758   *p++='('; *p++='?';
4759
4760   /* If a default, cover it using the caret */
4761   if (has_default) {
4762    *p++= DEFAULT_PAT_MOD;
4763   }
4764   if (has_charset) {
4765    STRLEN len;
4766    const char* const name = get_regex_charset_name(r->extflags, &len);
4767    Copy(name, p, len, char);
4768    p += len;
4769   }
4770   if (has_p)
4771    *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4772   {
4773    char ch;
4774    while((ch = *fptr++)) {
4775     if(reganch & 1)
4776      *p++ = ch;
4777     reganch >>= 1;
4778    }
4779   }
4780
4781   *p++ = ':';
4782   Copy(RExC_precomp, p, plen, char);
4783   assert ((RX_WRAPPED(rx) - p) < 16);
4784   r->pre_prefix = p - RX_WRAPPED(rx);
4785   p += plen;
4786   if (has_runon)
4787    *p++ = '\n';
4788   *p++ = ')';
4789   *p = 0;
4790   SvCUR_set(rx, p - SvPVX_const(rx));
4791  }
4792
4793  r->intflags = 0;
4794  r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4795
4796  if (RExC_seen & REG_SEEN_RECURSE) {
4797   Newxz(RExC_open_parens, RExC_npar,regnode *);
4798   SAVEFREEPV(RExC_open_parens);
4799   Newxz(RExC_close_parens,RExC_npar,regnode *);
4800   SAVEFREEPV(RExC_close_parens);
4801  }
4802
4803  /* Useful during FAIL. */
4804 #ifdef RE_TRACK_PATTERN_OFFSETS
4805  Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4806  DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4807       "%s %"UVuf" bytes for offset annotations.\n",
4808       ri->u.offsets ? "Got" : "Couldn't get",
4809       (UV)((2*RExC_size+1) * sizeof(U32))));
4810 #endif
4811  SetProgLen(ri,RExC_size);
4812  RExC_rx_sv = rx;
4813  RExC_rx = r;
4814  RExC_rxi = ri;
4815  REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4816
4817  /* Second pass: emit code. */
4818  RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4819  RExC_parse = exp;
4820  RExC_end = xend;
4821  RExC_naughty = 0;
4822  RExC_npar = 1;
4823  RExC_emit_start = ri->program;
4824  RExC_emit = ri->program;
4825  RExC_emit_bound = ri->program + RExC_size + 1;
4826
4827  /* Store the count of eval-groups for security checks: */
4828  RExC_rx->seen_evals = RExC_seen_evals;
4829  REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4830  if (reg(pRExC_state, 0, &flags,1) == NULL) {
4831   ReREFCNT_dec(rx);
4832   return(NULL);
4833  }
4834  /* XXXX To minimize changes to RE engine we always allocate
4835  3-units-long substrs field. */
4836  Newx(r->substrs, 1, struct reg_substr_data);
4837  if (RExC_recurse_count) {
4838   Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4839   SAVEFREEPV(RExC_recurse);
4840  }
4841
4842 reStudy:
4843  r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4844  Zero(r->substrs, 1, struct reg_substr_data);
4845
4846 #ifdef TRIE_STUDY_OPT
4847  if (!restudied) {
4848   StructCopy(&zero_scan_data, &data, scan_data_t);
4849   copyRExC_state = RExC_state;
4850  } else {
4851   U32 seen=RExC_seen;
4852   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4853
4854   RExC_state = copyRExC_state;
4855   if (seen & REG_TOP_LEVEL_BRANCHES)
4856    RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4857   else
4858    RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4859   if (data.last_found) {
4860    SvREFCNT_dec(data.longest_fixed);
4861    SvREFCNT_dec(data.longest_float);
4862    SvREFCNT_dec(data.last_found);
4863   }
4864   StructCopy(&zero_scan_data, &data, scan_data_t);
4865  }
4866 #else
4867  StructCopy(&zero_scan_data, &data, scan_data_t);
4868 #endif
4869
4870  /* Dig out information for optimizations. */
4871  r->extflags = RExC_flags; /* was pm_op */
4872  /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4873
4874  if (UTF)
4875   SvUTF8_on(rx); /* Unicode in it? */
4876  ri->regstclass = NULL;
4877  if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4878   r->intflags |= PREGf_NAUGHTY;
4879  scan = ri->program + 1;  /* First BRANCH. */
4880
4881  /* testing for BRANCH here tells us whether there is "must appear"
4882  data in the pattern. If there is then we can use it for optimisations */
4883  if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4884   I32 fake;
4885   STRLEN longest_float_length, longest_fixed_length;
4886   struct regnode_charclass_class ch_class; /* pointed to by data */
4887   int stclass_flag;
4888   I32 last_close = 0; /* pointed to by data */
4889   regnode *first= scan;
4890   regnode *first_next= regnext(first);
4891   /*
4892   * Skip introductions and multiplicators >= 1
4893   * so that we can extract the 'meat' of the pattern that must
4894   * match in the large if() sequence following.
4895   * NOTE that EXACT is NOT covered here, as it is normally
4896   * picked up by the optimiser separately.
4897   *
4898   * This is unfortunate as the optimiser isnt handling lookahead
4899   * properly currently.
4900   *
4901   */
4902   while ((OP(first) == OPEN && (sawopen = 1)) ||
4903    /* An OR of *one* alternative - should not happen now. */
4904    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4905    /* for now we can't handle lookbehind IFMATCH*/
4906    (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4907    (OP(first) == PLUS) ||
4908    (OP(first) == MINMOD) ||
4909    /* An {n,m} with n>0 */
4910    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4911    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4912   {
4913     /*
4914     * the only op that could be a regnode is PLUS, all the rest
4915     * will be regnode_1 or regnode_2.
4916     *
4917     */
4918     if (OP(first) == PLUS)
4919      sawplus = 1;
4920     else
4921      first += regarglen[OP(first)];
4922
4923     first = NEXTOPER(first);
4924     first_next= regnext(first);
4925   }
4926
4927   /* Starting-point info. */
4928  again:
4929   DEBUG_PEEP("first:",first,0);
4930   /* Ignore EXACT as we deal with it later. */
4931   if (PL_regkind[OP(first)] == EXACT) {
4932    if (OP(first) == EXACT)
4933     NOOP; /* Empty, get anchored substr later. */
4934    else
4935     ri->regstclass = first;
4936   }
4937 #ifdef TRIE_STCLASS
4938   else if (PL_regkind[OP(first)] == TRIE &&
4939     ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4940   {
4941    regnode *trie_op;
4942    /* this can happen only on restudy */
4943    if ( OP(first) == TRIE ) {
4944     struct regnode_1 *trieop = (struct regnode_1 *)
4945      PerlMemShared_calloc(1, sizeof(struct regnode_1));
4946     StructCopy(first,trieop,struct regnode_1);
4947     trie_op=(regnode *)trieop;
4948    } else {
4949     struct regnode_charclass *trieop = (struct regnode_charclass *)
4950      PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4951     StructCopy(first,trieop,struct regnode_charclass);
4952     trie_op=(regnode *)trieop;
4953    }
4954    OP(trie_op)+=2;
4955    make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4956    ri->regstclass = trie_op;
4957   }
4958 #endif
4959   else if (REGNODE_SIMPLE(OP(first)))
4960    ri->regstclass = first;
4961   else if (PL_regkind[OP(first)] == BOUND ||
4962     PL_regkind[OP(first)] == NBOUND)
4963    ri->regstclass = first;
4964   else if (PL_regkind[OP(first)] == BOL) {
4965    r->extflags |= (OP(first) == MBOL
4966       ? RXf_ANCH_MBOL
4967       : (OP(first) == SBOL
4968        ? RXf_ANCH_SBOL
4969        : RXf_ANCH_BOL));
4970    first = NEXTOPER(first);
4971    goto again;
4972   }
4973   else if (OP(first) == GPOS) {
4974    r->extflags |= RXf_ANCH_GPOS;
4975    first = NEXTOPER(first);
4976    goto again;
4977   }
4978   else if ((!sawopen || !RExC_sawback) &&
4979    (OP(first) == STAR &&
4980    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4981    !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4982   {
4983    /* turn .* into ^.* with an implied $*=1 */
4984    const int type =
4985     (OP(NEXTOPER(first)) == REG_ANY)
4986      ? RXf_ANCH_MBOL
4987      : RXf_ANCH_SBOL;
4988    r->extflags |= type;
4989    r->intflags |= PREGf_IMPLICIT;
4990    first = NEXTOPER(first);
4991    goto again;
4992   }
4993   if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4994    && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4995    /* x+ must match at the 1st pos of run of x's */
4996    r->intflags |= PREGf_SKIP;
4997
4998   /* Scan is after the zeroth branch, first is atomic matcher. */
4999 #ifdef TRIE_STUDY_OPT
5000   DEBUG_PARSE_r(
5001    if (!restudied)
5002     PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5003        (IV)(first - scan + 1))
5004   );
5005 #else
5006   DEBUG_PARSE_r(
5007    PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5008     (IV)(first - scan + 1))
5009   );
5010 #endif
5011
5012
5013   /*
5014   * If there's something expensive in the r.e., find the
5015   * longest literal string that must appear and make it the
5016   * regmust.  Resolve ties in favor of later strings, since
5017   * the regstart check works with the beginning of the r.e.
5018   * and avoiding duplication strengthens checking.  Not a
5019   * strong reason, but sufficient in the absence of others.
5020   * [Now we resolve ties in favor of the earlier string if
5021   * it happens that c_offset_min has been invalidated, since the
5022   * earlier string may buy us something the later one won't.]
5023   */
5024
5025   data.longest_fixed = newSVpvs("");
5026   data.longest_float = newSVpvs("");
5027   data.last_found = newSVpvs("");
5028   data.longest = &(data.longest_fixed);
5029   first = scan;
5030   if (!ri->regstclass) {
5031    cl_init(pRExC_state, &ch_class);
5032    data.start_class = &ch_class;
5033    stclass_flag = SCF_DO_STCLASS_AND;
5034   } else    /* XXXX Check for BOUND? */
5035    stclass_flag = 0;
5036   data.last_closep = &last_close;
5037
5038   minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5039    &data, -1, NULL, NULL,
5040    SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5041
5042
5043   CHECK_RESTUDY_GOTO;
5044
5045
5046   if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5047    && data.last_start_min == 0 && data.last_end > 0
5048    && !RExC_seen_zerolen
5049    && !(RExC_seen & REG_SEEN_VERBARG)
5050    && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5051    r->extflags |= RXf_CHECK_ALL;
5052   scan_commit(pRExC_state, &data,&minlen,0);
5053   SvREFCNT_dec(data.last_found);
5054
5055   /* Note that code very similar to this but for anchored string
5056   follows immediately below, changes may need to be made to both.
5057   Be careful.
5058   */
5059   longest_float_length = CHR_SVLEN(data.longest_float);
5060   if (longest_float_length
5061    || (data.flags & SF_FL_BEFORE_EOL
5062     && (!(data.flags & SF_FL_BEFORE_MEOL)
5063      || (RExC_flags & RXf_PMf_MULTILINE))))
5064   {
5065    I32 t,ml;
5066
5067    if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
5068     && data.offset_fixed == data.offset_float_min
5069     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
5070      goto remove_float;  /* As in (a)+. */
5071
5072    /* copy the information about the longest float from the reg_scan_data
5073    over to the program. */
5074    if (SvUTF8(data.longest_float)) {
5075     r->float_utf8 = data.longest_float;
5076     r->float_substr = NULL;
5077    } else {
5078     r->float_substr = data.longest_float;
5079     r->float_utf8 = NULL;
5080    }
5081    /* float_end_shift is how many chars that must be matched that
5082    follow this item. We calculate it ahead of time as once the
5083    lookbehind offset is added in we lose the ability to correctly
5084    calculate it.*/
5085    ml = data.minlen_float ? *(data.minlen_float)
5086         : (I32)longest_float_length;
5087    r->float_end_shift = ml - data.offset_float_min
5088     - longest_float_length + (SvTAIL(data.longest_float) != 0)
5089     + data.lookbehind_float;
5090    r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5091    r->float_max_offset = data.offset_float_max;
5092    if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5093     r->float_max_offset -= data.lookbehind_float;
5094
5095    t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5096      && (!(data.flags & SF_FL_BEFORE_MEOL)
5097       || (RExC_flags & RXf_PMf_MULTILINE)));
5098    fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5099   }
5100   else {
5101   remove_float:
5102    r->float_substr = r->float_utf8 = NULL;
5103    SvREFCNT_dec(data.longest_float);
5104    longest_float_length = 0;
5105   }
5106
5107   /* Note that code very similar to this but for floating string
5108   is immediately above, changes may need to be made to both.
5109   Be careful.
5110   */
5111   longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5112   if (longest_fixed_length
5113    || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5114     && (!(data.flags & SF_FIX_BEFORE_MEOL)
5115      || (RExC_flags & RXf_PMf_MULTILINE))))
5116   {
5117    I32 t,ml;
5118
5119    /* copy the information about the longest fixed
5120    from the reg_scan_data over to the program. */
5121    if (SvUTF8(data.longest_fixed)) {
5122     r->anchored_utf8 = data.longest_fixed;
5123     r->anchored_substr = NULL;
5124    } else {
5125     r->anchored_substr = data.longest_fixed;
5126     r->anchored_utf8 = NULL;
5127    }
5128    /* fixed_end_shift is how many chars that must be matched that
5129    follow this item. We calculate it ahead of time as once the
5130    lookbehind offset is added in we lose the ability to correctly
5131    calculate it.*/
5132    ml = data.minlen_fixed ? *(data.minlen_fixed)
5133         : (I32)longest_fixed_length;
5134    r->anchored_end_shift = ml - data.offset_fixed
5135     - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5136     + data.lookbehind_fixed;
5137    r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5138
5139    t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5140     && (!(data.flags & SF_FIX_BEFORE_MEOL)
5141      || (RExC_flags & RXf_PMf_MULTILINE)));
5142    fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5143   }
5144   else {
5145    r->anchored_substr = r->anchored_utf8 = NULL;
5146    SvREFCNT_dec(data.longest_fixed);
5147    longest_fixed_length = 0;
5148   }
5149   if (ri->regstclass
5150    && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5151    ri->regstclass = NULL;
5152
5153   if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5154    && stclass_flag
5155    && !(data.start_class->flags & ANYOF_EOS)
5156    && !cl_is_anything(data.start_class))
5157   {
5158    const U32 n = add_data(pRExC_state, 1, "f");
5159    data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5160
5161    Newx(RExC_rxi->data->data[n], 1,
5162     struct regnode_charclass_class);
5163    StructCopy(data.start_class,
5164      (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5165      struct regnode_charclass_class);
5166    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5167    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5168    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5169      regprop(r, sv, (regnode*)data.start_class);
5170      PerlIO_printf(Perl_debug_log,
5171          "synthetic stclass \"%s\".\n",
5172          SvPVX_const(sv));});
5173   }
5174
5175   /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5176   if (longest_fixed_length > longest_float_length) {
5177    r->check_end_shift = r->anchored_end_shift;
5178    r->check_substr = r->anchored_substr;
5179    r->check_utf8 = r->anchored_utf8;
5180    r->check_offset_min = r->check_offset_max = r->anchored_offset;
5181    if (r->extflags & RXf_ANCH_SINGLE)
5182     r->extflags |= RXf_NOSCAN;
5183   }
5184   else {
5185    r->check_end_shift = r->float_end_shift;
5186    r->check_substr = r->float_substr;
5187    r->check_utf8 = r->float_utf8;
5188    r->check_offset_min = r->float_min_offset;
5189    r->check_offset_max = r->float_max_offset;
5190   }
5191   /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5192   This should be changed ASAP!  */
5193   if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5194    r->extflags |= RXf_USE_INTUIT;
5195    if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5196     r->extflags |= RXf_INTUIT_TAIL;
5197   }
5198   /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5199   if ( (STRLEN)minlen < longest_float_length )
5200    minlen= longest_float_length;
5201   if ( (STRLEN)minlen < longest_fixed_length )
5202    minlen= longest_fixed_length;
5203   */
5204  }
5205  else {
5206   /* Several toplevels. Best we can is to set minlen. */
5207   I32 fake;
5208   struct regnode_charclass_class ch_class;
5209   I32 last_close = 0;
5210
5211   DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5212
5213   scan = ri->program + 1;
5214   cl_init(pRExC_state, &ch_class);
5215   data.start_class = &ch_class;
5216   data.last_closep = &last_close;
5217
5218
5219   minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5220    &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5221
5222   CHECK_RESTUDY_GOTO;
5223
5224   r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5225     = r->float_substr = r->float_utf8 = NULL;
5226
5227   if (!(data.start_class->flags & ANYOF_EOS)
5228    && !cl_is_anything(data.start_class))
5229   {
5230    const U32 n = add_data(pRExC_state, 1, "f");
5231    data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5232
5233    Newx(RExC_rxi->data->data[n], 1,
5234     struct regnode_charclass_class);
5235    StructCopy(data.start_class,
5236      (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5237      struct regnode_charclass_class);
5238    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5239    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5240    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5241      regprop(r, sv, (regnode*)data.start_class);
5242      PerlIO_printf(Perl_debug_log,
5243          "synthetic stclass \"%s\".\n",
5244          SvPVX_const(sv));});
5245   }
5246  }
5247
5248  /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5249  the "real" pattern. */
5250  DEBUG_OPTIMISE_r({
5251   PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5252      (IV)minlen, (IV)r->minlen);
5253  });
5254  r->minlenret = minlen;
5255  if (r->minlen < minlen)
5256   r->minlen = minlen;
5257
5258  if (RExC_seen & REG_SEEN_GPOS)
5259   r->extflags |= RXf_GPOS_SEEN;
5260  if (RExC_seen & REG_SEEN_LOOKBEHIND)
5261   r->extflags |= RXf_LOOKBEHIND_SEEN;
5262  if (RExC_seen & REG_SEEN_EVAL)
5263   r->extflags |= RXf_EVAL_SEEN;
5264  if (RExC_seen & REG_SEEN_CANY)
5265   r->extflags |= RXf_CANY_SEEN;
5266  if (RExC_seen & REG_SEEN_VERBARG)
5267   r->intflags |= PREGf_VERBARG_SEEN;
5268  if (RExC_seen & REG_SEEN_CUTGROUP)
5269   r->intflags |= PREGf_CUTGROUP_SEEN;
5270  if (RExC_paren_names)
5271   RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5272  else
5273   RXp_PAREN_NAMES(r) = NULL;
5274
5275 #ifdef STUPID_PATTERN_CHECKS
5276  if (RX_PRELEN(rx) == 0)
5277   r->extflags |= RXf_NULL;
5278  if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5279   /* XXX: this should happen BEFORE we compile */
5280   r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5281  else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5282   r->extflags |= RXf_WHITE;
5283  else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5284   r->extflags |= RXf_START_ONLY;
5285 #else
5286  if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5287    /* XXX: this should happen BEFORE we compile */
5288    r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5289  else {
5290   regnode *first = ri->program + 1;
5291   U8 fop = OP(first);
5292
5293   if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5294    r->extflags |= RXf_NULL;
5295   else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5296    r->extflags |= RXf_START_ONLY;
5297   else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5298        && OP(regnext(first)) == END)
5299    r->extflags |= RXf_WHITE;
5300  }
5301 #endif
5302 #ifdef DEBUGGING
5303  if (RExC_paren_names) {
5304   ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5305   ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5306  } else
5307 #endif
5308   ri->name_list_idx = 0;
5309
5310  if (RExC_recurse_count) {
5311   for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5312    const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5313    ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5314   }
5315  }
5316  Newxz(r->offs, RExC_npar, regexp_paren_pair);
5317  /* assume we don't need to swap parens around before we match */
5318
5319  DEBUG_DUMP_r({
5320   PerlIO_printf(Perl_debug_log,"Final program:\n");
5321   regdump(r);
5322  });
5323 #ifdef RE_TRACK_PATTERN_OFFSETS
5324  DEBUG_OFFSETS_r(if (ri->u.offsets) {
5325   const U32 len = ri->u.offsets[0];
5326   U32 i;
5327   GET_RE_DEBUG_FLAGS_DECL;
5328   PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5329   for (i = 1; i <= len; i++) {
5330    if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5331     PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5332     (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5333    }
5334   PerlIO_printf(Perl_debug_log, "\n");
5335  });
5336 #endif
5337  return rx;
5338 }
5339
5340 #undef RE_ENGINE_PTR
5341
5342
5343 SV*
5344 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5345      const U32 flags)
5346 {
5347  PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5348
5349  PERL_UNUSED_ARG(value);
5350
5351  if (flags & RXapif_FETCH) {
5352   return reg_named_buff_fetch(rx, key, flags);
5353  } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5354   Perl_croak_no_modify(aTHX);
5355   return NULL;
5356  } else if (flags & RXapif_EXISTS) {
5357   return reg_named_buff_exists(rx, key, flags)
5358    ? &PL_sv_yes
5359    : &PL_sv_no;
5360  } else if (flags & RXapif_REGNAMES) {
5361   return reg_named_buff_all(rx, flags);
5362  } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5363   return reg_named_buff_scalar(rx, flags);
5364  } else {
5365   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5366   return NULL;
5367  }
5368 }
5369
5370 SV*
5371 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5372       const U32 flags)
5373 {
5374  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5375  PERL_UNUSED_ARG(lastkey);
5376
5377  if (flags & RXapif_FIRSTKEY)
5378   return reg_named_buff_firstkey(rx, flags);
5379  else if (flags & RXapif_NEXTKEY)
5380   return reg_named_buff_nextkey(rx, flags);
5381  else {
5382   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5383   return NULL;
5384  }
5385 }
5386
5387 SV*
5388 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5389       const U32 flags)
5390 {
5391  AV *retarray = NULL;
5392  SV *ret;
5393  struct regexp *const rx = (struct regexp *)SvANY(r);
5394
5395  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5396
5397  if (flags & RXapif_ALL)
5398   retarray=newAV();
5399
5400  if (rx && RXp_PAREN_NAMES(rx)) {
5401   HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5402   if (he_str) {
5403    IV i;
5404    SV* sv_dat=HeVAL(he_str);
5405    I32 *nums=(I32*)SvPVX(sv_dat);
5406    for ( i=0; i<SvIVX(sv_dat); i++ ) {
5407     if ((I32)(rx->nparens) >= nums[i]
5408      && rx->offs[nums[i]].start != -1
5409      && rx->offs[nums[i]].end != -1)
5410     {
5411      ret = newSVpvs("");
5412      CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5413      if (!retarray)
5414       return ret;
5415     } else {
5416      ret = newSVsv(&PL_sv_undef);
5417     }
5418     if (retarray)
5419      av_push(retarray, ret);
5420    }
5421    if (retarray)
5422     return newRV_noinc(MUTABLE_SV(retarray));
5423   }
5424  }
5425  return NULL;
5426 }
5427
5428 bool
5429 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5430       const U32 flags)
5431 {
5432  struct regexp *const rx = (struct regexp *)SvANY(r);
5433
5434  PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5435
5436  if (rx && RXp_PAREN_NAMES(rx)) {
5437   if (flags & RXapif_ALL) {
5438    return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5439   } else {
5440    SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5441    if (sv) {
5442     SvREFCNT_dec(sv);
5443     return TRUE;
5444    } else {
5445     return FALSE;
5446    }
5447   }
5448  } else {
5449   return FALSE;
5450  }
5451 }
5452
5453 SV*
5454 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5455 {
5456  struct regexp *const rx = (struct regexp *)SvANY(r);
5457
5458  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5459
5460  if ( rx && RXp_PAREN_NAMES(rx) ) {
5461   (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5462
5463   return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5464  } else {
5465   return FALSE;
5466  }
5467 }
5468
5469 SV*
5470 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5471 {
5472  struct regexp *const rx = (struct regexp *)SvANY(r);
5473  GET_RE_DEBUG_FLAGS_DECL;
5474
5475  PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5476
5477  if (rx && RXp_PAREN_NAMES(rx)) {
5478   HV *hv = RXp_PAREN_NAMES(rx);
5479   HE *temphe;
5480   while ( (temphe = hv_iternext_flags(hv,0)) ) {
5481    IV i;
5482    IV parno = 0;
5483    SV* sv_dat = HeVAL(temphe);
5484    I32 *nums = (I32*)SvPVX(sv_dat);
5485    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5486     if ((I32)(rx->lastparen) >= nums[i] &&
5487      rx->offs[nums[i]].start != -1 &&
5488      rx->offs[nums[i]].end != -1)
5489     {
5490      parno = nums[i];
5491      break;
5492     }
5493    }
5494    if (parno || flags & RXapif_ALL) {
5495     return newSVhek(HeKEY_hek(temphe));
5496    }
5497   }
5498  }
5499  return NULL;
5500 }
5501
5502 SV*
5503 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5504 {
5505  SV *ret;
5506  AV *av;
5507  I32 length;
5508  struct regexp *const rx = (struct regexp *)SvANY(r);
5509
5510  PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5511
5512  if (rx && RXp_PAREN_NAMES(rx)) {
5513   if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5514    return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5515   } else if (flags & RXapif_ONE) {
5516    ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5517    av = MUTABLE_AV(SvRV(ret));
5518    length = av_len(av);
5519    SvREFCNT_dec(ret);
5520    return newSViv(length + 1);
5521   } else {
5522    Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5523    return NULL;
5524   }
5525  }
5526  return &PL_sv_undef;
5527 }
5528
5529 SV*
5530 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5531 {
5532  struct regexp *const rx = (struct regexp *)SvANY(r);
5533  AV *av = newAV();
5534
5535  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5536
5537  if (rx && RXp_PAREN_NAMES(rx)) {
5538   HV *hv= RXp_PAREN_NAMES(rx);
5539   HE *temphe;
5540   (void)hv_iterinit(hv);
5541   while ( (temphe = hv_iternext_flags(hv,0)) ) {
5542    IV i;
5543    IV parno = 0;
5544    SV* sv_dat = HeVAL(temphe);
5545    I32 *nums = (I32*)SvPVX(sv_dat);
5546    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5547     if ((I32)(rx->lastparen) >= nums[i] &&
5548      rx->offs[nums[i]].start != -1 &&
5549      rx->offs[nums[i]].end != -1)
5550     {
5551      parno = nums[i];
5552      break;
5553     }
5554    }
5555    if (parno || flags & RXapif_ALL) {
5556     av_push(av, newSVhek(HeKEY_hek(temphe)));
5557    }
5558   }
5559  }
5560
5561  return newRV_noinc(MUTABLE_SV(av));
5562 }
5563
5564 void
5565 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5566        SV * const sv)
5567 {
5568  struct regexp *const rx = (struct regexp *)SvANY(r);
5569  char *s = NULL;
5570  I32 i = 0;
5571  I32 s1, t1;
5572
5573  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5574
5575  if (!rx->subbeg) {
5576   sv_setsv(sv,&PL_sv_undef);
5577   return;
5578  }
5579  else
5580  if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5581   /* $` */
5582   i = rx->offs[0].start;
5583   s = rx->subbeg;
5584  }
5585  else
5586  if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5587   /* $' */
5588   s = rx->subbeg + rx->offs[0].end;
5589   i = rx->sublen - rx->offs[0].end;
5590  }
5591  else
5592  if ( 0 <= paren && paren <= (I32)rx->nparens &&
5593   (s1 = rx->offs[paren].start) != -1 &&
5594   (t1 = rx->offs[paren].end) != -1)
5595  {
5596   /* $& $1 ... */
5597   i = t1 - s1;
5598   s = rx->subbeg + s1;
5599  } else {
5600   sv_setsv(sv,&PL_sv_undef);
5601   return;
5602  }
5603  assert(rx->sublen >= (s - rx->subbeg) + i );
5604  if (i >= 0) {
5605   const int oldtainted = PL_tainted;
5606   TAINT_NOT;
5607   sv_setpvn(sv, s, i);
5608   PL_tainted = oldtainted;
5609   if ( (rx->extflags & RXf_CANY_SEEN)
5610    ? (RXp_MATCH_UTF8(rx)
5611       && (!i || is_utf8_string((U8*)s, i)))
5612    : (RXp_MATCH_UTF8(rx)) )
5613   {
5614    SvUTF8_on(sv);
5615   }
5616   else
5617    SvUTF8_off(sv);
5618   if (PL_tainting) {
5619    if (RXp_MATCH_TAINTED(rx)) {
5620     if (SvTYPE(sv) >= SVt_PVMG) {
5621      MAGIC* const mg = SvMAGIC(sv);
5622      MAGIC* mgt;
5623      PL_tainted = 1;
5624      SvMAGIC_set(sv, mg->mg_moremagic);
5625      SvTAINT(sv);
5626      if ((mgt = SvMAGIC(sv))) {
5627       mg->mg_moremagic = mgt;
5628       SvMAGIC_set(sv, mg);
5629      }
5630     } else {
5631      PL_tainted = 1;
5632      SvTAINT(sv);
5633     }
5634    } else
5635     SvTAINTED_off(sv);
5636   }
5637  } else {
5638   sv_setsv(sv,&PL_sv_undef);
5639   return;
5640  }
5641 }
5642
5643 void
5644 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5645               SV const * const value)
5646 {
5647  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5648
5649  PERL_UNUSED_ARG(rx);
5650  PERL_UNUSED_ARG(paren);
5651  PERL_UNUSED_ARG(value);
5652
5653  if (!PL_localizing)
5654   Perl_croak_no_modify(aTHX);
5655 }
5656
5657 I32
5658 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5659        const I32 paren)
5660 {
5661  struct regexp *const rx = (struct regexp *)SvANY(r);
5662  I32 i;
5663  I32 s1, t1;
5664
5665  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5666
5667  /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5668   switch (paren) {
5669  /* $` / ${^PREMATCH} */
5670  case RX_BUFF_IDX_PREMATCH:
5671   if (rx->offs[0].start != -1) {
5672       i = rx->offs[0].start;
5673       if (i > 0) {
5674         s1 = 0;
5675         t1 = i;
5676         goto getlen;
5677       }
5678    }
5679   return 0;
5680  /* $' / ${^POSTMATCH} */
5681  case RX_BUFF_IDX_POSTMATCH:
5682    if (rx->offs[0].end != -1) {
5683       i = rx->sublen - rx->offs[0].end;
5684       if (i > 0) {
5685         s1 = rx->offs[0].end;
5686         t1 = rx->sublen;
5687         goto getlen;
5688       }
5689    }
5690   return 0;
5691  /* $& / ${^MATCH}, $1, $2, ... */
5692  default:
5693    if (paren <= (I32)rx->nparens &&
5694    (s1 = rx->offs[paren].start) != -1 &&
5695    (t1 = rx->offs[paren].end) != -1)
5696    {
5697    i = t1 - s1;
5698    goto getlen;
5699   } else {
5700    if (ckWARN(WARN_UNINITIALIZED))
5701     report_uninit((const SV *)sv);
5702    return 0;
5703   }
5704  }
5705   getlen:
5706  if (i > 0 && RXp_MATCH_UTF8(rx)) {
5707   const char * const s = rx->subbeg + s1;
5708   const U8 *ep;
5709   STRLEN el;
5710
5711   i = t1 - s1;
5712   if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5713       i = el;
5714  }
5715  return i;
5716 }
5717
5718 SV*
5719 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5720 {
5721  PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5722   PERL_UNUSED_ARG(rx);
5723   if (0)
5724    return NULL;
5725   else
5726    return newSVpvs("Regexp");
5727 }
5728
5729 /* Scans the name of a named buffer from the pattern.
5730  * If flags is REG_RSN_RETURN_NULL returns null.
5731  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5732  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5733  * to the parsed name as looked up in the RExC_paren_names hash.
5734  * If there is an error throws a vFAIL().. type exception.
5735  */
5736
5737 #define REG_RSN_RETURN_NULL    0
5738 #define REG_RSN_RETURN_NAME    1
5739 #define REG_RSN_RETURN_DATA    2
5740
5741 STATIC SV*
5742 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5743 {
5744  char *name_start = RExC_parse;
5745
5746  PERL_ARGS_ASSERT_REG_SCAN_NAME;
5747
5748  if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5749   /* skip IDFIRST by using do...while */
5750   if (UTF)
5751    do {
5752     RExC_parse += UTF8SKIP(RExC_parse);
5753    } while (isALNUM_utf8((U8*)RExC_parse));
5754   else
5755    do {
5756     RExC_parse++;
5757    } while (isALNUM(*RExC_parse));
5758  }
5759
5760  if ( flags ) {
5761   SV* sv_name
5762    = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5763        SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5764   if ( flags == REG_RSN_RETURN_NAME)
5765    return sv_name;
5766   else if (flags==REG_RSN_RETURN_DATA) {
5767    HE *he_str = NULL;
5768    SV *sv_dat = NULL;
5769    if ( ! sv_name )      /* should not happen*/
5770     Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5771    if (RExC_paren_names)
5772     he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5773    if ( he_str )
5774     sv_dat = HeVAL(he_str);
5775    if ( ! sv_dat )
5776     vFAIL("Reference to nonexistent named group");
5777    return sv_dat;
5778   }
5779   else {
5780    Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5781   }
5782   /* NOT REACHED */
5783  }
5784  return NULL;
5785 }
5786
5787 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5788  int rem=(int)(RExC_end - RExC_parse);                       \
5789  int cut;                                                    \
5790  int num;                                                    \
5791  int iscut=0;                                                \
5792  if (rem>10) {                                               \
5793   rem=10;                                                 \
5794   iscut=1;                                                \
5795  }                                                           \
5796  cut=10-rem;                                                 \
5797  if (RExC_lastparse!=RExC_parse)                             \
5798   PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5799    rem, RExC_parse,                                    \
5800    cut + 4,                                            \
5801    iscut ? "..." : "<"                                 \
5802   );                                                      \
5803  else                                                        \
5804   PerlIO_printf(Perl_debug_log,"%16s","");                \
5805                 \
5806  if (SIZE_ONLY)                                              \
5807  num = RExC_size + 1;                                     \
5808  else                                                        \
5809  num=REG_NODE_NUM(RExC_emit);                             \
5810  if (RExC_lastnum!=num)                                      \
5811  PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5812  else                                                        \
5813  PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5814  PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5815   (int)((depth*2)), "",                                   \
5816   (funcname)                                              \
5817  );                                                          \
5818  RExC_lastnum=num;                                           \
5819  RExC_lastparse=RExC_parse;                                  \
5820 })
5821
5822
5823
5824 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5825  DEBUG_PARSE_MSG((funcname));                            \
5826  PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5827 })
5828 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5829  DEBUG_PARSE_MSG((funcname));                            \
5830  PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5831 })
5832
5833 /* This section of code defines the inversion list object and its methods.  The
5834  * interfaces are highly subject to change, so as much as possible is static to
5835  * this file.  An inversion list is here implemented as a malloc'd C array with
5836  * some added info.  More will be coming when functionality is added later.
5837  *
5838  * Some of the methods should always be private to the implementation, and some
5839  * should eventually be made public */
5840
5841 #define INVLIST_INITIAL_LEN 10
5842 #define INVLIST_ARRAY_KEY "array"
5843 #define INVLIST_MAX_KEY "max"
5844 #define INVLIST_LEN_KEY "len"
5845
5846 PERL_STATIC_INLINE UV*
5847 S_invlist_array(pTHX_ HV* const invlist)
5848 {
5849  /* Returns the pointer to the inversion list's array.  Every time the
5850  * length changes, this needs to be called in case malloc or realloc moved
5851  * it */
5852
5853  SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5854
5855  PERL_ARGS_ASSERT_INVLIST_ARRAY;
5856
5857  if (list_ptr == NULL) {
5858   Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5859                INVLIST_ARRAY_KEY);
5860  }
5861
5862  return INT2PTR(UV *, SvUV(*list_ptr));
5863 }
5864
5865 PERL_STATIC_INLINE void
5866 S_invlist_set_array(pTHX_ HV* const invlist, const UV* const array)
5867 {
5868  PERL_ARGS_ASSERT_INVLIST_SET_ARRAY;
5869
5870  /* Sets the array stored in the inversion list to the memory beginning with
5871  * the parameter */
5872
5873  if (hv_stores(invlist, INVLIST_ARRAY_KEY, newSVuv(PTR2UV(array))) == NULL) {
5874   Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5875                INVLIST_ARRAY_KEY);
5876  }
5877 }
5878
5879 PERL_STATIC_INLINE UV
5880 S_invlist_len(pTHX_ HV* const invlist)
5881 {
5882  /* Returns the current number of elements in the inversion list's array */
5883
5884  SV** len_ptr = hv_fetchs(invlist, INVLIST_LEN_KEY, FALSE);
5885
5886  PERL_ARGS_ASSERT_INVLIST_LEN;
5887
5888  if (len_ptr == NULL) {
5889   Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5890                INVLIST_LEN_KEY);
5891  }
5892
5893  return SvUV(*len_ptr);
5894 }
5895
5896 PERL_STATIC_INLINE UV
5897 S_invlist_max(pTHX_ HV* const invlist)
5898 {
5899  /* Returns the maximum number of elements storable in the inversion list's
5900  * array, without having to realloc() */
5901
5902  SV** max_ptr = hv_fetchs(invlist, INVLIST_MAX_KEY, FALSE);
5903
5904  PERL_ARGS_ASSERT_INVLIST_MAX;
5905
5906  if (max_ptr == NULL) {
5907   Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5908                INVLIST_MAX_KEY);
5909  }
5910
5911  return SvUV(*max_ptr);
5912 }
5913
5914 PERL_STATIC_INLINE void
5915 S_invlist_set_len(pTHX_ HV* const invlist, const UV len)
5916 {
5917  /* Sets the current number of elements stored in the inversion list */
5918
5919  PERL_ARGS_ASSERT_INVLIST_SET_LEN;
5920
5921  if (len != 0 && len > invlist_max(invlist)) {
5922   Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' more than %s=%"UVuf" in inversion list", INVLIST_LEN_KEY, len, INVLIST_MAX_KEY, invlist_max(invlist));
5923  }
5924
5925  if (hv_stores(invlist, INVLIST_LEN_KEY, newSVuv(len)) == NULL) {
5926   Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5927                INVLIST_LEN_KEY);
5928  }
5929 }
5930
5931 PERL_STATIC_INLINE void
5932 S_invlist_set_max(pTHX_ HV* const invlist, const UV max)
5933 {
5934
5935  /* Sets the maximum number of elements storable in the inversion list
5936  * without having to realloc() */
5937
5938  PERL_ARGS_ASSERT_INVLIST_SET_MAX;
5939
5940  if (max < invlist_len(invlist)) {
5941   Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' less than %s=%"UVuf" in inversion list", INVLIST_MAX_KEY, invlist_len(invlist), INVLIST_LEN_KEY, invlist_max(invlist));
5942  }
5943
5944  if (hv_stores(invlist, INVLIST_MAX_KEY, newSVuv(max)) == NULL) {
5945   Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5946                INVLIST_LEN_KEY);
5947  }
5948 }
5949
5950 #ifndef PERL_IN_XSUB_RE
5951 HV*
5952 Perl__new_invlist(pTHX_ IV initial_size)
5953 {
5954
5955  /* Return a pointer to a newly constructed inversion list, with enough
5956  * space to store 'initial_size' elements.  If that number is negative, a
5957  * system default is used instead */
5958
5959  HV* invlist = newHV();
5960  UV* list;
5961
5962  if (initial_size < 0) {
5963   initial_size = INVLIST_INITIAL_LEN;
5964  }
5965
5966  /* Allocate the initial space */
5967  Newx(list, initial_size, UV);
5968  invlist_set_array(invlist, list);
5969
5970  /* set_len has to come before set_max, as the latter inspects the len */
5971  invlist_set_len(invlist, 0);
5972  invlist_set_max(invlist, initial_size);
5973
5974  return invlist;
5975 }
5976 #endif
5977
5978 PERL_STATIC_INLINE void
5979 S_invlist_destroy(pTHX_ HV* const invlist)
5980 {
5981    /* Inversion list destructor */
5982
5983  SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5984
5985  PERL_ARGS_ASSERT_INVLIST_DESTROY;
5986
5987  if (list_ptr != NULL) {
5988   UV *list = INT2PTR(UV *, SvUV(*list_ptr)); /* PERL_POISON needs lvalue */
5989   Safefree(list);
5990  }
5991 }
5992
5993 STATIC void
5994 S_invlist_extend(pTHX_ HV* const invlist, const UV new_max)
5995 {
5996  /* Change the maximum size of an inversion list (up or down) */
5997
5998  UV* orig_array;
5999  UV* array;
6000  const UV old_max = invlist_max(invlist);
6001
6002  PERL_ARGS_ASSERT_INVLIST_EXTEND;
6003
6004  if (old_max == new_max) { /* If a no-op */
6005   return;
6006  }
6007
6008  array = orig_array = invlist_array(invlist);
6009  Renew(array, new_max, UV);
6010
6011  /* If the size change moved the list in memory, set the new one */
6012  if (array != orig_array) {
6013   invlist_set_array(invlist, array);
6014  }
6015
6016  invlist_set_max(invlist, new_max);
6017
6018 }
6019
6020 PERL_STATIC_INLINE void
6021 S_invlist_trim(pTHX_ HV* const invlist)
6022 {
6023  PERL_ARGS_ASSERT_INVLIST_TRIM;
6024
6025  /* Change the length of the inversion list to how many entries it currently
6026  * has */
6027
6028  invlist_extend(invlist, invlist_len(invlist));
6029 }
6030
6031 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6032  * etc */
6033
6034 #define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
6035
6036 #ifndef PERL_IN_XSUB_RE
6037 void
6038 Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
6039 {
6040    /* Subject to change or removal.  Append the range from 'start' to 'end' at
6041  * the end of the inversion list.  The range must be above any existing
6042  * ones. */
6043
6044  UV* array = invlist_array(invlist);
6045  UV max = invlist_max(invlist);
6046  UV len = invlist_len(invlist);
6047
6048  PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6049
6050  if (len > 0) {
6051
6052   /* Here, the existing list is non-empty. The current max entry in the
6053   * list is generally the first value not in the set, except when the
6054   * set extends to the end of permissible values, in which case it is
6055   * the first entry in that final set, and so this call is an attempt to
6056   * append out-of-order */
6057
6058   UV final_element = len - 1;
6059   if (array[final_element] > start
6060    || ELEMENT_IN_INVLIST_SET(final_element))
6061   {
6062    Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list");
6063   }
6064
6065   /* Here, it is a legal append.  If the new range begins with the first
6066   * value not in the set, it is extending the set, so the new first
6067   * value not in the set is one greater than the newly extended range.
6068   * */
6069   if (array[final_element] == start) {
6070    if (end != UV_MAX) {
6071     array[final_element] = end + 1;
6072    }
6073    else {
6074     /* But if the end is the maximum representable on the machine,
6075     * just let the range that this would extend have no end */
6076     invlist_set_len(invlist, len - 1);
6077    }
6078    return;
6079   }
6080  }
6081
6082  /* Here the new range doesn't extend any existing set.  Add it */
6083
6084  len += 2; /* Includes an element each for the start and end of range */
6085
6086  /* If overflows the existing space, extend, which may cause the array to be
6087  * moved */
6088  if (max < len) {
6089   invlist_extend(invlist, len);
6090   array = invlist_array(invlist);
6091  }
6092
6093  invlist_set_len(invlist, len);
6094
6095  /* The next item on the list starts the range, the one after that is
6096  * one past the new range.  */
6097  array[len - 2] = start;
6098  if (end != UV_MAX) {
6099   array[len - 1] = end + 1;
6100  }
6101  else {
6102   /* But if the end is the maximum representable on the machine, just let
6103   * the range have no end */
6104   invlist_set_len(invlist, len - 1);
6105  }
6106 }
6107 #endif
6108
6109 STATIC HV*
6110 S_invlist_union(pTHX_ HV* const a, HV* const b)
6111 {
6112  /* Return a new inversion list which is the union of two inversion lists.
6113  * The basis for this comes from "Unicode Demystified" Chapter 13 by
6114  * Richard Gillam, published by Addison-Wesley, and explained at some
6115  * length there.  The preface says to incorporate its examples into your
6116  * code at your own risk.
6117  *
6118  * The algorithm is like a merge sort.
6119  *
6120  * XXX A potential performance improvement is to keep track as we go along
6121  * if only one of the inputs contributes to the result, meaning the other
6122  * is a subset of that one.  In that case, we can skip the final copy and
6123  * return the larger of the input lists */
6124
6125  UV* array_a = invlist_array(a);   /* a's array */
6126  UV* array_b = invlist_array(b);
6127  UV len_a = invlist_len(a); /* length of a's array */
6128  UV len_b = invlist_len(b);
6129
6130  HV* u;   /* the resulting union */
6131  UV* array_u;
6132  UV len_u;
6133
6134  UV i_a = 0;      /* current index into a's array */
6135  UV i_b = 0;
6136  UV i_u = 0;
6137
6138  /* running count, as explained in the algorithm source book; items are
6139  * stopped accumulating and are output when the count changes to/from 0.
6140  * The count is incremented when we start a range that's in the set, and
6141  * decremented when we start a range that's not in the set.  So its range
6142  * is 0 to 2.  Only when the count is zero is something not in the set.
6143  */
6144  UV count = 0;
6145
6146  PERL_ARGS_ASSERT_INVLIST_UNION;
6147
6148  /* Size the union for the worst case: that the sets are completely
6149  * disjoint */
6150  u = _new_invlist(len_a + len_b);
6151  array_u = invlist_array(u);
6152
6153  /* Go through each list item by item, stopping when exhausted one of
6154  * them */
6155  while (i_a < len_a && i_b < len_b) {
6156   UV cp;     /* The element to potentially add to the union's array */
6157   bool cp_in_set;   /* is it in the the input list's set or not */
6158
6159   /* We need to take one or the other of the two inputs for the union.
6160   * Since we are merging two sorted lists, we take the smaller of the
6161   * next items.  In case of a tie, we take the one that is in its set
6162   * first.  If we took one not in the set first, it would decrement the
6163   * count, possibly to 0 which would cause it to be output as ending the
6164   * range, and the next time through we would take the same number, and
6165   * output it again as beginning the next range.  By doing it the
6166   * opposite way, there is no possibility that the count will be
6167   * momentarily decremented to 0, and thus the two adjoining ranges will
6168   * be seamlessly merged.  (In a tie and both are in the set or both not
6169   * in the set, it doesn't matter which we take first.) */
6170   if (array_a[i_a] < array_b[i_b]
6171    || (array_a[i_a] == array_b[i_b] && ELEMENT_IN_INVLIST_SET(i_a)))
6172   {
6173    cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6174    cp= array_a[i_a++];
6175   }
6176   else {
6177    cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6178    cp= array_b[i_b++];
6179   }
6180
6181   /* Here, have chosen which of the two inputs to look at.  Only output
6182   * if the running count changes to/from 0, which marks the
6183   * beginning/end of a range in that's in the set */
6184   if (cp_in_set) {
6185    if (count == 0) {
6186     array_u[i_u++] = cp;
6187    }
6188    count++;
6189   }
6190   else {
6191    count--;
6192    if (count == 0) {
6193     array_u[i_u++] = cp;
6194    }
6195   }
6196  }
6197
6198  /* Here, we are finished going through at least one of the lists, which
6199  * means there is something remaining in at most one.  We check if the list
6200  * that hasn't been exhausted is positioned such that we are in the middle
6201  * of a range in its set or not.  (We are in the set if the next item in
6202  * the array marks the beginning of something not in the set)   If in the
6203  * set, we decrement 'count'; if 0, there is potentially more to output.
6204  * There are four cases:
6205  * 1) Both weren't in their sets, count is 0, and remains 0.  What's left
6206  *    in the union is entirely from the non-exhausted set.
6207  * 2) Both were in their sets, count is 2.  Nothing further should
6208  *    be output, as everything that remains will be in the exhausted
6209  *    list's set, hence in the union; decrementing to 1 but not 0 insures
6210  *    that
6211  * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
6212  *    Nothing further should be output because the union includes
6213  *    everything from the exhausted set.  Not decrementing insures that.
6214  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6215  *    decrementing to 0 insures that we look at the remainder of the
6216  *    non-exhausted set */
6217  if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6218   || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6219  {
6220   count--;
6221  }
6222
6223  /* The final length is what we've output so far, plus what else is about to
6224  * be output.  (If 'count' is non-zero, then the input list we exhausted
6225  * has everything remaining up to the machine's limit in its set, and hence
6226  * in the union, so there will be no further output. */
6227  len_u = i_u;
6228  if (count == 0) {
6229   /* At most one of the subexpressions will be non-zero */
6230   len_u += (len_a - i_a) + (len_b - i_b);
6231  }
6232
6233  /* Set result to final length, which can change the pointer to array_u, so
6234  * re-find it */
6235  if (len_u != invlist_len(u)) {
6236   invlist_set_len(u, len_u);
6237   invlist_trim(u);
6238   array_u = invlist_array(u);
6239  }
6240
6241  /* When 'count' is 0, the list that was exhausted (if one was shorter than
6242  * the other) ended with everything above it not in its set.  That means
6243  * that the remaining part of the union is precisely the same as the
6244  * non-exhausted list, so can just copy it unchanged.  (If both list were
6245  * exhausted at the same time, then the operations below will be both 0.)
6246  */
6247  if (count == 0) {
6248   IV copy_count; /* At most one will have a non-zero copy count */
6249   if ((copy_count = len_a - i_a) > 0) {
6250    Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6251   }
6252   else if ((copy_count = len_b - i_b) > 0) {
6253    Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6254   }
6255  }
6256
6257  return u;
6258 }
6259
6260 STATIC HV*
6261 S_invlist_intersection(pTHX_ HV* const a, HV* const b)
6262 {
6263  /* Return the intersection of two inversion lists.  The basis for this
6264  * comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
6265  * by Addison-Wesley, and explained at some length there.  The preface says
6266  * to incorporate its examples into your code at your own risk.
6267  *
6268  * The algorithm is like a merge sort, and is essentially the same as the
6269  * union above
6270  */
6271
6272  UV* array_a = invlist_array(a);   /* a's array */
6273  UV* array_b = invlist_array(b);
6274  UV len_a = invlist_len(a); /* length of a's array */
6275  UV len_b = invlist_len(b);
6276
6277  HV* r;       /* the resulting intersection */
6278  UV* array_r;
6279  UV len_r;
6280
6281  UV i_a = 0;      /* current index into a's array */
6282  UV i_b = 0;
6283  UV i_r = 0;
6284
6285  /* running count, as explained in the algorithm source book; items are
6286  * stopped accumulating and are output when the count changes to/from 2.
6287  * The count is incremented when we start a range that's in the set, and
6288  * decremented when we start a range that's not in the set.  So its range
6289  * is 0 to 2.  Only when the count is 2 is something in the intersection.
6290  */
6291  UV count = 0;
6292
6293  PERL_ARGS_ASSERT_INVLIST_INTERSECTION;
6294
6295  /* Size the intersection for the worst case: that the intersection ends up
6296  * fragmenting everything to be completely disjoint */
6297  r= _new_invlist(len_a + len_b);
6298  array_r = invlist_array(r);
6299
6300  /* Go through each list item by item, stopping when exhausted one of
6301  * them */
6302  while (i_a < len_a && i_b < len_b) {
6303   UV cp;     /* The element to potentially add to the intersection's
6304      array */
6305   bool cp_in_set; /* Is it in the input list's set or not */
6306
6307   /* We need to take one or the other of the two inputs for the union.
6308   * Since we are merging two sorted lists, we take the smaller of the
6309   * next items.  In case of a tie, we take the one that is not in its
6310   * set first (a difference from the union algorithm).  If we took one
6311   * in the set first, it would increment the count, possibly to 2 which
6312   * would cause it to be output as starting a range in the intersection,
6313   * and the next time through we would take that same number, and output
6314   * it again as ending the set.  By doing it the opposite of this, we
6315   * there is no possibility that the count will be momentarily
6316   * incremented to 2.  (In a tie and both are in the set or both not in
6317   * the set, it doesn't matter which we take first.) */
6318   if (array_a[i_a] < array_b[i_b]
6319    || (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
6320   {
6321    cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6322    cp= array_a[i_a++];
6323   }
6324   else {
6325    cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6326    cp= array_b[i_b++];
6327   }
6328
6329   /* Here, have chosen which of the two inputs to look at.  Only output
6330   * if the running count changes to/from 2, which marks the
6331   * beginning/end of a range that's in the intersection */
6332   if (cp_in_set) {
6333    count++;
6334    if (count == 2) {
6335     array_r[i_r++] = cp;
6336    }
6337   }
6338   else {
6339    if (count == 2) {
6340     array_r[i_r++] = cp;
6341    }
6342    count--;
6343   }
6344  }
6345
6346  /* Here, we are finished going through at least one of the sets, which
6347  * means there is something remaining in at most one.  See the comments in
6348  * the union code */
6349  if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6350   || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6351  {
6352   count--;
6353  }
6354
6355  /* The final length is what we've output so far plus what else is in the
6356  * intersection.  Only one of the subexpressions below will be non-zero */
6357  len_r = i_r;
6358  if (count == 2) {
6359   len_r += (len_a - i_a) + (len_b - i_b);
6360  }
6361
6362  /* Set result to final length, which can change the pointer to array_r, so
6363  * re-find it */
6364  if (len_r != invlist_len(r)) {
6365   invlist_set_len(r, len_r);
6366   invlist_trim(r);
6367   array_r = invlist_array(r);
6368  }
6369
6370  /* Finish outputting any remaining */
6371  if (count == 2) { /* Only one of will have a non-zero copy count */
6372   IV copy_count;
6373   if ((copy_count = len_a - i_a) > 0) {
6374    Copy(array_a + i_a, array_r + i_r, copy_count, UV);
6375   }
6376   else if ((copy_count = len_b - i_b) > 0) {
6377    Copy(array_b + i_b, array_r + i_r, copy_count, UV);
6378   }
6379  }
6380
6381  return r;
6382 }
6383
6384 STATIC HV*
6385 S_add_range_to_invlist(pTHX_ HV* invlist, const UV start, const UV end)
6386 {
6387  /* Add the range from 'start' to 'end' inclusive to the inversion list's
6388  * set.  A pointer to the inversion list is returned.  This may actually be
6389  * a new list, in which case the passed in one has been destroyed.  The
6390  * passed in inversion list can be NULL, in which case a new one is created
6391  * with just the one range in it */
6392
6393  HV* range_invlist;
6394  HV* added_invlist;
6395  UV len;
6396
6397  if (invlist == NULL) {
6398   invlist = _new_invlist(2);
6399   len = 0;
6400  }
6401  else {
6402   len = invlist_len(invlist);
6403  }
6404
6405  /* If comes after the final entry, can just append it to the end */
6406  if (len == 0
6407   || start >= invlist_array(invlist)
6408          [invlist_len(invlist) - 1])
6409  {
6410   _append_range_to_invlist(invlist, start, end);
6411   return invlist;
6412  }
6413
6414  /* Here, can't just append things, create and return a new inversion list
6415  * which is the union of this range and the existing inversion list */
6416  range_invlist = _new_invlist(2);
6417  _append_range_to_invlist(range_invlist, start, end);
6418
6419  added_invlist = invlist_union(invlist, range_invlist);
6420
6421  /* The passed in list can be freed, as well as our temporary */
6422  invlist_destroy(range_invlist);
6423  if (invlist != added_invlist) {
6424   invlist_destroy(invlist);
6425  }
6426
6427  return added_invlist;
6428 }
6429
6430 PERL_STATIC_INLINE HV*
6431 S_add_cp_to_invlist(pTHX_ HV* invlist, const UV cp) {
6432  return add_range_to_invlist(invlist, cp, cp);
6433 }
6434
6435 /* End of inversion list object */
6436
6437 /*
6438  - reg - regular expression, i.e. main body or parenthesized thing
6439  *
6440  * Caller must absorb opening parenthesis.
6441  *
6442  * Combining parenthesis handling with the base level of regular expression
6443  * is a trifle forced, but the need to tie the tails of the branches to what
6444  * follows makes it hard to avoid.
6445  */
6446 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
6447 #ifdef DEBUGGING
6448 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
6449 #else
6450 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
6451 #endif
6452
6453 STATIC regnode *
6454 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
6455  /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
6456 {
6457  dVAR;
6458  register regnode *ret;  /* Will be the head of the group. */
6459  register regnode *br;
6460  register regnode *lastbr;
6461  register regnode *ender = NULL;
6462  register I32 parno = 0;
6463  I32 flags;
6464  U32 oregflags = RExC_flags;
6465  bool have_branch = 0;
6466  bool is_open = 0;
6467  I32 freeze_paren = 0;
6468  I32 after_freeze = 0;
6469
6470  /* for (?g), (?gc), and (?o) warnings; warning
6471  about (?c) will warn about (?g) -- japhy    */
6472
6473 #define WASTED_O  0x01
6474 #define WASTED_G  0x02
6475 #define WASTED_C  0x04
6476 #define WASTED_GC (0x02|0x04)
6477  I32 wastedflags = 0x00;
6478
6479  char * parse_start = RExC_parse; /* MJD */
6480  char * const oregcomp_parse = RExC_parse;
6481
6482  GET_RE_DEBUG_FLAGS_DECL;
6483
6484  PERL_ARGS_ASSERT_REG;
6485  DEBUG_PARSE("reg ");
6486
6487  *flagp = 0;    /* Tentatively. */
6488
6489
6490  /* Make an OPEN node, if parenthesized. */
6491  if (paren) {
6492   if ( *RExC_parse == '*') { /* (*VERB:ARG) */
6493    char *start_verb = RExC_parse;
6494    STRLEN verb_len = 0;
6495    char *start_arg = NULL;
6496    unsigned char op = 0;
6497    int argok = 1;
6498    int internal_argval = 0; /* internal_argval is only useful if !argok */
6499    while ( *RExC_parse && *RExC_parse != ')' ) {
6500     if ( *RExC_parse == ':' ) {
6501      start_arg = RExC_parse + 1;
6502      break;
6503     }
6504     RExC_parse++;
6505    }
6506    ++start_verb;
6507    verb_len = RExC_parse - start_verb;
6508    if ( start_arg ) {
6509     RExC_parse++;
6510     while ( *RExC_parse && *RExC_parse != ')' )
6511      RExC_parse++;
6512     if ( *RExC_parse != ')' )
6513      vFAIL("Unterminated verb pattern argument");
6514     if ( RExC_parse == start_arg )
6515      start_arg = NULL;
6516    } else {
6517     if ( *RExC_parse != ')' )
6518      vFAIL("Unterminated verb pattern");
6519    }
6520
6521    switch ( *start_verb ) {
6522    case 'A':  /* (*ACCEPT) */
6523     if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
6524      op = ACCEPT;
6525      internal_argval = RExC_nestroot;
6526     }
6527     break;
6528    case 'C':  /* (*COMMIT) */
6529     if ( memEQs(start_verb,verb_len,"COMMIT") )
6530      op = COMMIT;
6531     break;
6532    case 'F':  /* (*FAIL) */
6533     if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
6534      op = OPFAIL;
6535      argok = 0;
6536     }
6537     break;
6538    case ':':  /* (*:NAME) */
6539    case 'M':  /* (*MARK:NAME) */
6540     if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
6541      op = MARKPOINT;
6542      argok = -1;
6543     }
6544     break;
6545    case 'P':  /* (*PRUNE) */
6546     if ( memEQs(start_verb,verb_len,"PRUNE") )
6547      op = PRUNE;
6548     break;
6549    case 'S':   /* (*SKIP) */
6550     if ( memEQs(start_verb,verb_len,"SKIP") )
6551      op = SKIP;
6552     break;
6553    case 'T':  /* (*THEN) */
6554     /* [19:06] <TimToady> :: is then */
6555     if ( memEQs(start_verb,verb_len,"THEN") ) {
6556      op = CUTGROUP;
6557      RExC_seen |= REG_SEEN_CUTGROUP;
6558     }
6559     break;
6560    }
6561    if ( ! op ) {
6562     RExC_parse++;
6563     vFAIL3("Unknown verb pattern '%.*s'",
6564      verb_len, start_verb);
6565    }
6566    if ( argok ) {
6567     if ( start_arg && internal_argval ) {
6568      vFAIL3("Verb pattern '%.*s' may not have an argument",
6569       verb_len, start_verb);
6570     } else if ( argok < 0 && !start_arg ) {
6571      vFAIL3("Verb pattern '%.*s' has a mandatory argument",
6572       verb_len, start_verb);
6573     } else {
6574      ret = reganode(pRExC_state, op, internal_argval);
6575      if ( ! internal_argval && ! SIZE_ONLY ) {
6576       if (start_arg) {
6577        SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
6578        ARG(ret) = add_data( pRExC_state, 1, "S" );
6579        RExC_rxi->data->data[ARG(ret)]=(void*)sv;
6580        ret->flags = 0;
6581       } else {
6582        ret->flags = 1;
6583       }
6584      }
6585     }
6586     if (!internal_argval)
6587      RExC_seen |= REG_SEEN_VERBARG;
6588    } else if ( start_arg ) {
6589     vFAIL3("Verb pattern '%.*s' may not have an argument",
6590       verb_len, start_verb);
6591    } else {
6592     ret = reg_node(pRExC_state, op);
6593    }
6594    nextchar(pRExC_state);
6595    return ret;
6596   } else
6597   if (*RExC_parse == '?') { /* (?...) */
6598    bool is_logical = 0;
6599    const char * const seqstart = RExC_parse;
6600    bool has_use_defaults = FALSE;
6601
6602    RExC_parse++;
6603    paren = *RExC_parse++;
6604    ret = NULL;   /* For look-ahead/behind. */
6605    switch (paren) {
6606
6607    case 'P': /* (?P...) variants for those used to PCRE/Python */
6608     paren = *RExC_parse++;
6609     if ( paren == '<')         /* (?P<...>) named capture */
6610      goto named_capture;
6611     else if (paren == '>') {   /* (?P>name) named recursion */
6612      goto named_recursion;
6613     }
6614     else if (paren == '=') {   /* (?P=...)  named backref */
6615      /* this pretty much dupes the code for \k<NAME> in regatom(), if
6616      you change this make sure you change that */
6617      char* name_start = RExC_parse;
6618      U32 num = 0;
6619      SV *sv_dat = reg_scan_name(pRExC_state,
6620       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6621      if (RExC_parse == name_start || *RExC_parse != ')')
6622       vFAIL2("Sequence %.3s... not terminated",parse_start);
6623
6624      if (!SIZE_ONLY) {
6625       num = add_data( pRExC_state, 1, "S" );
6626       RExC_rxi->data->data[num]=(void*)sv_dat;
6627       SvREFCNT_inc_simple_void(sv_dat);
6628      }
6629      RExC_sawback = 1;
6630      ret = reganode(pRExC_state,
6631         ((! FOLD)
6632          ? NREF
6633          : (MORE_ASCII_RESTRICTED)
6634          ? NREFFA
6635          : (AT_LEAST_UNI_SEMANTICS)
6636           ? NREFFU
6637           : (LOC)
6638           ? NREFFL
6639           : NREFF),
6640          num);
6641      *flagp |= HASWIDTH;
6642
6643      Set_Node_Offset(ret, parse_start+1);
6644      Set_Node_Cur_Length(ret); /* MJD */
6645
6646      nextchar(pRExC_state);
6647      return ret;
6648     }
6649     RExC_parse++;
6650     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6651     /*NOTREACHED*/
6652    case '<':           /* (?<...) */
6653     if (*RExC_parse == '!')
6654      paren = ',';
6655     else if (*RExC_parse != '=')
6656    named_capture:
6657     {               /* (?<...>) */
6658      char *name_start;
6659      SV *svname;
6660      paren= '>';
6661    case '\'':          /* (?'...') */
6662       name_start= RExC_parse;
6663       svname = reg_scan_name(pRExC_state,
6664        SIZE_ONLY ?  /* reverse test from the others */
6665        REG_RSN_RETURN_NAME :
6666        REG_RSN_RETURN_NULL);
6667      if (RExC_parse == name_start) {
6668       RExC_parse++;
6669       vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6670       /*NOTREACHED*/
6671      }
6672      if (*RExC_parse != paren)
6673       vFAIL2("Sequence (?%c... not terminated",
6674        paren=='>' ? '<' : paren);
6675      if (SIZE_ONLY) {
6676       HE *he_str;
6677       SV *sv_dat = NULL;
6678       if (!svname) /* shouldn't happen */
6679        Perl_croak(aTHX_
6680         "panic: reg_scan_name returned NULL");
6681       if (!RExC_paren_names) {
6682        RExC_paren_names= newHV();
6683        sv_2mortal(MUTABLE_SV(RExC_paren_names));
6684 #ifdef DEBUGGING
6685        RExC_paren_name_list= newAV();
6686        sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
6687 #endif
6688       }
6689       he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
6690       if ( he_str )
6691        sv_dat = HeVAL(he_str);
6692       if ( ! sv_dat ) {
6693        /* croak baby croak */
6694        Perl_croak(aTHX_
6695         "panic: paren_name hash element allocation failed");
6696       } else if ( SvPOK(sv_dat) ) {
6697        /* (?|...) can mean we have dupes so scan to check
6698        its already been stored. Maybe a flag indicating
6699        we are inside such a construct would be useful,
6700        but the arrays are likely to be quite small, so
6701        for now we punt -- dmq */
6702        IV count = SvIV(sv_dat);
6703        I32 *pv = (I32*)SvPVX(sv_dat);
6704        IV i;
6705        for ( i = 0 ; i < count ; i++ ) {
6706         if ( pv[i] == RExC_npar ) {
6707          count = 0;
6708          break;
6709         }
6710        }
6711        if ( count ) {
6712         pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
6713         SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
6714         pv[count] = RExC_npar;
6715         SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
6716        }
6717       } else {
6718        (void)SvUPGRADE(sv_dat,SVt_PVNV);
6719        sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
6720        SvIOK_on(sv_dat);
6721        SvIV_set(sv_dat, 1);
6722       }
6723 #ifdef DEBUGGING
6724       if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
6725        SvREFCNT_dec(svname);
6726 #endif
6727
6728       /*sv_dump(sv_dat);*/
6729      }
6730      nextchar(pRExC_state);
6731      paren = 1;
6732      goto capturing_parens;
6733     }
6734     RExC_seen |= REG_SEEN_LOOKBEHIND;
6735     RExC_in_lookbehind++;
6736     RExC_parse++;
6737    case '=':           /* (?=...) */
6738     RExC_seen_zerolen++;
6739     break;
6740    case '!':           /* (?!...) */
6741     RExC_seen_zerolen++;
6742     if (*RExC_parse == ')') {
6743      ret=reg_node(pRExC_state, OPFAIL);
6744      nextchar(pRExC_state);
6745      return ret;
6746     }
6747     break;
6748    case '|':           /* (?|...) */
6749     /* branch reset, behave like a (?:...) except that
6750     buffers in alternations share the same numbers */
6751     paren = ':';
6752     after_freeze = freeze_paren = RExC_npar;
6753     break;
6754    case ':':           /* (?:...) */
6755    case '>':           /* (?>...) */
6756     break;
6757    case '$':           /* (?$...) */
6758    case '@':           /* (?@...) */
6759     vFAIL2("Sequence (?%c...) not implemented", (int)paren);
6760     break;
6761    case '#':           /* (?#...) */
6762     while (*RExC_parse && *RExC_parse != ')')
6763      RExC_parse++;
6764     if (*RExC_parse != ')')
6765      FAIL("Sequence (?#... not terminated");
6766     nextchar(pRExC_state);
6767     *flagp = TRYAGAIN;
6768     return NULL;
6769    case '0' :           /* (?0) */
6770    case 'R' :           /* (?R) */
6771     if (*RExC_parse != ')')
6772      FAIL("Sequence (?R) not terminated");
6773     ret = reg_node(pRExC_state, GOSTART);
6774     *flagp |= POSTPONED;
6775     nextchar(pRExC_state);
6776     return ret;
6777     /*notreached*/
6778    { /* named and numeric backreferences */
6779     I32 num;
6780    case '&':            /* (?&NAME) */
6781     parse_start = RExC_parse - 1;
6782    named_recursion:
6783     {
6784       SV *sv_dat = reg_scan_name(pRExC_state,
6785        SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6786       num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6787     }
6788     goto gen_recurse_regop;
6789     /* NOT REACHED */
6790    case '+':
6791     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6792      RExC_parse++;
6793      vFAIL("Illegal pattern");
6794     }
6795     goto parse_recursion;
6796     /* NOT REACHED*/
6797    case '-': /* (?-1) */
6798     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6799      RExC_parse--; /* rewind to let it be handled later */
6800      goto parse_flags;
6801     }
6802     /*FALLTHROUGH */
6803    case '1': case '2': case '3': case '4': /* (?1) */
6804    case '5': case '6': case '7': case '8': case '9':
6805     RExC_parse--;
6806    parse_recursion:
6807     num = atoi(RExC_parse);
6808     parse_start = RExC_parse - 1; /* MJD */
6809     if (*RExC_parse == '-')
6810      RExC_parse++;
6811     while (isDIGIT(*RExC_parse))
6812       RExC_parse++;
6813     if (*RExC_parse!=')')
6814      vFAIL("Expecting close bracket");
6815
6816    gen_recurse_regop:
6817     if ( paren == '-' ) {
6818      /*
6819      Diagram of capture buffer numbering.
6820      Top line is the normal capture buffer numbers
6821      Bottom line is the negative indexing as from
6822      the X (the (?-2))
6823
6824      +   1 2    3 4 5 X          6 7
6825      /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6826      -   5 4    3 2 1 X          x x
6827
6828      */
6829      num = RExC_npar + num;
6830      if (num < 1)  {
6831       RExC_parse++;
6832       vFAIL("Reference to nonexistent group");
6833      }
6834     } else if ( paren == '+' ) {
6835      num = RExC_npar + num - 1;
6836     }
6837
6838     ret = reganode(pRExC_state, GOSUB, num);
6839     if (!SIZE_ONLY) {
6840      if (num > (I32)RExC_rx->nparens) {
6841       RExC_parse++;
6842       vFAIL("Reference to nonexistent group");
6843      }
6844      ARG2L_SET( ret, RExC_recurse_count++);
6845      RExC_emit++;
6846      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6847       "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6848     } else {
6849      RExC_size++;
6850      }
6851      RExC_seen |= REG_SEEN_RECURSE;
6852     Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6853     Set_Node_Offset(ret, parse_start); /* MJD */
6854
6855     *flagp |= POSTPONED;
6856     nextchar(pRExC_state);
6857     return ret;
6858    } /* named and numeric backreferences */
6859    /* NOT REACHED */
6860
6861    case '?':           /* (??...) */
6862     is_logical = 1;
6863     if (*RExC_parse != '{') {
6864      RExC_parse++;
6865      vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6866      /*NOTREACHED*/
6867     }
6868     *flagp |= POSTPONED;
6869     paren = *RExC_parse++;
6870     /* FALL THROUGH */
6871    case '{':           /* (?{...}) */
6872    {
6873     I32 count = 1;
6874     U32 n = 0;
6875     char c;
6876     char *s = RExC_parse;
6877
6878     RExC_seen_zerolen++;
6879     RExC_seen |= REG_SEEN_EVAL;
6880     while (count && (c = *RExC_parse)) {
6881      if (c == '\\') {
6882       if (RExC_parse[1])
6883        RExC_parse++;
6884      }
6885      else if (c == '{')
6886       count++;
6887      else if (c == '}')
6888       count--;
6889      RExC_parse++;
6890     }
6891     if (*RExC_parse != ')') {
6892      RExC_parse = s;
6893      vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6894     }
6895     if (!SIZE_ONLY) {
6896      PAD *pad;
6897      OP_4tree *sop, *rop;
6898      SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6899
6900      ENTER;
6901      Perl_save_re_context(aTHX);
6902      rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6903      sop->op_private |= OPpREFCOUNTED;
6904      /* re_dup will OpREFCNT_inc */
6905      OpREFCNT_set(sop, 1);
6906      LEAVE;
6907
6908      n = add_data(pRExC_state, 3, "nop");
6909      RExC_rxi->data->data[n] = (void*)rop;
6910      RExC_rxi->data->data[n+1] = (void*)sop;
6911      RExC_rxi->data->data[n+2] = (void*)pad;
6912      SvREFCNT_dec(sv);
6913     }
6914     else {      /* First pass */
6915      if (PL_reginterp_cnt < ++RExC_seen_evals
6916       && IN_PERL_RUNTIME)
6917       /* No compiled RE interpolated, has runtime
6918       components ===> unsafe.  */
6919       FAIL("Eval-group not allowed at runtime, use re 'eval'");
6920      if (PL_tainting && PL_tainted)
6921       FAIL("Eval-group in insecure regular expression");
6922 #if PERL_VERSION > 8
6923      if (IN_PERL_COMPILETIME)
6924       PL_cv_has_eval = 1;
6925 #endif
6926     }
6927
6928     nextchar(pRExC_state);
6929     if (is_logical) {
6930      ret = reg_node(pRExC_state, LOGICAL);
6931      if (!SIZE_ONLY)
6932       ret->flags = 2;
6933      REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6934      /* deal with the length of this later - MJD */
6935      return ret;
6936     }
6937     ret = reganode(pRExC_state, EVAL, n);
6938     Set_Node_Length(ret, RExC_parse - parse_start + 1);
6939     Set_Node_Offset(ret, parse_start);
6940     return ret;
6941    }
6942    case '(':           /* (?(?{...})...) and (?(?=...)...) */
6943    {
6944     int is_define= 0;
6945     if (RExC_parse[0] == '?') {        /* (?(?...)) */
6946      if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6947       || RExC_parse[1] == '<'
6948       || RExC_parse[1] == '{') { /* Lookahead or eval. */
6949       I32 flag;
6950
6951       ret = reg_node(pRExC_state, LOGICAL);
6952       if (!SIZE_ONLY)
6953        ret->flags = 1;
6954       REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6955       goto insert_if;
6956      }
6957     }
6958     else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
6959       || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6960     {
6961      char ch = RExC_parse[0] == '<' ? '>' : '\'';
6962      char *name_start= RExC_parse++;
6963      U32 num = 0;
6964      SV *sv_dat=reg_scan_name(pRExC_state,
6965       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6966      if (RExC_parse == name_start || *RExC_parse != ch)
6967       vFAIL2("Sequence (?(%c... not terminated",
6968        (ch == '>' ? '<' : ch));
6969      RExC_parse++;
6970      if (!SIZE_ONLY) {
6971       num = add_data( pRExC_state, 1, "S" );
6972       RExC_rxi->data->data[num]=(void*)sv_dat;
6973       SvREFCNT_inc_simple_void(sv_dat);
6974      }
6975      ret = reganode(pRExC_state,NGROUPP,num);
6976      goto insert_if_check_paren;
6977     }
6978     else if (RExC_parse[0] == 'D' &&
6979       RExC_parse[1] == 'E' &&
6980       RExC_parse[2] == 'F' &&
6981       RExC_parse[3] == 'I' &&
6982       RExC_parse[4] == 'N' &&
6983       RExC_parse[5] == 'E')
6984     {
6985      ret = reganode(pRExC_state,DEFINEP,0);
6986      RExC_parse +=6 ;
6987      is_define = 1;
6988      goto insert_if_check_paren;
6989     }
6990     else if (RExC_parse[0] == 'R') {
6991      RExC_parse++;
6992      parno = 0;
6993      if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6994       parno = atoi(RExC_parse++);
6995       while (isDIGIT(*RExC_parse))
6996        RExC_parse++;
6997      } else if (RExC_parse[0] == '&') {
6998       SV *sv_dat;
6999       RExC_parse++;
7000       sv_dat = reg_scan_name(pRExC_state,
7001         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7002        parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
7003      }
7004      ret = reganode(pRExC_state,INSUBP,parno);
7005      goto insert_if_check_paren;
7006     }
7007     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
7008      /* (?(1)...) */
7009      char c;
7010      parno = atoi(RExC_parse++);
7011
7012      while (isDIGIT(*RExC_parse))
7013       RExC_parse++;
7014      ret = reganode(pRExC_state, GROUPP, parno);
7015
7016     insert_if_check_paren:
7017      if ((c = *nextchar(pRExC_state)) != ')')
7018       vFAIL("Switch condition not recognized");
7019     insert_if:
7020      REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
7021      br = regbranch(pRExC_state, &flags, 1,depth+1);
7022      if (br == NULL)
7023       br = reganode(pRExC_state, LONGJMP, 0);
7024      else
7025       REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
7026      c = *nextchar(pRExC_state);
7027      if (flags&HASWIDTH)
7028       *flagp |= HASWIDTH;
7029      if (c == '|') {
7030       if (is_define)
7031        vFAIL("(?(DEFINE)....) does not allow branches");
7032       lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
7033       regbranch(pRExC_state, &flags, 1,depth+1);
7034       REGTAIL(pRExC_state, ret, lastbr);
7035       if (flags&HASWIDTH)
7036        *flagp |= HASWIDTH;
7037       c = *nextchar(pRExC_state);
7038      }
7039      else
7040       lastbr = NULL;
7041      if (c != ')')
7042       vFAIL("Switch (?(condition)... contains too many branches");
7043      ender = reg_node(pRExC_state, TAIL);
7044      REGTAIL(pRExC_state, br, ender);
7045      if (lastbr) {
7046       REGTAIL(pRExC_state, lastbr, ender);
7047       REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
7048      }
7049      else
7050       REGTAIL(pRExC_state, ret, ender);
7051      RExC_size++; /* XXX WHY do we need this?!!
7052          For large programs it seems to be required
7053          but I can't figure out why. -- dmq*/
7054      return ret;
7055     }
7056     else {
7057      vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
7058     }
7059    }
7060    case 0:
7061     RExC_parse--; /* for vFAIL to print correctly */
7062     vFAIL("Sequence (? incomplete");
7063     break;
7064    case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
7065          that follow */
7066     has_use_defaults = TRUE;
7067     STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
7068     set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
7069             ? REGEX_UNICODE_CHARSET
7070             : REGEX_DEPENDS_CHARSET);
7071     goto parse_flags;
7072    default:
7073     --RExC_parse;
7074     parse_flags:      /* (?i) */
7075    {
7076     U32 posflags = 0, negflags = 0;
7077     U32 *flagsp = &posflags;
7078     char has_charset_modifier = '\0';
7079     regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
7080          ? REGEX_UNICODE_CHARSET
7081          : REGEX_DEPENDS_CHARSET;
7082
7083     while (*RExC_parse) {
7084      /* && strchr("iogcmsx", *RExC_parse) */
7085      /* (?g), (?gc) and (?o) are useless here
7086      and must be globally applied -- japhy */
7087      switch (*RExC_parse) {
7088      CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
7089      case LOCALE_PAT_MOD:
7090       if (has_charset_modifier) {
7091        goto excess_modifier;
7092       }
7093       else if (flagsp == &negflags) {
7094        goto neg_modifier;
7095       }
7096       cs = REGEX_LOCALE_CHARSET;
7097       has_charset_modifier = LOCALE_PAT_MOD;
7098       RExC_contains_locale = 1;
7099       break;
7100      case UNICODE_PAT_MOD:
7101       if (has_charset_modifier) {
7102        goto excess_modifier;
7103       }
7104       else if (flagsp == &negflags) {
7105        goto neg_modifier;
7106       }
7107       cs = REGEX_UNICODE_CHARSET;
7108       has_charset_modifier = UNICODE_PAT_MOD;
7109       break;
7110      case ASCII_RESTRICT_PAT_MOD:
7111       if (flagsp == &negflags) {
7112        goto neg_modifier;
7113       }
7114       if (has_charset_modifier) {
7115        if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
7116         goto excess_modifier;
7117        }
7118        /* Doubled modifier implies more restricted */
7119        cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
7120       }
7121       else {
7122        cs = REGEX_ASCII_RESTRICTED_CHARSET;
7123       }
7124       has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
7125       break;
7126      case DEPENDS_PAT_MOD:
7127       if (has_use_defaults) {
7128        goto fail_modifiers;
7129       }
7130       else if (flagsp == &negflags) {
7131        goto neg_modifier;
7132       }
7133       else if (has_charset_modifier) {
7134        goto excess_modifier;
7135       }
7136
7137       /* The dual charset means unicode semantics if the
7138       * pattern (or target, not known until runtime) are
7139       * utf8, or something in the pattern indicates unicode
7140       * semantics */
7141       cs = (RExC_utf8 || RExC_uni_semantics)
7142        ? REGEX_UNICODE_CHARSET
7143        : REGEX_DEPENDS_CHARSET;
7144       has_charset_modifier = DEPENDS_PAT_MOD;
7145       break;
7146      excess_modifier:
7147       RExC_parse++;
7148       if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
7149        vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
7150       }
7151       else if (has_charset_modifier == *(RExC_parse - 1)) {
7152        vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
7153       }
7154       else {
7155        vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
7156       }
7157       /*NOTREACHED*/
7158      neg_modifier:
7159       RExC_parse++;
7160       vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
7161       /*NOTREACHED*/
7162      case ONCE_PAT_MOD: /* 'o' */
7163      case GLOBAL_PAT_MOD: /* 'g' */
7164       if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7165        const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
7166        if (! (wastedflags & wflagbit) ) {
7167         wastedflags |= wflagbit;
7168         vWARN5(
7169          RExC_parse + 1,
7170          "Useless (%s%c) - %suse /%c modifier",
7171          flagsp == &negflags ? "?-" : "?",
7172          *RExC_parse,
7173          flagsp == &negflags ? "don't " : "",
7174          *RExC_parse
7175         );
7176        }
7177       }
7178       break;
7179
7180      case CONTINUE_PAT_MOD: /* 'c' */
7181       if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7182        if (! (wastedflags & WASTED_C) ) {
7183         wastedflags |= WASTED_GC;
7184         vWARN3(
7185          RExC_parse + 1,
7186          "Useless (%sc) - %suse /gc modifier",
7187          flagsp == &negflags ? "?-" : "?",
7188          flagsp == &negflags ? "don't " : ""
7189         );
7190        }
7191       }
7192       break;
7193      case KEEPCOPY_PAT_MOD: /* 'p' */
7194       if (flagsp == &negflags) {
7195        if (SIZE_ONLY)
7196         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
7197       } else {
7198        *flagsp |= RXf_PMf_KEEPCOPY;
7199       }
7200       break;
7201      case '-':
7202       /* A flag is a default iff it is following a minus, so
7203       * if there is a minus, it means will be trying to
7204       * re-specify a default which is an error */
7205       if (has_use_defaults || flagsp == &negflags) {
7206    fail_modifiers:
7207        RExC_parse++;
7208        vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7209        /*NOTREACHED*/
7210       }
7211       flagsp = &negflags;
7212       wastedflags = 0;  /* reset so (?g-c) warns twice */
7213       break;
7214      case ':':
7215       paren = ':';
7216       /*FALLTHROUGH*/
7217      case ')':
7218       RExC_flags |= posflags;
7219       RExC_flags &= ~negflags;
7220       set_regex_charset(&RExC_flags, cs);
7221       if (paren != ':') {
7222        oregflags |= posflags;
7223        oregflags &= ~negflags;
7224        set_regex_charset(&oregflags, cs);
7225       }
7226       nextchar(pRExC_state);
7227       if (paren != ':') {
7228        *flagp = TRYAGAIN;
7229        return NULL;
7230       } else {
7231        ret = NULL;
7232        goto parse_rest;
7233       }
7234       /*NOTREACHED*/
7235      default:
7236       RExC_parse++;
7237       vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7238       /*NOTREACHED*/
7239      }
7240      ++RExC_parse;
7241     }
7242    }} /* one for the default block, one for the switch */
7243   }
7244   else {                  /* (...) */
7245   capturing_parens:
7246    parno = RExC_npar;
7247    RExC_npar++;
7248
7249    ret = reganode(pRExC_state, OPEN, parno);
7250    if (!SIZE_ONLY ){
7251     if (!RExC_nestroot)
7252      RExC_nestroot = parno;
7253     if (RExC_seen & REG_SEEN_RECURSE
7254      && !RExC_open_parens[parno-1])
7255     {
7256      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7257       "Setting open paren #%"IVdf" to %d\n",
7258       (IV)parno, REG_NODE_NUM(ret)));
7259      RExC_open_parens[parno-1]= ret;
7260     }
7261    }
7262    Set_Node_Length(ret, 1); /* MJD */
7263    Set_Node_Offset(ret, RExC_parse); /* MJD */
7264    is_open = 1;
7265   }
7266  }
7267  else                        /* ! paren */
7268   ret = NULL;
7269
7270    parse_rest:
7271  /* Pick up the branches, linking them together. */
7272  parse_start = RExC_parse;   /* MJD */
7273  br = regbranch(pRExC_state, &flags, 1,depth+1);
7274
7275  /*     branch_len = (paren != 0); */
7276
7277  if (br == NULL)
7278   return(NULL);
7279  if (*RExC_parse == '|') {
7280   if (!SIZE_ONLY && RExC_extralen) {
7281    reginsert(pRExC_state, BRANCHJ, br, depth+1);
7282   }
7283   else {                  /* MJD */
7284    reginsert(pRExC_state, BRANCH, br, depth+1);
7285    Set_Node_Length(br, paren != 0);
7286    Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
7287   }
7288   have_branch = 1;
7289   if (SIZE_ONLY)
7290    RExC_extralen += 1;  /* For BRANCHJ-BRANCH. */
7291  }
7292  else if (paren == ':') {
7293   *flagp |= flags&SIMPLE;
7294  }
7295  if (is_open) {    /* Starts with OPEN. */
7296   REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
7297  }
7298  else if (paren != '?')  /* Not Conditional */
7299   ret = br;
7300  *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7301  lastbr = br;
7302  while (*RExC_parse == '|') {
7303   if (!SIZE_ONLY && RExC_extralen) {
7304    ender = reganode(pRExC_state, LONGJMP,0);
7305    REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
7306   }
7307   if (SIZE_ONLY)
7308    RExC_extralen += 2;  /* Account for LONGJMP. */
7309   nextchar(pRExC_state);
7310   if (freeze_paren) {
7311    if (RExC_npar > after_freeze)
7312     after_freeze = RExC_npar;
7313    RExC_npar = freeze_paren;
7314   }
7315   br = regbranch(pRExC_state, &flags, 0, depth+1);
7316
7317   if (br == NULL)
7318    return(NULL);
7319   REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
7320   lastbr = br;
7321   *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7322  }
7323
7324  if (have_branch || paren != ':') {
7325   /* Make a closing node, and hook it on the end. */
7326   switch (paren) {
7327   case ':':
7328    ender = reg_node(pRExC_state, TAIL);
7329    break;
7330   case 1:
7331    ender = reganode(pRExC_state, CLOSE, parno);
7332    if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
7333     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7334       "Setting close paren #%"IVdf" to %d\n",
7335       (IV)parno, REG_NODE_NUM(ender)));
7336     RExC_close_parens[parno-1]= ender;
7337     if (RExC_nestroot == parno)
7338      RExC_nestroot = 0;
7339    }
7340    Set_Node_Offset(ender,RExC_parse+1); /* MJD */
7341    Set_Node_Length(ender,1); /* MJD */
7342    break;
7343   case '<':
7344   case ',':
7345   case '=':
7346   case '!':
7347    *flagp &= ~HASWIDTH;
7348    /* FALL THROUGH */
7349   case '>':
7350    ender = reg_node(pRExC_state, SUCCEED);
7351    break;
7352   case 0:
7353    ender = reg_node(pRExC_state, END);
7354    if (!SIZE_ONLY) {
7355     assert(!RExC_opend); /* there can only be one! */
7356     RExC_opend = ender;
7357    }
7358    break;
7359   }
7360   REGTAIL(pRExC_state, lastbr, ender);
7361
7362   if (have_branch && !SIZE_ONLY) {
7363    if (depth==1)
7364     RExC_seen |= REG_TOP_LEVEL_BRANCHES;
7365
7366    /* Hook the tails of the branches to the closing node. */
7367    for (br = ret; br; br = regnext(br)) {
7368     const U8 op = PL_regkind[OP(br)];
7369     if (op == BRANCH) {
7370      REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
7371     }
7372     else if (op == BRANCHJ) {
7373      REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
7374     }
7375    }
7376   }
7377  }
7378
7379  {
7380   const char *p;
7381   static const char parens[] = "=!<,>";
7382
7383   if (paren && (p = strchr(parens, paren))) {
7384    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
7385    int flag = (p - parens) > 1;
7386
7387    if (paren == '>')
7388     node = SUSPEND, flag = 0;
7389    reginsert(pRExC_state, node,ret, depth+1);
7390    Set_Node_Cur_Length(ret);
7391    Set_Node_Offset(ret, parse_start + 1);
7392    ret->flags = flag;
7393    REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
7394   }
7395  }
7396
7397  /* Check for proper termination. */
7398  if (paren) {
7399   RExC_flags = oregflags;
7400   if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
7401    RExC_parse = oregcomp_parse;
7402    vFAIL("Unmatched (");
7403   }
7404  }
7405  else if (!paren && RExC_parse < RExC_end) {
7406   if (*RExC_parse == ')') {
7407    RExC_parse++;
7408    vFAIL("Unmatched )");
7409   }
7410   else
7411    FAIL("Junk on end of regexp"); /* "Can't happen". */
7412   /* NOTREACHED */
7413  }
7414
7415  if (RExC_in_lookbehind) {
7416   RExC_in_lookbehind--;
7417  }
7418  if (after_freeze > RExC_npar)
7419   RExC_npar = after_freeze;
7420  return(ret);
7421 }
7422
7423 /*
7424  - regbranch - one alternative of an | operator
7425  *
7426  * Implements the concatenation operator.
7427  */
7428 STATIC regnode *
7429 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
7430 {
7431  dVAR;
7432  register regnode *ret;
7433  register regnode *chain = NULL;
7434  register regnode *latest;
7435  I32 flags = 0, c = 0;
7436  GET_RE_DEBUG_FLAGS_DECL;
7437
7438  PERL_ARGS_ASSERT_REGBRANCH;
7439
7440  DEBUG_PARSE("brnc");
7441
7442  if (first)
7443   ret = NULL;
7444  else {
7445   if (!SIZE_ONLY && RExC_extralen)
7446    ret = reganode(pRExC_state, BRANCHJ,0);
7447   else {
7448    ret = reg_node(pRExC_state, BRANCH);
7449    Set_Node_Length(ret, 1);
7450   }
7451  }
7452
7453  if (!first && SIZE_ONLY)
7454   RExC_extralen += 1;   /* BRANCHJ */
7455
7456  *flagp = WORST;   /* Tentatively. */
7457
7458  RExC_parse--;
7459  nextchar(pRExC_state);
7460  while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
7461   flags &= ~TRYAGAIN;
7462   latest = regpiece(pRExC_state, &flags,depth+1);
7463   if (latest == NULL) {
7464    if (flags & TRYAGAIN)
7465     continue;
7466    return(NULL);
7467   }
7468   else if (ret == NULL)
7469    ret = latest;
7470   *flagp |= flags&(HASWIDTH|POSTPONED);
7471   if (chain == NULL)  /* First piece. */
7472    *flagp |= flags&SPSTART;
7473   else {
7474    RExC_naughty++;
7475    REGTAIL(pRExC_state, chain, latest);
7476   }
7477   chain = latest;
7478   c++;
7479  }
7480  if (chain == NULL) { /* Loop ran zero times. */
7481   chain = reg_node(pRExC_state, NOTHING);
7482   if (ret == NULL)
7483    ret = chain;
7484  }
7485  if (c == 1) {
7486   *flagp |= flags&SIMPLE;
7487  }
7488
7489  return ret;
7490 }
7491
7492 /*
7493  - regpiece - something followed by possible [*+?]
7494  *
7495  * Note that the branching code sequences used for ? and the general cases
7496  * of * and + are somewhat optimized:  they use the same NOTHING node as
7497  * both the endmarker for their branch list and the body of the last branch.
7498  * It might seem that this node could be dispensed with entirely, but the
7499  * endmarker role is not redundant.
7500  */
7501 STATIC regnode *
7502 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7503 {
7504  dVAR;
7505  register regnode *ret;
7506  register char op;
7507  register char *next;
7508  I32 flags;
7509  const char * const origparse = RExC_parse;
7510  I32 min;
7511  I32 max = REG_INFTY;
7512  char *parse_start;
7513  const char *maxpos = NULL;
7514  GET_RE_DEBUG_FLAGS_DECL;
7515
7516  PERL_ARGS_ASSERT_REGPIECE;
7517
7518  DEBUG_PARSE("piec");
7519
7520  ret = regatom(pRExC_state, &flags,depth+1);
7521  if (ret == NULL) {
7522   if (flags & TRYAGAIN)
7523    *flagp |= TRYAGAIN;
7524   return(NULL);
7525  }
7526
7527  op = *RExC_parse;
7528
7529  if (op == '{' && regcurly(RExC_parse)) {
7530   maxpos = NULL;
7531   parse_start = RExC_parse; /* MJD */
7532   next = RExC_parse + 1;
7533   while (isDIGIT(*next) || *next == ',') {
7534    if (*next == ',') {
7535     if (maxpos)
7536      break;
7537     else
7538      maxpos = next;
7539    }
7540    next++;
7541   }
7542   if (*next == '}') {  /* got one */
7543    if (!maxpos)
7544     maxpos = next;
7545    RExC_parse++;
7546    min = atoi(RExC_parse);
7547    if (*maxpos == ',')
7548     maxpos++;
7549    else
7550     maxpos = RExC_parse;
7551    max = atoi(maxpos);
7552    if (!max && *maxpos != '0')
7553     max = REG_INFTY;  /* meaning "infinity" */
7554    else if (max >= REG_INFTY)
7555     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
7556    RExC_parse = next;
7557    nextchar(pRExC_state);
7558
7559   do_curly:
7560    if ((flags&SIMPLE)) {
7561     RExC_naughty += 2 + RExC_naughty / 2;
7562     reginsert(pRExC_state, CURLY, ret, depth+1);
7563     Set_Node_Offset(ret, parse_start+1); /* MJD */
7564     Set_Node_Cur_Length(ret);
7565    }
7566    else {
7567     regnode * const w = reg_node(pRExC_state, WHILEM);
7568
7569     w->flags = 0;
7570     REGTAIL(pRExC_state, ret, w);
7571     if (!SIZE_ONLY && RExC_extralen) {
7572      reginsert(pRExC_state, LONGJMP,ret, depth+1);
7573      reginsert(pRExC_state, NOTHING,ret, depth+1);
7574      NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
7575     }
7576     reginsert(pRExC_state, CURLYX,ret, depth+1);
7577         /* MJD hk */
7578     Set_Node_Offset(ret, parse_start+1);
7579     Set_Node_Length(ret,
7580         op == '{' ? (RExC_parse - parse_start) : 1);
7581
7582     if (!SIZE_ONLY && RExC_extralen)
7583      NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
7584     REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
7585     if (SIZE_ONLY)
7586      RExC_whilem_seen++, RExC_extralen += 3;
7587     RExC_naughty += 4 + RExC_naughty; /* compound interest */
7588    }
7589    ret->flags = 0;
7590
7591    if (min > 0)
7592     *flagp = WORST;
7593    if (max > 0)
7594     *flagp |= HASWIDTH;
7595    if (max < min)
7596     vFAIL("Can't do {n,m} with n > m");
7597    if (!SIZE_ONLY) {
7598     ARG1_SET(ret, (U16)min);
7599     ARG2_SET(ret, (U16)max);
7600    }
7601
7602    goto nest_check;
7603   }
7604  }
7605
7606  if (!ISMULT1(op)) {
7607   *flagp = flags;
7608   return(ret);
7609  }
7610
7611 #if 0    /* Now runtime fix should be reliable. */
7612
7613  /* if this is reinstated, don't forget to put this back into perldiag:
7614
7615    =item Regexp *+ operand could be empty at {#} in regex m/%s/
7616
7617   (F) The part of the regexp subject to either the * or + quantifier
7618   could match an empty string. The {#} shows in the regular
7619   expression about where the problem was discovered.
7620
7621  */
7622
7623  if (!(flags&HASWIDTH) && op != '?')
7624  vFAIL("Regexp *+ operand could be empty");
7625 #endif
7626
7627  parse_start = RExC_parse;
7628  nextchar(pRExC_state);
7629
7630  *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
7631
7632  if (op == '*' && (flags&SIMPLE)) {
7633   reginsert(pRExC_state, STAR, ret, depth+1);
7634   ret->flags = 0;
7635   RExC_naughty += 4;
7636  }
7637  else if (op == '*') {
7638   min = 0;
7639   goto do_curly;
7640  }
7641  else if (op == '+' && (flags&SIMPLE)) {
7642   reginsert(pRExC_state, PLUS, ret, depth+1);
7643   ret->flags = 0;
7644   RExC_naughty += 3;
7645  }
7646  else if (op == '+') {
7647   min = 1;
7648   goto do_curly;
7649  }
7650  else if (op == '?') {
7651   min = 0; max = 1;
7652   goto do_curly;
7653  }
7654   nest_check:
7655  if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
7656   ckWARN3reg(RExC_parse,
7657     "%.*s matches null string many times",
7658     (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
7659     origparse);
7660  }
7661
7662  if (RExC_parse < RExC_end && *RExC_parse == '?') {
7663   nextchar(pRExC_state);
7664   reginsert(pRExC_state, MINMOD, ret, depth+1);
7665   REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
7666  }
7667 #ifndef REG_ALLOW_MINMOD_SUSPEND
7668  else
7669 #endif
7670  if (RExC_parse < RExC_end && *RExC_parse == '+') {
7671   regnode *ender;
7672   nextchar(pRExC_state);
7673   ender = reg_node(pRExC_state, SUCCEED);
7674   REGTAIL(pRExC_state, ret, ender);
7675   reginsert(pRExC_state, SUSPEND, ret, depth+1);
7676   ret->flags = 0;
7677   ender = reg_node(pRExC_state, TAIL);
7678   REGTAIL(pRExC_state, ret, ender);
7679   /*ret= ender;*/
7680  }
7681
7682  if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
7683   RExC_parse++;
7684   vFAIL("Nested quantifiers");
7685  }
7686
7687  return(ret);
7688 }
7689
7690
7691 /* reg_namedseq(pRExC_state,UVp, UV depth)
7692
7693    This is expected to be called by a parser routine that has
7694    recognized '\N' and needs to handle the rest. RExC_parse is
7695    expected to point at the first char following the N at the time
7696    of the call.
7697
7698    The \N may be inside (indicated by valuep not being NULL) or outside a
7699    character class.
7700
7701    \N may begin either a named sequence, or if outside a character class, mean
7702    to match a non-newline.  For non single-quoted regexes, the tokenizer has
7703    attempted to decide which, and in the case of a named sequence converted it
7704    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
7705    where c1... are the characters in the sequence.  For single-quoted regexes,
7706    the tokenizer passes the \N sequence through unchanged; this code will not
7707    attempt to determine this nor expand those.  The net effect is that if the
7708    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
7709    signals that this \N occurrence means to match a non-newline.
7710
7711    Only the \N{U+...} form should occur in a character class, for the same
7712    reason that '.' inside a character class means to just match a period: it
7713    just doesn't make sense.
7714
7715    If valuep is non-null then it is assumed that we are parsing inside
7716    of a charclass definition and the first codepoint in the resolved
7717    string is returned via *valuep and the routine will return NULL.
7718    In this mode if a multichar string is returned from the charnames
7719    handler, a warning will be issued, and only the first char in the
7720    sequence will be examined. If the string returned is zero length
7721    then the value of *valuep is undefined and NON-NULL will
7722    be returned to indicate failure. (This will NOT be a valid pointer
7723    to a regnode.)
7724
7725    If valuep is null then it is assumed that we are parsing normal text and a
7726    new EXACT node is inserted into the program containing the resolved string,
7727    and a pointer to the new node is returned.  But if the string is zero length
7728    a NOTHING node is emitted instead.
7729
7730    On success RExC_parse is set to the char following the endbrace.
7731    Parsing failures will generate a fatal error via vFAIL(...)
7732  */
7733 STATIC regnode *
7734 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
7735 {
7736  char * endbrace;    /* '}' following the name */
7737  regnode *ret = NULL;
7738  char* p;
7739
7740  GET_RE_DEBUG_FLAGS_DECL;
7741
7742  PERL_ARGS_ASSERT_REG_NAMEDSEQ;
7743
7744  GET_RE_DEBUG_FLAGS;
7745
7746  /* The [^\n] meaning of \N ignores spaces and comments under the /x
7747  * modifier.  The other meaning does not */
7748  p = (RExC_flags & RXf_PMf_EXTENDED)
7749   ? regwhite( pRExC_state, RExC_parse )
7750   : RExC_parse;
7751
7752  /* Disambiguate between \N meaning a named character versus \N meaning
7753  * [^\n].  The former is assumed when it can't be the latter. */
7754  if (*p != '{' || regcurly(p)) {
7755   RExC_parse = p;
7756   if (valuep) {
7757    /* no bare \N in a charclass */
7758    vFAIL("\\N in a character class must be a named character: \\N{...}");
7759   }
7760   nextchar(pRExC_state);
7761   ret = reg_node(pRExC_state, REG_ANY);
7762   *flagp |= HASWIDTH|SIMPLE;
7763   RExC_naughty++;
7764   RExC_parse--;
7765   Set_Node_Length(ret, 1); /* MJD */
7766   return ret;
7767  }
7768
7769  /* Here, we have decided it should be a named sequence */
7770
7771  /* The test above made sure that the next real character is a '{', but
7772  * under the /x modifier, it could be separated by space (or a comment and
7773  * \n) and this is not allowed (for consistency with \x{...} and the
7774  * tokenizer handling of \N{NAME}). */
7775  if (*RExC_parse != '{') {
7776   vFAIL("Missing braces on \\N{}");
7777  }
7778
7779  RExC_parse++; /* Skip past the '{' */
7780
7781  if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
7782   || ! (endbrace == RExC_parse  /* nothing between the {} */
7783    || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
7784     && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
7785  {
7786   if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
7787   vFAIL("\\N{NAME} must be resolved by the lexer");
7788  }
7789
7790  if (endbrace == RExC_parse) {   /* empty: \N{} */
7791   if (! valuep) {
7792    RExC_parse = endbrace + 1;
7793    return reg_node(pRExC_state,NOTHING);
7794   }
7795
7796   if (SIZE_ONLY) {
7797    ckWARNreg(RExC_parse,
7798      "Ignoring zero length \\N{} in character class"
7799    );
7800    RExC_parse = endbrace + 1;
7801   }
7802   *valuep = 0;
7803   return (regnode *) &RExC_parse; /* Invalid regnode pointer */
7804  }
7805
7806  REQUIRE_UTF8; /* named sequences imply Unicode semantics */
7807  RExC_parse += 2; /* Skip past the 'U+' */
7808
7809  if (valuep) {   /* In a bracketed char class */
7810   /* We only pay attention to the first char of
7811   multichar strings being returned. I kinda wonder
7812   if this makes sense as it does change the behaviour
7813   from earlier versions, OTOH that behaviour was broken
7814   as well. XXX Solution is to recharacterize as
7815   [rest-of-class]|multi1|multi2... */
7816
7817   STRLEN length_of_hex;
7818   I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7819    | PERL_SCAN_DISALLOW_PREFIX
7820    | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7821
7822   char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7823   if (endchar < endbrace) {
7824    ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7825   }
7826
7827   length_of_hex = (STRLEN)(endchar - RExC_parse);
7828   *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7829
7830   /* The tokenizer should have guaranteed validity, but it's possible to
7831   * bypass it by using single quoting, so check */
7832   if (length_of_hex == 0
7833    || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7834   {
7835    RExC_parse += length_of_hex; /* Includes all the valid */
7836    RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7837        ? UTF8SKIP(RExC_parse)
7838        : 1;
7839    /* Guard against malformed utf8 */
7840    if (RExC_parse >= endchar) RExC_parse = endchar;
7841    vFAIL("Invalid hexadecimal number in \\N{U+...}");
7842   }
7843
7844   RExC_parse = endbrace + 1;
7845   if (endchar == endbrace) return NULL;
7846
7847   ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
7848  }
7849  else { /* Not a char class */
7850
7851   /* What is done here is to convert this to a sub-pattern of the form
7852   * (?:\x{char1}\x{char2}...)
7853   * and then call reg recursively.  That way, it retains its atomicness,
7854   * while not having to worry about special handling that some code
7855   * points may have.  toke.c has converted the original Unicode values
7856   * to native, so that we can just pass on the hex values unchanged.  We
7857   * do have to set a flag to keep recoding from happening in the
7858   * recursion */
7859
7860   SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
7861   STRLEN len;
7862   char *endchar;     /* Points to '.' or '}' ending cur char in the input
7863        stream */
7864   char *orig_end = RExC_end;
7865
7866   while (RExC_parse < endbrace) {
7867
7868    /* Code points are separated by dots.  If none, there is only one
7869    * code point, and is terminated by the brace */
7870    endchar = RExC_parse + strcspn(RExC_parse, ".}");
7871
7872    /* Convert to notation the rest of the code understands */
7873    sv_catpv(substitute_parse, "\\x{");
7874    sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
7875    sv_catpv(substitute_parse, "}");
7876
7877    /* Point to the beginning of the next character in the sequence. */
7878    RExC_parse = endchar + 1;
7879   }
7880   sv_catpv(substitute_parse, ")");
7881
7882   RExC_parse = SvPV(substitute_parse, len);
7883
7884   /* Don't allow empty number */
7885   if (len < 8) {
7886    vFAIL("Invalid hexadecimal number in \\N{U+...}");
7887   }
7888   RExC_end = RExC_parse + len;
7889
7890   /* The values are Unicode, and therefore not subject to recoding */
7891   RExC_override_recoding = 1;
7892
7893   ret = reg(pRExC_state, 1, flagp, depth+1);
7894
7895   RExC_parse = endbrace;
7896   RExC_end = orig_end;
7897   RExC_override_recoding = 0;
7898
7899   nextchar(pRExC_state);
7900  }
7901
7902  return ret;
7903 }
7904
7905
7906 /*
7907  * reg_recode
7908  *
7909  * It returns the code point in utf8 for the value in *encp.
7910  *    value: a code value in the source encoding
7911  *    encp:  a pointer to an Encode object
7912  *
7913  * If the result from Encode is not a single character,
7914  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7915  */
7916 STATIC UV
7917 S_reg_recode(pTHX_ const char value, SV **encp)
7918 {
7919  STRLEN numlen = 1;
7920  SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7921  const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7922  const STRLEN newlen = SvCUR(sv);
7923  UV uv = UNICODE_REPLACEMENT;
7924
7925  PERL_ARGS_ASSERT_REG_RECODE;
7926
7927  if (newlen)
7928   uv = SvUTF8(sv)
7929    ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7930    : *(U8*)s;
7931
7932  if (!newlen || numlen != newlen) {
7933   uv = UNICODE_REPLACEMENT;
7934   *encp = NULL;
7935  }
7936  return uv;
7937 }
7938
7939
7940 /*
7941  - regatom - the lowest level
7942
7943    Try to identify anything special at the start of the pattern. If there
7944    is, then handle it as required. This may involve generating a single regop,
7945    such as for an assertion; or it may involve recursing, such as to
7946    handle a () structure.
7947
7948    If the string doesn't start with something special then we gobble up
7949    as much literal text as we can.
7950
7951    Once we have been able to handle whatever type of thing started the
7952    sequence, we return.
7953
7954    Note: we have to be careful with escapes, as they can be both literal
7955    and special, and in the case of \10 and friends can either, depending
7956    on context. Specifically there are two separate switches for handling
7957    escape sequences, with the one for handling literal escapes requiring
7958    a dummy entry for all of the special escapes that are actually handled
7959    by the other.
7960 */
7961
7962 STATIC regnode *
7963 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7964 {
7965  dVAR;
7966  register regnode *ret = NULL;
7967  I32 flags;
7968  char *parse_start = RExC_parse;
7969  U8 op;
7970  GET_RE_DEBUG_FLAGS_DECL;
7971  DEBUG_PARSE("atom");
7972  *flagp = WORST;  /* Tentatively. */
7973
7974  PERL_ARGS_ASSERT_REGATOM;
7975
7976 tryagain:
7977  switch ((U8)*RExC_parse) {
7978  case '^':
7979   RExC_seen_zerolen++;
7980   nextchar(pRExC_state);
7981   if (RExC_flags & RXf_PMf_MULTILINE)
7982    ret = reg_node(pRExC_state, MBOL);
7983   else if (RExC_flags & RXf_PMf_SINGLELINE)
7984    ret = reg_node(pRExC_state, SBOL);
7985   else
7986    ret = reg_node(pRExC_state, BOL);
7987   Set_Node_Length(ret, 1); /* MJD */
7988   break;
7989  case '$':
7990   nextchar(pRExC_state);
7991   if (*RExC_parse)
7992    RExC_seen_zerolen++;
7993   if (RExC_flags & RXf_PMf_MULTILINE)
7994    ret = reg_node(pRExC_state, MEOL);
7995   else if (RExC_flags & RXf_PMf_SINGLELINE)
7996    ret = reg_node(pRExC_state, SEOL);
7997   else
7998    ret = reg_node(pRExC_state, EOL);
7999   Set_Node_Length(ret, 1); /* MJD */
8000   break;
8001  case '.':
8002   nextchar(pRExC_state);
8003   if (RExC_flags & RXf_PMf_SINGLELINE)
8004    ret = reg_node(pRExC_state, SANY);
8005   else
8006    ret = reg_node(pRExC_state, REG_ANY);
8007   *flagp |= HASWIDTH|SIMPLE;
8008   RExC_naughty++;
8009   Set_Node_Length(ret, 1); /* MJD */
8010   break;
8011  case '[':
8012  {
8013   char * const oregcomp_parse = ++RExC_parse;
8014   ret = regclass(pRExC_state,depth+1);
8015   if (*RExC_parse != ']') {
8016    RExC_parse = oregcomp_parse;
8017    vFAIL("Unmatched [");
8018   }
8019   nextchar(pRExC_state);
8020   *flagp |= HASWIDTH|SIMPLE;
8021   Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
8022   break;
8023  }
8024  case '(':
8025   nextchar(pRExC_state);
8026   ret = reg(pRExC_state, 1, &flags,depth+1);
8027   if (ret == NULL) {
8028     if (flags & TRYAGAIN) {
8029      if (RExC_parse == RExC_end) {
8030       /* Make parent create an empty node if needed. */
8031       *flagp |= TRYAGAIN;
8032       return(NULL);
8033      }
8034      goto tryagain;
8035     }
8036     return(NULL);
8037   }
8038   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
8039   break;
8040  case '|':
8041  case ')':
8042   if (flags & TRYAGAIN) {
8043    *flagp |= TRYAGAIN;
8044    return NULL;
8045   }
8046   vFAIL("Internal urp");
8047         /* Supposed to be caught earlier. */
8048   break;
8049  case '{':
8050   if (!regcurly(RExC_parse)) {
8051    RExC_parse++;
8052    goto defchar;
8053   }
8054   /* FALL THROUGH */
8055  case '?':
8056  case '+':
8057  case '*':
8058   RExC_parse++;
8059   vFAIL("Quantifier follows nothing");
8060   break;
8061  case '\\':
8062   /* Special Escapes
8063
8064   This switch handles escape sequences that resolve to some kind
8065   of special regop and not to literal text. Escape sequnces that
8066   resolve to literal text are handled below in the switch marked
8067   "Literal Escapes".
8068
8069   Every entry in this switch *must* have a corresponding entry
8070   in the literal escape switch. However, the opposite is not
8071   required, as the default for this switch is to jump to the
8072   literal text handling code.
8073   */
8074   switch ((U8)*++RExC_parse) {
8075   /* Special Escapes */
8076   case 'A':
8077    RExC_seen_zerolen++;
8078    ret = reg_node(pRExC_state, SBOL);
8079    *flagp |= SIMPLE;
8080    goto finish_meta_pat;
8081   case 'G':
8082    ret = reg_node(pRExC_state, GPOS);
8083    RExC_seen |= REG_SEEN_GPOS;
8084    *flagp |= SIMPLE;
8085    goto finish_meta_pat;
8086   case 'K':
8087    RExC_seen_zerolen++;
8088    ret = reg_node(pRExC_state, KEEPS);
8089    *flagp |= SIMPLE;
8090    /* XXX:dmq : disabling in-place substitution seems to
8091    * be necessary here to avoid cases of memory corruption, as
8092    * with: C<$_="x" x 80; s/x\K/y/> -- rgs
8093    */
8094    RExC_seen |= REG_SEEN_LOOKBEHIND;
8095    goto finish_meta_pat;
8096   case 'Z':
8097    ret = reg_node(pRExC_state, SEOL);
8098    *flagp |= SIMPLE;
8099    RExC_seen_zerolen++;  /* Do not optimize RE away */
8100    goto finish_meta_pat;
8101   case 'z':
8102    ret = reg_node(pRExC_state, EOS);
8103    *flagp |= SIMPLE;
8104    RExC_seen_zerolen++;  /* Do not optimize RE away */
8105    goto finish_meta_pat;
8106   case 'C':
8107    ret = reg_node(pRExC_state, CANY);
8108    RExC_seen |= REG_SEEN_CANY;
8109    *flagp |= HASWIDTH|SIMPLE;
8110    goto finish_meta_pat;
8111   case 'X':
8112    ret = reg_node(pRExC_state, CLUMP);
8113    *flagp |= HASWIDTH;
8114    goto finish_meta_pat;
8115   case 'w':
8116    switch (get_regex_charset(RExC_flags)) {
8117     case REGEX_LOCALE_CHARSET:
8118      op = ALNUML;
8119      break;
8120     case REGEX_UNICODE_CHARSET:
8121      op = ALNUMU;
8122      break;
8123     case REGEX_ASCII_RESTRICTED_CHARSET:
8124     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8125      op = ALNUMA;
8126      break;
8127     case REGEX_DEPENDS_CHARSET:
8128      op = ALNUM;
8129      break;
8130     default:
8131      goto bad_charset;
8132    }
8133    ret = reg_node(pRExC_state, op);
8134    *flagp |= HASWIDTH|SIMPLE;
8135    goto finish_meta_pat;
8136   case 'W':
8137    switch (get_regex_charset(RExC_flags)) {
8138     case REGEX_LOCALE_CHARSET:
8139      op = NALNUML;
8140      break;
8141     case REGEX_UNICODE_CHARSET:
8142      op = NALNUMU;
8143      break;
8144     case REGEX_ASCII_RESTRICTED_CHARSET:
8145     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8146      op = NALNUMA;
8147      break;
8148     case REGEX_DEPENDS_CHARSET:
8149      op = NALNUM;
8150      break;
8151     default:
8152      goto bad_charset;
8153    }
8154    ret = reg_node(pRExC_state, op);
8155    *flagp |= HASWIDTH|SIMPLE;
8156    goto finish_meta_pat;
8157   case 'b':
8158    RExC_seen_zerolen++;
8159    RExC_seen |= REG_SEEN_LOOKBEHIND;
8160    switch (get_regex_charset(RExC_flags)) {
8161     case REGEX_LOCALE_CHARSET:
8162      op = BOUNDL;
8163      break;
8164     case REGEX_UNICODE_CHARSET:
8165      op = BOUNDU;
8166      break;
8167     case REGEX_ASCII_RESTRICTED_CHARSET:
8168     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8169      op = BOUNDA;
8170      break;
8171     case REGEX_DEPENDS_CHARSET:
8172      op = BOUND;
8173      break;
8174     default:
8175      goto bad_charset;
8176    }
8177    ret = reg_node(pRExC_state, op);
8178    FLAGS(ret) = get_regex_charset(RExC_flags);
8179    *flagp |= SIMPLE;
8180    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8181     ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
8182    }
8183    goto finish_meta_pat;
8184   case 'B':
8185    RExC_seen_zerolen++;
8186    RExC_seen |= REG_SEEN_LOOKBEHIND;
8187    switch (get_regex_charset(RExC_flags)) {
8188     case REGEX_LOCALE_CHARSET:
8189      op = NBOUNDL;
8190      break;
8191     case REGEX_UNICODE_CHARSET:
8192      op = NBOUNDU;
8193      break;
8194     case REGEX_ASCII_RESTRICTED_CHARSET:
8195     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8196      op = NBOUNDA;
8197      break;
8198     case REGEX_DEPENDS_CHARSET:
8199      op = NBOUND;
8200      break;
8201     default:
8202      goto bad_charset;
8203    }
8204    ret = reg_node(pRExC_state, op);
8205    FLAGS(ret) = get_regex_charset(RExC_flags);
8206    *flagp |= SIMPLE;
8207    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8208     ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
8209    }
8210    goto finish_meta_pat;
8211   case 's':
8212    switch (get_regex_charset(RExC_flags)) {
8213     case REGEX_LOCALE_CHARSET:
8214      op = SPACEL;
8215      break;
8216     case REGEX_UNICODE_CHARSET:
8217      op = SPACEU;
8218      break;
8219     case REGEX_ASCII_RESTRICTED_CHARSET:
8220     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8221      op = SPACEA;
8222      break;
8223     case REGEX_DEPENDS_CHARSET:
8224      op = SPACE;
8225      break;
8226     default:
8227      goto bad_charset;
8228    }
8229    ret = reg_node(pRExC_state, op);
8230    *flagp |= HASWIDTH|SIMPLE;
8231    goto finish_meta_pat;
8232   case 'S':
8233    switch (get_regex_charset(RExC_flags)) {
8234     case REGEX_LOCALE_CHARSET:
8235      op = NSPACEL;
8236      break;
8237     case REGEX_UNICODE_CHARSET:
8238      op = NSPACEU;
8239      break;
8240     case REGEX_ASCII_RESTRICTED_CHARSET:
8241     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8242      op = NSPACEA;
8243      break;
8244     case REGEX_DEPENDS_CHARSET:
8245      op = NSPACE;
8246      break;
8247     default:
8248      goto bad_charset;
8249    }
8250    ret = reg_node(pRExC_state, op);
8251    *flagp |= HASWIDTH|SIMPLE;
8252    goto finish_meta_pat;
8253   case 'd':
8254    switch (get_regex_charset(RExC_flags)) {
8255     case REGEX_LOCALE_CHARSET:
8256      op = DIGITL;
8257      break;
8258     case REGEX_ASCII_RESTRICTED_CHARSET:
8259     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8260      op = DIGITA;
8261      break;
8262     case REGEX_DEPENDS_CHARSET: /* No difference between these */
8263     case REGEX_UNICODE_CHARSET:
8264      op = DIGIT;
8265      break;
8266     default:
8267      goto bad_charset;
8268    }
8269    ret = reg_node(pRExC_state, op);
8270    *flagp |= HASWIDTH|SIMPLE;
8271    goto finish_meta_pat;
8272   case 'D':
8273    switch (get_regex_charset(RExC_flags)) {
8274     case REGEX_LOCALE_CHARSET:
8275      op = NDIGITL;
8276      break;
8277     case REGEX_ASCII_RESTRICTED_CHARSET:
8278     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8279      op = NDIGITA;
8280      break;
8281     case REGEX_DEPENDS_CHARSET: /* No difference between these */
8282     case REGEX_UNICODE_CHARSET:
8283      op = NDIGIT;
8284      break;
8285     default:
8286      goto bad_charset;
8287    }
8288    ret = reg_node(pRExC_state, op);
8289    *flagp |= HASWIDTH|SIMPLE;
8290    goto finish_meta_pat;
8291   case 'R':
8292    ret = reg_node(pRExC_state, LNBREAK);
8293    *flagp |= HASWIDTH|SIMPLE;
8294    goto finish_meta_pat;
8295   case 'h':
8296    ret = reg_node(pRExC_state, HORIZWS);
8297    *flagp |= HASWIDTH|SIMPLE;
8298    goto finish_meta_pat;
8299   case 'H':
8300    ret = reg_node(pRExC_state, NHORIZWS);
8301    *flagp |= HASWIDTH|SIMPLE;
8302    goto finish_meta_pat;
8303   case 'v':
8304    ret = reg_node(pRExC_state, VERTWS);
8305    *flagp |= HASWIDTH|SIMPLE;
8306    goto finish_meta_pat;
8307   case 'V':
8308    ret = reg_node(pRExC_state, NVERTWS);
8309    *flagp |= HASWIDTH|SIMPLE;
8310   finish_meta_pat:
8311    nextchar(pRExC_state);
8312    Set_Node_Length(ret, 2); /* MJD */
8313    break;
8314   case 'p':
8315   case 'P':
8316    {
8317     char* const oldregxend = RExC_end;
8318 #ifdef DEBUGGING
8319     char* parse_start = RExC_parse - 2;
8320 #endif
8321
8322     if (RExC_parse[1] == '{') {
8323     /* a lovely hack--pretend we saw [\pX] instead */
8324      RExC_end = strchr(RExC_parse, '}');
8325      if (!RExC_end) {
8326       const U8 c = (U8)*RExC_parse;
8327       RExC_parse += 2;
8328       RExC_end = oldregxend;
8329       vFAIL2("Missing right brace on \\%c{}", c);
8330      }
8331      RExC_end++;
8332     }
8333     else {
8334      RExC_end = RExC_parse + 2;
8335      if (RExC_end > oldregxend)
8336       RExC_end = oldregxend;
8337     }
8338     RExC_parse--;
8339
8340     ret = regclass(pRExC_state,depth+1);
8341
8342     RExC_end = oldregxend;
8343     RExC_parse--;
8344
8345     Set_Node_Offset(ret, parse_start + 2);
8346     Set_Node_Cur_Length(ret);
8347     nextchar(pRExC_state);
8348     *flagp |= HASWIDTH|SIMPLE;
8349    }
8350    break;
8351   case 'N':
8352    /* Handle \N and \N{NAME} here and not below because it can be
8353    multicharacter. join_exact() will join them up later on.
8354    Also this makes sure that things like /\N{BLAH}+/ and
8355    \N{BLAH} being multi char Just Happen. dmq*/
8356    ++RExC_parse;
8357    ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
8358    break;
8359   case 'k':    /* Handle \k<NAME> and \k'NAME' */
8360   parse_named_seq:
8361   {
8362    char ch= RExC_parse[1];
8363    if (ch != '<' && ch != '\'' && ch != '{') {
8364     RExC_parse++;
8365     vFAIL2("Sequence %.2s... not terminated",parse_start);
8366    } else {
8367     /* this pretty much dupes the code for (?P=...) in reg(), if
8368     you change this make sure you change that */
8369     char* name_start = (RExC_parse += 2);
8370     U32 num = 0;
8371     SV *sv_dat = reg_scan_name(pRExC_state,
8372      SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8373     ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
8374     if (RExC_parse == name_start || *RExC_parse != ch)
8375      vFAIL2("Sequence %.3s... not terminated",parse_start);
8376
8377     if (!SIZE_ONLY) {
8378      num = add_data( pRExC_state, 1, "S" );
8379      RExC_rxi->data->data[num]=(void*)sv_dat;
8380      SvREFCNT_inc_simple_void(sv_dat);
8381     }
8382
8383     RExC_sawback = 1;
8384     ret = reganode(pRExC_state,
8385        ((! FOLD)
8386         ? NREF
8387         : (MORE_ASCII_RESTRICTED)
8388         ? NREFFA
8389         : (AT_LEAST_UNI_SEMANTICS)
8390          ? NREFFU
8391          : (LOC)
8392          ? NREFFL
8393          : NREFF),
8394         num);
8395     *flagp |= HASWIDTH;
8396
8397     /* override incorrect value set in reganode MJD */
8398     Set_Node_Offset(ret, parse_start+1);
8399     Set_Node_Cur_Length(ret); /* MJD */
8400     nextchar(pRExC_state);
8401
8402    }
8403    break;
8404   }
8405   case 'g':
8406   case '1': case '2': case '3': case '4':
8407   case '5': case '6': case '7': case '8': case '9':
8408    {
8409     I32 num;
8410     bool isg = *RExC_parse == 'g';
8411     bool isrel = 0;
8412     bool hasbrace = 0;
8413     if (isg) {
8414      RExC_parse++;
8415      if (*RExC_parse == '{') {
8416       RExC_parse++;
8417       hasbrace = 1;
8418      }
8419      if (*RExC_parse == '-') {
8420       RExC_parse++;
8421       isrel = 1;
8422      }
8423      if (hasbrace && !isDIGIT(*RExC_parse)) {
8424       if (isrel) RExC_parse--;
8425       RExC_parse -= 2;
8426       goto parse_named_seq;
8427     }   }
8428     num = atoi(RExC_parse);
8429     if (isg && num == 0)
8430      vFAIL("Reference to invalid group 0");
8431     if (isrel) {
8432      num = RExC_npar - num;
8433      if (num < 1)
8434       vFAIL("Reference to nonexistent or unclosed group");
8435     }
8436     if (!isg && num > 9 && num >= RExC_npar)
8437      goto defchar;
8438     else {
8439      char * const parse_start = RExC_parse - 1; /* MJD */
8440      while (isDIGIT(*RExC_parse))
8441       RExC_parse++;
8442      if (parse_start == RExC_parse - 1)
8443       vFAIL("Unterminated \\g... pattern");
8444      if (hasbrace) {
8445       if (*RExC_parse != '}')
8446        vFAIL("Unterminated \\g{...} pattern");
8447       RExC_parse++;
8448      }
8449      if (!SIZE_ONLY) {
8450       if (num > (I32)RExC_rx->nparens)
8451        vFAIL("Reference to nonexistent group");
8452      }
8453      RExC_sawback = 1;
8454      ret = reganode(pRExC_state,
8455         ((! FOLD)
8456          ? REF
8457          : (MORE_ASCII_RESTRICTED)
8458          ? REFFA
8459          : (AT_LEAST_UNI_SEMANTICS)
8460           ? REFFU
8461           : (LOC)
8462           ? REFFL
8463           : REFF),
8464          num);
8465      *flagp |= HASWIDTH;
8466
8467      /* override incorrect value set in reganode MJD */
8468      Set_Node_Offset(ret, parse_start+1);
8469      Set_Node_Cur_Length(ret); /* MJD */
8470      RExC_parse--;
8471      nextchar(pRExC_state);
8472     }
8473    }
8474    break;
8475   case '\0':
8476    if (RExC_parse >= RExC_end)
8477     FAIL("Trailing \\");
8478    /* FALL THROUGH */
8479   default:
8480    /* Do not generate "unrecognized" warnings here, we fall
8481    back into the quick-grab loop below */
8482    parse_start--;
8483    goto defchar;
8484   }
8485   break;
8486
8487  case '#':
8488   if (RExC_flags & RXf_PMf_EXTENDED) {
8489    if ( reg_skipcomment( pRExC_state ) )
8490     goto tryagain;
8491   }
8492   /* FALL THROUGH */
8493
8494  default:
8495
8496    parse_start = RExC_parse - 1;
8497
8498    RExC_parse++;
8499
8500   defchar: {
8501    typedef enum {
8502     generic_char = 0,
8503     char_s,
8504     upsilon_1,
8505     upsilon_2,
8506     iota_1,
8507     iota_2,
8508    } char_state;
8509    char_state latest_char_state = generic_char;
8510    register STRLEN len;
8511    register UV ender;
8512    register char *p;
8513    char *s;
8514    STRLEN foldlen;
8515    U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
8516    regnode * orig_emit;
8517
8518    ender = 0;
8519    orig_emit = RExC_emit; /* Save the original output node position in
8520          case we need to output a different node
8521          type */
8522    ret = reg_node(pRExC_state,
8523       (U8) ((! FOLD) ? EXACT
8524           : (LOC)
8525            ? EXACTFL
8526            : (MORE_ASCII_RESTRICTED)
8527            ? EXACTFA
8528            : (AT_LEAST_UNI_SEMANTICS)
8529             ? EXACTFU
8530             : EXACTF)
8531      );
8532    s = STRING(ret);
8533    for (len = 0, p = RExC_parse - 1;
8534    len < 127 && p < RExC_end;
8535    len++)
8536    {
8537     char * const oldp = p;
8538
8539     if (RExC_flags & RXf_PMf_EXTENDED)
8540      p = regwhite( pRExC_state, p );
8541     switch ((U8)*p) {
8542     case '^':
8543     case '$':
8544     case '.':
8545     case '[':
8546     case '(':
8547     case ')':
8548     case '|':
8549      goto loopdone;
8550     case '\\':
8551      /* Literal Escapes Switch
8552
8553      This switch is meant to handle escape sequences that
8554      resolve to a literal character.
8555
8556      Every escape sequence that represents something
8557      else, like an assertion or a char class, is handled
8558      in the switch marked 'Special Escapes' above in this
8559      routine, but also has an entry here as anything that
8560      isn't explicitly mentioned here will be treated as
8561      an unescaped equivalent literal.
8562      */
8563
8564      switch ((U8)*++p) {
8565      /* These are all the special escapes. */
8566      case 'A':             /* Start assertion */
8567      case 'b': case 'B':   /* Word-boundary assertion*/
8568      case 'C':             /* Single char !DANGEROUS! */
8569      case 'd': case 'D':   /* digit class */
8570      case 'g': case 'G':   /* generic-backref, pos assertion */
8571      case 'h': case 'H':   /* HORIZWS */
8572      case 'k': case 'K':   /* named backref, keep marker */
8573      case 'N':             /* named char sequence */
8574      case 'p': case 'P':   /* Unicode property */
8575        case 'R':   /* LNBREAK */
8576      case 's': case 'S':   /* space class */
8577      case 'v': case 'V':   /* VERTWS */
8578      case 'w': case 'W':   /* word class */
8579      case 'X':             /* eXtended Unicode "combining character sequence" */
8580      case 'z': case 'Z':   /* End of line/string assertion */
8581       --p;
8582       goto loopdone;
8583
8584      /* Anything after here is an escape that resolves to a
8585      literal. (Except digits, which may or may not)
8586      */
8587      case 'n':
8588       ender = '\n';
8589       p++;
8590       break;
8591      case 'r':
8592       ender = '\r';
8593       p++;
8594       break;
8595      case 't':
8596       ender = '\t';
8597       p++;
8598       break;
8599      case 'f':
8600       ender = '\f';
8601       p++;
8602       break;
8603      case 'e':
8604       ender = ASCII_TO_NATIVE('\033');
8605       p++;
8606       break;
8607      case 'a':
8608       ender = ASCII_TO_NATIVE('\007');
8609       p++;
8610       break;
8611      case 'o':
8612       {
8613        STRLEN brace_len = len;
8614        UV result;
8615        const char* error_msg;
8616
8617        bool valid = grok_bslash_o(p,
8618              &result,
8619              &brace_len,
8620              &error_msg,
8621              1);
8622        p += brace_len;
8623        if (! valid) {
8624         RExC_parse = p; /* going to die anyway; point
8625             to exact spot of failure */
8626         vFAIL(error_msg);
8627        }
8628        else
8629        {
8630         ender = result;
8631        }
8632        if (PL_encoding && ender < 0x100) {
8633         goto recode_encoding;
8634        }
8635        if (ender > 0xff) {
8636         REQUIRE_UTF8;
8637        }
8638        break;
8639       }
8640      case 'x':
8641       if (*++p == '{') {
8642        char* const e = strchr(p, '}');
8643
8644        if (!e) {
8645         RExC_parse = p + 1;
8646         vFAIL("Missing right brace on \\x{}");
8647        }
8648        else {
8649         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8650          | PERL_SCAN_DISALLOW_PREFIX;
8651         STRLEN numlen = e - p - 1;
8652         ender = grok_hex(p + 1, &numlen, &flags, NULL);
8653         if (ender > 0xff)
8654          REQUIRE_UTF8;
8655         p = e + 1;
8656        }
8657       }
8658       else {
8659        I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8660        STRLEN numlen = 2;
8661        ender = grok_hex(p, &numlen, &flags, NULL);
8662        p += numlen;
8663       }
8664       if (PL_encoding && ender < 0x100)
8665        goto recode_encoding;
8666       break;
8667      case 'c':
8668       p++;
8669       ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
8670       break;
8671      case '0': case '1': case '2': case '3':case '4':
8672      case '5': case '6': case '7': case '8':case '9':
8673       if (*p == '0' ||
8674        (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
8675       {
8676        I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8677        STRLEN numlen = 3;
8678        ender = grok_oct(p, &numlen, &flags, NULL);
8679        if (ender > 0xff) {
8680         REQUIRE_UTF8;
8681        }
8682        p += numlen;
8683       }
8684       else {
8685        --p;
8686        goto loopdone;
8687       }
8688       if (PL_encoding && ender < 0x100)
8689        goto recode_encoding;
8690       break;
8691      recode_encoding:
8692       if (! RExC_override_recoding) {
8693        SV* enc = PL_encoding;
8694        ender = reg_recode((const char)(U8)ender, &enc);
8695        if (!enc && SIZE_ONLY)
8696         ckWARNreg(p, "Invalid escape in the specified encoding");
8697        REQUIRE_UTF8;
8698       }
8699       break;
8700      case '\0':
8701       if (p >= RExC_end)
8702        FAIL("Trailing \\");
8703       /* FALL THROUGH */
8704      default:
8705       if (!SIZE_ONLY&& isALPHA(*p)) {
8706        /* Include any { following the alpha to emphasize
8707        * that it could be part of an escape at some point
8708        * in the future */
8709        int len = (*(p + 1) == '{') ? 2 : 1;
8710        ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
8711       }
8712       goto normal_default;
8713      }
8714      break;
8715     default:
8716     normal_default:
8717      if (UTF8_IS_START(*p) && UTF) {
8718       STRLEN numlen;
8719       ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
8720            &numlen, UTF8_ALLOW_DEFAULT);
8721       p += numlen;
8722      }
8723      else
8724       ender = (U8) *p++;
8725      break;
8726     } /* End of switch on the literal */
8727
8728     /* Certain characters are problematic because their folded
8729     * length is so different from their original length that it
8730     * isn't handleable by the optimizer.  They are therefore not
8731     * placed in an EXACTish node; and are here handled specially.
8732     * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
8733     * putting it in a special node keeps regexec from having to
8734     * deal with a non-utf8 multi-char fold */
8735     if (FOLD
8736      && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC)))
8737     {
8738      /* We look for either side of the fold.  For example \xDF
8739      * folds to 'ss'.  We look for both the single character
8740      * \xDF and the sequence 'ss'.  When we find something that
8741      * could be one of those, we stop and flush whatever we
8742      * have output so far into the EXACTish node that was being
8743      * built.  Then restore the input pointer to what it was.
8744      * regatom will return that EXACT node, and will be called
8745      * again, positioned so the first character is the one in
8746      * question, which we return in a different node type.
8747      * The multi-char folds are a sequence, so the occurrence
8748      * of the first character in that sequence doesn't
8749      * necessarily mean that what follows is the rest of the
8750      * sequence.  We keep track of that with a state machine,
8751      * with the state being set to the latest character
8752      * processed before the current one.  Most characters will
8753      * set the state to 0, but if one occurs that is part of a
8754      * potential tricky fold sequence, the state is set to that
8755      * character, and the next loop iteration sees if the state
8756      * should progress towards the final folded-from character,
8757      * or if it was a false alarm.  If it turns out to be a
8758      * false alarm, the character(s) will be output in a new
8759      * EXACTish node, and join_exact() will later combine them.
8760      * In the case of the 'ss' sequence, which is more common
8761      * and more easily checked, some look-ahead is done to
8762      * save time by ruling-out some false alarms */
8763      switch (ender) {
8764       default:
8765        latest_char_state = generic_char;
8766        break;
8767       case 's':
8768       case 'S':
8769        if (AT_LEAST_UNI_SEMANTICS) {
8770         if (latest_char_state == char_s) {  /* 'ss' */
8771          ender = LATIN_SMALL_LETTER_SHARP_S;
8772          goto do_tricky;
8773         }
8774         else if (p < RExC_end) {
8775
8776          /* Look-ahead at the next character.  If it
8777          * is also an s, we handle as a sharp s
8778          * tricky regnode.  */
8779          if (*p == 's' || *p == 'S') {
8780
8781           /* But first flush anything in the
8782           * EXACTish buffer */
8783           if (len != 0) {
8784            p = oldp;
8785            goto loopdone;
8786           }
8787           p++; /* Account for swallowing this
8788             's' up */
8789           ender = LATIN_SMALL_LETTER_SHARP_S;
8790           goto do_tricky;
8791          }
8792           /* Here, the next character is not a
8793           * literal 's', but still could
8794           * evaluate to one if part of a \o{},
8795           * \x or \OCTAL-DIGIT.  The minimum
8796           * length required for that is 4, eg
8797           * \x53 or \123 */
8798          else if (*p == '\\'
8799            && p < RExC_end - 4
8800            && (isDIGIT(*(p + 1))
8801             || *(p + 1) == 'x'
8802             || *(p + 1) == 'o' ))
8803          {
8804
8805           /* Here, it could be an 's', too much
8806           * bother to figure it out here.  Flush
8807           * the buffer if any; when come back
8808           * here, set the state so know that the
8809           * previous char was an 's' */
8810           if (len != 0) {
8811            latest_char_state = generic_char;
8812            p = oldp;
8813            goto loopdone;
8814           }
8815           latest_char_state = char_s;
8816           break;
8817          }
8818         }
8819        }
8820
8821        /* Here, can't be an 'ss' sequence, or at least not
8822        * one that could fold to/from the sharp ss */
8823        latest_char_state = generic_char;
8824        break;
8825       case 0x03C5: /* First char in upsilon series */
8826        if (p < RExC_end - 4) { /* Need >= 4 bytes left */
8827         latest_char_state = upsilon_1;
8828         if (len != 0) {
8829          p = oldp;
8830          goto loopdone;
8831         }
8832        }
8833        else {
8834         latest_char_state = generic_char;
8835        }
8836        break;
8837       case 0x03B9: /* First char in iota series */
8838        if (p < RExC_end - 4) {
8839         latest_char_state = iota_1;
8840         if (len != 0) {
8841          p = oldp;
8842          goto loopdone;
8843         }
8844        }
8845        else {
8846         latest_char_state = generic_char;
8847        }
8848        break;
8849       case 0x0308:
8850        if (latest_char_state == upsilon_1) {
8851         latest_char_state = upsilon_2;
8852        }
8853        else if (latest_char_state == iota_1) {
8854         latest_char_state = iota_2;
8855        }
8856        else {
8857         latest_char_state = generic_char;
8858        }
8859        break;
8860       case 0x301:
8861        if (latest_char_state == upsilon_2) {
8862         ender = GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS;
8863         goto do_tricky;
8864        }
8865        else if (latest_char_state == iota_2) {
8866         ender = GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS;
8867         goto do_tricky;
8868        }
8869        latest_char_state = generic_char;
8870        break;
8871
8872       /* These are the tricky fold characters.  Flush any
8873       * buffer first. */
8874       case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS:
8875       case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS:
8876       case LATIN_SMALL_LETTER_SHARP_S:
8877       case LATIN_CAPITAL_LETTER_SHARP_S:
8878       case 0x1FD3:
8879       case 0x1FE3:
8880        if (len != 0) {
8881         p = oldp;
8882         goto loopdone;
8883        }
8884        /* FALL THROUGH */
8885       do_tricky: {
8886        char* const oldregxend = RExC_end;
8887        U8 tmpbuf[UTF8_MAXBYTES+1];
8888
8889        /* Here, we know we need to generate a special
8890        * regnode, and 'ender' contains the tricky
8891        * character.  What's done is to pretend it's in a
8892        * [bracketed] class, and let the code that deals
8893        * with those handle it, as that code has all the
8894        * intelligence necessary.  First save the current
8895        * parse state, get rid of the already allocated
8896        * but empty EXACT node that the ANYOFV node will
8897        * replace, and point the parse to a buffer which
8898        * we fill with the character we want the regclass
8899        * code to think is being parsed */
8900        RExC_emit = orig_emit;
8901        RExC_parse = (char *) tmpbuf;
8902        if (UTF) {
8903         U8 *d = uvchr_to_utf8(tmpbuf, ender);
8904         *d = '\0';
8905         RExC_end = (char *) d;
8906        }
8907        else {  /* ender above 255 already excluded */
8908         tmpbuf[0] = (U8) ender;
8909         tmpbuf[1] = '\0';
8910         RExC_end = RExC_parse + 1;
8911        }
8912
8913        ret = regclass(pRExC_state,depth+1);
8914
8915        /* Here, have parsed the buffer.  Reset the parse to
8916        * the actual input, and return */
8917        RExC_end = oldregxend;
8918        RExC_parse = p - 1;
8919
8920        Set_Node_Offset(ret, RExC_parse);
8921        Set_Node_Cur_Length(ret);
8922        nextchar(pRExC_state);
8923        *flagp |= HASWIDTH|SIMPLE;
8924        return ret;
8925       }
8926      }
8927     }
8928
8929     if ( RExC_flags & RXf_PMf_EXTENDED)
8930      p = regwhite( pRExC_state, p );
8931     if (UTF && FOLD) {
8932      /* Prime the casefolded buffer.  Locale rules, which apply
8933      * only to code points < 256, aren't known until execution,
8934      * so for them, just output the original character using
8935      * utf8 */
8936      if (LOC && ender < 256) {
8937       if (UNI_IS_INVARIANT(ender)) {
8938        *tmpbuf = (U8) ender;
8939        foldlen = 1;
8940       } else {
8941        *tmpbuf = UTF8_TWO_BYTE_HI(ender);
8942        *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
8943        foldlen = 2;
8944       }
8945      }
8946      else if (isASCII(ender)) { /* Note: Here can't also be LOC
8947             */
8948       ender = toLOWER(ender);
8949       *tmpbuf = (U8) ender;
8950       foldlen = 1;
8951      }
8952      else if (! MORE_ASCII_RESTRICTED && ! LOC) {
8953
8954       /* Locale and /aa require more selectivity about the
8955       * fold, so are handled below.  Otherwise, here, just
8956       * use the fold */
8957       ender = toFOLD_uni(ender, tmpbuf, &foldlen);
8958      }
8959      else {
8960       /* Under locale rules or /aa we are not to mix,
8961       * respectively, ords < 256 or ASCII with non-.  So
8962       * reject folds that mix them, using only the
8963       * non-folded code point.  So do the fold to a
8964       * temporary, and inspect each character in it. */
8965       U8 trialbuf[UTF8_MAXBYTES_CASE+1];
8966       U8* s = trialbuf;
8967       UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
8968       U8* e = s + foldlen;
8969       bool fold_ok = TRUE;
8970
8971       while (s < e) {
8972        if (isASCII(*s)
8973         || (LOC && (UTF8_IS_INVARIANT(*s)
8974           || UTF8_IS_DOWNGRADEABLE_START(*s))))
8975        {
8976         fold_ok = FALSE;
8977         break;
8978        }
8979        s += UTF8SKIP(s);
8980       }
8981       if (fold_ok) {
8982        Copy(trialbuf, tmpbuf, foldlen, U8);
8983        ender = tmpender;
8984       }
8985       else {
8986        uvuni_to_utf8(tmpbuf, ender);
8987        foldlen = UNISKIP(ender);
8988       }
8989      }
8990     }
8991     if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
8992      if (len)
8993       p = oldp;
8994      else if (UTF) {
8995       if (FOLD) {
8996        /* Emit all the Unicode characters. */
8997        STRLEN numlen;
8998        for (foldbuf = tmpbuf;
8999         foldlen;
9000         foldlen -= numlen) {
9001         ender = utf8_to_uvchr(foldbuf, &numlen);
9002         if (numlen > 0) {
9003           const STRLEN unilen = reguni(pRExC_state, ender, s);
9004           s       += unilen;
9005           len     += unilen;
9006           /* In EBCDIC the numlen
9007           * and unilen can differ. */
9008           foldbuf += numlen;
9009           if (numlen >= foldlen)
9010            break;
9011         }
9012         else
9013           break; /* "Can't happen." */
9014        }
9015       }
9016       else {
9017        const STRLEN unilen = reguni(pRExC_state, ender, s);
9018        if (unilen > 0) {
9019         s   += unilen;
9020         len += unilen;
9021        }
9022       }
9023      }
9024      else {
9025       len++;
9026       REGC((char)ender, s++);
9027      }
9028      break;
9029     }
9030     if (UTF) {
9031      if (FOLD) {
9032       /* Emit all the Unicode characters. */
9033       STRLEN numlen;
9034       for (foldbuf = tmpbuf;
9035        foldlen;
9036        foldlen -= numlen) {
9037        ender = utf8_to_uvchr(foldbuf, &numlen);
9038        if (numlen > 0) {
9039          const STRLEN unilen = reguni(pRExC_state, ender, s);
9040          len     += unilen;
9041          s       += unilen;
9042          /* In EBCDIC the numlen
9043          * and unilen can differ. */
9044          foldbuf += numlen;
9045          if (numlen >= foldlen)
9046           break;
9047        }
9048        else
9049          break;
9050       }
9051      }
9052      else {
9053       const STRLEN unilen = reguni(pRExC_state, ender, s);
9054       if (unilen > 0) {
9055        s   += unilen;
9056        len += unilen;
9057       }
9058      }
9059      len--;
9060     }
9061     else {
9062      REGC((char)ender, s++);
9063     }
9064    }
9065   loopdone:   /* Jumped to when encounters something that shouldn't be in
9066      the node */
9067    RExC_parse = p - 1;
9068    Set_Node_Cur_Length(ret); /* MJD */
9069    nextchar(pRExC_state);
9070    {
9071     /* len is STRLEN which is unsigned, need to copy to signed */
9072     IV iv = len;
9073     if (iv < 0)
9074      vFAIL("Internal disaster");
9075    }
9076    if (len > 0)
9077     *flagp |= HASWIDTH;
9078    if (len == 1 && UNI_IS_INVARIANT(ender))
9079     *flagp |= SIMPLE;
9080
9081    if (SIZE_ONLY)
9082     RExC_size += STR_SZ(len);
9083    else {
9084     STR_LEN(ret) = len;
9085     RExC_emit += STR_SZ(len);
9086    }
9087   }
9088   break;
9089  }
9090
9091  return(ret);
9092
9093 /* Jumped to when an unrecognized character set is encountered */
9094 bad_charset:
9095  Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
9096  return(NULL);
9097 }
9098
9099 STATIC char *
9100 S_regwhite( RExC_state_t *pRExC_state, char *p )
9101 {
9102  const char *e = RExC_end;
9103
9104  PERL_ARGS_ASSERT_REGWHITE;
9105
9106  while (p < e) {
9107   if (isSPACE(*p))
9108    ++p;
9109   else if (*p == '#') {
9110    bool ended = 0;
9111    do {
9112     if (*p++ == '\n') {
9113      ended = 1;
9114      break;
9115     }
9116    } while (p < e);
9117    if (!ended)
9118     RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9119   }
9120   else
9121    break;
9122  }
9123  return p;
9124 }
9125
9126 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9127    Character classes ([:foo:]) can also be negated ([:^foo:]).
9128    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9129    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
9130    but trigger failures because they are currently unimplemented. */
9131
9132 #define POSIXCC_DONE(c)   ((c) == ':')
9133 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9134 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9135
9136 STATIC I32
9137 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
9138 {
9139  dVAR;
9140  I32 namedclass = OOB_NAMEDCLASS;
9141
9142  PERL_ARGS_ASSERT_REGPPOSIXCC;
9143
9144  if (value == '[' && RExC_parse + 1 < RExC_end &&
9145   /* I smell either [: or [= or [. -- POSIX has been here, right? */
9146   POSIXCC(UCHARAT(RExC_parse))) {
9147   const char c = UCHARAT(RExC_parse);
9148   char* const s = RExC_parse++;
9149
9150   while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
9151    RExC_parse++;
9152   if (RExC_parse == RExC_end)
9153    /* Grandfather lone [:, [=, [. */
9154    RExC_parse = s;
9155   else {
9156    const char* const t = RExC_parse++; /* skip over the c */
9157    assert(*t == c);
9158
9159    if (UCHARAT(RExC_parse) == ']') {
9160     const char *posixcc = s + 1;
9161     RExC_parse++; /* skip over the ending ] */
9162
9163     if (*s == ':') {
9164      const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
9165      const I32 skip = t - posixcc;
9166
9167      /* Initially switch on the length of the name.  */
9168      switch (skip) {
9169      case 4:
9170       if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
9171        namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
9172       break;
9173      case 5:
9174       /* Names all of length 5.  */
9175       /* alnum alpha ascii blank cntrl digit graph lower
9176       print punct space upper  */
9177       /* Offset 4 gives the best switch position.  */
9178       switch (posixcc[4]) {
9179       case 'a':
9180        if (memEQ(posixcc, "alph", 4)) /* alpha */
9181         namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
9182        break;
9183       case 'e':
9184        if (memEQ(posixcc, "spac", 4)) /* space */
9185         namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
9186        break;
9187       case 'h':
9188        if (memEQ(posixcc, "grap", 4)) /* graph */
9189         namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
9190        break;
9191       case 'i':
9192        if (memEQ(posixcc, "asci", 4)) /* ascii */
9193         namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
9194        break;
9195       case 'k':
9196        if (memEQ(posixcc, "blan", 4)) /* blank */
9197         namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
9198        break;
9199       case 'l':
9200        if (memEQ(posixcc, "cntr", 4)) /* cntrl */
9201         namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
9202        break;
9203       case 'm':
9204        if (memEQ(posixcc, "alnu", 4)) /* alnum */
9205         namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
9206        break;
9207       case 'r':
9208        if (memEQ(posixcc, "lowe", 4)) /* lower */
9209         namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
9210        else if (memEQ(posixcc, "uppe", 4)) /* upper */
9211         namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
9212        break;
9213       case 't':
9214        if (memEQ(posixcc, "digi", 4)) /* digit */
9215         namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
9216        else if (memEQ(posixcc, "prin", 4)) /* print */
9217         namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
9218        else if (memEQ(posixcc, "punc", 4)) /* punct */
9219         namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
9220        break;
9221       }
9222       break;
9223      case 6:
9224       if (memEQ(posixcc, "xdigit", 6))
9225        namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
9226       break;
9227      }
9228
9229      if (namedclass == OOB_NAMEDCLASS)
9230       Simple_vFAIL3("POSIX class [:%.*s:] unknown",
9231          t - s - 1, s + 1);
9232      assert (posixcc[skip] == ':');
9233      assert (posixcc[skip+1] == ']');
9234     } else if (!SIZE_ONLY) {
9235      /* [[=foo=]] and [[.foo.]] are still future. */
9236
9237      /* adjust RExC_parse so the warning shows after
9238      the class closes */
9239      while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
9240       RExC_parse++;
9241      Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9242     }
9243    } else {
9244     /* Maternal grandfather:
9245     * "[:" ending in ":" but not in ":]" */
9246     RExC_parse = s;
9247    }
9248   }
9249  }
9250
9251  return namedclass;
9252 }
9253
9254 STATIC void
9255 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
9256 {
9257  dVAR;
9258
9259  PERL_ARGS_ASSERT_CHECKPOSIXCC;
9260
9261  if (POSIXCC(UCHARAT(RExC_parse))) {
9262   const char *s = RExC_parse;
9263   const char  c = *s++;
9264
9265   while (isALNUM(*s))
9266    s++;
9267   if (*s && c == *s && s[1] == ']') {
9268    ckWARN3reg(s+2,
9269      "POSIX syntax [%c %c] belongs inside character classes",
9270      c, c);
9271
9272    /* [[=foo=]] and [[.foo.]] are still future. */
9273    if (POSIXCC_NOTYET(c)) {
9274     /* adjust RExC_parse so the error shows after
9275     the class closes */
9276     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
9277      NOOP;
9278     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9279    }
9280   }
9281  }
9282 }
9283
9284 /* No locale test, and always Unicode semantics */
9285 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                                          \
9286 ANYOF_##NAME:                                                                  \
9287   for (value = 0; value < 256; value++)                                  \
9288    if (TEST)                                                          \
9289    stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9290  yesno = '+';                                                               \
9291  what = WORD;                                                               \
9292  break;                                                                     \
9293 case ANYOF_N##NAME:                                                            \
9294   for (value = 0; value < 256; value++)                                  \
9295    if (!TEST)                                                         \
9296    stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9297  yesno = '!';                                                               \
9298  what = WORD;                                                               \
9299  break
9300
9301 /* Like the above, but there are differences if we are in uni-8-bit or not, so
9302  * there are two tests passed in, to use depending on that. There aren't any
9303  * cases where the label is different from the name, so no need for that
9304  * parameter */
9305 #define _C_C_T_(NAME, TEST_8, TEST_7, WORD)                                    \
9306 ANYOF_##NAME:                                                                  \
9307  if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);                               \
9308  else if (UNI_SEMANTICS) {                                                  \
9309   for (value = 0; value < 256; value++) {                                \
9310    if (TEST_8(value)) stored +=                                       \
9311      set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9312   }                                                                      \
9313  }                                                                          \
9314  else {                                                                     \
9315   for (value = 0; value < 128; value++) {                                \
9316    if (TEST_7(UNI_TO_NATIVE(value))) stored +=                        \
9317     set_regclass_bit(pRExC_state, ret,                     \
9318         (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);                 \
9319   }                                                                      \
9320  }                                                                          \
9321  yesno = '+';                                                               \
9322  what = WORD;                                                               \
9323  break;                                                                     \
9324 case ANYOF_N##NAME:                                                            \
9325  if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);                              \
9326  else if (UNI_SEMANTICS) {                                                  \
9327   for (value = 0; value < 256; value++) {                                \
9328    if (! TEST_8(value)) stored +=                                     \
9329      set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);    \
9330   }                                                                      \
9331  }                                                                          \
9332  else {                                                                     \
9333   for (value = 0; value < 128; value++) {                                \
9334    if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit(  \
9335       pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);    \
9336   }                                                                      \
9337   if (AT_LEAST_ASCII_RESTRICTED) {                                       \
9338    for (value = 128; value < 256; value++) {                          \
9339    stored += set_regclass_bit(                                     \
9340       pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
9341    }                                                                  \
9342    ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;                             \
9343   }                                                                      \
9344   else {                                                                 \
9345    /* For a non-ut8 target string with DEPENDS semantics, all above   \
9346    * ASCII Latin1 code points match the complement of any of the     \
9347    * classes.  But in utf8, they have their Unicode semantics, so    \
9348    * can't just set them in the bitmap, or else regexec.c will think \
9349    * they matched when they shouldn't. */                            \
9350    ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;                     \
9351   }                                                                      \
9352  }                                                                          \
9353  yesno = '!';                                                               \
9354  what = WORD;                                                               \
9355  break
9356
9357 STATIC U8
9358 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
9359 {
9360
9361  /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
9362  * Locale folding is done at run-time, so this function should not be
9363  * called for nodes that are for locales.
9364  *
9365  * This function sets the bit corresponding to the fold of the input
9366  * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
9367  * 'F' is 'f'.
9368  *
9369  * It also knows about the characters that are in the bitmap that have
9370  * folds that are matchable only outside it, and sets the appropriate lists
9371  * and flags.
9372  *
9373  * It returns the number of bits that actually changed from 0 to 1 */
9374
9375  U8 stored = 0;
9376  U8 fold;
9377
9378  PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
9379
9380  fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
9381          : PL_fold[value];
9382
9383  /* It assumes the bit for 'value' has already been set */
9384  if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
9385   ANYOF_BITMAP_SET(node, fold);
9386   stored++;
9387  }
9388  if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
9389   /* Certain Latin1 characters have matches outside the bitmap.  To get
9390   * here, 'value' is one of those characters.   None of these matches is
9391   * valid for ASCII characters under /aa, which have been excluded by
9392   * the 'if' above.  The matches fall into three categories:
9393   * 1) They are singly folded-to or -from an above 255 character, as
9394   *    LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
9395   *    WITH DIAERESIS;
9396   * 2) They are part of a multi-char fold with another character in the
9397   *    bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
9398   * 3) They are part of a multi-char fold with a character not in the
9399   *    bitmap, such as various ligatures.
9400   * We aren't dealing fully with multi-char folds, except we do deal
9401   * with the pattern containing a character that has a multi-char fold
9402   * (not so much the inverse).
9403   * For types 1) and 3), the matches only happen when the target string
9404   * is utf8; that's not true for 2), and we set a flag for it.
9405   *
9406   * The code below adds to the passed in inversion list the single fold
9407   * closures for 'value'.  The values are hard-coded here so that an
9408   * innocent-looking character class, like /[ks]/i won't have to go out
9409   * to disk to find the possible matches.  XXX It would be better to
9410   * generate these via regen, in case a new version of the Unicode
9411   * standard adds new mappings, though that is not really likely. */
9412   switch (value) {
9413    case 'k':
9414    case 'K':
9415     /* KELVIN SIGN */
9416     *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
9417     break;
9418    case 's':
9419    case 'S':
9420     /* LATIN SMALL LETTER LONG S */
9421     *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
9422     break;
9423    case MICRO_SIGN:
9424     *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9425             GREEK_SMALL_LETTER_MU);
9426     *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9427             GREEK_CAPITAL_LETTER_MU);
9428     break;
9429    case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
9430    case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
9431     /* ANGSTROM SIGN */
9432     *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
9433     if (DEPENDS_SEMANTICS) {    /* See DEPENDS comment below */
9434      *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9435              PL_fold_latin1[value]);
9436     }
9437     break;
9438    case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
9439     *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9440           LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
9441     break;
9442    case LATIN_SMALL_LETTER_SHARP_S:
9443     *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9444           LATIN_CAPITAL_LETTER_SHARP_S);
9445
9446     /* Under /a, /d, and /u, this can match the two chars "ss" */
9447     if (! MORE_ASCII_RESTRICTED) {
9448      add_alternate(alternate_ptr, (U8 *) "ss", 2);
9449
9450      /* And under /u or /a, it can match even if the target is
9451      * not utf8 */
9452      if (AT_LEAST_UNI_SEMANTICS) {
9453       ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
9454      }
9455     }
9456     break;
9457    case 'F': case 'f':
9458    case 'I': case 'i':
9459    case 'L': case 'l':
9460    case 'T': case 't':
9461    case 'A': case 'a':
9462    case 'H': case 'h':
9463    case 'J': case 'j':
9464    case 'N': case 'n':
9465    case 'W': case 'w':
9466    case 'Y': case 'y':
9467     /* These all are targets of multi-character folds from code
9468     * points that require UTF8 to express, so they can't match
9469     * unless the target string is in UTF-8, so no action here is
9470     * necessary, as regexec.c properly handles the general case
9471     * for UTF-8 matching */
9472     break;
9473    default:
9474     /* Use deprecated warning to increase the chances of this
9475     * being output */
9476     ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
9477     break;
9478   }
9479  }
9480  else if (DEPENDS_SEMANTICS
9481    && ! isASCII(value)
9482    && PL_fold_latin1[value] != value)
9483  {
9484   /* Under DEPENDS rules, non-ASCII Latin1 characters match their
9485    * folds only when the target string is in UTF-8.  We add the fold
9486    * here to the list of things to match outside the bitmap, which
9487    * won't be looked at unless it is UTF8 (or else if something else
9488    * says to look even if not utf8, but those things better not happen
9489    * under DEPENDS semantics. */
9490   *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
9491  }
9492
9493  return stored;
9494 }
9495
9496
9497 PERL_STATIC_INLINE U8
9498 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
9499 {
9500  /* This inline function sets a bit in the bitmap if not already set, and if
9501  * appropriate, its fold, returning the number of bits that actually
9502  * changed from 0 to 1 */
9503
9504  U8 stored;
9505
9506  PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
9507
9508  if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
9509   return 0;
9510  }
9511
9512  ANYOF_BITMAP_SET(node, value);
9513  stored = 1;
9514
9515  if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
9516   stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
9517  }
9518
9519  return stored;
9520 }
9521
9522 STATIC void
9523 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
9524 {
9525  /* Adds input 'string' with length 'len' to the ANYOF node's unicode
9526  * alternate list, pointed to by 'alternate_ptr'.  This is an array of
9527  * the multi-character folds of characters in the node */
9528  SV *sv;
9529
9530  PERL_ARGS_ASSERT_ADD_ALTERNATE;
9531
9532  if (! *alternate_ptr) {
9533   *alternate_ptr = newAV();
9534  }
9535  sv = newSVpvn_utf8((char*)string, len, TRUE);
9536  av_push(*alternate_ptr, sv);
9537  return;
9538 }
9539
9540 /*
9541    parse a class specification and produce either an ANYOF node that
9542    matches the pattern or perhaps will be optimized into an EXACTish node
9543    instead. The node contains a bit map for the first 256 characters, with the
9544    corresponding bit set if that character is in the list.  For characters
9545    above 255, a range list is used */
9546
9547 STATIC regnode *
9548 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
9549 {
9550  dVAR;
9551  register UV nextvalue;
9552  register IV prevvalue = OOB_UNICODE;
9553  register IV range = 0;
9554  UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
9555  register regnode *ret;
9556  STRLEN numlen;
9557  IV namedclass;
9558  char *rangebegin = NULL;
9559  bool need_class = 0;
9560  bool allow_full_fold = TRUE;   /* Assume wants multi-char folding */
9561  SV *listsv = NULL;
9562  STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
9563          than just initialized.  */
9564  UV n;
9565
9566  /* code points this node matches that can't be stored in the bitmap */
9567  HV* nonbitmap = NULL;
9568
9569  /* The items that are to match that aren't stored in the bitmap, but are a
9570  * result of things that are stored there.  This is the fold closure of
9571  * such a character, either because it has DEPENDS semantics and shouldn't
9572  * be matched unless the target string is utf8, or is a code point that is
9573  * too large for the bit map, as for example, the fold of the MICRO SIGN is
9574  * above 255.  This all is solely for performance reasons.  By having this
9575  * code know the outside-the-bitmap folds that the bitmapped characters are
9576  * involved with, we don't have to go out to disk to find the list of
9577  * matches, unless the character class includes code points that aren't
9578  * storable in the bit map.  That means that a character class with an 's'
9579  * in it, for example, doesn't need to go out to disk to find everything
9580  * that matches.  A 2nd list is used so that the 'nonbitmap' list is kept
9581  * empty unless there is something whose fold we don't know about, and will
9582  * have to go out to the disk to find. */
9583  HV* l1_fold_invlist = NULL;
9584
9585  /* List of multi-character folds that are matched by this node */
9586  AV* unicode_alternate  = NULL;
9587 #ifdef EBCDIC
9588  UV literal_endpoint = 0;
9589 #endif
9590  UV stored = 0;  /* how many chars stored in the bitmap */
9591
9592  regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
9593   case we need to change the emitted regop to an EXACT. */
9594  const char * orig_parse = RExC_parse;
9595  GET_RE_DEBUG_FLAGS_DECL;
9596
9597  PERL_ARGS_ASSERT_REGCLASS;
9598 #ifndef DEBUGGING
9599  PERL_UNUSED_ARG(depth);
9600 #endif
9601
9602  DEBUG_PARSE("clas");
9603
9604  /* Assume we are going to generate an ANYOF node. */
9605  ret = reganode(pRExC_state, ANYOF, 0);
9606
9607
9608  if (!SIZE_ONLY) {
9609   ANYOF_FLAGS(ret) = 0;
9610  }
9611
9612  if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
9613   RExC_naughty++;
9614   RExC_parse++;
9615   if (!SIZE_ONLY)
9616    ANYOF_FLAGS(ret) |= ANYOF_INVERT;
9617
9618   /* We have decided to not allow multi-char folds in inverted character
9619   * classes, due to the confusion that can happen, even with classes
9620   * that are designed for a non-Unicode world:  You have the peculiar
9621   * case that:
9622    "s s" =~ /^[^\xDF]+$/i => Y
9623    "ss"  =~ /^[^\xDF]+$/i => N
9624   *
9625   * See [perl #89750] */
9626   allow_full_fold = FALSE;
9627  }
9628
9629  if (SIZE_ONLY) {
9630   RExC_size += ANYOF_SKIP;
9631   listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
9632  }
9633  else {
9634   RExC_emit += ANYOF_SKIP;
9635   if (LOC) {
9636    ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
9637   }
9638   ANYOF_BITMAP_ZERO(ret);
9639   listsv = newSVpvs("# comment\n");
9640   initial_listsv_len = SvCUR(listsv);
9641  }
9642
9643  nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9644
9645  if (!SIZE_ONLY && POSIXCC(nextvalue))
9646   checkposixcc(pRExC_state);
9647
9648  /* allow 1st char to be ] (allowing it to be - is dealt with later) */
9649  if (UCHARAT(RExC_parse) == ']')
9650   goto charclassloop;
9651
9652 parseit:
9653  while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
9654
9655  charclassloop:
9656
9657   namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9658
9659   if (!range)
9660    rangebegin = RExC_parse;
9661   if (UTF) {
9662    value = utf8n_to_uvchr((U8*)RExC_parse,
9663         RExC_end - RExC_parse,
9664         &numlen, UTF8_ALLOW_DEFAULT);
9665    RExC_parse += numlen;
9666   }
9667   else
9668    value = UCHARAT(RExC_parse++);
9669
9670   nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9671   if (value == '[' && POSIXCC(nextvalue))
9672    namedclass = regpposixcc(pRExC_state, value);
9673   else if (value == '\\') {
9674    if (UTF) {
9675     value = utf8n_to_uvchr((U8*)RExC_parse,
9676         RExC_end - RExC_parse,
9677         &numlen, UTF8_ALLOW_DEFAULT);
9678     RExC_parse += numlen;
9679    }
9680    else
9681     value = UCHARAT(RExC_parse++);
9682    /* Some compilers cannot handle switching on 64-bit integer
9683    * values, therefore value cannot be an UV.  Yes, this will
9684    * be a problem later if we want switch on Unicode.
9685    * A similar issue a little bit later when switching on
9686    * namedclass. --jhi */
9687    switch ((I32)value) {
9688    case 'w': namedclass = ANYOF_ALNUM; break;
9689    case 'W': namedclass = ANYOF_NALNUM; break;
9690    case 's': namedclass = ANYOF_SPACE; break;
9691    case 'S': namedclass = ANYOF_NSPACE; break;
9692    case 'd': namedclass = ANYOF_DIGIT; break;
9693    case 'D': namedclass = ANYOF_NDIGIT; break;
9694    case 'v': namedclass = ANYOF_VERTWS; break;
9695    case 'V': namedclass = ANYOF_NVERTWS; break;
9696    case 'h': namedclass = ANYOF_HORIZWS; break;
9697    case 'H': namedclass = ANYOF_NHORIZWS; break;
9698    case 'N':  /* Handle \N{NAME} in class */
9699     {
9700      /* We only pay attention to the first char of
9701      multichar strings being returned. I kinda wonder
9702      if this makes sense as it does change the behaviour
9703      from earlier versions, OTOH that behaviour was broken
9704      as well. */
9705      UV v; /* value is register so we cant & it /grrr */
9706      if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
9707       goto parseit;
9708      }
9709      value= v;
9710     }
9711     break;
9712    case 'p':
9713    case 'P':
9714     {
9715     char *e;
9716     if (RExC_parse >= RExC_end)
9717      vFAIL2("Empty \\%c{}", (U8)value);
9718     if (*RExC_parse == '{') {
9719      const U8 c = (U8)value;
9720      e = strchr(RExC_parse++, '}');
9721      if (!e)
9722       vFAIL2("Missing right brace on \\%c{}", c);
9723      while (isSPACE(UCHARAT(RExC_parse)))
9724       RExC_parse++;
9725      if (e == RExC_parse)
9726       vFAIL2("Empty \\%c{}", c);
9727      n = e - RExC_parse;
9728      while (isSPACE(UCHARAT(RExC_parse + n - 1)))
9729       n--;
9730     }
9731     else {
9732      e = RExC_parse;
9733      n = 1;
9734     }
9735     if (!SIZE_ONLY) {
9736      if (UCHARAT(RExC_parse) == '^') {
9737       RExC_parse++;
9738       n--;
9739       value = value == 'p' ? 'P' : 'p'; /* toggle */
9740       while (isSPACE(UCHARAT(RExC_parse))) {
9741        RExC_parse++;
9742        n--;
9743       }
9744      }
9745
9746      /* Add the property name to the list.  If /i matching, give
9747      * a different name which consists of the normal name
9748      * sandwiched between two underscores and '_i'.  The design
9749      * is discussed in the commit message for this. */
9750      Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
9751           (value=='p' ? '+' : '!'),
9752           (FOLD) ? "__" : "",
9753           (int)n,
9754           RExC_parse,
9755           (FOLD) ? "_i" : ""
9756          );
9757     }
9758     RExC_parse = e + 1;
9759
9760     /* The \p could match something in the Latin1 range, hence
9761     * something that isn't utf8 */
9762     ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
9763     namedclass = ANYOF_MAX;  /* no official name, but it's named */
9764
9765     /* \p means they want Unicode semantics */
9766     RExC_uni_semantics = 1;
9767     }
9768     break;
9769    case 'n': value = '\n';   break;
9770    case 'r': value = '\r';   break;
9771    case 't': value = '\t';   break;
9772    case 'f': value = '\f';   break;
9773    case 'b': value = '\b';   break;
9774    case 'e': value = ASCII_TO_NATIVE('\033');break;
9775    case 'a': value = ASCII_TO_NATIVE('\007');break;
9776    case 'o':
9777     RExC_parse--; /* function expects to be pointed at the 'o' */
9778     {
9779      const char* error_msg;
9780      bool valid = grok_bslash_o(RExC_parse,
9781            &value,
9782            &numlen,
9783            &error_msg,
9784            SIZE_ONLY);
9785      RExC_parse += numlen;
9786      if (! valid) {
9787       vFAIL(error_msg);
9788      }
9789     }
9790     if (PL_encoding && value < 0x100) {
9791      goto recode_encoding;
9792     }
9793     break;
9794    case 'x':
9795     if (*RExC_parse == '{') {
9796      I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9797       | PERL_SCAN_DISALLOW_PREFIX;
9798      char * const e = strchr(RExC_parse++, '}');
9799      if (!e)
9800       vFAIL("Missing right brace on \\x{}");
9801
9802      numlen = e - RExC_parse;
9803      value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9804      RExC_parse = e + 1;
9805     }
9806     else {
9807      I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9808      numlen = 2;
9809      value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9810      RExC_parse += numlen;
9811     }
9812     if (PL_encoding && value < 0x100)
9813      goto recode_encoding;
9814     break;
9815    case 'c':
9816     value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
9817     break;
9818    case '0': case '1': case '2': case '3': case '4':
9819    case '5': case '6': case '7':
9820     {
9821      /* Take 1-3 octal digits */
9822      I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9823      numlen = 3;
9824      value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
9825      RExC_parse += numlen;
9826      if (PL_encoding && value < 0x100)
9827       goto recode_encoding;
9828      break;
9829     }
9830    recode_encoding:
9831     if (! RExC_override_recoding) {
9832      SV* enc = PL_encoding;
9833      value = reg_recode((const char)(U8)value, &enc);
9834      if (!enc && SIZE_ONLY)
9835       ckWARNreg(RExC_parse,
9836         "Invalid escape in the specified encoding");
9837      break;
9838     }
9839    default:
9840     /* Allow \_ to not give an error */
9841     if (!SIZE_ONLY && isALNUM(value) && value != '_') {
9842      ckWARN2reg(RExC_parse,
9843        "Unrecognized escape \\%c in character class passed through",
9844        (int)value);
9845     }
9846     break;
9847    }
9848   } /* end of \blah */
9849 #ifdef EBCDIC
9850   else
9851    literal_endpoint++;
9852 #endif
9853
9854   if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
9855
9856    /* What matches in a locale is not known until runtime, so need to
9857    * (one time per class) allocate extra space to pass to regexec.
9858    * The space will contain a bit for each named class that is to be
9859    * matched against.  This isn't needed for \p{} and pseudo-classes,
9860    * as they are not affected by locale, and hence are dealt with
9861    * separately */
9862    if (LOC && namedclass < ANYOF_MAX && ! need_class) {
9863     need_class = 1;
9864     if (SIZE_ONLY) {
9865      RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
9866     }
9867     else {
9868      RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
9869      ANYOF_CLASS_ZERO(ret);
9870     }
9871     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
9872    }
9873
9874    /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
9875    * literal, as is the character that began the false range, i.e.
9876    * the 'a' in the examples */
9877    if (range) {
9878     if (!SIZE_ONLY) {
9879      const int w =
9880       RExC_parse >= rangebegin ?
9881       RExC_parse - rangebegin : 0;
9882      ckWARN4reg(RExC_parse,
9883        "False [] range \"%*.*s\"",
9884        w, w, rangebegin);
9885
9886      stored +=
9887       set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
9888      if (prevvalue < 256) {
9889       stored +=
9890       set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
9891      }
9892      else {
9893       nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
9894      }
9895     }
9896
9897     range = 0; /* this was not a true range */
9898    }
9899
9900
9901
9902    if (!SIZE_ONLY) {
9903     const char *what = NULL;
9904     char yesno = 0;
9905
9906     /* Possible truncation here but in some 64-bit environments
9907     * the compiler gets heartburn about switch on 64-bit values.
9908     * A similar issue a little earlier when switching on value.
9909     * --jhi */
9910     switch ((I32)namedclass) {
9911
9912     case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
9913     case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
9914     case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
9915     case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
9916     case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
9917     case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
9918     case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
9919     case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
9920     case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
9921     case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
9922     /* \s, \w match all unicode if utf8. */
9923     case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
9924     case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
9925     case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
9926     case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
9927     case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
9928     case ANYOF_ASCII:
9929      if (LOC)
9930       ANYOF_CLASS_SET(ret, ANYOF_ASCII);
9931      else {
9932       for (value = 0; value < 128; value++)
9933        stored +=
9934        set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
9935      }
9936      yesno = '+';
9937      what = NULL; /* Doesn't match outside ascii, so
9938           don't want to add +utf8:: */
9939      break;
9940     case ANYOF_NASCII:
9941      if (LOC)
9942       ANYOF_CLASS_SET(ret, ANYOF_NASCII);
9943      else {
9944       for (value = 128; value < 256; value++)
9945        stored +=
9946        set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
9947      }
9948      ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9949      yesno = '!';
9950      what = "ASCII";
9951      break;
9952     case ANYOF_DIGIT:
9953      if (LOC)
9954       ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
9955      else {
9956       /* consecutive digits assumed */
9957       for (value = '0'; value <= '9'; value++)
9958        stored +=
9959        set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9960      }
9961      yesno = '+';
9962      what = "Digit";
9963      break;
9964     case ANYOF_NDIGIT:
9965      if (LOC)
9966       ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
9967      else {
9968       /* consecutive digits assumed */
9969       for (value = 0; value < '0'; value++)
9970        stored +=
9971        set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9972       for (value = '9' + 1; value < 256; value++)
9973        stored +=
9974        set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9975      }
9976      yesno = '!';
9977      what = "Digit";
9978      if (AT_LEAST_ASCII_RESTRICTED ) {
9979       ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9980      }
9981      break;
9982     case ANYOF_MAX:
9983      /* this is to handle \p and \P */
9984      break;
9985     default:
9986      vFAIL("Invalid [::] class");
9987      break;
9988     }
9989     if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
9990      /* Strings such as "+utf8::isWord\n" */
9991      Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
9992     }
9993
9994     continue;
9995    }
9996   } /* end of namedclass \blah */
9997
9998   if (range) {
9999    if (prevvalue > (IV)value) /* b-a */ {
10000     const int w = RExC_parse - rangebegin;
10001     Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
10002     range = 0; /* not a valid range */
10003    }
10004   }
10005   else {
10006    prevvalue = value; /* save the beginning of the range */
10007    if (RExC_parse+1 < RExC_end
10008     && *RExC_parse == '-'
10009     && RExC_parse[1] != ']')
10010    {
10011     RExC_parse++;
10012
10013     /* a bad range like \w-, [:word:]- ? */
10014     if (namedclass > OOB_NAMEDCLASS) {
10015      if (ckWARN(WARN_REGEXP)) {
10016       const int w =
10017        RExC_parse >= rangebegin ?
10018        RExC_parse - rangebegin : 0;
10019       vWARN4(RExC_parse,
10020        "False [] range \"%*.*s\"",
10021        w, w, rangebegin);
10022      }
10023      if (!SIZE_ONLY)
10024       stored +=
10025        set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
10026     } else
10027      range = 1; /* yeah, it's a range! */
10028     continue; /* but do it the next time */
10029    }
10030   }
10031
10032   /* non-Latin1 code point implies unicode semantics.  Must be set in
10033   * pass1 so is there for the whole of pass 2 */
10034   if (value > 255) {
10035    RExC_uni_semantics = 1;
10036   }
10037
10038   /* now is the next time */
10039   if (!SIZE_ONLY) {
10040    if (prevvalue < 256) {
10041     const IV ceilvalue = value < 256 ? value : 255;
10042     IV i;
10043 #ifdef EBCDIC
10044     /* In EBCDIC [\x89-\x91] should include
10045     * the \x8e but [i-j] should not. */
10046     if (literal_endpoint == 2 &&
10047      ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
10048      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
10049     {
10050      if (isLOWER(prevvalue)) {
10051       for (i = prevvalue; i <= ceilvalue; i++)
10052        if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
10053         stored +=
10054         set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
10055        }
10056      } else {
10057       for (i = prevvalue; i <= ceilvalue; i++)
10058        if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
10059         stored +=
10060         set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
10061        }
10062      }
10063     }
10064     else
10065 #endif
10066      for (i = prevvalue; i <= ceilvalue; i++) {
10067       stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
10068      }
10069   }
10070   if (value > 255) {
10071    const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
10072    const UV natvalue      = NATIVE_TO_UNI(value);
10073    nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
10074   }
10075 #ifdef EBCDIC
10076    literal_endpoint = 0;
10077 #endif
10078   }
10079
10080   range = 0; /* this range (if it was one) is done now */
10081  }
10082
10083
10084
10085  if (SIZE_ONLY)
10086   return ret;
10087  /****** !SIZE_ONLY AFTER HERE *********/
10088
10089  /* If folding and there are code points above 255, we calculate all
10090  * characters that could fold to or from the ones already on the list */
10091  if (FOLD && nonbitmap) {
10092   UV i;
10093
10094   HV* fold_intersection;
10095   UV* fold_list;
10096
10097   /* This is a list of all the characters that participate in folds
10098    * (except marks, etc in multi-char folds */
10099   if (! PL_utf8_foldable) {
10100    SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
10101    PL_utf8_foldable = _swash_to_invlist(swash);
10102   }
10103
10104   /* This is a hash that for a particular fold gives all characters
10105    * that are involved in it */
10106   if (! PL_utf8_foldclosures) {
10107
10108    /* If we were unable to find any folds, then we likely won't be
10109    * able to find the closures.  So just create an empty list.
10110    * Folding will effectively be restricted to the non-Unicode rules
10111    * hard-coded into Perl.  (This case happens legitimately during
10112    * compilation of Perl itself before the Unicode tables are
10113    * generated) */
10114    if (invlist_len(PL_utf8_foldable) == 0) {
10115     PL_utf8_foldclosures = _new_invlist(0);
10116    } else {
10117     /* If the folds haven't been read in, call a fold function
10118      * to force that */
10119     if (! PL_utf8_tofold) {
10120      U8 dummy[UTF8_MAXBYTES+1];
10121      STRLEN dummy_len;
10122      to_utf8_fold((U8*) "A", dummy, &dummy_len);
10123     }
10124     PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
10125    }
10126   }
10127
10128   /* Only the characters in this class that participate in folds need
10129    * be checked.  Get the intersection of this class and all the
10130    * possible characters that are foldable.  This can quickly narrow
10131    * down a large class */
10132   fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
10133
10134   /* Now look at the foldable characters in this class individually */
10135   fold_list = invlist_array(fold_intersection);
10136   for (i = 0; i < invlist_len(fold_intersection); i++) {
10137    UV j;
10138
10139    /* The next entry is the beginning of the range that is in the
10140    * class */
10141    UV start = fold_list[i++];
10142
10143
10144    /* The next entry is the beginning of the next range, which
10145     * isn't in the class, so the end of the current range is one
10146     * less than that */
10147    UV end = fold_list[i] - 1;
10148
10149    /* Look at every character in the range */
10150    for (j = start; j <= end; j++) {
10151
10152     /* Get its fold */
10153     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10154     STRLEN foldlen;
10155     const UV f =
10156      _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
10157
10158     if (foldlen > (STRLEN)UNISKIP(f)) {
10159
10160      /* Any multicharacter foldings (disallowed in
10161       * lookbehind patterns) require the following
10162       * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
10163       * E folds into "pq" and F folds into "rst", all other
10164       * characters fold to single characters.  We save away
10165       * these multicharacter foldings, to be later saved as
10166       * part of the additional "s" data. */
10167      if (! RExC_in_lookbehind) {
10168       U8* loc = foldbuf;
10169       U8* e = foldbuf + foldlen;
10170
10171       /* If any of the folded characters of this are in
10172        * the Latin1 range, tell the regex engine that
10173        * this can match a non-utf8 target string.  The
10174        * only multi-byte fold whose source is in the
10175        * Latin1 range (U+00DF) applies only when the
10176        * target string is utf8, or under unicode rules */
10177       if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
10178        while (loc < e) {
10179
10180         /* Can't mix ascii with non- under /aa */
10181         if (MORE_ASCII_RESTRICTED
10182          && (isASCII(*loc) != isASCII(j)))
10183         {
10184          goto end_multi_fold;
10185         }
10186         if (UTF8_IS_INVARIANT(*loc)
10187          || UTF8_IS_DOWNGRADEABLE_START(*loc))
10188         {
10189          /* Can't mix above and below 256 under
10190           * LOC */
10191          if (LOC) {
10192           goto end_multi_fold;
10193          }
10194          ANYOF_FLAGS(ret)
10195            |= ANYOF_NONBITMAP_NON_UTF8;
10196          break;
10197         }
10198         loc += UTF8SKIP(loc);
10199        }
10200       }
10201
10202       add_alternate(&unicode_alternate, foldbuf, foldlen);
10203      end_multi_fold: ;
10204      }
10205
10206      /* This is special-cased, as it is the only letter which
10207      * has both a multi-fold and single-fold in Latin1.  All
10208      * the other chars that have single and multi-folds are
10209      * always in utf8, and the utf8 folding algorithm catches
10210      * them */
10211      if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
10212       stored += set_regclass_bit(pRExC_state,
10213           ret,
10214           LATIN_SMALL_LETTER_SHARP_S,
10215           &l1_fold_invlist, &unicode_alternate);
10216      }
10217     }
10218     else {
10219      /* Single character fold.  Add everything in its fold
10220       * closure to the list that this node should match */
10221      SV** listp;
10222
10223      /* The fold closures data structure is a hash with the
10224       * keys being every character that is folded to, like
10225       * 'k', and the values each an array of everything that
10226       * folds to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
10227      if ((listp = hv_fetch(PL_utf8_foldclosures,
10228          (char *) foldbuf, foldlen, FALSE)))
10229      {
10230       AV* list = (AV*) *listp;
10231       IV k;
10232       for (k = 0; k <= av_len(list); k++) {
10233        SV** c_p = av_fetch(list, k, FALSE);
10234        UV c;
10235        if (c_p == NULL) {
10236         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
10237        }
10238        c = SvUV(*c_p);
10239
10240        /* /aa doesn't allow folds between ASCII and
10241         * non-; /l doesn't allow them between above
10242         * and below 256 */
10243        if ((MORE_ASCII_RESTRICTED
10244         && (isASCII(c) != isASCII(j)))
10245          || (LOC && ((c < 256) != (j < 256))))
10246        {
10247         continue;
10248        }
10249
10250        if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
10251         stored += set_regclass_bit(pRExC_state,
10252           ret,
10253           (U8) c,
10254           &l1_fold_invlist, &unicode_alternate);
10255        }
10256         /* It may be that the code point is already
10257          * in this range or already in the bitmap,
10258          * in which case we need do nothing */
10259        else if ((c < start || c > end)
10260           && (c > 255
10261            || ! ANYOF_BITMAP_TEST(ret, c)))
10262        {
10263         nonbitmap = add_cp_to_invlist(nonbitmap, c);
10264        }
10265       }
10266      }
10267     }
10268    }
10269   }
10270   invlist_destroy(fold_intersection);
10271  }
10272
10273  /* Combine the two lists into one. */
10274  if (l1_fold_invlist) {
10275   if (nonbitmap) {
10276    nonbitmap = invlist_union(nonbitmap, l1_fold_invlist);
10277   }
10278   else {
10279    nonbitmap = l1_fold_invlist;
10280   }
10281  }
10282
10283  /* Here, we have calculated what code points should be in the character
10284  * class.   Now we can see about various optimizations.  Fold calculation
10285  * needs to take place before inversion.  Otherwise /[^k]/i would invert to
10286  * include K, which under /i would match k. */
10287
10288  /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
10289  * set the FOLD flag yet, so this this does optimize those.  It doesn't
10290  * optimize locale.  Doing so perhaps could be done as long as there is
10291  * nothing like \w in it; some thought also would have to be given to the
10292  * interaction with above 0x100 chars */
10293  if (! LOC
10294   && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT
10295   && ! unicode_alternate
10296   && ! nonbitmap
10297   && SvCUR(listsv) == initial_listsv_len)
10298  {
10299   for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
10300    ANYOF_BITMAP(ret)[value] ^= 0xFF;
10301   stored = 256 - stored;
10302
10303   /* The inversion means that everything above 255 is matched; and at the
10304   * same time we clear the invert flag */
10305   ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
10306  }
10307
10308  /* Folding in the bitmap is taken care of above, but not for locale (for
10309  * which we have to wait to see what folding is in effect at runtime), and
10310  * for things not in the bitmap.  Set run-time fold flag for these */
10311  if (FOLD && (LOC || nonbitmap || unicode_alternate)) {
10312   ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
10313  }
10314
10315  /* A single character class can be "optimized" into an EXACTish node.
10316  * Note that since we don't currently count how many characters there are
10317  * outside the bitmap, we are XXX missing optimization possibilities for
10318  * them.  This optimization can't happen unless this is a truly single
10319  * character class, which means that it can't be an inversion into a
10320  * many-character class, and there must be no possibility of there being
10321  * things outside the bitmap.  'stored' (only) for locales doesn't include
10322  * \w, etc, so have to make a special test that they aren't present
10323  *
10324  * Similarly A 2-character class of the very special form like [bB] can be
10325  * optimized into an EXACTFish node, but only for non-locales, and for
10326  * characters which only have the two folds; so things like 'fF' and 'Ii'
10327  * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
10328  * FI'. */
10329  if (! nonbitmap
10330   && ! unicode_alternate
10331   && SvCUR(listsv) == initial_listsv_len
10332   && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
10333   && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10334        || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
10335    || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10336         && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
10337         /* If the latest code point has a fold whose
10338         * bit is set, it must be the only other one */
10339         && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
10340         && ANYOF_BITMAP_TEST(ret, prevvalue)))))
10341  {
10342   /* Note that the information needed to decide to do this optimization
10343   * is not currently available until the 2nd pass, and that the actually
10344   * used EXACTish node takes less space than the calculated ANYOF node,
10345   * and hence the amount of space calculated in the first pass is larger
10346   * than actually used, so this optimization doesn't gain us any space.
10347   * But an EXACT node is faster than an ANYOF node, and can be combined
10348   * with any adjacent EXACT nodes later by the optimizer for further
10349   * gains.  The speed of executing an EXACTF is similar to an ANYOF
10350   * node, so the optimization advantage comes from the ability to join
10351   * it to adjacent EXACT nodes */
10352
10353   const char * cur_parse= RExC_parse;
10354   U8 op;
10355   RExC_emit = (regnode *)orig_emit;
10356   RExC_parse = (char *)orig_parse;
10357
10358   if (stored == 1) {
10359
10360    /* A locale node with one point can be folded; all the other cases
10361    * with folding will have two points, since we calculate them above
10362    */
10363    if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
10364     op = EXACTFL;
10365    }
10366    else {
10367     op = EXACT;
10368    }
10369   }   /* else 2 chars in the bit map: the folds of each other */
10370   else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
10371
10372    /* To join adjacent nodes, they must be the exact EXACTish type.
10373    * Try to use the most likely type, by using EXACTFU if the regex
10374    * calls for them, or is required because the character is
10375    * non-ASCII */
10376    op = EXACTFU;
10377   }
10378   else {    /* Otherwise, more likely to be EXACTF type */
10379    op = EXACTF;
10380   }
10381
10382   ret = reg_node(pRExC_state, op);
10383   RExC_parse = (char *)cur_parse;
10384   if (UTF && ! NATIVE_IS_INVARIANT(value)) {
10385    *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
10386    *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
10387    STR_LEN(ret)= 2;
10388    RExC_emit += STR_SZ(2);
10389   }
10390   else {
10391    *STRING(ret)= (char)value;
10392    STR_LEN(ret)= 1;
10393    RExC_emit += STR_SZ(1);
10394   }
10395   SvREFCNT_dec(listsv);
10396   return ret;
10397  }
10398
10399  if (nonbitmap) {
10400   UV* nonbitmap_array = invlist_array(nonbitmap);
10401   UV nonbitmap_len = invlist_len(nonbitmap);
10402   UV i;
10403
10404   /*  Here have the full list of items to match that aren't in the
10405   *  bitmap.  Convert to the structure that the rest of the code is
10406   *  expecting.   XXX That rest of the code should convert to this
10407   *  structure */
10408   for (i = 0; i < nonbitmap_len; i++) {
10409
10410    /* The next entry is the beginning of the range that is in the
10411    * class */
10412    UV start = nonbitmap_array[i++];
10413    UV end;
10414
10415    /* The next entry is the beginning of the next range, which isn't
10416    * in the class, so the end of the current range is one less than
10417    * that.  But if there is no next range, it means that the range
10418    * begun by 'start' extends to infinity, which for this platform
10419    * ends at UV_MAX */
10420    if (i == nonbitmap_len) {
10421     end = UV_MAX;
10422    }
10423    else {
10424     end = nonbitmap_array[i] - 1;
10425    }
10426
10427    if (start == end) {
10428     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
10429    }
10430    else {
10431     /* The \t sets the whole range */
10432     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
10433       /* XXX EBCDIC */
10434         start, end);
10435    }
10436   }
10437   invlist_destroy(nonbitmap);
10438  }
10439
10440  if (SvCUR(listsv) == initial_listsv_len && ! unicode_alternate) {
10441   ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
10442   SvREFCNT_dec(listsv);
10443   SvREFCNT_dec(unicode_alternate);
10444  }
10445  else {
10446
10447   AV * const av = newAV();
10448   SV *rv;
10449   /* The 0th element stores the character class description
10450   * in its textual form: used later (regexec.c:Perl_regclass_swash())
10451   * to initialize the appropriate swash (which gets stored in
10452   * the 1st element), and also useful for dumping the regnode.
10453   * The 2nd element stores the multicharacter foldings,
10454   * used later (regexec.c:S_reginclass()). */
10455   av_store(av, 0, listsv);
10456   av_store(av, 1, NULL);
10457
10458   /* Store any computed multi-char folds only if we are allowing
10459   * them */
10460   if (allow_full_fold) {
10461    av_store(av, 2, MUTABLE_SV(unicode_alternate));
10462    if (unicode_alternate) { /* This node is variable length */
10463     OP(ret) = ANYOFV;
10464    }
10465   }
10466   else {
10467    av_store(av, 2, NULL);
10468   }
10469   rv = newRV_noinc(MUTABLE_SV(av));
10470   n = add_data(pRExC_state, 1, "s");
10471   RExC_rxi->data->data[n] = (void*)rv;
10472   ARG_SET(ret, n);
10473  }
10474  return ret;
10475 }
10476 #undef _C_C_T_
10477
10478
10479 /* reg_skipcomment()
10480
10481    Absorbs an /x style # comments from the input stream.
10482    Returns true if there is more text remaining in the stream.
10483    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
10484    terminates the pattern without including a newline.
10485
10486    Note its the callers responsibility to ensure that we are
10487    actually in /x mode
10488
10489 */
10490
10491 STATIC bool
10492 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
10493 {
10494  bool ended = 0;
10495
10496  PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
10497
10498  while (RExC_parse < RExC_end)
10499   if (*RExC_parse++ == '\n') {
10500    ended = 1;
10501    break;
10502   }
10503  if (!ended) {
10504   /* we ran off the end of the pattern without ending
10505   the comment, so we have to add an \n when wrapping */
10506   RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10507   return 0;
10508  } else
10509   return 1;
10510 }
10511
10512 /* nextchar()
10513
10514    Advances the parse position, and optionally absorbs
10515    "whitespace" from the inputstream.
10516
10517    Without /x "whitespace" means (?#...) style comments only,
10518    with /x this means (?#...) and # comments and whitespace proper.
10519
10520    Returns the RExC_parse point from BEFORE the scan occurs.
10521
10522    This is the /x friendly way of saying RExC_parse++.
10523 */
10524
10525 STATIC char*
10526 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
10527 {
10528  char* const retval = RExC_parse++;
10529
10530  PERL_ARGS_ASSERT_NEXTCHAR;
10531
10532  for (;;) {
10533   if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
10534     RExC_parse[2] == '#') {
10535    while (*RExC_parse != ')') {
10536     if (RExC_parse == RExC_end)
10537      FAIL("Sequence (?#... not terminated");
10538     RExC_parse++;
10539    }
10540    RExC_parse++;
10541    continue;
10542   }
10543   if (RExC_flags & RXf_PMf_EXTENDED) {
10544    if (isSPACE(*RExC_parse)) {
10545     RExC_parse++;
10546     continue;
10547    }
10548    else if (*RExC_parse == '#') {
10549     if ( reg_skipcomment( pRExC_state ) )
10550      continue;
10551    }
10552   }
10553   return retval;
10554  }
10555 }
10556
10557 /*
10558 - reg_node - emit a node
10559 */
10560 STATIC regnode *   /* Location. */
10561 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
10562 {
10563  dVAR;
10564  register regnode *ptr;
10565  regnode * const ret = RExC_emit;
10566  GET_RE_DEBUG_FLAGS_DECL;
10567
10568  PERL_ARGS_ASSERT_REG_NODE;
10569
10570  if (SIZE_ONLY) {
10571   SIZE_ALIGN(RExC_size);
10572   RExC_size += 1;
10573   return(ret);
10574  }
10575  if (RExC_emit >= RExC_emit_bound)
10576   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10577
10578  NODE_ALIGN_FILL(ret);
10579  ptr = ret;
10580  FILL_ADVANCE_NODE(ptr, op);
10581  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
10582 #ifdef RE_TRACK_PATTERN_OFFSETS
10583  if (RExC_offsets) {         /* MJD */
10584   MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
10585    "reg_node", __LINE__,
10586    PL_reg_name[op],
10587    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
10588     ? "Overwriting end of array!\n" : "OK",
10589    (UV)(RExC_emit - RExC_emit_start),
10590    (UV)(RExC_parse - RExC_start),
10591    (UV)RExC_offsets[0]));
10592   Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
10593  }
10594 #endif
10595  RExC_emit = ptr;
10596  return(ret);
10597 }
10598
10599 /*
10600 - reganode - emit a node with an argument
10601 */
10602 STATIC regnode *   /* Location. */
10603 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
10604 {
10605  dVAR;
10606  register regnode *ptr;
10607  regnode * const ret = RExC_emit;
10608  GET_RE_DEBUG_FLAGS_DECL;
10609
10610  PERL_ARGS_ASSERT_REGANODE;
10611
10612  if (SIZE_ONLY) {
10613   SIZE_ALIGN(RExC_size);
10614   RExC_size += 2;
10615   /*
10616   We can't do this:
10617
10618   assert(2==regarglen[op]+1);
10619
10620   Anything larger than this has to allocate the extra amount.
10621   If we changed this to be:
10622
10623   RExC_size += (1 + regarglen[op]);
10624
10625   then it wouldn't matter. Its not clear what side effect
10626   might come from that so its not done so far.
10627   -- dmq
10628   */
10629   return(ret);
10630  }
10631  if (RExC_emit >= RExC_emit_bound)
10632   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10633
10634  NODE_ALIGN_FILL(ret);
10635  ptr = ret;
10636  FILL_ADVANCE_NODE_ARG(ptr, op, arg);
10637  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
10638 #ifdef RE_TRACK_PATTERN_OFFSETS
10639  if (RExC_offsets) {         /* MJD */
10640   MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10641    "reganode",
10642    __LINE__,
10643    PL_reg_name[op],
10644    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
10645    "Overwriting end of array!\n" : "OK",
10646    (UV)(RExC_emit - RExC_emit_start),
10647    (UV)(RExC_parse - RExC_start),
10648    (UV)RExC_offsets[0]));
10649   Set_Cur_Node_Offset;
10650  }
10651 #endif
10652  RExC_emit = ptr;
10653  return(ret);
10654 }
10655
10656 /*
10657 - reguni - emit (if appropriate) a Unicode character
10658 */
10659 STATIC STRLEN
10660 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
10661 {
10662  dVAR;
10663
10664  PERL_ARGS_ASSERT_REGUNI;
10665
10666  return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
10667 }
10668
10669 /*
10670 - reginsert - insert an operator in front of already-emitted operand
10671 *
10672 * Means relocating the operand.
10673 */
10674 STATIC void
10675 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
10676 {
10677  dVAR;
10678  register regnode *src;
10679  register regnode *dst;
10680  register regnode *place;
10681  const int offset = regarglen[(U8)op];
10682  const int size = NODE_STEP_REGNODE + offset;
10683  GET_RE_DEBUG_FLAGS_DECL;
10684
10685  PERL_ARGS_ASSERT_REGINSERT;
10686  PERL_UNUSED_ARG(depth);
10687 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
10688  DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
10689  if (SIZE_ONLY) {
10690   RExC_size += size;
10691   return;
10692  }
10693
10694  src = RExC_emit;
10695  RExC_emit += size;
10696  dst = RExC_emit;
10697  if (RExC_open_parens) {
10698   int paren;
10699   /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
10700   for ( paren=0 ; paren < RExC_npar ; paren++ ) {
10701    if ( RExC_open_parens[paren] >= opnd ) {
10702     /*DEBUG_PARSE_FMT("open"," - %d",size);*/
10703     RExC_open_parens[paren] += size;
10704    } else {
10705     /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
10706    }
10707    if ( RExC_close_parens[paren] >= opnd ) {
10708     /*DEBUG_PARSE_FMT("close"," - %d",size);*/
10709     RExC_close_parens[paren] += size;
10710    } else {
10711     /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
10712    }
10713   }
10714  }
10715
10716  while (src > opnd) {
10717   StructCopy(--src, --dst, regnode);
10718 #ifdef RE_TRACK_PATTERN_OFFSETS
10719   if (RExC_offsets) {     /* MJD 20010112 */
10720    MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
10721     "reg_insert",
10722     __LINE__,
10723     PL_reg_name[op],
10724     (UV)(dst - RExC_emit_start) > RExC_offsets[0]
10725      ? "Overwriting end of array!\n" : "OK",
10726     (UV)(src - RExC_emit_start),
10727     (UV)(dst - RExC_emit_start),
10728     (UV)RExC_offsets[0]));
10729    Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
10730    Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
10731   }
10732 #endif
10733  }
10734
10735
10736  place = opnd;  /* Op node, where operand used to be. */
10737 #ifdef RE_TRACK_PATTERN_OFFSETS
10738  if (RExC_offsets) {         /* MJD */
10739   MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10740    "reginsert",
10741    __LINE__,
10742    PL_reg_name[op],
10743    (UV)(place - RExC_emit_start) > RExC_offsets[0]
10744    ? "Overwriting end of array!\n" : "OK",
10745    (UV)(place - RExC_emit_start),
10746    (UV)(RExC_parse - RExC_start),
10747    (UV)RExC_offsets[0]));
10748   Set_Node_Offset(place, RExC_parse);
10749   Set_Node_Length(place, 1);
10750  }
10751 #endif
10752  src = NEXTOPER(place);
10753  FILL_ADVANCE_NODE(place, op);
10754  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
10755  Zero(src, offset, regnode);
10756 }
10757
10758 /*
10759 - regtail - set the next-pointer at the end of a node chain of p to val.
10760 - SEE ALSO: regtail_study
10761 */
10762 /* TODO: All three parms should be const */
10763 STATIC void
10764 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10765 {
10766  dVAR;
10767  register regnode *scan;
10768  GET_RE_DEBUG_FLAGS_DECL;
10769
10770  PERL_ARGS_ASSERT_REGTAIL;
10771 #ifndef DEBUGGING
10772  PERL_UNUSED_ARG(depth);
10773 #endif
10774
10775  if (SIZE_ONLY)
10776   return;
10777
10778  /* Find last node. */
10779  scan = p;
10780  for (;;) {
10781   regnode * const temp = regnext(scan);
10782   DEBUG_PARSE_r({
10783    SV * const mysv=sv_newmortal();
10784    DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
10785    regprop(RExC_rx, mysv, scan);
10786    PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
10787     SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
10788      (temp == NULL ? "->" : ""),
10789      (temp == NULL ? PL_reg_name[OP(val)] : "")
10790    );
10791   });
10792   if (temp == NULL)
10793    break;
10794   scan = temp;
10795  }
10796
10797  if (reg_off_by_arg[OP(scan)]) {
10798   ARG_SET(scan, val - scan);
10799  }
10800  else {
10801   NEXT_OFF(scan) = val - scan;
10802  }
10803 }
10804
10805 #ifdef DEBUGGING
10806 /*
10807 - regtail_study - set the next-pointer at the end of a node chain of p to val.
10808 - Look for optimizable sequences at the same time.
10809 - currently only looks for EXACT chains.
10810
10811 This is experimental code. The idea is to use this routine to perform
10812 in place optimizations on branches and groups as they are constructed,
10813 with the long term intention of removing optimization from study_chunk so
10814 that it is purely analytical.
10815
10816 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
10817 to control which is which.
10818
10819 */
10820 /* TODO: All four parms should be const */
10821
10822 STATIC U8
10823 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10824 {
10825  dVAR;
10826  register regnode *scan;
10827  U8 exact = PSEUDO;
10828 #ifdef EXPERIMENTAL_INPLACESCAN
10829  I32 min = 0;
10830 #endif
10831  GET_RE_DEBUG_FLAGS_DECL;
10832
10833  PERL_ARGS_ASSERT_REGTAIL_STUDY;
10834
10835
10836  if (SIZE_ONLY)
10837   return exact;
10838
10839  /* Find last node. */
10840
10841  scan = p;
10842  for (;;) {
10843   regnode * const temp = regnext(scan);
10844 #ifdef EXPERIMENTAL_INPLACESCAN
10845   if (PL_regkind[OP(scan)] == EXACT)
10846    if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
10847     return EXACT;
10848 #endif
10849   if ( exact ) {
10850    switch (OP(scan)) {
10851     case EXACT:
10852     case EXACTF:
10853     case EXACTFA:
10854     case EXACTFU:
10855     case EXACTFL:
10856       if( exact == PSEUDO )
10857        exact= OP(scan);
10858       else if ( exact != OP(scan) )
10859        exact= 0;
10860     case NOTHING:
10861      break;
10862     default:
10863      exact= 0;
10864    }
10865   }
10866   DEBUG_PARSE_r({
10867    SV * const mysv=sv_newmortal();
10868    DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
10869    regprop(RExC_rx, mysv, scan);
10870    PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
10871     SvPV_nolen_const(mysv),
10872     REG_NODE_NUM(scan),
10873     PL_reg_name[exact]);
10874   });
10875   if (temp == NULL)
10876    break;
10877   scan = temp;
10878  }
10879  DEBUG_PARSE_r({
10880   SV * const mysv_val=sv_newmortal();
10881   DEBUG_PARSE_MSG("");
10882   regprop(RExC_rx, mysv_val, val);
10883   PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
10884      SvPV_nolen_const(mysv_val),
10885      (IV)REG_NODE_NUM(val),
10886      (IV)(val - scan)
10887   );
10888  });
10889  if (reg_off_by_arg[OP(scan)]) {
10890   ARG_SET(scan, val - scan);
10891  }
10892  else {
10893   NEXT_OFF(scan) = val - scan;
10894  }
10895
10896  return exact;
10897 }
10898 #endif
10899
10900 /*
10901  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
10902  */
10903 #ifdef DEBUGGING
10904 static void
10905 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
10906 {
10907  int bit;
10908  int set=0;
10909  regex_charset cs;
10910
10911  for (bit=0; bit<32; bit++) {
10912   if (flags & (1<<bit)) {
10913    if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
10914     continue;
10915    }
10916    if (!set++ && lead)
10917     PerlIO_printf(Perl_debug_log, "%s",lead);
10918    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
10919   }
10920  }
10921  if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
10922    if (!set++ && lead) {
10923     PerlIO_printf(Perl_debug_log, "%s",lead);
10924    }
10925    switch (cs) {
10926     case REGEX_UNICODE_CHARSET:
10927      PerlIO_printf(Perl_debug_log, "UNICODE");
10928      break;
10929     case REGEX_LOCALE_CHARSET:
10930      PerlIO_printf(Perl_debug_log, "LOCALE");
10931      break;
10932     case REGEX_ASCII_RESTRICTED_CHARSET:
10933      PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
10934      break;
10935     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
10936      PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
10937      break;
10938     default:
10939      PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
10940      break;
10941    }
10942  }
10943  if (lead)  {
10944   if (set)
10945    PerlIO_printf(Perl_debug_log, "\n");
10946   else
10947    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
10948  }
10949 }
10950 #endif
10951
10952 void
10953 Perl_regdump(pTHX_ const regexp *r)
10954 {
10955 #ifdef DEBUGGING
10956  dVAR;
10957  SV * const sv = sv_newmortal();
10958  SV *dsv= sv_newmortal();
10959  RXi_GET_DECL(r,ri);
10960  GET_RE_DEBUG_FLAGS_DECL;
10961
10962  PERL_ARGS_ASSERT_REGDUMP;
10963
10964  (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
10965
10966  /* Header fields of interest. */
10967  if (r->anchored_substr) {
10968   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
10969    RE_SV_DUMPLEN(r->anchored_substr), 30);
10970   PerlIO_printf(Perl_debug_log,
10971      "anchored %s%s at %"IVdf" ",
10972      s, RE_SV_TAIL(r->anchored_substr),
10973      (IV)r->anchored_offset);
10974  } else if (r->anchored_utf8) {
10975   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
10976    RE_SV_DUMPLEN(r->anchored_utf8), 30);
10977   PerlIO_printf(Perl_debug_log,
10978      "anchored utf8 %s%s at %"IVdf" ",
10979      s, RE_SV_TAIL(r->anchored_utf8),
10980      (IV)r->anchored_offset);
10981  }
10982  if (r->float_substr) {
10983   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
10984    RE_SV_DUMPLEN(r->float_substr), 30);
10985   PerlIO_printf(Perl_debug_log,
10986      "floating %s%s at %"IVdf"..%"UVuf" ",
10987      s, RE_SV_TAIL(r->float_substr),
10988      (IV)r->float_min_offset, (UV)r->float_max_offset);
10989  } else if (r->float_utf8) {
10990   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
10991    RE_SV_DUMPLEN(r->float_utf8), 30);
10992   PerlIO_printf(Perl_debug_log,
10993      "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
10994      s, RE_SV_TAIL(r->float_utf8),
10995      (IV)r->float_min_offset, (UV)r->float_max_offset);
10996  }
10997  if (r->check_substr || r->check_utf8)
10998   PerlIO_printf(Perl_debug_log,
10999      (const char *)
11000      (r->check_substr == r->float_substr
11001      && r->check_utf8 == r->float_utf8
11002      ? "(checking floating" : "(checking anchored"));
11003  if (r->extflags & RXf_NOSCAN)
11004   PerlIO_printf(Perl_debug_log, " noscan");
11005  if (r->extflags & RXf_CHECK_ALL)
11006   PerlIO_printf(Perl_debug_log, " isall");
11007  if (r->check_substr || r->check_utf8)
11008   PerlIO_printf(Perl_debug_log, ") ");
11009
11010  if (ri->regstclass) {
11011   regprop(r, sv, ri->regstclass);
11012   PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
11013  }
11014  if (r->extflags & RXf_ANCH) {
11015   PerlIO_printf(Perl_debug_log, "anchored");
11016   if (r->extflags & RXf_ANCH_BOL)
11017    PerlIO_printf(Perl_debug_log, "(BOL)");
11018   if (r->extflags & RXf_ANCH_MBOL)
11019    PerlIO_printf(Perl_debug_log, "(MBOL)");
11020   if (r->extflags & RXf_ANCH_SBOL)
11021    PerlIO_printf(Perl_debug_log, "(SBOL)");
11022   if (r->extflags & RXf_ANCH_GPOS)
11023    PerlIO_printf(Perl_debug_log, "(GPOS)");
11024   PerlIO_putc(Perl_debug_log, ' ');
11025  }
11026  if (r->extflags & RXf_GPOS_SEEN)
11027   PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
11028  if (r->intflags & PREGf_SKIP)
11029   PerlIO_printf(Perl_debug_log, "plus ");
11030  if (r->intflags & PREGf_IMPLICIT)
11031   PerlIO_printf(Perl_debug_log, "implicit ");
11032  PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
11033  if (r->extflags & RXf_EVAL_SEEN)
11034   PerlIO_printf(Perl_debug_log, "with eval ");
11035  PerlIO_printf(Perl_debug_log, "\n");
11036  DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
11037 #else
11038  PERL_ARGS_ASSERT_REGDUMP;
11039  PERL_UNUSED_CONTEXT;
11040  PERL_UNUSED_ARG(r);
11041 #endif /* DEBUGGING */
11042 }
11043
11044 /*
11045 - regprop - printable representation of opcode
11046 */
11047 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
11048 STMT_START { \
11049   if (do_sep) {                           \
11050    Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
11051    if (flags & ANYOF_INVERT)           \
11052     /*make sure the invert info is in each */ \
11053     sv_catpvs(sv, "^");             \
11054    do_sep = 0;                         \
11055   }                                       \
11056 } STMT_END
11057
11058 void
11059 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
11060 {
11061 #ifdef DEBUGGING
11062  dVAR;
11063  register int k;
11064  RXi_GET_DECL(prog,progi);
11065  GET_RE_DEBUG_FLAGS_DECL;
11066
11067  PERL_ARGS_ASSERT_REGPROP;
11068
11069  sv_setpvs(sv, "");
11070
11071  if (OP(o) > REGNODE_MAX)  /* regnode.type is unsigned */
11072   /* It would be nice to FAIL() here, but this may be called from
11073   regexec.c, and it would be hard to supply pRExC_state. */
11074   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
11075  sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
11076
11077  k = PL_regkind[OP(o)];
11078
11079  if (k == EXACT) {
11080   sv_catpvs(sv, " ");
11081   /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
11082   * is a crude hack but it may be the best for now since
11083   * we have no flag "this EXACTish node was UTF-8"
11084   * --jhi */
11085   pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
11086     PERL_PV_ESCAPE_UNI_DETECT |
11087     PERL_PV_ESCAPE_NONASCII   |
11088     PERL_PV_PRETTY_ELLIPSES   |
11089     PERL_PV_PRETTY_LTGT       |
11090     PERL_PV_PRETTY_NOCLEAR
11091     );
11092  } else if (k == TRIE) {
11093   /* print the details of the trie in dumpuntil instead, as
11094   * progi->data isn't available here */
11095   const char op = OP(o);
11096   const U32 n = ARG(o);
11097   const reg_ac_data * const ac = IS_TRIE_AC(op) ?
11098    (reg_ac_data *)progi->data->data[n] :
11099    NULL;
11100   const reg_trie_data * const trie
11101    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
11102
11103   Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
11104   DEBUG_TRIE_COMPILE_r(
11105    Perl_sv_catpvf(aTHX_ sv,
11106     "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
11107     (UV)trie->startstate,
11108     (IV)trie->statecount-1, /* -1 because of the unused 0 element */
11109     (UV)trie->wordcount,
11110     (UV)trie->minlen,
11111     (UV)trie->maxlen,
11112     (UV)TRIE_CHARCOUNT(trie),
11113     (UV)trie->uniquecharcount
11114    )
11115   );
11116   if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
11117    int i;
11118    int rangestart = -1;
11119    U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
11120    sv_catpvs(sv, "[");
11121    for (i = 0; i <= 256; i++) {
11122     if (i < 256 && BITMAP_TEST(bitmap,i)) {
11123      if (rangestart == -1)
11124       rangestart = i;
11125     } else if (rangestart != -1) {
11126      if (i <= rangestart + 3)
11127       for (; rangestart < i; rangestart++)
11128        put_byte(sv, rangestart);
11129      else {
11130       put_byte(sv, rangestart);
11131       sv_catpvs(sv, "-");
11132       put_byte(sv, i - 1);
11133      }
11134      rangestart = -1;
11135     }
11136    }
11137    sv_catpvs(sv, "]");
11138   }
11139
11140  } else if (k == CURLY) {
11141   if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
11142    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
11143   Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
11144  }
11145  else if (k == WHILEM && o->flags)   /* Ordinal/of */
11146   Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
11147  else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
11148   Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
11149   if ( RXp_PAREN_NAMES(prog) ) {
11150    if ( k != REF || (OP(o) < NREF)) {
11151     AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
11152     SV **name= av_fetch(list, ARG(o), 0 );
11153     if (name)
11154      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
11155    }
11156    else {
11157     AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
11158     SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
11159     I32 *nums=(I32*)SvPVX(sv_dat);
11160     SV **name= av_fetch(list, nums[0], 0 );
11161     I32 n;
11162     if (name) {
11163      for ( n=0; n<SvIVX(sv_dat); n++ ) {
11164       Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
11165          (n ? "," : ""), (IV)nums[n]);
11166      }
11167      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
11168     }
11169    }
11170   }
11171  } else if (k == GOSUB)
11172   Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
11173  else if (k == VERB) {
11174   if (!o->flags)
11175    Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
11176       SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
11177  } else if (k == LOGICAL)
11178   Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
11179  else if (k == FOLDCHAR)
11180   Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
11181  else if (k == ANYOF) {
11182   int i, rangestart = -1;
11183   const U8 flags = ANYOF_FLAGS(o);
11184   int do_sep = 0;
11185
11186   /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
11187   static const char * const anyofs[] = {
11188    "\\w",
11189    "\\W",
11190    "\\s",
11191    "\\S",
11192    "\\d",
11193    "\\D",
11194    "[:alnum:]",
11195    "[:^alnum:]",
11196    "[:alpha:]",
11197    "[:^alpha:]",
11198    "[:ascii:]",
11199    "[:^ascii:]",
11200    "[:cntrl:]",
11201    "[:^cntrl:]",
11202    "[:graph:]",
11203    "[:^graph:]",
11204    "[:lower:]",
11205    "[:^lower:]",
11206    "[:print:]",
11207    "[:^print:]",
11208    "[:punct:]",
11209    "[:^punct:]",
11210    "[:upper:]",
11211    "[:^upper:]",
11212    "[:xdigit:]",
11213    "[:^xdigit:]",
11214    "[:space:]",
11215    "[:^space:]",
11216    "[:blank:]",
11217    "[:^blank:]"
11218   };
11219
11220   if (flags & ANYOF_LOCALE)
11221    sv_catpvs(sv, "{loc}");
11222   if (flags & ANYOF_LOC_NONBITMAP_FOLD)
11223    sv_catpvs(sv, "{i}");
11224   Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
11225   if (flags & ANYOF_INVERT)
11226    sv_catpvs(sv, "^");
11227
11228   /* output what the standard cp 0-255 bitmap matches */
11229   for (i = 0; i <= 256; i++) {
11230    if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
11231     if (rangestart == -1)
11232      rangestart = i;
11233    } else if (rangestart != -1) {
11234     if (i <= rangestart + 3)
11235      for (; rangestart < i; rangestart++)
11236       put_byte(sv, rangestart);
11237     else {
11238      put_byte(sv, rangestart);
11239      sv_catpvs(sv, "-");
11240      put_byte(sv, i - 1);
11241     }
11242     do_sep = 1;
11243     rangestart = -1;
11244    }
11245   }
11246
11247   EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11248   /* output any special charclass tests (used entirely under use locale) */
11249   if (ANYOF_CLASS_TEST_ANY_SET(o))
11250    for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
11251     if (ANYOF_CLASS_TEST(o,i)) {
11252      sv_catpv(sv, anyofs[i]);
11253      do_sep = 1;
11254     }
11255
11256   EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11257
11258   if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
11259    sv_catpvs(sv, "{non-utf8-latin1-all}");
11260   }
11261
11262   /* output information about the unicode matching */
11263   if (flags & ANYOF_UNICODE_ALL)
11264    sv_catpvs(sv, "{unicode_all}");
11265   else if (ANYOF_NONBITMAP(o))
11266    sv_catpvs(sv, "{unicode}");
11267   if (flags & ANYOF_NONBITMAP_NON_UTF8)
11268    sv_catpvs(sv, "{outside bitmap}");
11269
11270   if (ANYOF_NONBITMAP(o)) {
11271    SV *lv;
11272    SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
11273
11274    if (lv) {
11275     if (sw) {
11276      U8 s[UTF8_MAXBYTES_CASE+1];
11277
11278      for (i = 0; i <= 256; i++) { /* just the first 256 */
11279       uvchr_to_utf8(s, i);
11280
11281       if (i < 256 && swash_fetch(sw, s, TRUE)) {
11282        if (rangestart == -1)
11283         rangestart = i;
11284       } else if (rangestart != -1) {
11285        if (i <= rangestart + 3)
11286         for (; rangestart < i; rangestart++) {
11287          const U8 * const e = uvchr_to_utf8(s,rangestart);
11288          U8 *p;
11289          for(p = s; p < e; p++)
11290           put_byte(sv, *p);
11291         }
11292        else {
11293         const U8 *e = uvchr_to_utf8(s,rangestart);
11294         U8 *p;
11295         for (p = s; p < e; p++)
11296          put_byte(sv, *p);
11297         sv_catpvs(sv, "-");
11298         e = uvchr_to_utf8(s, i-1);
11299         for (p = s; p < e; p++)
11300          put_byte(sv, *p);
11301         }
11302         rangestart = -1;
11303        }
11304       }
11305
11306      sv_catpvs(sv, "..."); /* et cetera */
11307     }
11308
11309     {
11310      char *s = savesvpv(lv);
11311      char * const origs = s;
11312
11313      while (*s && *s != '\n')
11314       s++;
11315
11316      if (*s == '\n') {
11317       const char * const t = ++s;
11318
11319       while (*s) {
11320        if (*s == '\n')
11321         *s = ' ';
11322        s++;
11323       }
11324       if (s[-1] == ' ')
11325        s[-1] = 0;
11326
11327       sv_catpv(sv, t);
11328      }
11329
11330      Safefree(origs);
11331     }
11332    }
11333   }
11334
11335   Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
11336  }
11337  else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
11338   Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
11339 #else
11340  PERL_UNUSED_CONTEXT;
11341  PERL_UNUSED_ARG(sv);
11342  PERL_UNUSED_ARG(o);
11343  PERL_UNUSED_ARG(prog);
11344 #endif /* DEBUGGING */
11345 }
11346
11347 SV *
11348 Perl_re_intuit_string(pTHX_ REGEXP * const r)
11349 {    /* Assume that RE_INTUIT is set */
11350  dVAR;
11351  struct regexp *const prog = (struct regexp *)SvANY(r);
11352  GET_RE_DEBUG_FLAGS_DECL;
11353
11354  PERL_ARGS_ASSERT_RE_INTUIT_STRING;
11355  PERL_UNUSED_CONTEXT;
11356
11357  DEBUG_COMPILE_r(
11358   {
11359    const char * const s = SvPV_nolen_const(prog->check_substr
11360      ? prog->check_substr : prog->check_utf8);
11361
11362    if (!PL_colorset) reginitcolors();
11363    PerlIO_printf(Perl_debug_log,
11364      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
11365      PL_colors[4],
11366      prog->check_substr ? "" : "utf8 ",
11367      PL_colors[5],PL_colors[0],
11368      s,
11369      PL_colors[1],
11370      (strlen(s) > 60 ? "..." : ""));
11371   } );
11372
11373  return prog->check_substr ? prog->check_substr : prog->check_utf8;
11374 }
11375
11376 /*
11377    pregfree()
11378
11379    handles refcounting and freeing the perl core regexp structure. When
11380    it is necessary to actually free the structure the first thing it
11381    does is call the 'free' method of the regexp_engine associated to
11382    the regexp, allowing the handling of the void *pprivate; member
11383    first. (This routine is not overridable by extensions, which is why
11384    the extensions free is called first.)
11385
11386    See regdupe and regdupe_internal if you change anything here.
11387 */
11388 #ifndef PERL_IN_XSUB_RE
11389 void
11390 Perl_pregfree(pTHX_ REGEXP *r)
11391 {
11392  SvREFCNT_dec(r);
11393 }
11394
11395 void
11396 Perl_pregfree2(pTHX_ REGEXP *rx)
11397 {
11398  dVAR;
11399  struct regexp *const r = (struct regexp *)SvANY(rx);
11400  GET_RE_DEBUG_FLAGS_DECL;
11401
11402  PERL_ARGS_ASSERT_PREGFREE2;
11403
11404  if (r->mother_re) {
11405   ReREFCNT_dec(r->mother_re);
11406  } else {
11407   CALLREGFREE_PVT(rx); /* free the private data */
11408   SvREFCNT_dec(RXp_PAREN_NAMES(r));
11409  }
11410  if (r->substrs) {
11411   SvREFCNT_dec(r->anchored_substr);
11412   SvREFCNT_dec(r->anchored_utf8);
11413   SvREFCNT_dec(r->float_substr);
11414   SvREFCNT_dec(r->float_utf8);
11415   Safefree(r->substrs);
11416  }
11417  RX_MATCH_COPY_FREE(rx);
11418 #ifdef PERL_OLD_COPY_ON_WRITE
11419  SvREFCNT_dec(r->saved_copy);
11420 #endif
11421  Safefree(r->offs);
11422 }
11423
11424 /*  reg_temp_copy()
11425
11426  This is a hacky workaround to the structural issue of match results
11427  being stored in the regexp structure which is in turn stored in
11428  PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
11429  could be PL_curpm in multiple contexts, and could require multiple
11430  result sets being associated with the pattern simultaneously, such
11431  as when doing a recursive match with (??{$qr})
11432
11433  The solution is to make a lightweight copy of the regexp structure
11434  when a qr// is returned from the code executed by (??{$qr}) this
11435  lightweight copy doesn't actually own any of its data except for
11436  the starp/end and the actual regexp structure itself.
11437
11438 */
11439
11440
11441 REGEXP *
11442 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
11443 {
11444  struct regexp *ret;
11445  struct regexp *const r = (struct regexp *)SvANY(rx);
11446  register const I32 npar = r->nparens+1;
11447
11448  PERL_ARGS_ASSERT_REG_TEMP_COPY;
11449
11450  if (!ret_x)
11451   ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
11452  ret = (struct regexp *)SvANY(ret_x);
11453
11454  (void)ReREFCNT_inc(rx);
11455  /* We can take advantage of the existing "copied buffer" mechanism in SVs
11456  by pointing directly at the buffer, but flagging that the allocated
11457  space in the copy is zero. As we've just done a struct copy, it's now
11458  a case of zero-ing that, rather than copying the current length.  */
11459  SvPV_set(ret_x, RX_WRAPPED(rx));
11460  SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
11461  memcpy(&(ret->xpv_cur), &(r->xpv_cur),
11462   sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
11463  SvLEN_set(ret_x, 0);
11464  SvSTASH_set(ret_x, NULL);
11465  SvMAGIC_set(ret_x, NULL);
11466  Newx(ret->offs, npar, regexp_paren_pair);
11467  Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11468  if (r->substrs) {
11469   Newx(ret->substrs, 1, struct reg_substr_data);
11470   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11471
11472   SvREFCNT_inc_void(ret->anchored_substr);
11473   SvREFCNT_inc_void(ret->anchored_utf8);
11474   SvREFCNT_inc_void(ret->float_substr);
11475   SvREFCNT_inc_void(ret->float_utf8);
11476
11477   /* check_substr and check_utf8, if non-NULL, point to either their
11478   anchored or float namesakes, and don't hold a second reference.  */
11479  }
11480  RX_MATCH_COPIED_off(ret_x);
11481 #ifdef PERL_OLD_COPY_ON_WRITE
11482  ret->saved_copy = NULL;
11483 #endif
11484  ret->mother_re = rx;
11485
11486  return ret_x;
11487 }
11488 #endif
11489
11490 /* regfree_internal()
11491
11492    Free the private data in a regexp. This is overloadable by
11493    extensions. Perl takes care of the regexp structure in pregfree(),
11494    this covers the *pprivate pointer which technically perl doesn't
11495    know about, however of course we have to handle the
11496    regexp_internal structure when no extension is in use.
11497
11498    Note this is called before freeing anything in the regexp
11499    structure.
11500  */
11501
11502 void
11503 Perl_regfree_internal(pTHX_ REGEXP * const rx)
11504 {
11505  dVAR;
11506  struct regexp *const r = (struct regexp *)SvANY(rx);
11507  RXi_GET_DECL(r,ri);
11508  GET_RE_DEBUG_FLAGS_DECL;
11509
11510  PERL_ARGS_ASSERT_REGFREE_INTERNAL;
11511
11512  DEBUG_COMPILE_r({
11513   if (!PL_colorset)
11514    reginitcolors();
11515   {
11516    SV *dsv= sv_newmortal();
11517    RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
11518     dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
11519    PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
11520     PL_colors[4],PL_colors[5],s);
11521   }
11522  });
11523 #ifdef RE_TRACK_PATTERN_OFFSETS
11524  if (ri->u.offsets)
11525   Safefree(ri->u.offsets);             /* 20010421 MJD */
11526 #endif
11527  if (ri->data) {
11528   int n = ri->data->count;
11529   PAD* new_comppad = NULL;
11530   PAD* old_comppad;
11531   PADOFFSET refcnt;
11532
11533   while (--n >= 0) {
11534   /* If you add a ->what type here, update the comment in regcomp.h */
11535    switch (ri->data->what[n]) {
11536    case 'a':
11537    case 's':
11538    case 'S':
11539    case 'u':
11540     SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
11541     break;
11542    case 'f':
11543     Safefree(ri->data->data[n]);
11544     break;
11545    case 'p':
11546     new_comppad = MUTABLE_AV(ri->data->data[n]);
11547     break;
11548    case 'o':
11549     if (new_comppad == NULL)
11550      Perl_croak(aTHX_ "panic: pregfree comppad");
11551     PAD_SAVE_LOCAL(old_comppad,
11552      /* Watch out for global destruction's random ordering. */
11553      (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
11554     );
11555     OP_REFCNT_LOCK;
11556     refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
11557     OP_REFCNT_UNLOCK;
11558     if (!refcnt)
11559      op_free((OP_4tree*)ri->data->data[n]);
11560
11561     PAD_RESTORE_LOCAL(old_comppad);
11562     SvREFCNT_dec(MUTABLE_SV(new_comppad));
11563     new_comppad = NULL;
11564     break;
11565    case 'n':
11566     break;
11567    case 'T':
11568     { /* Aho Corasick add-on structure for a trie node.
11569      Used in stclass optimization only */
11570      U32 refcount;
11571      reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
11572      OP_REFCNT_LOCK;
11573      refcount = --aho->refcount;
11574      OP_REFCNT_UNLOCK;
11575      if ( !refcount ) {
11576       PerlMemShared_free(aho->states);
11577       PerlMemShared_free(aho->fail);
11578       /* do this last!!!! */
11579       PerlMemShared_free(ri->data->data[n]);
11580       PerlMemShared_free(ri->regstclass);
11581      }
11582     }
11583     break;
11584    case 't':
11585     {
11586      /* trie structure. */
11587      U32 refcount;
11588      reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
11589      OP_REFCNT_LOCK;
11590      refcount = --trie->refcount;
11591      OP_REFCNT_UNLOCK;
11592      if ( !refcount ) {
11593       PerlMemShared_free(trie->charmap);
11594       PerlMemShared_free(trie->states);
11595       PerlMemShared_free(trie->trans);
11596       if (trie->bitmap)
11597        PerlMemShared_free(trie->bitmap);
11598       if (trie->jump)
11599        PerlMemShared_free(trie->jump);
11600       PerlMemShared_free(trie->wordinfo);
11601       /* do this last!!!! */
11602       PerlMemShared_free(ri->data->data[n]);
11603      }
11604     }
11605     break;
11606    default:
11607     Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
11608    }
11609   }
11610   Safefree(ri->data->what);
11611   Safefree(ri->data);
11612  }
11613
11614  Safefree(ri);
11615 }
11616
11617 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
11618 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
11619 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
11620
11621 /*
11622    re_dup - duplicate a regexp.
11623
11624    This routine is expected to clone a given regexp structure. It is only
11625    compiled under USE_ITHREADS.
11626
11627    After all of the core data stored in struct regexp is duplicated
11628    the regexp_engine.dupe method is used to copy any private data
11629    stored in the *pprivate pointer. This allows extensions to handle
11630    any duplication it needs to do.
11631
11632    See pregfree() and regfree_internal() if you change anything here.
11633 */
11634 #if defined(USE_ITHREADS)
11635 #ifndef PERL_IN_XSUB_RE
11636 void
11637 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
11638 {
11639  dVAR;
11640  I32 npar;
11641  const struct regexp *r = (const struct regexp *)SvANY(sstr);
11642  struct regexp *ret = (struct regexp *)SvANY(dstr);
11643
11644  PERL_ARGS_ASSERT_RE_DUP_GUTS;
11645
11646  npar = r->nparens+1;
11647  Newx(ret->offs, npar, regexp_paren_pair);
11648  Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11649  if(ret->swap) {
11650   /* no need to copy these */
11651   Newx(ret->swap, npar, regexp_paren_pair);
11652  }
11653
11654  if (ret->substrs) {
11655   /* Do it this way to avoid reading from *r after the StructCopy().
11656   That way, if any of the sv_dup_inc()s dislodge *r from the L1
11657   cache, it doesn't matter.  */
11658   const bool anchored = r->check_substr
11659    ? r->check_substr == r->anchored_substr
11660    : r->check_utf8 == r->anchored_utf8;
11661   Newx(ret->substrs, 1, struct reg_substr_data);
11662   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11663
11664   ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
11665   ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
11666   ret->float_substr = sv_dup_inc(ret->float_substr, param);
11667   ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
11668
11669   /* check_substr and check_utf8, if non-NULL, point to either their
11670   anchored or float namesakes, and don't hold a second reference.  */
11671
11672   if (ret->check_substr) {
11673    if (anchored) {
11674     assert(r->check_utf8 == r->anchored_utf8);
11675     ret->check_substr = ret->anchored_substr;
11676     ret->check_utf8 = ret->anchored_utf8;
11677    } else {
11678     assert(r->check_substr == r->float_substr);
11679     assert(r->check_utf8 == r->float_utf8);
11680     ret->check_substr = ret->float_substr;
11681     ret->check_utf8 = ret->float_utf8;
11682    }
11683   } else if (ret->check_utf8) {
11684    if (anchored) {
11685     ret->check_utf8 = ret->anchored_utf8;
11686    } else {
11687     ret->check_utf8 = ret->float_utf8;
11688    }
11689   }
11690  }
11691
11692  RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
11693
11694  if (ret->pprivate)
11695   RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
11696
11697  if (RX_MATCH_COPIED(dstr))
11698   ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
11699  else
11700   ret->subbeg = NULL;
11701 #ifdef PERL_OLD_COPY_ON_WRITE
11702  ret->saved_copy = NULL;
11703 #endif
11704
11705  if (ret->mother_re) {
11706   if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
11707    /* Our storage points directly to our mother regexp, but that's
11708    1: a buffer in a different thread
11709    2: something we no longer hold a reference on
11710    so we need to copy it locally.  */
11711    /* Note we need to sue SvCUR() on our mother_re, because it, in
11712    turn, may well be pointing to its own mother_re.  */
11713    SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
11714         SvCUR(ret->mother_re)+1));
11715    SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
11716   }
11717   ret->mother_re      = NULL;
11718  }
11719  ret->gofs = 0;
11720 }
11721 #endif /* PERL_IN_XSUB_RE */
11722
11723 /*
11724    regdupe_internal()
11725
11726    This is the internal complement to regdupe() which is used to copy
11727    the structure pointed to by the *pprivate pointer in the regexp.
11728    This is the core version of the extension overridable cloning hook.
11729    The regexp structure being duplicated will be copied by perl prior
11730    to this and will be provided as the regexp *r argument, however
11731    with the /old/ structures pprivate pointer value. Thus this routine
11732    may override any copying normally done by perl.
11733
11734    It returns a pointer to the new regexp_internal structure.
11735 */
11736
11737 void *
11738 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
11739 {
11740  dVAR;
11741  struct regexp *const r = (struct regexp *)SvANY(rx);
11742  regexp_internal *reti;
11743  int len, npar;
11744  RXi_GET_DECL(r,ri);
11745
11746  PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
11747
11748  npar = r->nparens+1;
11749  len = ProgLen(ri);
11750
11751  Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
11752  Copy(ri->program, reti->program, len+1, regnode);
11753
11754
11755  reti->regstclass = NULL;
11756
11757  if (ri->data) {
11758   struct reg_data *d;
11759   const int count = ri->data->count;
11760   int i;
11761
11762   Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
11763     char, struct reg_data);
11764   Newx(d->what, count, U8);
11765
11766   d->count = count;
11767   for (i = 0; i < count; i++) {
11768    d->what[i] = ri->data->what[i];
11769    switch (d->what[i]) {
11770     /* legal options are one of: sSfpontTua
11771     see also regcomp.h and pregfree() */
11772    case 'a': /* actually an AV, but the dup function is identical.  */
11773    case 's':
11774    case 'S':
11775    case 'p': /* actually an AV, but the dup function is identical.  */
11776    case 'u': /* actually an HV, but the dup function is identical.  */
11777     d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
11778     break;
11779    case 'f':
11780     /* This is cheating. */
11781     Newx(d->data[i], 1, struct regnode_charclass_class);
11782     StructCopy(ri->data->data[i], d->data[i],
11783        struct regnode_charclass_class);
11784     reti->regstclass = (regnode*)d->data[i];
11785     break;
11786    case 'o':
11787     /* Compiled op trees are readonly and in shared memory,
11788     and can thus be shared without duplication. */
11789     OP_REFCNT_LOCK;
11790     d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
11791     OP_REFCNT_UNLOCK;
11792     break;
11793    case 'T':
11794     /* Trie stclasses are readonly and can thus be shared
11795     * without duplication. We free the stclass in pregfree
11796     * when the corresponding reg_ac_data struct is freed.
11797     */
11798     reti->regstclass= ri->regstclass;
11799     /* Fall through */
11800    case 't':
11801     OP_REFCNT_LOCK;
11802     ((reg_trie_data*)ri->data->data[i])->refcount++;
11803     OP_REFCNT_UNLOCK;
11804     /* Fall through */
11805    case 'n':
11806     d->data[i] = ri->data->data[i];
11807     break;
11808    default:
11809     Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
11810    }
11811   }
11812
11813   reti->data = d;
11814  }
11815  else
11816   reti->data = NULL;
11817
11818  reti->name_list_idx = ri->name_list_idx;
11819
11820 #ifdef RE_TRACK_PATTERN_OFFSETS
11821  if (ri->u.offsets) {
11822   Newx(reti->u.offsets, 2*len+1, U32);
11823   Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
11824  }
11825 #else
11826  SetProgLen(reti,len);
11827 #endif
11828
11829  return (void*)reti;
11830 }
11831
11832 #endif    /* USE_ITHREADS */
11833
11834 #ifndef PERL_IN_XSUB_RE
11835
11836 /*
11837  - regnext - dig the "next" pointer out of a node
11838  */
11839 regnode *
11840 Perl_regnext(pTHX_ register regnode *p)
11841 {
11842  dVAR;
11843  register I32 offset;
11844
11845  if (!p)
11846   return(NULL);
11847
11848  if (OP(p) > REGNODE_MAX) {  /* regnode.type is unsigned */
11849   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
11850  }
11851
11852  offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
11853  if (offset == 0)
11854   return(NULL);
11855
11856  return(p+offset);
11857 }
11858 #endif
11859
11860 STATIC void
11861 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
11862 {
11863  va_list args;
11864  STRLEN l1 = strlen(pat1);
11865  STRLEN l2 = strlen(pat2);
11866  char buf[512];
11867  SV *msv;
11868  const char *message;
11869
11870  PERL_ARGS_ASSERT_RE_CROAK2;
11871
11872  if (l1 > 510)
11873   l1 = 510;
11874  if (l1 + l2 > 510)
11875   l2 = 510 - l1;
11876  Copy(pat1, buf, l1 , char);
11877  Copy(pat2, buf + l1, l2 , char);
11878  buf[l1 + l2] = '\n';
11879  buf[l1 + l2 + 1] = '\0';
11880 #ifdef I_STDARG
11881  /* ANSI variant takes additional second argument */
11882  va_start(args, pat2);
11883 #else
11884  va_start(args);
11885 #endif
11886  msv = vmess(buf, &args);
11887  va_end(args);
11888  message = SvPV_const(msv,l1);
11889  if (l1 > 512)
11890   l1 = 512;
11891  Copy(message, buf, l1 , char);
11892  buf[l1-1] = '\0';   /* Overwrite \n */
11893  Perl_croak(aTHX_ "%s", buf);
11894 }
11895
11896 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
11897
11898 #ifndef PERL_IN_XSUB_RE
11899 void
11900 Perl_save_re_context(pTHX)
11901 {
11902  dVAR;
11903
11904  struct re_save_state *state;
11905
11906  SAVEVPTR(PL_curcop);
11907  SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
11908
11909  state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
11910  PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
11911  SSPUSHUV(SAVEt_RE_STATE);
11912
11913  Copy(&PL_reg_state, state, 1, struct re_save_state);
11914
11915  PL_reg_start_tmp = 0;
11916  PL_reg_start_tmpl = 0;
11917  PL_reg_oldsaved = NULL;
11918  PL_reg_oldsavedlen = 0;
11919  PL_reg_maxiter = 0;
11920  PL_reg_leftiter = 0;
11921  PL_reg_poscache = NULL;
11922  PL_reg_poscache_size = 0;
11923 #ifdef PERL_OLD_COPY_ON_WRITE
11924  PL_nrs = NULL;
11925 #endif
11926
11927  /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
11928  if (PL_curpm) {
11929   const REGEXP * const rx = PM_GETRE(PL_curpm);
11930   if (rx) {
11931    U32 i;
11932    for (i = 1; i <= RX_NPARENS(rx); i++) {
11933     char digits[TYPE_CHARS(long)];
11934     const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
11935     GV *const *const gvp
11936      = (GV**)hv_fetch(PL_defstash, digits, len, 0);
11937
11938     if (gvp) {
11939      GV * const gv = *gvp;
11940      if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
11941       save_scalar(gv);
11942     }
11943    }
11944   }
11945  }
11946 }
11947 #endif
11948
11949 static void
11950 clear_re(pTHX_ void *r)
11951 {
11952  dVAR;
11953  ReREFCNT_dec((REGEXP *)r);
11954 }
11955
11956 #ifdef DEBUGGING
11957
11958 STATIC void
11959 S_put_byte(pTHX_ SV *sv, int c)
11960 {
11961  PERL_ARGS_ASSERT_PUT_BYTE;
11962
11963  /* Our definition of isPRINT() ignores locales, so only bytes that are
11964  not part of UTF-8 are considered printable. I assume that the same
11965  holds for UTF-EBCDIC.
11966  Also, code point 255 is not printable in either (it's E0 in EBCDIC,
11967  which Wikipedia says:
11968
11969  EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
11970  ones (binary 1111 1111, hexadecimal FF). It is similar, but not
11971  identical, to the ASCII delete (DEL) or rubout control character.
11972  ) So the old condition can be simplified to !isPRINT(c)  */
11973  if (!isPRINT(c)) {
11974   if (c < 256) {
11975    Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
11976   }
11977   else {
11978    Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
11979   }
11980  }
11981  else {
11982   const char string = c;
11983   if (c == '-' || c == ']' || c == '\\' || c == '^')
11984    sv_catpvs(sv, "\\");
11985   sv_catpvn(sv, &string, 1);
11986  }
11987 }
11988
11989
11990 #define CLEAR_OPTSTART \
11991  if (optstart) STMT_START { \
11992    DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
11993    optstart=NULL; \
11994  } STMT_END
11995
11996 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
11997
11998 STATIC const regnode *
11999 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
12000    const regnode *last, const regnode *plast,
12001    SV* sv, I32 indent, U32 depth)
12002 {
12003  dVAR;
12004  register U8 op = PSEUDO; /* Arbitrary non-END op. */
12005  register const regnode *next;
12006  const regnode *optstart= NULL;
12007
12008  RXi_GET_DECL(r,ri);
12009  GET_RE_DEBUG_FLAGS_DECL;
12010
12011  PERL_ARGS_ASSERT_DUMPUNTIL;
12012
12013 #ifdef DEBUG_DUMPUNTIL
12014  PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
12015   last ? last-start : 0,plast ? plast-start : 0);
12016 #endif
12017
12018  if (plast && plast < last)
12019   last= plast;
12020
12021  while (PL_regkind[op] != END && (!last || node < last)) {
12022   /* While that wasn't END last time... */
12023   NODE_ALIGN(node);
12024   op = OP(node);
12025   if (op == CLOSE || op == WHILEM)
12026    indent--;
12027   next = regnext((regnode *)node);
12028
12029   /* Where, what. */
12030   if (OP(node) == OPTIMIZED) {
12031    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
12032     optstart = node;
12033    else
12034     goto after_print;
12035   } else
12036    CLEAR_OPTSTART;
12037
12038   regprop(r, sv, node);
12039   PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
12040      (int)(2*indent + 1), "", SvPVX_const(sv));
12041
12042   if (OP(node) != OPTIMIZED) {
12043    if (next == NULL)  /* Next ptr. */
12044     PerlIO_printf(Perl_debug_log, " (0)");
12045    else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
12046     PerlIO_printf(Perl_debug_log, " (FAIL)");
12047    else
12048     PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
12049    (void)PerlIO_putc(Perl_debug_log, '\n');
12050   }
12051
12052  after_print:
12053   if (PL_regkind[(U8)op] == BRANCHJ) {
12054    assert(next);
12055    {
12056     register const regnode *nnode = (OP(next) == LONGJMP
12057            ? regnext((regnode *)next)
12058            : next);
12059     if (last && nnode > last)
12060      nnode = last;
12061     DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
12062    }
12063   }
12064   else if (PL_regkind[(U8)op] == BRANCH) {
12065    assert(next);
12066    DUMPUNTIL(NEXTOPER(node), next);
12067   }
12068   else if ( PL_regkind[(U8)op]  == TRIE ) {
12069    const regnode *this_trie = node;
12070    const char op = OP(node);
12071    const U32 n = ARG(node);
12072    const reg_ac_data * const ac = op>=AHOCORASICK ?
12073    (reg_ac_data *)ri->data->data[n] :
12074    NULL;
12075    const reg_trie_data * const trie =
12076     (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
12077 #ifdef DEBUGGING
12078    AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
12079 #endif
12080    const regnode *nextbranch= NULL;
12081    I32 word_idx;
12082    sv_setpvs(sv, "");
12083    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
12084     SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
12085
12086     PerlIO_printf(Perl_debug_log, "%*s%s ",
12087     (int)(2*(indent+3)), "",
12088      elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
12089        PL_colors[0], PL_colors[1],
12090        (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
12091        PERL_PV_PRETTY_ELLIPSES    |
12092        PERL_PV_PRETTY_LTGT
12093        )
12094        : "???"
12095     );
12096     if (trie->jump) {
12097      U16 dist= trie->jump[word_idx+1];
12098      PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
12099         (UV)((dist ? this_trie + dist : next) - start));
12100      if (dist) {
12101       if (!nextbranch)
12102        nextbranch= this_trie + trie->jump[0];
12103       DUMPUNTIL(this_trie + dist, nextbranch);
12104      }
12105      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
12106       nextbranch= regnext((regnode *)nextbranch);
12107     } else {
12108      PerlIO_printf(Perl_debug_log, "\n");
12109     }
12110    }
12111    if (last && next > last)
12112     node= last;
12113    else
12114     node= next;
12115   }
12116   else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
12117    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
12118      NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
12119   }
12120   else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
12121    assert(next);
12122    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
12123   }
12124   else if ( op == PLUS || op == STAR) {
12125    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
12126   }
12127   else if (PL_regkind[(U8)op] == ANYOF) {
12128    /* arglen 1 + class block */
12129    node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
12130      ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
12131    node = NEXTOPER(node);
12132   }
12133   else if (PL_regkind[(U8)op] == EXACT) {
12134    /* Literal string, where present. */
12135    node += NODE_SZ_STR(node) - 1;
12136    node = NEXTOPER(node);
12137   }
12138   else {
12139    node = NEXTOPER(node);
12140    node += regarglen[(U8)op];
12141   }
12142   if (op == CURLYX || op == OPEN)
12143    indent++;
12144  }
12145  CLEAR_OPTSTART;
12146 #ifdef DEBUG_DUMPUNTIL
12147  PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
12148 #endif
12149  return node;
12150 }
12151
12152 #endif /* DEBUGGING */
12153
12154 /*
12155  * Local variables:
12156  * c-indentation-style: bsd
12157  * c-basic-offset: 4
12158  * indent-tabs-mode: t
12159  * End:
12160  *
12161  * ex: set ts=8 sts=4 sw=4 noet:
12162  */