src/5012004/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to  pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  * Copyright (c) 1986 by University of Toronto.
  42  * Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  * Permission is granted to anyone to use this software for any
  45  * purpose on any computer system, and to redistribute it freely,
  46  * subject to the following restrictions:
  47  *
  48  * 1. The author is not responsible for the consequences of use of
  49  *  this software, no matter how awful, even if they arise
  50  *  from defects in it.
  51  *
  52  * 2. The origin of this software must not be misrepresented, either
  53  *  by explicit claim or by omission.
  54  *
  55  * 3. Altered versions must be plainly marked as such, and must not
  56  *  be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #undef PERL_IN_XSUB_RE
  76 #define PERL_IN_XSUB_RE 1
  77 #include "perl.h"
  78 #undef PERL_IN_XSUB_RE
  79
  80 #ifndef PERL_IN_XSUB_RE
  81 #include "re_defs.h"
  82 #endif
  83
  84 #define REG_COMP_C
  85 #ifdef PERL_IN_XSUB_RE
  86 #  include "re_comp.h"
  87 #else
  88 #  include "regcomp.h"
  89 #endif
  90
  91 #ifdef op
  92 #undef op
  93 #endif /* op */
  94
  95 #ifdef MSDOS
  96 #  if defined(BUGGY_MSC6)
  97  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  98 #    pragma optimize("a",off)
  99  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 100 #    pragma optimize("w",on )
 101 #  endif /* BUGGY_MSC6 */
 102 #endif /* MSDOS */
 103
 104 #ifndef STATIC
 105 #define STATIC static
 106 #endif
 107
 108 typedef struct RExC_state_t {
 109  U32  flags;   /* are we folding, multilining? */
 110  char *precomp;  /* uncompiled string. */
 111  REGEXP *rx_sv;   /* The SV that is the regexp. */
 112  regexp *rx;                    /* perl core regexp structure */
 113  regexp_internal *rxi;           /* internal data for regexp object pprivate field */
 114  char *start;   /* Start of input for compile */
 115  char *end;   /* End of input for compile */
 116  char *parse;   /* Input-scan pointer. */
 117  I32  whilem_seen;  /* number of WHILEM in this expr */
 118  regnode *emit_start;  /* Start of emitted-code area */
 119  regnode *emit_bound;  /* First regnode outside of the allocated space */
 120  regnode *emit;   /* Code-emit pointer; &regdummy = don't = compiling */
 121  I32  naughty;  /* How bad is this pattern? */
 122  I32  sawback;  /* Did we see \1, ...? */
 123  U32  seen;
 124  I32  size;   /* Code size. */
 125  I32  npar;   /* Capture buffer count, (OPEN). */
 126  I32  cpar;   /* Capture buffer count, (CLOSE). */
 127  I32  nestroot;  /* root parens we are in - used by accept */
 128  I32  extralen;
 129  I32  seen_zerolen;
 130  I32  seen_evals;
 131  regnode **open_parens;  /* pointers to open parens */
 132  regnode **close_parens;  /* pointers to close parens */
 133  regnode *opend;   /* END node in program */
 134  I32  utf8;  /* whether the pattern is utf8 or not */
 135  I32  orig_utf8; /* whether the pattern was originally in utf8 */
 136         /* XXX use this for future optimisation of case
 137         * where pattern must be upgraded to utf8. */
 138  HV  *paren_names;  /* Paren names */
 139
 140  regnode **recurse;  /* Recurse regops */
 141  I32  recurse_count;  /* Number of recurse regops */
 142 #if ADD_TO_REGEXEC
 143  char  *starttry;  /* -Dr: where regtry was called. */
 144 #define RExC_starttry (pRExC_state->starttry)
 145 #endif
 146 #ifdef DEBUGGING
 147  const char  *lastparse;
 148  I32         lastnum;
 149  AV          *paren_name_list;       /* idx -> name */
 150 #define RExC_lastparse (pRExC_state->lastparse)
 151 #define RExC_lastnum (pRExC_state->lastnum)
 152 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 153 #endif
 154 } RExC_state_t;
 155
 156 #define RExC_flags (pRExC_state->flags)
 157 #define RExC_precomp (pRExC_state->precomp)
 158 #define RExC_rx_sv (pRExC_state->rx_sv)
 159 #define RExC_rx  (pRExC_state->rx)
 160 #define RExC_rxi (pRExC_state->rxi)
 161 #define RExC_start (pRExC_state->start)
 162 #define RExC_end (pRExC_state->end)
 163 #define RExC_parse (pRExC_state->parse)
 164 #define RExC_whilem_seen (pRExC_state->whilem_seen)
 165 #ifdef RE_TRACK_PATTERN_OFFSETS
 166 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
 167 #endif
 168 #define RExC_emit (pRExC_state->emit)
 169 #define RExC_emit_start (pRExC_state->emit_start)
 170 #define RExC_emit_bound (pRExC_state->emit_bound)
 171 #define RExC_naughty (pRExC_state->naughty)
 172 #define RExC_sawback (pRExC_state->sawback)
 173 #define RExC_seen (pRExC_state->seen)
 174 #define RExC_size (pRExC_state->size)
 175 #define RExC_npar (pRExC_state->npar)
 176 #define RExC_nestroot   (pRExC_state->nestroot)
 177 #define RExC_extralen (pRExC_state->extralen)
 178 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
 179 #define RExC_seen_evals (pRExC_state->seen_evals)
 180 #define RExC_utf8 (pRExC_state->utf8)
 181 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
 182 #define RExC_open_parens (pRExC_state->open_parens)
 183 #define RExC_close_parens (pRExC_state->close_parens)
 184 #define RExC_opend (pRExC_state->opend)
 185 #define RExC_paren_names (pRExC_state->paren_names)
 186 #define RExC_recurse (pRExC_state->recurse)
 187 #define RExC_recurse_count (pRExC_state->recurse_count)
 188
 189
 190 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
 191 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 192   ((*s) == '{' && regcurly(s)))
 193
 194 #ifdef SPSTART
 195 #undef SPSTART  /* dratted cpp namespace... */
 196 #endif
 197 /*
 198  * Flags to be passed up and down.
 199  */
 200 #define WORST  0 /* Worst case. */
 201 #define HASWIDTH 0x01 /* Known to match non-null strings. */
 202 #define SIMPLE  0x02 /* Simple enough to be STAR/PLUS operand. */
 203 #define SPSTART  0x04 /* Starts with * or +. */
 204 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
 205 #define POSTPONED 0x10    /* (?1),(?&name), (??{...}) or similar */
 206
 207 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 208
 209 /* whether trie related optimizations are enabled */
 210 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 211 #define TRIE_STUDY_OPT
 212 #define FULL_TRIE_STUDY
 213 #define TRIE_STCLASS
 214 #endif
 215
 216
 217
 218 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 219 #define PBITVAL(paren) (1 << ((paren) & 7))
 220 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 221 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 222 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 223
 224
 225 /* About scan_data_t.
 226
 227   During optimisation we recurse through the regexp program performing
 228   various inplace (keyhole style) optimisations. In addition study_chunk
 229   and scan_commit populate this data structure with information about
 230   what strings MUST appear in the pattern. We look for the longest
 231   string that must appear for at a fixed location, and we look for the
 232   longest string that may appear at a floating location. So for instance
 233   in the pattern:
 234
 235  /FOO[xX]A.*B[xX]BAR/
 236
 237   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 238   strings (because they follow a .* construct). study_chunk will identify
 239   both FOO and BAR as being the longest fixed and floating strings respectively.
 240
 241   The strings can be composites, for instance
 242
 243  /(f)(o)(o)/
 244
 245   will result in a composite fixed substring 'foo'.
 246
 247   For each string some basic information is maintained:
 248
 249   - offset or min_offset
 250  This is the position the string must appear at, or not before.
 251  It also implicitly (when combined with minlenp) tells us how many
 252  character must match before the string we are searching.
 253  Likewise when combined with minlenp and the length of the string
 254  tells us how many characters must appear after the string we have
 255  found.
 256
 257   - max_offset
 258  Only used for floating strings. This is the rightmost point that
 259  the string can appear at. Ifset to I32 max it indicates that the
 260  string can occur infinitely far to the right.
 261
 262   - minlenp
 263  A pointer to the minimum length of the pattern that the string
 264  was found inside. This is important as in the case of positive
 265  lookahead or positive lookbehind we can have multiple patterns
 266  involved. Consider
 267
 268  /(?=FOO).*F/
 269
 270  The minimum length of the pattern overall is 3, the minimum length
 271  of the lookahead part is 3, but the minimum length of the part that
 272  will actually match is 1. So 'FOO's minimum length is 3, but the
 273  minimum length for the F is 1. This is important as the minimum length
 274  is used to determine offsets in front of and behind the string being
 275  looked for.  Since strings can be composites this is the length of the
 276  pattern at the time it was commited with a scan_commit. Note that
 277  the length is calculated by study_chunk, so that the minimum lengths
 278  are not known until the full pattern has been compiled, thus the
 279  pointer to the value.
 280
 281   - lookbehind
 282
 283  In the case of lookbehind the string being searched for can be
 284  offset past the start point of the final matching string.
 285  If this value was just blithely removed from the min_offset it would
 286  invalidate some of the calculations for how many chars must match
 287  before or after (as they are derived from min_offset and minlen and
 288  the length of the string being searched for).
 289  When the final pattern is compiled and the data is moved from the
 290  scan_data_t structure into the regexp structure the information
 291  about lookbehind is factored in, with the information that would
 292  have been lost precalculated in the end_shift field for the
 293  associated string.
 294
 295   The fields pos_min and pos_delta are used to store the minimum offset
 296   and the delta to the maximum offset at the current point in the pattern.
 297
 298 */
 299
 300 typedef struct scan_data_t {
 301  /*I32 len_min;      unused */
 302  /*I32 len_delta;    unused */
 303  I32 pos_min;
 304  I32 pos_delta;
 305  SV *last_found;
 306  I32 last_end;     /* min value, <0 unless valid. */
 307  I32 last_start_min;
 308  I32 last_start_max;
 309  SV **longest;     /* Either &l_fixed, or &l_float. */
 310  SV *longest_fixed;      /* longest fixed string found in pattern */
 311  I32 offset_fixed;       /* offset where it starts */
 312  I32 *minlen_fixed;      /* pointer to the minlen relevent to the string */
 313  I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 314  SV *longest_float;      /* longest floating string found in pattern */
 315  I32 offset_float_min;   /* earliest point in string it can appear */
 316  I32 offset_float_max;   /* latest point in string it can appear */
 317  I32 *minlen_float;      /* pointer to the minlen relevent to the string */
 318  I32 lookbehind_float;   /* is the position of the string modified by LB */
 319  I32 flags;
 320  I32 whilem_c;
 321  I32 *last_closep;
 322  struct regnode_charclass_class *start_class;
 323 } scan_data_t;
 324
 325 /*
 326  * Forward declarations for pregcomp()'s friends.
 327  */
 328
 329 static const scan_data_t zero_scan_data =
 330   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 331
 332 #define SF_BEFORE_EOL  (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 333 #define SF_BEFORE_SEOL  0x0001
 334 #define SF_BEFORE_MEOL  0x0002
 335 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 336 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 337
 338 #ifdef NO_UNARY_PLUS
 339 #  define SF_FIX_SHIFT_EOL (0+2)
 340 #  define SF_FL_SHIFT_EOL  (0+4)
 341 #else
 342 #  define SF_FIX_SHIFT_EOL (+2)
 343 #  define SF_FL_SHIFT_EOL  (+4)
 344 #endif
 345
 346 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 347 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 348
 349 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 350 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 351 #define SF_IS_INF  0x0040
 352 #define SF_HAS_PAR  0x0080
 353 #define SF_IN_PAR  0x0100
 354 #define SF_HAS_EVAL  0x0200
 355 #define SCF_DO_SUBSTR  0x0400
 356 #define SCF_DO_STCLASS_AND 0x0800
 357 #define SCF_DO_STCLASS_OR 0x1000
 358 #define SCF_DO_STCLASS  (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 359 #define SCF_WHILEM_VISITED_POS 0x2000
 360
 361 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 362 #define SCF_SEEN_ACCEPT         0x8000
 363
 364 #define UTF (RExC_utf8 != 0)
 365 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
 366 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
 367
 368 #define OOB_UNICODE  12345678
 369 #define OOB_NAMEDCLASS  -1
 370
 371 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 372 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 373
 374
 375 /* length of regex to show in messages that don't mark a position within */
 376 #define RegexLengthToShowInErrorMessages 127
 377
 378 /*
 379  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 380  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 381  * op/pragma/warn/regcomp.
 382  */
 383 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 384 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 385
 386 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 387
 388 /*
 389  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 390  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 391  * "...".
 392  */
 393 #define _FAIL(code) STMT_START {     \
 394  const char *ellipses = "";      \
 395  IV len = RExC_end - RExC_precomp;     \
 396                   \
 397  if (!SIZE_ONLY)       \
 398   SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 399  if (len > RegexLengthToShowInErrorMessages) {   \
 400   /* chop 10 shorter than the max, to ensure meaning of "..." */ \
 401   len = RegexLengthToShowInErrorMessages - 10;   \
 402   ellipses = "...";      \
 403  }         \
 404  code;                                                               \
 405 } STMT_END
 406
 407 #define FAIL(msg) _FAIL(       \
 408  Perl_croak(aTHX_ "%s in regex m/%.*s%s/",     \
 409    msg, (int)len, RExC_precomp, ellipses))
 410
 411 #define FAIL2(msg,arg) _FAIL(       \
 412  Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 413    arg, (int)len, RExC_precomp, ellipses))
 414
 415 /*
 416  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 417  */
 418 #define Simple_vFAIL(m) STMT_START {     \
 419  const IV offset = RExC_parse - RExC_precomp;   \
 420  Perl_croak(aTHX_ "%s" REPORT_LOCATION,    \
 421    m, (int)offset, RExC_precomp, RExC_precomp + offset); \
 422 } STMT_END
 423
 424 /*
 425  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 426  */
 427 #define vFAIL(m) STMT_START {    \
 428  if (!SIZE_ONLY)     \
 429   SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
 430  Simple_vFAIL(m);     \
 431 } STMT_END
 432
 433 /*
 434  * Like Simple_vFAIL(), but accepts two arguments.
 435  */
 436 #define Simple_vFAIL2(m,a1) STMT_START {   \
 437  const IV offset = RExC_parse - RExC_precomp;   \
 438  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,   \
 439    (int)offset, RExC_precomp, RExC_precomp + offset); \
 440 } STMT_END
 441
 442 /*
 443  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 444  */
 445 #define vFAIL2(m,a1) STMT_START {   \
 446  if (!SIZE_ONLY)     \
 447   SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
 448  Simple_vFAIL2(m, a1);    \
 449 } STMT_END
 450
 451
 452 /*
 453  * Like Simple_vFAIL(), but accepts three arguments.
 454  */
 455 #define Simple_vFAIL3(m, a1, a2) STMT_START {   \
 456  const IV offset = RExC_parse - RExC_precomp;  \
 457  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,  \
 458    (int)offset, RExC_precomp, RExC_precomp + offset); \
 459 } STMT_END
 460
 461 /*
 462  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 463  */
 464 #define vFAIL3(m,a1,a2) STMT_START {   \
 465  if (!SIZE_ONLY)     \
 466   SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
 467  Simple_vFAIL3(m, a1, a2);    \
 468 } STMT_END
 469
 470 /*
 471  * Like Simple_vFAIL(), but accepts four arguments.
 472  */
 473 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {  \
 474  const IV offset = RExC_parse - RExC_precomp;  \
 475  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,  \
 476    (int)offset, RExC_precomp, RExC_precomp + offset); \
 477 } STMT_END
 478
 479 #define ckWARNreg(loc,m) STMT_START {     \
 480  const IV offset = loc - RExC_precomp;    \
 481  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 482    (int)offset, RExC_precomp, RExC_precomp + offset);  \
 483 } STMT_END
 484
 485 #define ckWARNregdep(loc,m) STMT_START {    \
 486  const IV offset = loc - RExC_precomp;    \
 487  Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
 488    m REPORT_LOCATION,      \
 489    (int)offset, RExC_precomp, RExC_precomp + offset);  \
 490 } STMT_END
 491
 492 #define ckWARN2reg(loc, m, a1) STMT_START {    \
 493  const IV offset = loc - RExC_precomp;    \
 494  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 495    a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
 496 } STMT_END
 497
 498 #define vWARN3(loc, m, a1, a2) STMT_START {    \
 499  const IV offset = loc - RExC_precomp;    \
 500  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 501    a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
 502 } STMT_END
 503
 504 #define ckWARN3reg(loc, m, a1, a2) STMT_START {    \
 505  const IV offset = loc - RExC_precomp;    \
 506  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 507    a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
 508 } STMT_END
 509
 510 #define vWARN4(loc, m, a1, a2, a3) STMT_START {    \
 511  const IV offset = loc - RExC_precomp;    \
 512  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 513    a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 514 } STMT_END
 515
 516 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {   \
 517  const IV offset = loc - RExC_precomp;    \
 518  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 519    a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 520 } STMT_END
 521
 522 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {   \
 523  const IV offset = loc - RExC_precomp;    \
 524  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 525    a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 526 } STMT_END
 527
 528
 529 /* Allow for side effects in s */
 530 #define REGC(c,s) STMT_START {   \
 531  if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 532 } STMT_END
 533
 534 /* Macros for recording node offsets.   20001227 mjd@plover.com
 535  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 536  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 537  * Element 0 holds the number n.
 538  * Position is 1 indexed.
 539  */
 540 #ifndef RE_TRACK_PATTERN_OFFSETS
 541 #define Set_Node_Offset_To_R(node,byte)
 542 #define Set_Node_Offset(node,byte)
 543 #define Set_Cur_Node_Offset
 544 #define Set_Node_Length_To_R(node,len)
 545 #define Set_Node_Length(node,len)
 546 #define Set_Node_Cur_Length(node)
 547 #define Node_Offset(n)
 548 #define Node_Length(n)
 549 #define Set_Node_Offset_Length(node,offset,len)
 550 #define ProgLen(ri) ri->u.proglen
 551 #define SetProgLen(ri,x) ri->u.proglen = x
 552 #else
 553 #define ProgLen(ri) ri->u.offsets[0]
 554 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 555 #define Set_Node_Offset_To_R(node,byte) STMT_START {   \
 556  if (! SIZE_ONLY) {       \
 557   MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",  \
 558      __LINE__, (int)(node), (int)(byte)));  \
 559   if((node) < 0) {      \
 560    Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 561   } else {       \
 562    RExC_offsets[2*(node)-1] = (byte);    \
 563   }        \
 564  }         \
 565 } STMT_END
 566
 567 #define Set_Node_Offset(node,byte) \
 568  Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 569 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 570
 571 #define Set_Node_Length_To_R(node,len) STMT_START {   \
 572  if (! SIZE_ONLY) {       \
 573   MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",  \
 574     __LINE__, (int)(node), (int)(len)));   \
 575   if((node) < 0) {      \
 576    Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 577   } else {       \
 578    RExC_offsets[2*(node)] = (len);    \
 579   }        \
 580  }         \
 581 } STMT_END
 582
 583 #define Set_Node_Length(node,len) \
 584  Set_Node_Length_To_R((node)-RExC_emit_start, len)
 585 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 586 #define Set_Node_Cur_Length(node) \
 587  Set_Node_Length(node, RExC_parse - parse_start)
 588
 589 /* Get offsets and lengths */
 590 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 591 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 592
 593 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
 594  Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
 595  Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
 596 } STMT_END
 597 #endif
 598
 599 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 600 #define EXPERIMENTAL_INPLACESCAN
 601 #endif /*RE_TRACK_PATTERN_OFFSETS*/
 602
 603 #define DEBUG_STUDYDATA(str,data,depth)                              \
 604 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 605  PerlIO_printf(Perl_debug_log,                                    \
 606   "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 607   " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 608   (int)(depth)*2, "",                                          \
 609   (IV)((data)->pos_min),                                       \
 610   (IV)((data)->pos_delta),                                     \
 611   (UV)((data)->flags),                                         \
 612   (IV)((data)->whilem_c),                                      \
 613   (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 614   is_inf ? "INF " : ""                                         \
 615  );                                                               \
 616  if ((data)->last_found)                                          \
 617   PerlIO_printf(Perl_debug_log,                                \
 618    "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 619    " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 620    SvPVX_const((data)->last_found),                         \
 621    (IV)((data)->last_end),                                  \
 622    (IV)((data)->last_start_min),                            \
 623    (IV)((data)->last_start_max),                            \
 624    ((data)->longest &&                                      \
 625    (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 626    SvPVX_const((data)->longest_fixed),                      \
 627    (IV)((data)->offset_fixed),                              \
 628    ((data)->longest &&                                      \
 629    (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 630    SvPVX_const((data)->longest_float),                      \
 631    (IV)((data)->offset_float_min),                          \
 632    (IV)((data)->offset_float_max)                           \
 633   );                                                           \
 634  PerlIO_printf(Perl_debug_log,"\n");                              \
 635 });
 636
 637 static void clear_re(pTHX_ void *r);
 638
 639 /* Mark that we cannot extend a found fixed substring at this point.
 640    Update the longest found anchored substring and the longest found
 641    floating substrings if needed. */
 642
 643 STATIC void
 644 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 645 {
 646  const STRLEN l = CHR_SVLEN(data->last_found);
 647  const STRLEN old_l = CHR_SVLEN(*data->longest);
 648  GET_RE_DEBUG_FLAGS_DECL;
 649
 650  PERL_ARGS_ASSERT_SCAN_COMMIT;
 651
 652  if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 653   SvSetMagicSV(*data->longest, data->last_found);
 654   if (*data->longest == data->longest_fixed) {
 655    data->offset_fixed = l ? data->last_start_min : data->pos_min;
 656    if (data->flags & SF_BEFORE_EOL)
 657     data->flags
 658      |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 659    else
 660     data->flags &= ~SF_FIX_BEFORE_EOL;
 661    data->minlen_fixed=minlenp;
 662    data->lookbehind_fixed=0;
 663   }
 664   else { /* *data->longest == data->longest_float */
 665    data->offset_float_min = l ? data->last_start_min : data->pos_min;
 666    data->offset_float_max = (l
 667          ? data->last_start_max
 668          : data->pos_min + data->pos_delta);
 669    if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 670     data->offset_float_max = I32_MAX;
 671    if (data->flags & SF_BEFORE_EOL)
 672     data->flags
 673      |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 674    else
 675     data->flags &= ~SF_FL_BEFORE_EOL;
 676    data->minlen_float=minlenp;
 677    data->lookbehind_float=0;
 678   }
 679  }
 680  SvCUR_set(data->last_found, 0);
 681  {
 682   SV * const sv = data->last_found;
 683   if (SvUTF8(sv) && SvMAGICAL(sv)) {
 684    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 685    if (mg)
 686     mg->mg_len = 0;
 687   }
 688  }
 689  data->last_end = -1;
 690  data->flags &= ~SF_BEFORE_EOL;
 691  DEBUG_STUDYDATA("commit: ",data,0);
 692 }
 693
 694 /* Can match anything (initialization) */
 695 STATIC void
 696 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 697 {
 698  PERL_ARGS_ASSERT_CL_ANYTHING;
 699
 700  ANYOF_CLASS_ZERO(cl);
 701  ANYOF_BITMAP_SETALL(cl);
 702  cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
 703  if (LOC)
 704   cl->flags |= ANYOF_LOCALE;
 705 }
 706
 707 /* Can match anything (initialization) */
 708 STATIC int
 709 S_cl_is_anything(const struct regnode_charclass_class *cl)
 710 {
 711  int value;
 712
 713  PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 714
 715  for (value = 0; value <= ANYOF_MAX; value += 2)
 716   if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 717    return 1;
 718  if (!(cl->flags & ANYOF_UNICODE_ALL))
 719   return 0;
 720  if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 721   return 0;
 722  return 1;
 723 }
 724
 725 /* Can match anything (initialization) */
 726 STATIC void
 727 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 728 {
 729  PERL_ARGS_ASSERT_CL_INIT;
 730
 731  Zero(cl, 1, struct regnode_charclass_class);
 732  cl->type = ANYOF;
 733  cl_anything(pRExC_state, cl);
 734 }
 735
 736 STATIC void
 737 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 738 {
 739  PERL_ARGS_ASSERT_CL_INIT_ZERO;
 740
 741  Zero(cl, 1, struct regnode_charclass_class);
 742  cl->type = ANYOF;
 743  cl_anything(pRExC_state, cl);
 744  if (LOC)
 745   cl->flags |= ANYOF_LOCALE;
 746 }
 747
 748 /* 'And' a given class with another one.  Can create false positives */
 749 /* We assume that cl is not inverted */
 750 STATIC void
 751 S_cl_and(struct regnode_charclass_class *cl,
 752   const struct regnode_charclass_class *and_with)
 753 {
 754  PERL_ARGS_ASSERT_CL_AND;
 755
 756  assert(and_with->type == ANYOF);
 757  if (!(and_with->flags & ANYOF_CLASS)
 758   && !(cl->flags & ANYOF_CLASS)
 759   && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 760   && !(and_with->flags & ANYOF_FOLD)
 761   && !(cl->flags & ANYOF_FOLD)) {
 762   int i;
 763
 764   if (and_with->flags & ANYOF_INVERT)
 765    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 766     cl->bitmap[i] &= ~and_with->bitmap[i];
 767   else
 768    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 769     cl->bitmap[i] &= and_with->bitmap[i];
 770  } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 771  if (!(and_with->flags & ANYOF_EOS))
 772   cl->flags &= ~ANYOF_EOS;
 773
 774  if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
 775   !(and_with->flags & ANYOF_INVERT)) {
 776   cl->flags &= ~ANYOF_UNICODE_ALL;
 777   cl->flags |= ANYOF_UNICODE;
 778   ARG_SET(cl, ARG(and_with));
 779  }
 780  if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
 781   !(and_with->flags & ANYOF_INVERT))
 782   cl->flags &= ~ANYOF_UNICODE_ALL;
 783  if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
 784   !(and_with->flags & ANYOF_INVERT))
 785   cl->flags &= ~ANYOF_UNICODE;
 786 }
 787
 788 /* 'OR' a given class with another one.  Can create false positives */
 789 /* We assume that cl is not inverted */
 790 STATIC void
 791 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 792 {
 793  PERL_ARGS_ASSERT_CL_OR;
 794
 795  if (or_with->flags & ANYOF_INVERT) {
 796   /* We do not use
 797   * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 798   *   <= (B1 | !B2) | (CL1 | !CL2)
 799   * which is wasteful if CL2 is small, but we ignore CL2:
 800   *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 801   * XXXX Can we handle case-fold?  Unclear:
 802   *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 803   *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 804   */
 805   if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 806    && !(or_with->flags & ANYOF_FOLD)
 807    && !(cl->flags & ANYOF_FOLD) ) {
 808    int i;
 809
 810    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 811     cl->bitmap[i] |= ~or_with->bitmap[i];
 812   } /* XXXX: logic is complicated otherwise */
 813   else {
 814    cl_anything(pRExC_state, cl);
 815   }
 816  } else {
 817   /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 818   if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 819    && (!(or_with->flags & ANYOF_FOLD)
 820     || (cl->flags & ANYOF_FOLD)) ) {
 821    int i;
 822
 823    /* OR char bitmap and class bitmap separately */
 824    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 825     cl->bitmap[i] |= or_with->bitmap[i];
 826    if (or_with->flags & ANYOF_CLASS) {
 827     for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 828      cl->classflags[i] |= or_with->classflags[i];
 829     cl->flags |= ANYOF_CLASS;
 830    }
 831   }
 832   else { /* XXXX: logic is complicated, leave it along for a moment. */
 833    cl_anything(pRExC_state, cl);
 834   }
 835  }
 836  if (or_with->flags & ANYOF_EOS)
 837   cl->flags |= ANYOF_EOS;
 838
 839  if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
 840   ARG(cl) != ARG(or_with)) {
 841   cl->flags |= ANYOF_UNICODE_ALL;
 842   cl->flags &= ~ANYOF_UNICODE;
 843  }
 844  if (or_with->flags & ANYOF_UNICODE_ALL) {
 845   cl->flags |= ANYOF_UNICODE_ALL;
 846   cl->flags &= ~ANYOF_UNICODE;
 847  }
 848 }
 849
 850 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 851 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 852 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 853 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
 854
 855
 856 #ifdef DEBUGGING
 857 /*
 858    dump_trie(trie,widecharmap,revcharmap)
 859    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
 860    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 861
 862    These routines dump out a trie in a somewhat readable format.
 863    The _interim_ variants are used for debugging the interim
 864    tables that are used to generate the final compressed
 865    representation which is what dump_trie expects.
 866
 867    Part of the reason for their existance is to provide a form
 868    of documentation as to how the different representations function.
 869
 870 */
 871
 872 /*
 873   Dumps the final compressed table form of the trie to Perl_debug_log.
 874   Used for debugging make_trie().
 875 */
 876
 877 STATIC void
 878 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
 879    AV *revcharmap, U32 depth)
 880 {
 881  U32 state;
 882  SV *sv=sv_newmortal();
 883  int colwidth= widecharmap ? 6 : 4;
 884  GET_RE_DEBUG_FLAGS_DECL;
 885
 886  PERL_ARGS_ASSERT_DUMP_TRIE;
 887
 888  PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
 889   (int)depth * 2 + 2,"",
 890   "Match","Base","Ofs" );
 891
 892  for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
 893   SV ** const tmp = av_fetch( revcharmap, state, 0);
 894   if ( tmp ) {
 895    PerlIO_printf( Perl_debug_log, "%*s",
 896     colwidth,
 897     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 898        PL_colors[0], PL_colors[1],
 899        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 900        PERL_PV_ESCAPE_FIRSTCHAR
 901     )
 902    );
 903   }
 904  }
 905  PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
 906   (int)depth * 2 + 2,"");
 907
 908  for( state = 0 ; state < trie->uniquecharcount ; state++ )
 909   PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
 910  PerlIO_printf( Perl_debug_log, "\n");
 911
 912  for( state = 1 ; state < trie->statecount ; state++ ) {
 913   const U32 base = trie->states[ state ].trans.base;
 914
 915   PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
 916
 917   if ( trie->states[ state ].wordnum ) {
 918    PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
 919   } else {
 920    PerlIO_printf( Perl_debug_log, "%6s", "" );
 921   }
 922
 923   PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
 924
 925   if ( base ) {
 926    U32 ofs = 0;
 927
 928    while( ( base + ofs  < trie->uniquecharcount ) ||
 929     ( base + ofs - trie->uniquecharcount < trie->lasttrans
 930      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
 931      ofs++;
 932
 933    PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
 934
 935    for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
 936     if ( ( base + ofs >= trie->uniquecharcount ) &&
 937      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
 938      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
 939     {
 940     PerlIO_printf( Perl_debug_log, "%*"UVXf,
 941      colwidth,
 942      (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
 943     } else {
 944      PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
 945     }
 946    }
 947
 948    PerlIO_printf( Perl_debug_log, "]");
 949
 950   }
 951   PerlIO_printf( Perl_debug_log, "\n" );
 952  }
 953 }
 954 /*
 955   Dumps a fully constructed but uncompressed trie in list form.
 956   List tries normally only are used for construction when the number of
 957   possible chars (trie->uniquecharcount) is very high.
 958   Used for debugging make_trie().
 959 */
 960 STATIC void
 961 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
 962       HV *widecharmap, AV *revcharmap, U32 next_alloc,
 963       U32 depth)
 964 {
 965  U32 state;
 966  SV *sv=sv_newmortal();
 967  int colwidth= widecharmap ? 6 : 4;
 968  GET_RE_DEBUG_FLAGS_DECL;
 969
 970  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
 971
 972  /* print out the table precompression.  */
 973  PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
 974   (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
 975   "------:-----+-----------------\n" );
 976
 977  for( state=1 ; state < next_alloc ; state ++ ) {
 978   U16 charid;
 979
 980   PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
 981    (int)depth * 2 + 2,"", (UV)state  );
 982   if ( ! trie->states[ state ].wordnum ) {
 983    PerlIO_printf( Perl_debug_log, "%5s| ","");
 984   } else {
 985    PerlIO_printf( Perl_debug_log, "W%4x| ",
 986     trie->states[ state ].wordnum
 987    );
 988   }
 989   for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
 990    SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
 991    if ( tmp ) {
 992     PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
 993      colwidth,
 994      pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 995        PL_colors[0], PL_colors[1],
 996        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 997        PERL_PV_ESCAPE_FIRSTCHAR
 998      ) ,
 999      TRIE_LIST_ITEM(state,charid).forid,
1000      (UV)TRIE_LIST_ITEM(state,charid).newstate
1001     );
1002     if (!(charid % 10))
1003      PerlIO_printf(Perl_debug_log, "\n%*s| ",
1004       (int)((depth * 2) + 14), "");
1005    }
1006   }
1007   PerlIO_printf( Perl_debug_log, "\n");
1008  }
1009 }
1010
1011 /*
1012   Dumps a fully constructed but uncompressed trie in table form.
1013   This is the normal DFA style state transition table, with a few
1014   twists to facilitate compression later.
1015   Used for debugging make_trie().
1016 */
1017 STATIC void
1018 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1019       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1020       U32 depth)
1021 {
1022  U32 state;
1023  U16 charid;
1024  SV *sv=sv_newmortal();
1025  int colwidth= widecharmap ? 6 : 4;
1026  GET_RE_DEBUG_FLAGS_DECL;
1027
1028  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1029
1030  /*
1031  print out the table precompression so that we can do a visual check
1032  that they are identical.
1033  */
1034
1035  PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1036
1037  for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1038   SV ** const tmp = av_fetch( revcharmap, charid, 0);
1039   if ( tmp ) {
1040    PerlIO_printf( Perl_debug_log, "%*s",
1041     colwidth,
1042     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1043        PL_colors[0], PL_colors[1],
1044        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1045        PERL_PV_ESCAPE_FIRSTCHAR
1046     )
1047    );
1048   }
1049  }
1050
1051  PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1052
1053  for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1054   PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1055  }
1056
1057  PerlIO_printf( Perl_debug_log, "\n" );
1058
1059  for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1060
1061   PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1062    (int)depth * 2 + 2,"",
1063    (UV)TRIE_NODENUM( state ) );
1064
1065   for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1066    UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1067    if (v)
1068     PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1069    else
1070     PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1071   }
1072   if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1073    PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1074   } else {
1075    PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1076    trie->states[ TRIE_NODENUM( state ) ].wordnum );
1077   }
1078  }
1079 }
1080
1081 #endif
1082
1083 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1084   startbranch: the first branch in the whole branch sequence
1085   first      : start branch of sequence of branch-exact nodes.
1086    May be the same as startbranch
1087   last       : Thing following the last branch.
1088    May be the same as tail.
1089   tail       : item following the branch sequence
1090   count      : words in the sequence
1091   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1092   depth      : indent depth
1093
1094 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1095
1096 A trie is an N'ary tree where the branches are determined by digital
1097 decomposition of the key. IE, at the root node you look up the 1st character and
1098 follow that branch repeat until you find the end of the branches. Nodes can be
1099 marked as "accepting" meaning they represent a complete word. Eg:
1100
1101   /he|she|his|hers/
1102
1103 would convert into the following structure. Numbers represent states, letters
1104 following numbers represent valid transitions on the letter from that state, if
1105 the number is in square brackets it represents an accepting state, otherwise it
1106 will be in parenthesis.
1107
1108  +-h->+-e->[3]-+-r->(8)-+-s->[9]
1109  |    |
1110  |   (2)
1111  |    |
1112  (1)   +-i->(6)-+-s->[7]
1113  |
1114  +-s->(3)-+-h->(4)-+-e->[5]
1115
1116  Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1117
1118 This shows that when matching against the string 'hers' we will begin at state 1
1119 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1120 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1121 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1122 single traverse. We store a mapping from accepting to state to which word was
1123 matched, and then when we have multiple possibilities we try to complete the
1124 rest of the regex in the order in which they occured in the alternation.
1125
1126 The only prior NFA like behaviour that would be changed by the TRIE support is
1127 the silent ignoring of duplicate alternations which are of the form:
1128
1129  / (DUPE|DUPE) X? (?{ ... }) Y /x
1130
1131 Thus EVAL blocks follwing a trie may be called a different number of times with
1132 and without the optimisation. With the optimisations dupes will be silently
1133 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1134 the following demonstrates:
1135
1136  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1137
1138 which prints out 'word' three times, but
1139
1140  'words'=~/(word|word|word)(?{ print $1 })S/
1141
1142 which doesnt print it out at all. This is due to other optimisations kicking in.
1143
1144 Example of what happens on a structural level:
1145
1146 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1147
1148    1: CURLYM[1] {1,32767}(18)
1149    5:   BRANCH(8)
1150    6:     EXACT <ac>(16)
1151    8:   BRANCH(11)
1152    9:     EXACT <ad>(16)
1153   11:   BRANCH(14)
1154   12:     EXACT <ab>(16)
1155   16:   SUCCEED(0)
1156   17:   NOTHING(18)
1157   18: END(0)
1158
1159 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1160 and should turn into:
1161
1162    1: CURLYM[1] {1,32767}(18)
1163    5:   TRIE(16)
1164   [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1165   <ac>
1166   <ad>
1167   <ab>
1168   16:   SUCCEED(0)
1169   17:   NOTHING(18)
1170   18: END(0)
1171
1172 Cases where tail != last would be like /(?foo|bar)baz/:
1173
1174    1: BRANCH(4)
1175    2:   EXACT <foo>(8)
1176    4: BRANCH(7)
1177    5:   EXACT <bar>(8)
1178    7: TAIL(8)
1179    8: EXACT <baz>(10)
1180   10: END(0)
1181
1182 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1183 and would end up looking like:
1184
1185  1: TRIE(8)
1186  [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1187   <foo>
1188   <bar>
1189    7: TAIL(8)
1190    8: EXACT <baz>(10)
1191   10: END(0)
1192
1193  d = uvuni_to_utf8_flags(d, uv, 0);
1194
1195 is the recommended Unicode-aware way of saying
1196
1197  *(d++) = uv;
1198 */
1199
1200 #define TRIE_STORE_REVCHAR                                                 \
1201  STMT_START {                                                           \
1202   if (UTF) {          \
1203    SV *zlopp = newSV(2);        \
1204    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);    \
1205    unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1206    SvCUR_set(zlopp, kapow - flrbbbbb);       \
1207    SvPOK_on(zlopp);         \
1208    SvUTF8_on(zlopp);         \
1209    av_push(revcharmap, zlopp);        \
1210   } else {          \
1211    char ooooff = (char)uvc;            \
1212    av_push(revcharmap, newSVpvn(&ooooff, 1));      \
1213   }           \
1214   } STMT_END
1215
1216 #define TRIE_READ_CHAR STMT_START {                                           \
1217  wordlen++;                                                                \
1218  if ( UTF ) {                                                              \
1219   if ( folder ) {                                                       \
1220    if ( foldlen > 0 ) {                                              \
1221    uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1222    foldlen -= len;                                                \
1223    scan += len;                                                   \
1224    len = 0;                                                       \
1225    } else {                                                          \
1226     uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1227     uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1228     foldlen -= UNISKIP( uvc );                                    \
1229     scan = foldbuf + UNISKIP( uvc );                              \
1230    }                                                                 \
1231   } else {                                                              \
1232    uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1233   }                                                                     \
1234  } else {                                                                  \
1235   uvc = (U32)*uc;                                                       \
1236   len = 1;                                                              \
1237  }                                                                         \
1238 } STMT_END
1239
1240
1241
1242 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1243  if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1244   U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1245   Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1246  }                                                           \
1247  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1248  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1249  TRIE_LIST_CUR( state )++;                                   \
1250 } STMT_END
1251
1252 #define TRIE_LIST_NEW(state) STMT_START {                       \
1253  Newxz( trie->states[ state ].trans.list,               \
1254   4, reg_trie_trans_le );                                 \
1255  TRIE_LIST_CUR( state ) = 1;                                \
1256  TRIE_LIST_LEN( state ) = 4;                                \
1257 } STMT_END
1258
1259 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1260  U16 dupe= trie->states[ state ].wordnum;                    \
1261  regnode * const noper_next = regnext( noper );              \
1262                 \
1263  if (trie->wordlen)                                          \
1264   trie->wordlen[ curword ] = wordlen;                     \
1265  DEBUG_r({                                                   \
1266   /* store the word for dumping */                        \
1267   SV* tmp;                                                \
1268   if (OP(noper) != NOTHING)                               \
1269    tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1270   else                                                    \
1271    tmp = newSVpvn_utf8( "", 0, UTF );   \
1272   av_push( trie_words, tmp );                             \
1273  });                                                         \
1274                 \
1275  curword++;                                                  \
1276                 \
1277  if ( noper_next < tail ) {                                  \
1278   if (!trie->jump)                                        \
1279    trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1280   trie->jump[curword] = (U16)(noper_next - convert);      \
1281   if (!jumper)                                            \
1282    jumper = noper_next;                                \
1283   if (!nextbranch)                                        \
1284    nextbranch= regnext(cur);                           \
1285  }                                                           \
1286                 \
1287  if ( dupe ) {                                               \
1288   /* So it's a dupe. This means we need to maintain a   */\
1289   /* linked-list from the first to the next.            */\
1290   /* we only allocate the nextword buffer when there    */\
1291   /* a dupe, so first time we have to do the allocation */\
1292   if (!trie->nextword)                                    \
1293    trie->nextword = (U16 *)     \
1294     PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
1295   while ( trie->nextword[dupe] )                          \
1296    dupe= trie->nextword[dupe];                         \
1297   trie->nextword[dupe]= curword;                          \
1298  } else {                                                    \
1299   /* we haven't inserted this word yet.                */ \
1300   trie->states[ state ].wordnum = curword;                \
1301  }                                                           \
1302 } STMT_END
1303
1304
1305 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)  \
1306  ( ( base + charid >=  ucharcount     \
1307   && base + charid < ubound     \
1308   && state == trie->trans[ base - ucharcount + charid ].check \
1309   && trie->trans[ base - ucharcount + charid ].next )  \
1310   ? trie->trans[ base - ucharcount + charid ].next  \
1311   : ( state==1 ? special : 0 )     \
1312  )
1313
1314 #define MADE_TRIE       1
1315 #define MADE_JUMP_TRIE  2
1316 #define MADE_EXACT_TRIE 4
1317
1318 STATIC I32
1319 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1320 {
1321  dVAR;
1322  /* first pass, loop through and scan words */
1323  reg_trie_data *trie;
1324  HV *widecharmap = NULL;
1325  AV *revcharmap = newAV();
1326  regnode *cur;
1327  const U32 uniflags = UTF8_ALLOW_DEFAULT;
1328  STRLEN len = 0;
1329  UV uvc = 0;
1330  U16 curword = 0;
1331  U32 next_alloc = 0;
1332  regnode *jumper = NULL;
1333  regnode *nextbranch = NULL;
1334  regnode *convert = NULL;
1335  /* we just use folder as a flag in utf8 */
1336  const U8 * const folder = ( flags == EXACTF
1337      ? PL_fold
1338      : ( flags == EXACTFL
1339       ? PL_fold_locale
1340       : NULL
1341       )
1342      );
1343
1344 #ifdef DEBUGGING
1345  const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1346  AV *trie_words = NULL;
1347  /* along with revcharmap, this only used during construction but both are
1348  * useful during debugging so we store them in the struct when debugging.
1349  */
1350 #else
1351  const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1352  STRLEN trie_charcount=0;
1353 #endif
1354  SV *re_trie_maxbuff;
1355  GET_RE_DEBUG_FLAGS_DECL;
1356
1357  PERL_ARGS_ASSERT_MAKE_TRIE;
1358 #ifndef DEBUGGING
1359  PERL_UNUSED_ARG(depth);
1360 #endif
1361
1362  trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1363  trie->refcount = 1;
1364  trie->startstate = 1;
1365  trie->wordcount = word_count;
1366  RExC_rxi->data->data[ data_slot ] = (void*)trie;
1367  trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1368  if (!(UTF && folder))
1369   trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1370  DEBUG_r({
1371   trie_words = newAV();
1372  });
1373
1374  re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1375  if (!SvIOK(re_trie_maxbuff)) {
1376   sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1377  }
1378  DEBUG_OPTIMISE_r({
1379     PerlIO_printf( Perl_debug_log,
1380     "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1381     (int)depth * 2 + 2, "",
1382     REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1383     REG_NODE_NUM(last), REG_NODE_NUM(tail),
1384     (int)depth);
1385  });
1386
1387    /* Find the node we are going to overwrite */
1388  if ( first == startbranch && OP( last ) != BRANCH ) {
1389   /* whole branch chain */
1390   convert = first;
1391  } else {
1392   /* branch sub-chain */
1393   convert = NEXTOPER( first );
1394  }
1395
1396  /*  -- First loop and Setup --
1397
1398  We first traverse the branches and scan each word to determine if it
1399  contains widechars, and how many unique chars there are, this is
1400  important as we have to build a table with at least as many columns as we
1401  have unique chars.
1402
1403  We use an array of integers to represent the character codes 0..255
1404  (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1405  native representation of the character value as the key and IV's for the
1406  coded index.
1407
1408  *TODO* If we keep track of how many times each character is used we can
1409  remap the columns so that the table compression later on is more
1410  efficient in terms of memory by ensuring most common value is in the
1411  middle and the least common are on the outside.  IMO this would be better
1412  than a most to least common mapping as theres a decent chance the most
1413  common letter will share a node with the least common, meaning the node
1414  will not be compressable. With a middle is most common approach the worst
1415  case is when we have the least common nodes twice.
1416
1417  */
1418
1419  for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1420   regnode * const noper = NEXTOPER( cur );
1421   const U8 *uc = (U8*)STRING( noper );
1422   const U8 * const e  = uc + STR_LEN( noper );
1423   STRLEN foldlen = 0;
1424   U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1425   const U8 *scan = (U8*)NULL;
1426   U32 wordlen      = 0;         /* required init */
1427   STRLEN chars = 0;
1428   bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1429
1430   if (OP(noper) == NOTHING) {
1431    trie->minlen= 0;
1432    continue;
1433   }
1434   if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1435    TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1436           regardless of encoding */
1437
1438   for ( ; uc < e ; uc += len ) {
1439    TRIE_CHARCOUNT(trie)++;
1440    TRIE_READ_CHAR;
1441    chars++;
1442    if ( uvc < 256 ) {
1443     if ( !trie->charmap[ uvc ] ) {
1444      trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1445      if ( folder )
1446       trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1447      TRIE_STORE_REVCHAR;
1448     }
1449     if ( set_bit ) {
1450      /* store the codepoint in the bitmap, and if its ascii
1451      also store its folded equivelent. */
1452      TRIE_BITMAP_SET(trie,uvc);
1453
1454      /* store the folded codepoint */
1455      if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1456
1457      if ( !UTF ) {
1458       /* store first byte of utf8 representation of
1459       codepoints in the 127 < uvc < 256 range */
1460       if (127 < uvc && uvc < 192) {
1461        TRIE_BITMAP_SET(trie,194);
1462       } else if (191 < uvc ) {
1463        TRIE_BITMAP_SET(trie,195);
1464       /* && uvc < 256 -- we know uvc is < 256 already */
1465       }
1466      }
1467      set_bit = 0; /* We've done our bit :-) */
1468     }
1469    } else {
1470     SV** svpp;
1471     if ( !widecharmap )
1472      widecharmap = newHV();
1473
1474     svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1475
1476     if ( !svpp )
1477      Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1478
1479     if ( !SvTRUE( *svpp ) ) {
1480      sv_setiv( *svpp, ++trie->uniquecharcount );
1481      TRIE_STORE_REVCHAR;
1482     }
1483    }
1484   }
1485   if( cur == first ) {
1486    trie->minlen=chars;
1487    trie->maxlen=chars;
1488   } else if (chars < trie->minlen) {
1489    trie->minlen=chars;
1490   } else if (chars > trie->maxlen) {
1491    trie->maxlen=chars;
1492   }
1493
1494  } /* end first pass */
1495  DEBUG_TRIE_COMPILE_r(
1496   PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1497     (int)depth * 2 + 2,"",
1498     ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1499     (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1500     (int)trie->minlen, (int)trie->maxlen )
1501  );
1502  trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1503
1504  /*
1505   We now know what we are dealing with in terms of unique chars and
1506   string sizes so we can calculate how much memory a naive
1507   representation using a flat table  will take. If it's over a reasonable
1508   limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1509   conservative but potentially much slower representation using an array
1510   of lists.
1511
1512   At the end we convert both representations into the same compressed
1513   form that will be used in regexec.c for matching with. The latter
1514   is a form that cannot be used to construct with but has memory
1515   properties similar to the list form and access properties similar
1516   to the table form making it both suitable for fast searches and
1517   small enough that its feasable to store for the duration of a program.
1518
1519   See the comment in the code where the compressed table is produced
1520   inplace from the flat tabe representation for an explanation of how
1521   the compression works.
1522
1523  */
1524
1525
1526  if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1527   /*
1528    Second Pass -- Array Of Lists Representation
1529
1530    Each state will be represented by a list of charid:state records
1531    (reg_trie_trans_le) the first such element holds the CUR and LEN
1532    points of the allocated array. (See defines above).
1533
1534    We build the initial structure using the lists, and then convert
1535    it into the compressed table form which allows faster lookups
1536    (but cant be modified once converted).
1537   */
1538
1539   STRLEN transcount = 1;
1540
1541   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1542    "%*sCompiling trie using list compiler\n",
1543    (int)depth * 2 + 2, ""));
1544
1545   trie->states = (reg_trie_state *)
1546    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1547         sizeof(reg_trie_state) );
1548   TRIE_LIST_NEW(1);
1549   next_alloc = 2;
1550
1551   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1552
1553    regnode * const noper = NEXTOPER( cur );
1554    U8 *uc           = (U8*)STRING( noper );
1555    const U8 * const e = uc + STR_LEN( noper );
1556    U32 state        = 1;         /* required init */
1557    U16 charid       = 0;         /* sanity init */
1558    U8 *scan         = (U8*)NULL; /* sanity init */
1559    STRLEN foldlen   = 0;         /* required init */
1560    U32 wordlen      = 0;         /* required init */
1561    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1562
1563    if (OP(noper) != NOTHING) {
1564     for ( ; uc < e ; uc += len ) {
1565
1566      TRIE_READ_CHAR;
1567
1568      if ( uvc < 256 ) {
1569       charid = trie->charmap[ uvc ];
1570      } else {
1571       SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1572       if ( !svpp ) {
1573        charid = 0;
1574       } else {
1575        charid=(U16)SvIV( *svpp );
1576       }
1577      }
1578      /* charid is now 0 if we dont know the char read, or nonzero if we do */
1579      if ( charid ) {
1580
1581       U16 check;
1582       U32 newstate = 0;
1583
1584       charid--;
1585       if ( !trie->states[ state ].trans.list ) {
1586        TRIE_LIST_NEW( state );
1587       }
1588       for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1589        if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1590         newstate = TRIE_LIST_ITEM( state, check ).newstate;
1591         break;
1592        }
1593       }
1594       if ( ! newstate ) {
1595        newstate = next_alloc++;
1596        TRIE_LIST_PUSH( state, charid, newstate );
1597        transcount++;
1598       }
1599       state = newstate;
1600      } else {
1601       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1602      }
1603     }
1604    }
1605    TRIE_HANDLE_WORD(state);
1606
1607   } /* end second pass */
1608
1609   /* next alloc is the NEXT state to be allocated */
1610   trie->statecount = next_alloc;
1611   trie->states = (reg_trie_state *)
1612    PerlMemShared_realloc( trie->states,
1613         next_alloc
1614         * sizeof(reg_trie_state) );
1615
1616   /* and now dump it out before we compress it */
1617   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1618               revcharmap, next_alloc,
1619               depth+1)
1620   );
1621
1622   trie->trans = (reg_trie_trans *)
1623    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1624   {
1625    U32 state;
1626    U32 tp = 0;
1627    U32 zp = 0;
1628
1629
1630    for( state=1 ; state < next_alloc ; state ++ ) {
1631     U32 base=0;
1632
1633     /*
1634     DEBUG_TRIE_COMPILE_MORE_r(
1635      PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1636     );
1637     */
1638
1639     if (trie->states[state].trans.list) {
1640      U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1641      U16 maxid=minid;
1642      U16 idx;
1643
1644      for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1645       const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1646       if ( forid < minid ) {
1647        minid=forid;
1648       } else if ( forid > maxid ) {
1649        maxid=forid;
1650       }
1651      }
1652      if ( transcount < tp + maxid - minid + 1) {
1653       transcount *= 2;
1654       trie->trans = (reg_trie_trans *)
1655        PerlMemShared_realloc( trie->trans,
1656              transcount
1657              * sizeof(reg_trie_trans) );
1658       Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1659      }
1660      base = trie->uniquecharcount + tp - minid;
1661      if ( maxid == minid ) {
1662       U32 set = 0;
1663       for ( ; zp < tp ; zp++ ) {
1664        if ( ! trie->trans[ zp ].next ) {
1665         base = trie->uniquecharcount + zp - minid;
1666         trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1667         trie->trans[ zp ].check = state;
1668         set = 1;
1669         break;
1670        }
1671       }
1672       if ( !set ) {
1673        trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1674        trie->trans[ tp ].check = state;
1675        tp++;
1676        zp = tp;
1677       }
1678      } else {
1679       for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1680        const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1681        trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1682        trie->trans[ tid ].check = state;
1683       }
1684       tp += ( maxid - minid + 1 );
1685      }
1686      Safefree(trie->states[ state ].trans.list);
1687     }
1688     /*
1689     DEBUG_TRIE_COMPILE_MORE_r(
1690      PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1691     );
1692     */
1693     trie->states[ state ].trans.base=base;
1694    }
1695    trie->lasttrans = tp + 1;
1696   }
1697  } else {
1698   /*
1699   Second Pass -- Flat Table Representation.
1700
1701   we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1702   We know that we will need Charcount+1 trans at most to store the data
1703   (one row per char at worst case) So we preallocate both structures
1704   assuming worst case.
1705
1706   We then construct the trie using only the .next slots of the entry
1707   structs.
1708
1709   We use the .check field of the first entry of the node  temporarily to
1710   make compression both faster and easier by keeping track of how many non
1711   zero fields are in the node.
1712
1713   Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1714   transition.
1715
1716   There are two terms at use here: state as a TRIE_NODEIDX() which is a
1717   number representing the first entry of the node, and state as a
1718   TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1719   TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1720   are 2 entrys per node. eg:
1721
1722    A B       A B
1723   1. 2 4    1. 3 7
1724   2. 0 3    3. 0 5
1725   3. 0 0    5. 0 0
1726   4. 0 0    7. 0 0
1727
1728   The table is internally in the right hand, idx form. However as we also
1729   have to deal with the states array which is indexed by nodenum we have to
1730   use TRIE_NODENUM() to convert.
1731
1732   */
1733   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1734    "%*sCompiling trie using table compiler\n",
1735    (int)depth * 2 + 2, ""));
1736
1737   trie->trans = (reg_trie_trans *)
1738    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1739         * trie->uniquecharcount + 1,
1740         sizeof(reg_trie_trans) );
1741   trie->states = (reg_trie_state *)
1742    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1743         sizeof(reg_trie_state) );
1744   next_alloc = trie->uniquecharcount + 1;
1745
1746
1747   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1748
1749    regnode * const noper   = NEXTOPER( cur );
1750    const U8 *uc     = (U8*)STRING( noper );
1751    const U8 * const e = uc + STR_LEN( noper );
1752
1753    U32 state        = 1;         /* required init */
1754
1755    U16 charid       = 0;         /* sanity init */
1756    U32 accept_state = 0;         /* sanity init */
1757    U8 *scan         = (U8*)NULL; /* sanity init */
1758
1759    STRLEN foldlen   = 0;         /* required init */
1760    U32 wordlen      = 0;         /* required init */
1761    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1762
1763    if ( OP(noper) != NOTHING ) {
1764     for ( ; uc < e ; uc += len ) {
1765
1766      TRIE_READ_CHAR;
1767
1768      if ( uvc < 256 ) {
1769       charid = trie->charmap[ uvc ];
1770      } else {
1771       SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1772       charid = svpp ? (U16)SvIV(*svpp) : 0;
1773      }
1774      if ( charid ) {
1775       charid--;
1776       if ( !trie->trans[ state + charid ].next ) {
1777        trie->trans[ state + charid ].next = next_alloc;
1778        trie->trans[ state ].check++;
1779        next_alloc += trie->uniquecharcount;
1780       }
1781       state = trie->trans[ state + charid ].next;
1782      } else {
1783       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1784      }
1785      /* charid is now 0 if we dont know the char read, or nonzero if we do */
1786     }
1787    }
1788    accept_state = TRIE_NODENUM( state );
1789    TRIE_HANDLE_WORD(accept_state);
1790
1791   } /* end second pass */
1792
1793   /* and now dump it out before we compress it */
1794   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1795               revcharmap,
1796               next_alloc, depth+1));
1797
1798   {
1799   /*
1800   * Inplace compress the table.*
1801
1802   For sparse data sets the table constructed by the trie algorithm will
1803   be mostly 0/FAIL transitions or to put it another way mostly empty.
1804   (Note that leaf nodes will not contain any transitions.)
1805
1806   This algorithm compresses the tables by eliminating most such
1807   transitions, at the cost of a modest bit of extra work during lookup:
1808
1809   - Each states[] entry contains a .base field which indicates the
1810   index in the state[] array wheres its transition data is stored.
1811
1812   - If .base is 0 there are no  valid transitions from that node.
1813
1814   - If .base is nonzero then charid is added to it to find an entry in
1815   the trans array.
1816
1817   -If trans[states[state].base+charid].check!=state then the
1818   transition is taken to be a 0/Fail transition. Thus if there are fail
1819   transitions at the front of the node then the .base offset will point
1820   somewhere inside the previous nodes data (or maybe even into a node
1821   even earlier), but the .check field determines if the transition is
1822   valid.
1823
1824   XXX - wrong maybe?
1825   The following process inplace converts the table to the compressed
1826   table: We first do not compress the root node 1,and mark its all its
1827   .check pointers as 1 and set its .base pointer as 1 as well. This
1828   allows to do a DFA construction from the compressed table later, and
1829   ensures that any .base pointers we calculate later are greater than
1830   0.
1831
1832   - We set 'pos' to indicate the first entry of the second node.
1833
1834   - We then iterate over the columns of the node, finding the first and
1835   last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1836   and set the .check pointers accordingly, and advance pos
1837   appropriately and repreat for the next node. Note that when we copy
1838   the next pointers we have to convert them from the original
1839   NODEIDX form to NODENUM form as the former is not valid post
1840   compression.
1841
1842   - If a node has no transitions used we mark its base as 0 and do not
1843   advance the pos pointer.
1844
1845   - If a node only has one transition we use a second pointer into the
1846   structure to fill in allocated fail transitions from other states.
1847   This pointer is independent of the main pointer and scans forward
1848   looking for null transitions that are allocated to a state. When it
1849   finds one it writes the single transition into the "hole".  If the
1850   pointer doesnt find one the single transition is appended as normal.
1851
1852   - Once compressed we can Renew/realloc the structures to release the
1853   excess space.
1854
1855   See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1856   specifically Fig 3.47 and the associated pseudocode.
1857
1858   demq
1859   */
1860   const U32 laststate = TRIE_NODENUM( next_alloc );
1861   U32 state, charid;
1862   U32 pos = 0, zp=0;
1863   trie->statecount = laststate;
1864
1865   for ( state = 1 ; state < laststate ; state++ ) {
1866    U8 flag = 0;
1867    const U32 stateidx = TRIE_NODEIDX( state );
1868    const U32 o_used = trie->trans[ stateidx ].check;
1869    U32 used = trie->trans[ stateidx ].check;
1870    trie->trans[ stateidx ].check = 0;
1871
1872    for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1873     if ( flag || trie->trans[ stateidx + charid ].next ) {
1874      if ( trie->trans[ stateidx + charid ].next ) {
1875       if (o_used == 1) {
1876        for ( ; zp < pos ; zp++ ) {
1877         if ( ! trie->trans[ zp ].next ) {
1878          break;
1879         }
1880        }
1881        trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1882        trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1883        trie->trans[ zp ].check = state;
1884        if ( ++zp > pos ) pos = zp;
1885        break;
1886       }
1887       used--;
1888      }
1889      if ( !flag ) {
1890       flag = 1;
1891       trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1892      }
1893      trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1894      trie->trans[ pos ].check = state;
1895      pos++;
1896     }
1897    }
1898   }
1899   trie->lasttrans = pos + 1;
1900   trie->states = (reg_trie_state *)
1901    PerlMemShared_realloc( trie->states, laststate
1902         * sizeof(reg_trie_state) );
1903   DEBUG_TRIE_COMPILE_MORE_r(
1904     PerlIO_printf( Perl_debug_log,
1905      "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1906      (int)depth * 2 + 2,"",
1907      (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1908      (IV)next_alloc,
1909      (IV)pos,
1910      ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1911    );
1912
1913   } /* end table compress */
1914  }
1915  DEBUG_TRIE_COMPILE_MORE_r(
1916    PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1917     (int)depth * 2 + 2, "",
1918     (UV)trie->statecount,
1919     (UV)trie->lasttrans)
1920  );
1921  /* resize the trans array to remove unused space */
1922  trie->trans = (reg_trie_trans *)
1923   PerlMemShared_realloc( trie->trans, trie->lasttrans
1924        * sizeof(reg_trie_trans) );
1925
1926  /* and now dump out the compressed format */
1927  DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1928
1929  {   /* Modify the program and insert the new TRIE node*/
1930   U8 nodetype =(U8)(flags & 0xFF);
1931   char *str=NULL;
1932
1933 #ifdef DEBUGGING
1934   regnode *optimize = NULL;
1935 #ifdef RE_TRACK_PATTERN_OFFSETS
1936
1937   U32 mjd_offset = 0;
1938   U32 mjd_nodelen = 0;
1939 #endif /* RE_TRACK_PATTERN_OFFSETS */
1940 #endif /* DEBUGGING */
1941   /*
1942   This means we convert either the first branch or the first Exact,
1943   depending on whether the thing following (in 'last') is a branch
1944   or not and whther first is the startbranch (ie is it a sub part of
1945   the alternation or is it the whole thing.)
1946   Assuming its a sub part we conver the EXACT otherwise we convert
1947   the whole branch sequence, including the first.
1948   */
1949   /* Find the node we are going to overwrite */
1950   if ( first != startbranch || OP( last ) == BRANCH ) {
1951    /* branch sub-chain */
1952    NEXT_OFF( first ) = (U16)(last - first);
1953 #ifdef RE_TRACK_PATTERN_OFFSETS
1954    DEBUG_r({
1955     mjd_offset= Node_Offset((convert));
1956     mjd_nodelen= Node_Length((convert));
1957    });
1958 #endif
1959    /* whole branch chain */
1960   }
1961 #ifdef RE_TRACK_PATTERN_OFFSETS
1962   else {
1963    DEBUG_r({
1964     const  regnode *nop = NEXTOPER( convert );
1965     mjd_offset= Node_Offset((nop));
1966     mjd_nodelen= Node_Length((nop));
1967    });
1968   }
1969   DEBUG_OPTIMISE_r(
1970    PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1971     (int)depth * 2 + 2, "",
1972     (UV)mjd_offset, (UV)mjd_nodelen)
1973   );
1974 #endif
1975   /* But first we check to see if there is a common prefix we can
1976   split out as an EXACT and put in front of the TRIE node.  */
1977   trie->startstate= 1;
1978   if ( trie->bitmap && !widecharmap && !trie->jump  ) {
1979    U32 state;
1980    for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1981     U32 ofs = 0;
1982     I32 idx = -1;
1983     U32 count = 0;
1984     const U32 base = trie->states[ state ].trans.base;
1985
1986     if ( trie->states[state].wordnum )
1987       count = 1;
1988
1989     for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1990      if ( ( base + ofs >= trie->uniquecharcount ) &&
1991       ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1992       trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1993      {
1994       if ( ++count > 1 ) {
1995        SV **tmp = av_fetch( revcharmap, ofs, 0);
1996        const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1997        if ( state == 1 ) break;
1998        if ( count == 2 ) {
1999         Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2000         DEBUG_OPTIMISE_r(
2001          PerlIO_printf(Perl_debug_log,
2002           "%*sNew Start State=%"UVuf" Class: [",
2003           (int)depth * 2 + 2, "",
2004           (UV)state));
2005         if (idx >= 0) {
2006          SV ** const tmp = av_fetch( revcharmap, idx, 0);
2007          const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2008
2009          TRIE_BITMAP_SET(trie,*ch);
2010          if ( folder )
2011           TRIE_BITMAP_SET(trie, folder[ *ch ]);
2012          DEBUG_OPTIMISE_r(
2013           PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2014          );
2015         }
2016        }
2017        TRIE_BITMAP_SET(trie,*ch);
2018        if ( folder )
2019         TRIE_BITMAP_SET(trie,folder[ *ch ]);
2020        DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2021       }
2022       idx = ofs;
2023      }
2024     }
2025     if ( count == 1 ) {
2026      SV **tmp = av_fetch( revcharmap, idx, 0);
2027      STRLEN len;
2028      char *ch = SvPV( *tmp, len );
2029      DEBUG_OPTIMISE_r({
2030       SV *sv=sv_newmortal();
2031       PerlIO_printf( Perl_debug_log,
2032        "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2033        (int)depth * 2 + 2, "",
2034        (UV)state, (UV)idx,
2035        pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2036         PL_colors[0], PL_colors[1],
2037         (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2038         PERL_PV_ESCAPE_FIRSTCHAR
2039        )
2040       );
2041      });
2042      if ( state==1 ) {
2043       OP( convert ) = nodetype;
2044       str=STRING(convert);
2045       STR_LEN(convert)=0;
2046      }
2047      STR_LEN(convert) += len;
2048      while (len--)
2049       *str++ = *ch++;
2050     } else {
2051 #ifdef DEBUGGING
2052      if (state>1)
2053       DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2054 #endif
2055      break;
2056     }
2057    }
2058    if (str) {
2059     regnode *n = convert+NODE_SZ_STR(convert);
2060     NEXT_OFF(convert) = NODE_SZ_STR(convert);
2061     trie->startstate = state;
2062     trie->minlen -= (state - 1);
2063     trie->maxlen -= (state - 1);
2064 #ifdef DEBUGGING
2065    /* At least the UNICOS C compiler choked on this
2066     * being argument to DEBUG_r(), so let's just have
2067     * it right here. */
2068    if (
2069 #ifdef PERL_EXT_RE_BUILD
2070     1
2071 #else
2072     DEBUG_r_TEST
2073 #endif
2074     ) {
2075     regnode *fix = convert;
2076     U32 word = trie->wordcount;
2077     mjd_nodelen++;
2078     Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2079     while( ++fix < n ) {
2080      Set_Node_Offset_Length(fix, 0, 0);
2081     }
2082     while (word--) {
2083      SV ** const tmp = av_fetch( trie_words, word, 0 );
2084      if (tmp) {
2085       if ( STR_LEN(convert) <= SvCUR(*tmp) )
2086        sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2087       else
2088        sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2089      }
2090     }
2091    }
2092 #endif
2093     if (trie->maxlen) {
2094      convert = n;
2095     } else {
2096      NEXT_OFF(convert) = (U16)(tail - convert);
2097      DEBUG_r(optimize= n);
2098     }
2099    }
2100   }
2101   if (!jumper)
2102    jumper = last;
2103   if ( trie->maxlen ) {
2104    NEXT_OFF( convert ) = (U16)(tail - convert);
2105    ARG_SET( convert, data_slot );
2106    /* Store the offset to the first unabsorbed branch in
2107    jump[0], which is otherwise unused by the jump logic.
2108    We use this when dumping a trie and during optimisation. */
2109    if (trie->jump)
2110     trie->jump[0] = (U16)(nextbranch - convert);
2111
2112    /* XXXX */
2113    if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2114     ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2115    {
2116     OP( convert ) = TRIEC;
2117     Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2118     PerlMemShared_free(trie->bitmap);
2119     trie->bitmap= NULL;
2120    } else
2121     OP( convert ) = TRIE;
2122
2123    /* store the type in the flags */
2124    convert->flags = nodetype;
2125    DEBUG_r({
2126    optimize = convert
2127      + NODE_STEP_REGNODE
2128      + regarglen[ OP( convert ) ];
2129    });
2130    /* XXX We really should free up the resource in trie now,
2131     as we won't use them - (which resources?) dmq */
2132   }
2133   /* needed for dumping*/
2134   DEBUG_r(if (optimize) {
2135    regnode *opt = convert;
2136
2137    while ( ++opt < optimize) {
2138     Set_Node_Offset_Length(opt,0,0);
2139    }
2140    /*
2141     Try to clean up some of the debris left after the
2142     optimisation.
2143    */
2144    while( optimize < jumper ) {
2145     mjd_nodelen += Node_Length((optimize));
2146     OP( optimize ) = OPTIMIZED;
2147     Set_Node_Offset_Length(optimize,0,0);
2148     optimize++;
2149    }
2150    Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2151   });
2152  } /* end node insert */
2153  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2154  RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2155 #ifdef DEBUGGING
2156  RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2157  RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2158 #else
2159  SvREFCNT_dec(revcharmap);
2160 #endif
2161  return trie->jump
2162   ? MADE_JUMP_TRIE
2163   : trie->startstate>1
2164    ? MADE_EXACT_TRIE
2165    : MADE_TRIE;
2166 }
2167
2168 STATIC void
2169 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2170 {
2171 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2172
2173    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2174    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2175    ISBN 0-201-10088-6
2176
2177    We find the fail state for each state in the trie, this state is the longest proper
2178    suffix of the current states 'word' that is also a proper prefix of another word in our
2179    trie. State 1 represents the word '' and is the thus the default fail state. This allows
2180    the DFA not to have to restart after its tried and failed a word at a given point, it
2181    simply continues as though it had been matching the other word in the first place.
2182    Consider
2183  'abcdgu'=~/abcdefg|cdgu/
2184    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2185    fail, which would bring use to the state representing 'd' in the second word where we would
2186    try 'g' and succeed, prodceding to match 'cdgu'.
2187  */
2188  /* add a fail transition */
2189  const U32 trie_offset = ARG(source);
2190  reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2191  U32 *q;
2192  const U32 ucharcount = trie->uniquecharcount;
2193  const U32 numstates = trie->statecount;
2194  const U32 ubound = trie->lasttrans + ucharcount;
2195  U32 q_read = 0;
2196  U32 q_write = 0;
2197  U32 charid;
2198  U32 base = trie->states[ 1 ].trans.base;
2199  U32 *fail;
2200  reg_ac_data *aho;
2201  const U32 data_slot = add_data( pRExC_state, 1, "T" );
2202  GET_RE_DEBUG_FLAGS_DECL;
2203
2204  PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2205 #ifndef DEBUGGING
2206  PERL_UNUSED_ARG(depth);
2207 #endif
2208
2209
2210  ARG_SET( stclass, data_slot );
2211  aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2212  RExC_rxi->data->data[ data_slot ] = (void*)aho;
2213  aho->trie=trie_offset;
2214  aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2215  Copy( trie->states, aho->states, numstates, reg_trie_state );
2216  Newxz( q, numstates, U32);
2217  aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2218  aho->refcount = 1;
2219  fail = aho->fail;
2220  /* initialize fail[0..1] to be 1 so that we always have
2221  a valid final fail state */
2222  fail[ 0 ] = fail[ 1 ] = 1;
2223
2224  for ( charid = 0; charid < ucharcount ; charid++ ) {
2225   const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2226   if ( newstate ) {
2227    q[ q_write ] = newstate;
2228    /* set to point at the root */
2229    fail[ q[ q_write++ ] ]=1;
2230   }
2231  }
2232  while ( q_read < q_write) {
2233   const U32 cur = q[ q_read++ % numstates ];
2234   base = trie->states[ cur ].trans.base;
2235
2236   for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2237    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2238    if (ch_state) {
2239     U32 fail_state = cur;
2240     U32 fail_base;
2241     do {
2242      fail_state = fail[ fail_state ];
2243      fail_base = aho->states[ fail_state ].trans.base;
2244     } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2245
2246     fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2247     fail[ ch_state ] = fail_state;
2248     if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2249     {
2250       aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2251     }
2252     q[ q_write++ % numstates] = ch_state;
2253    }
2254   }
2255  }
2256  /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2257  when we fail in state 1, this allows us to use the
2258  charclass scan to find a valid start char. This is based on the principle
2259  that theres a good chance the string being searched contains lots of stuff
2260  that cant be a start char.
2261  */
2262  fail[ 0 ] = fail[ 1 ] = 0;
2263  DEBUG_TRIE_COMPILE_r({
2264   PerlIO_printf(Perl_debug_log,
2265      "%*sStclass Failtable (%"UVuf" states): 0",
2266      (int)(depth * 2), "", (UV)numstates
2267   );
2268   for( q_read=1; q_read<numstates; q_read++ ) {
2269    PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2270   }
2271   PerlIO_printf(Perl_debug_log, "\n");
2272  });
2273  Safefree(q);
2274  /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2275 }
2276
2277
2278 /*
2279  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2280  * These need to be revisited when a newer toolchain becomes available.
2281  */
2282 #if defined(__sparc64__) && defined(__GNUC__)
2283 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2284 #       undef  SPARC64_GCC_WORKAROUND
2285 #       define SPARC64_GCC_WORKAROUND 1
2286 #   endif
2287 #endif
2288
2289 #define DEBUG_PEEP(str,scan,depth) \
2290  DEBUG_OPTIMISE_r({if (scan){ \
2291  SV * const mysv=sv_newmortal(); \
2292  regnode *Next = regnext(scan); \
2293  regprop(RExC_rx, mysv, scan); \
2294  PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2295  (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2296  Next ? (REG_NODE_NUM(Next)) : 0 ); \
2297    }});
2298
2299
2300
2301
2302
2303 #define JOIN_EXACT(scan,min,flags) \
2304  if (PL_regkind[OP(scan)] == EXACT) \
2305   join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2306
2307 STATIC U32
2308 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2309  /* Merge several consecutive EXACTish nodes into one. */
2310  regnode *n = regnext(scan);
2311  U32 stringok = 1;
2312  regnode *next = scan + NODE_SZ_STR(scan);
2313  U32 merged = 0;
2314  U32 stopnow = 0;
2315 #ifdef DEBUGGING
2316  regnode *stop = scan;
2317  GET_RE_DEBUG_FLAGS_DECL;
2318 #else
2319  PERL_UNUSED_ARG(depth);
2320 #endif
2321
2322  PERL_ARGS_ASSERT_JOIN_EXACT;
2323 #ifndef EXPERIMENTAL_INPLACESCAN
2324  PERL_UNUSED_ARG(flags);
2325  PERL_UNUSED_ARG(val);
2326 #endif
2327  DEBUG_PEEP("join",scan,depth);
2328
2329  /* Skip NOTHING, merge EXACT*. */
2330  while (n &&
2331   ( PL_regkind[OP(n)] == NOTHING ||
2332    (stringok && (OP(n) == OP(scan))))
2333   && NEXT_OFF(n)
2334   && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2335
2336   if (OP(n) == TAIL || n > next)
2337    stringok = 0;
2338   if (PL_regkind[OP(n)] == NOTHING) {
2339    DEBUG_PEEP("skip:",n,depth);
2340    NEXT_OFF(scan) += NEXT_OFF(n);
2341    next = n + NODE_STEP_REGNODE;
2342 #ifdef DEBUGGING
2343    if (stringok)
2344     stop = n;
2345 #endif
2346    n = regnext(n);
2347   }
2348   else if (stringok) {
2349    const unsigned int oldl = STR_LEN(scan);
2350    regnode * const nnext = regnext(n);
2351
2352    DEBUG_PEEP("merg",n,depth);
2353
2354    merged++;
2355    if (oldl + STR_LEN(n) > U8_MAX)
2356     break;
2357    NEXT_OFF(scan) += NEXT_OFF(n);
2358    STR_LEN(scan) += STR_LEN(n);
2359    next = n + NODE_SZ_STR(n);
2360    /* Now we can overwrite *n : */
2361    Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2362 #ifdef DEBUGGING
2363    stop = next - 1;
2364 #endif
2365    n = nnext;
2366    if (stopnow) break;
2367   }
2368
2369 #ifdef EXPERIMENTAL_INPLACESCAN
2370   if (flags && !NEXT_OFF(n)) {
2371    DEBUG_PEEP("atch", val, depth);
2372    if (reg_off_by_arg[OP(n)]) {
2373     ARG_SET(n, val - n);
2374    }
2375    else {
2376     NEXT_OFF(n) = val - n;
2377    }
2378    stopnow = 1;
2379   }
2380 #endif
2381  }
2382
2383  if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2384  /*
2385  Two problematic code points in Unicode casefolding of EXACT nodes:
2386
2387  U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2388  U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2389
2390  which casefold to
2391
2392  Unicode                      UTF-8
2393
2394  U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2395  U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2396
2397  This means that in case-insensitive matching (or "loose matching",
2398  as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2399  length of the above casefolded versions) can match a target string
2400  of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2401  This would rather mess up the minimum length computation.
2402
2403  What we'll do is to look for the tail four bytes, and then peek
2404  at the preceding two bytes to see whether we need to decrease
2405  the minimum length by four (six minus two).
2406
2407  Thanks to the design of UTF-8, there cannot be false matches:
2408  A sequence of valid UTF-8 bytes cannot be a subsequence of
2409  another valid sequence of UTF-8 bytes.
2410
2411  */
2412   char * const s0 = STRING(scan), *s, *t;
2413   char * const s1 = s0 + STR_LEN(scan) - 1;
2414   char * const s2 = s1 - 4;
2415 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2416   const char t0[] = "\xaf\x49\xaf\x42";
2417 #else
2418   const char t0[] = "\xcc\x88\xcc\x81";
2419 #endif
2420   const char * const t1 = t0 + 3;
2421
2422   for (s = s0 + 2;
2423    s < s2 && (t = ninstr(s, s1, t0, t1));
2424    s = t + 4) {
2425 #ifdef EBCDIC
2426    if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2427     ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2428 #else
2429    if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2430     ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2431 #endif
2432     *min -= 4;
2433   }
2434  }
2435
2436 #ifdef DEBUGGING
2437  /* Allow dumping */
2438  n = scan + NODE_SZ_STR(scan);
2439  while (n <= stop) {
2440   if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2441    OP(n) = OPTIMIZED;
2442    NEXT_OFF(n) = 0;
2443   }
2444   n++;
2445  }
2446 #endif
2447  DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2448  return stopnow;
2449 }
2450
2451 /* REx optimizer.  Converts nodes into quickier variants "in place".
2452    Finds fixed substrings.  */
2453
2454 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2455    to the position after last scanned or to NULL. */
2456
2457 #define INIT_AND_WITHP \
2458  assert(!and_withp); \
2459  Newx(and_withp,1,struct regnode_charclass_class); \
2460  SAVEFREEPV(and_withp)
2461
2462 /* this is a chain of data about sub patterns we are processing that
2463    need to be handled seperately/specially in study_chunk. Its so
2464    we can simulate recursion without losing state.  */
2465 struct scan_frame;
2466 typedef struct scan_frame {
2467  regnode *last;  /* last node to process in this frame */
2468  regnode *next;  /* next node to process when last is reached */
2469  struct scan_frame *prev; /*previous frame*/
2470  I32 stop; /* what stopparen do we use */
2471 } scan_frame;
2472
2473
2474 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2475
2476 #define CASE_SYNST_FNC(nAmE)                                       \
2477 case nAmE:                                                         \
2478  if (flags & SCF_DO_STCLASS_AND) {                              \
2479    for (value = 0; value < 256; value++)                  \
2480     if (!is_ ## nAmE ## _cp(value))                       \
2481      ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2482  }                                                              \
2483  else {                                                         \
2484    for (value = 0; value < 256; value++)                  \
2485     if (is_ ## nAmE ## _cp(value))                        \
2486      ANYOF_BITMAP_SET(data->start_class, value);    \
2487  }                                                              \
2488  break;                                                         \
2489 case N ## nAmE:                                                    \
2490  if (flags & SCF_DO_STCLASS_AND) {                              \
2491    for (value = 0; value < 256; value++)                   \
2492     if (is_ ## nAmE ## _cp(value))                         \
2493      ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2494  }                                                               \
2495  else {                                                          \
2496    for (value = 0; value < 256; value++)                   \
2497     if (!is_ ## nAmE ## _cp(value))                        \
2498      ANYOF_BITMAP_SET(data->start_class, value);     \
2499  }                                                               \
2500  break
2501
2502
2503
2504 STATIC I32
2505 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2506       I32 *minlenp, I32 *deltap,
2507       regnode *last,
2508       scan_data_t *data,
2509       I32 stopparen,
2510       U8* recursed,
2511       struct regnode_charclass_class *and_withp,
2512       U32 flags, U32 depth)
2513       /* scanp: Start here (read-write). */
2514       /* deltap: Write maxlen-minlen here. */
2515       /* last: Stop before this one. */
2516       /* data: string data about the pattern */
2517       /* stopparen: treat close N as END */
2518       /* recursed: which subroutines have we recursed into */
2519       /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2520 {
2521  dVAR;
2522  I32 min = 0, pars = 0, code;
2523  regnode *scan = *scanp, *next;
2524  I32 delta = 0;
2525  int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2526  int is_inf_internal = 0;  /* The studied chunk is infinite */
2527  I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2528  scan_data_t data_fake;
2529  SV *re_trie_maxbuff = NULL;
2530  regnode *first_non_open = scan;
2531  I32 stopmin = I32_MAX;
2532  scan_frame *frame = NULL;
2533  GET_RE_DEBUG_FLAGS_DECL;
2534
2535  PERL_ARGS_ASSERT_STUDY_CHUNK;
2536
2537 #ifdef DEBUGGING
2538  StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2539 #endif
2540
2541  if ( depth == 0 ) {
2542   while (first_non_open && OP(first_non_open) == OPEN)
2543    first_non_open=regnext(first_non_open);
2544  }
2545
2546
2547   fake_study_recurse:
2548  while ( scan && OP(scan) != END && scan < last ){
2549   /* Peephole optimizer: */
2550   DEBUG_STUDYDATA("Peep:", data,depth);
2551   DEBUG_PEEP("Peep",scan,depth);
2552   JOIN_EXACT(scan,&min,0);
2553
2554   /* Follow the next-chain of the current node and optimize
2555   away all the NOTHINGs from it.  */
2556   if (OP(scan) != CURLYX) {
2557    const int max = (reg_off_by_arg[OP(scan)]
2558      ? I32_MAX
2559      /* I32 may be smaller than U16 on CRAYs! */
2560      : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2561    int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2562    int noff;
2563    regnode *n = scan;
2564
2565    /* Skip NOTHING and LONGJMP. */
2566    while ((n = regnext(n))
2567     && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2568      || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2569     && off + noff < max)
2570     off += noff;
2571    if (reg_off_by_arg[OP(scan)])
2572     ARG(scan) = off;
2573    else
2574     NEXT_OFF(scan) = off;
2575   }
2576
2577
2578
2579   /* The principal pseudo-switch.  Cannot be a switch, since we
2580   look into several different things.  */
2581   if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2582     || OP(scan) == IFTHEN) {
2583    next = regnext(scan);
2584    code = OP(scan);
2585    /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2586
2587    if (OP(next) == code || code == IFTHEN) {
2588     /* NOTE - There is similar code to this block below for handling
2589     TRIE nodes on a re-study.  If you change stuff here check there
2590     too. */
2591     I32 max1 = 0, min1 = I32_MAX, num = 0;
2592     struct regnode_charclass_class accum;
2593     regnode * const startbranch=scan;
2594
2595     if (flags & SCF_DO_SUBSTR)
2596      SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2597     if (flags & SCF_DO_STCLASS)
2598      cl_init_zero(pRExC_state, &accum);
2599
2600     while (OP(scan) == code) {
2601      I32 deltanext, minnext, f = 0, fake;
2602      struct regnode_charclass_class this_class;
2603
2604      num++;
2605      data_fake.flags = 0;
2606      if (data) {
2607       data_fake.whilem_c = data->whilem_c;
2608       data_fake.last_closep = data->last_closep;
2609      }
2610      else
2611       data_fake.last_closep = &fake;
2612
2613      data_fake.pos_delta = delta;
2614      next = regnext(scan);
2615      scan = NEXTOPER(scan);
2616      if (code != BRANCH)
2617       scan = NEXTOPER(scan);
2618      if (flags & SCF_DO_STCLASS) {
2619       cl_init(pRExC_state, &this_class);
2620       data_fake.start_class = &this_class;
2621       f = SCF_DO_STCLASS_AND;
2622      }
2623      if (flags & SCF_WHILEM_VISITED_POS)
2624       f |= SCF_WHILEM_VISITED_POS;
2625
2626      /* we suppose the run is continuous, last=next...*/
2627      minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2628           next, &data_fake,
2629           stopparen, recursed, NULL, f,depth+1);
2630      if (min1 > minnext)
2631       min1 = minnext;
2632      if (max1 < minnext + deltanext)
2633       max1 = minnext + deltanext;
2634      if (deltanext == I32_MAX)
2635       is_inf = is_inf_internal = 1;
2636      scan = next;
2637      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2638       pars++;
2639      if (data_fake.flags & SCF_SEEN_ACCEPT) {
2640       if ( stopmin > minnext)
2641        stopmin = min + min1;
2642       flags &= ~SCF_DO_SUBSTR;
2643       if (data)
2644        data->flags |= SCF_SEEN_ACCEPT;
2645      }
2646      if (data) {
2647       if (data_fake.flags & SF_HAS_EVAL)
2648        data->flags |= SF_HAS_EVAL;
2649       data->whilem_c = data_fake.whilem_c;
2650      }
2651      if (flags & SCF_DO_STCLASS)
2652       cl_or(pRExC_state, &accum, &this_class);
2653     }
2654     if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2655      min1 = 0;
2656     if (flags & SCF_DO_SUBSTR) {
2657      data->pos_min += min1;
2658      data->pos_delta += max1 - min1;
2659      if (max1 != min1 || is_inf)
2660       data->longest = &(data->longest_float);
2661     }
2662     min += min1;
2663     delta += max1 - min1;
2664     if (flags & SCF_DO_STCLASS_OR) {
2665      cl_or(pRExC_state, data->start_class, &accum);
2666      if (min1) {
2667       cl_and(data->start_class, and_withp);
2668       flags &= ~SCF_DO_STCLASS;
2669      }
2670     }
2671     else if (flags & SCF_DO_STCLASS_AND) {
2672      if (min1) {
2673       cl_and(data->start_class, &accum);
2674       flags &= ~SCF_DO_STCLASS;
2675      }
2676      else {
2677       /* Switch to OR mode: cache the old value of
2678       * data->start_class */
2679       INIT_AND_WITHP;
2680       StructCopy(data->start_class, and_withp,
2681         struct regnode_charclass_class);
2682       flags &= ~SCF_DO_STCLASS_AND;
2683       StructCopy(&accum, data->start_class,
2684         struct regnode_charclass_class);
2685       flags |= SCF_DO_STCLASS_OR;
2686       data->start_class->flags |= ANYOF_EOS;
2687      }
2688     }
2689
2690     if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2691     /* demq.
2692
2693     Assuming this was/is a branch we are dealing with: 'scan' now
2694     points at the item that follows the branch sequence, whatever
2695     it is. We now start at the beginning of the sequence and look
2696     for subsequences of
2697
2698     BRANCH->EXACT=>x1
2699     BRANCH->EXACT=>x2
2700     tail
2701
2702     which would be constructed from a pattern like /A|LIST|OF|WORDS/
2703
2704     If we can find such a subseqence we need to turn the first
2705     element into a trie and then add the subsequent branch exact
2706     strings to the trie.
2707
2708     We have two cases
2709
2710      1. patterns where the whole set of branch can be converted.
2711
2712      2. patterns where only a subset can be converted.
2713
2714     In case 1 we can replace the whole set with a single regop
2715     for the trie. In case 2 we need to keep the start and end
2716     branchs so
2717
2718      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2719      becomes BRANCH TRIE; BRANCH X;
2720
2721     There is an additional case, that being where there is a
2722     common prefix, which gets split out into an EXACT like node
2723     preceding the TRIE node.
2724
2725     If x(1..n)==tail then we can do a simple trie, if not we make
2726     a "jump" trie, such that when we match the appropriate word
2727     we "jump" to the appopriate tail node. Essentailly we turn
2728     a nested if into a case structure of sorts.
2729
2730     */
2731
2732      int made=0;
2733      if (!re_trie_maxbuff) {
2734       re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2735       if (!SvIOK(re_trie_maxbuff))
2736        sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2737      }
2738      if ( SvIV(re_trie_maxbuff)>=0  ) {
2739       regnode *cur;
2740       regnode *first = (regnode *)NULL;
2741       regnode *last = (regnode *)NULL;
2742       regnode *tail = scan;
2743       U8 optype = 0;
2744       U32 count=0;
2745
2746 #ifdef DEBUGGING
2747       SV * const mysv = sv_newmortal();       /* for dumping */
2748 #endif
2749       /* var tail is used because there may be a TAIL
2750       regop in the way. Ie, the exacts will point to the
2751       thing following the TAIL, but the last branch will
2752       point at the TAIL. So we advance tail. If we
2753       have nested (?:) we may have to move through several
2754       tails.
2755       */
2756
2757       while ( OP( tail ) == TAIL ) {
2758        /* this is the TAIL generated by (?:) */
2759        tail = regnext( tail );
2760       }
2761
2762
2763       DEBUG_OPTIMISE_r({
2764        regprop(RExC_rx, mysv, tail );
2765        PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2766         (int)depth * 2 + 2, "",
2767         "Looking for TRIE'able sequences. Tail node is: ",
2768         SvPV_nolen_const( mysv )
2769        );
2770       });
2771
2772       /*
2773
2774       step through the branches, cur represents each
2775       branch, noper is the first thing to be matched
2776       as part of that branch and noper_next is the
2777       regnext() of that node. if noper is an EXACT
2778       and noper_next is the same as scan (our current
2779       position in the regex) then the EXACT branch is
2780       a possible optimization target. Once we have
2781       two or more consequetive such branches we can
2782       create a trie of the EXACT's contents and stich
2783       it in place. If the sequence represents all of
2784       the branches we eliminate the whole thing and
2785       replace it with a single TRIE. If it is a
2786       subsequence then we need to stitch it in. This
2787       means the first branch has to remain, and needs
2788       to be repointed at the item on the branch chain
2789       following the last branch optimized. This could
2790       be either a BRANCH, in which case the
2791       subsequence is internal, or it could be the
2792       item following the branch sequence in which
2793       case the subsequence is at the end.
2794
2795       */
2796
2797       /* dont use tail as the end marker for this traverse */
2798       for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2799        regnode * const noper = NEXTOPER( cur );
2800 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2801        regnode * const noper_next = regnext( noper );
2802 #endif
2803
2804        DEBUG_OPTIMISE_r({
2805         regprop(RExC_rx, mysv, cur);
2806         PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2807         (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2808
2809         regprop(RExC_rx, mysv, noper);
2810         PerlIO_printf( Perl_debug_log, " -> %s",
2811          SvPV_nolen_const(mysv));
2812
2813         if ( noper_next ) {
2814         regprop(RExC_rx, mysv, noper_next );
2815         PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2816          SvPV_nolen_const(mysv));
2817         }
2818         PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2819         REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2820        });
2821        if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2822           : PL_regkind[ OP( noper ) ] == EXACT )
2823         || OP(noper) == NOTHING )
2824 #ifdef NOJUMPTRIE
2825         && noper_next == tail
2826 #endif
2827         && count < U16_MAX)
2828        {
2829         count++;
2830         if ( !first || optype == NOTHING ) {
2831          if (!first) first = cur;
2832          optype = OP( noper );
2833         } else {
2834          last = cur;
2835         }
2836        } else {
2837 /*
2838  Currently we do not believe that the trie logic can
2839  handle case insensitive matching properly when the
2840  pattern is not unicode (thus forcing unicode semantics).
2841
2842  If/when this is fixed the following define can be swapped
2843  in below to fully enable trie logic.
2844
2845 #define TRIE_TYPE_IS_SAFE 1
2846
2847 */
2848 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2849
2850         if ( last && TRIE_TYPE_IS_SAFE ) {
2851          make_trie( pRExC_state,
2852            startbranch, first, cur, tail, count,
2853            optype, depth+1 );
2854         }
2855         if ( PL_regkind[ OP( noper ) ] == EXACT
2856 #ifdef NOJUMPTRIE
2857          && noper_next == tail
2858 #endif
2859         ){
2860          count = 1;
2861          first = cur;
2862          optype = OP( noper );
2863         } else {
2864          count = 0;
2865          first = NULL;
2866          optype = 0;
2867         }
2868         last = NULL;
2869        }
2870       }
2871       DEBUG_OPTIMISE_r({
2872        regprop(RExC_rx, mysv, cur);
2873        PerlIO_printf( Perl_debug_log,
2874        "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2875        "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2876
2877       });
2878
2879       if ( last && TRIE_TYPE_IS_SAFE ) {
2880        made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2881 #ifdef TRIE_STUDY_OPT
2882        if ( ((made == MADE_EXACT_TRIE &&
2883         startbranch == first)
2884         || ( first_non_open == first )) &&
2885         depth==0 ) {
2886         flags |= SCF_TRIE_RESTUDY;
2887         if ( startbranch == first
2888          && scan == tail )
2889         {
2890          RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2891         }
2892        }
2893 #endif
2894       }
2895      }
2896
2897     } /* do trie */
2898
2899    }
2900    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
2901     scan = NEXTOPER(NEXTOPER(scan));
2902    } else   /* single branch is optimized. */
2903     scan = NEXTOPER(scan);
2904    continue;
2905   } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2906    scan_frame *newframe = NULL;
2907    I32 paren;
2908    regnode *start;
2909    regnode *end;
2910
2911    if (OP(scan) != SUSPEND) {
2912    /* set the pointer */
2913     if (OP(scan) == GOSUB) {
2914      paren = ARG(scan);
2915      RExC_recurse[ARG2L(scan)] = scan;
2916      start = RExC_open_parens[paren-1];
2917      end   = RExC_close_parens[paren-1];
2918     } else {
2919      paren = 0;
2920      start = RExC_rxi->program + 1;
2921      end   = RExC_opend;
2922     }
2923     if (!recursed) {
2924      Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2925      SAVEFREEPV(recursed);
2926     }
2927     if (!PAREN_TEST(recursed,paren+1)) {
2928      PAREN_SET(recursed,paren+1);
2929      Newx(newframe,1,scan_frame);
2930     } else {
2931      if (flags & SCF_DO_SUBSTR) {
2932       SCAN_COMMIT(pRExC_state,data,minlenp);
2933       data->longest = &(data->longest_float);
2934      }
2935      is_inf = is_inf_internal = 1;
2936      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2937       cl_anything(pRExC_state, data->start_class);
2938      flags &= ~SCF_DO_STCLASS;
2939     }
2940    } else {
2941     Newx(newframe,1,scan_frame);
2942     paren = stopparen;
2943     start = scan+2;
2944     end = regnext(scan);
2945    }
2946    if (newframe) {
2947     assert(start);
2948     assert(end);
2949     SAVEFREEPV(newframe);
2950     newframe->next = regnext(scan);
2951     newframe->last = last;
2952     newframe->stop = stopparen;
2953     newframe->prev = frame;
2954
2955     frame = newframe;
2956     scan =  start;
2957     stopparen = paren;
2958     last = end;
2959
2960     continue;
2961    }
2962   }
2963   else if (OP(scan) == EXACT) {
2964    I32 l = STR_LEN(scan);
2965    UV uc;
2966    if (UTF) {
2967     const U8 * const s = (U8*)STRING(scan);
2968     l = utf8_length(s, s + l);
2969     uc = utf8_to_uvchr(s, NULL);
2970    } else {
2971     uc = *((U8*)STRING(scan));
2972    }
2973    min += l;
2974    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2975     /* The code below prefers earlier match for fixed
2976     offset, later match for variable offset.  */
2977     if (data->last_end == -1) { /* Update the start info. */
2978      data->last_start_min = data->pos_min;
2979      data->last_start_max = is_inf
2980       ? I32_MAX : data->pos_min + data->pos_delta;
2981     }
2982     sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2983     if (UTF)
2984      SvUTF8_on(data->last_found);
2985     {
2986      SV * const sv = data->last_found;
2987      MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2988       mg_find(sv, PERL_MAGIC_utf8) : NULL;
2989      if (mg && mg->mg_len >= 0)
2990       mg->mg_len += utf8_length((U8*)STRING(scan),
2991             (U8*)STRING(scan)+STR_LEN(scan));
2992     }
2993     data->last_end = data->pos_min + l;
2994     data->pos_min += l; /* As in the first entry. */
2995     data->flags &= ~SF_BEFORE_EOL;
2996    }
2997    if (flags & SCF_DO_STCLASS_AND) {
2998     /* Check whether it is compatible with what we know already! */
2999     int compat = 1;
3000
3001     if (uc >= 0x100 ||
3002      (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3003      && !ANYOF_BITMAP_TEST(data->start_class, uc)
3004      && (!(data->start_class->flags & ANYOF_FOLD)
3005       || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3006      )
3007      compat = 0;
3008     ANYOF_CLASS_ZERO(data->start_class);
3009     ANYOF_BITMAP_ZERO(data->start_class);
3010     if (compat)
3011      ANYOF_BITMAP_SET(data->start_class, uc);
3012     data->start_class->flags &= ~ANYOF_EOS;
3013     if (uc < 0x100)
3014     data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3015    }
3016    else if (flags & SCF_DO_STCLASS_OR) {
3017     /* false positive possible if the class is case-folded */
3018     if (uc < 0x100)
3019      ANYOF_BITMAP_SET(data->start_class, uc);
3020     else
3021      data->start_class->flags |= ANYOF_UNICODE_ALL;
3022     data->start_class->flags &= ~ANYOF_EOS;
3023     cl_and(data->start_class, and_withp);
3024    }
3025    flags &= ~SCF_DO_STCLASS;
3026   }
3027   else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3028    I32 l = STR_LEN(scan);
3029    UV uc = *((U8*)STRING(scan));
3030
3031    /* Search for fixed substrings supports EXACT only. */
3032    if (flags & SCF_DO_SUBSTR) {
3033     assert(data);
3034     SCAN_COMMIT(pRExC_state, data, minlenp);
3035    }
3036    if (UTF) {
3037     const U8 * const s = (U8 *)STRING(scan);
3038     l = utf8_length(s, s + l);
3039     uc = utf8_to_uvchr(s, NULL);
3040    }
3041    min += l;
3042    if (flags & SCF_DO_SUBSTR)
3043     data->pos_min += l;
3044    if (flags & SCF_DO_STCLASS_AND) {
3045     /* Check whether it is compatible with what we know already! */
3046     int compat = 1;
3047
3048     if (uc >= 0x100 ||
3049      (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3050      && !ANYOF_BITMAP_TEST(data->start_class, uc)
3051      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3052      compat = 0;
3053     ANYOF_CLASS_ZERO(data->start_class);
3054     ANYOF_BITMAP_ZERO(data->start_class);
3055     if (compat) {
3056      ANYOF_BITMAP_SET(data->start_class, uc);
3057      data->start_class->flags &= ~ANYOF_EOS;
3058      data->start_class->flags |= ANYOF_FOLD;
3059      if (OP(scan) == EXACTFL)
3060       data->start_class->flags |= ANYOF_LOCALE;
3061     }
3062    }
3063    else if (flags & SCF_DO_STCLASS_OR) {
3064     if (data->start_class->flags & ANYOF_FOLD) {
3065      /* false positive possible if the class is case-folded.
3066      Assume that the locale settings are the same... */
3067      if (uc < 0x100)
3068       ANYOF_BITMAP_SET(data->start_class, uc);
3069      data->start_class->flags &= ~ANYOF_EOS;
3070     }
3071     cl_and(data->start_class, and_withp);
3072    }
3073    flags &= ~SCF_DO_STCLASS;
3074   }
3075   else if (strchr((const char*)PL_varies,OP(scan))) {
3076    I32 mincount, maxcount, minnext, deltanext, fl = 0;
3077    I32 f = flags, pos_before = 0;
3078    regnode * const oscan = scan;
3079    struct regnode_charclass_class this_class;
3080    struct regnode_charclass_class *oclass = NULL;
3081    I32 next_is_eval = 0;
3082
3083    switch (PL_regkind[OP(scan)]) {
3084    case WHILEM:  /* End of (?:...)* . */
3085     scan = NEXTOPER(scan);
3086     goto finish;
3087    case PLUS:
3088     if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3089      next = NEXTOPER(scan);
3090      if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3091       mincount = 1;
3092       maxcount = REG_INFTY;
3093       next = regnext(scan);
3094       scan = NEXTOPER(scan);
3095       goto do_curly;
3096      }
3097     }
3098     if (flags & SCF_DO_SUBSTR)
3099      data->pos_min++;
3100     min++;
3101     /* Fall through. */
3102    case STAR:
3103     if (flags & SCF_DO_STCLASS) {
3104      mincount = 0;
3105      maxcount = REG_INFTY;
3106      next = regnext(scan);
3107      scan = NEXTOPER(scan);
3108      goto do_curly;
3109     }
3110     is_inf = is_inf_internal = 1;
3111     scan = regnext(scan);
3112     if (flags & SCF_DO_SUBSTR) {
3113      SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3114      data->longest = &(data->longest_float);
3115     }
3116     goto optimize_curly_tail;
3117    case CURLY:
3118     if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3119      && (scan->flags == stopparen))
3120     {
3121      mincount = 1;
3122      maxcount = 1;
3123     } else {
3124      mincount = ARG1(scan);
3125      maxcount = ARG2(scan);
3126     }
3127     next = regnext(scan);
3128     if (OP(scan) == CURLYX) {
3129      I32 lp = (data ? *(data->last_closep) : 0);
3130      scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3131     }
3132     scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3133     next_is_eval = (OP(scan) == EVAL);
3134    do_curly:
3135     if (flags & SCF_DO_SUBSTR) {
3136      if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3137      pos_before = data->pos_min;
3138     }
3139     if (data) {
3140      fl = data->flags;
3141      data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3142      if (is_inf)
3143       data->flags |= SF_IS_INF;
3144     }
3145     if (flags & SCF_DO_STCLASS) {
3146      cl_init(pRExC_state, &this_class);
3147      oclass = data->start_class;
3148      data->start_class = &this_class;
3149      f |= SCF_DO_STCLASS_AND;
3150      f &= ~SCF_DO_STCLASS_OR;
3151     }
3152     /* These are the cases when once a subexpression
3153     fails at a particular position, it cannot succeed
3154     even after backtracking at the enclosing scope.
3155
3156     XXXX what if minimal match and we are at the
3157       initial run of {n,m}? */
3158     if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3159      f &= ~SCF_WHILEM_VISITED_POS;
3160
3161     /* This will finish on WHILEM, setting scan, or on NULL: */
3162     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3163          last, data, stopparen, recursed, NULL,
3164          (mincount == 0
3165           ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3166
3167     if (flags & SCF_DO_STCLASS)
3168      data->start_class = oclass;
3169     if (mincount == 0 || minnext == 0) {
3170      if (flags & SCF_DO_STCLASS_OR) {
3171       cl_or(pRExC_state, data->start_class, &this_class);
3172      }
3173      else if (flags & SCF_DO_STCLASS_AND) {
3174       /* Switch to OR mode: cache the old value of
3175       * data->start_class */
3176       INIT_AND_WITHP;
3177       StructCopy(data->start_class, and_withp,
3178         struct regnode_charclass_class);
3179       flags &= ~SCF_DO_STCLASS_AND;
3180       StructCopy(&this_class, data->start_class,
3181         struct regnode_charclass_class);
3182       flags |= SCF_DO_STCLASS_OR;
3183       data->start_class->flags |= ANYOF_EOS;
3184      }
3185     } else {  /* Non-zero len */
3186      if (flags & SCF_DO_STCLASS_OR) {
3187       cl_or(pRExC_state, data->start_class, &this_class);
3188       cl_and(data->start_class, and_withp);
3189      }
3190      else if (flags & SCF_DO_STCLASS_AND)
3191       cl_and(data->start_class, &this_class);
3192      flags &= ~SCF_DO_STCLASS;
3193     }
3194     if (!scan)   /* It was not CURLYX, but CURLY. */
3195      scan = next;
3196     if ( /* ? quantifier ok, except for (?{ ... }) */
3197      (next_is_eval || !(mincount == 0 && maxcount == 1))
3198      && (minnext == 0) && (deltanext == 0)
3199      && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3200      && maxcount <= REG_INFTY/3) /* Complement check for big count */
3201     {
3202      ckWARNreg(RExC_parse,
3203        "Quantifier unexpected on zero-length expression");
3204     }
3205
3206     min += minnext * mincount;
3207     is_inf_internal |= ((maxcount == REG_INFTY
3208          && (minnext + deltanext) > 0)
3209          || deltanext == I32_MAX);
3210     is_inf |= is_inf_internal;
3211     delta += (minnext + deltanext) * maxcount - minnext * mincount;
3212
3213     /* Try powerful optimization CURLYX => CURLYN. */
3214     if (  OP(oscan) == CURLYX && data
3215      && data->flags & SF_IN_PAR
3216      && !(data->flags & SF_HAS_EVAL)
3217      && !deltanext && minnext == 1 ) {
3218      /* Try to optimize to CURLYN.  */
3219      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3220      regnode * const nxt1 = nxt;
3221 #ifdef DEBUGGING
3222      regnode *nxt2;
3223 #endif
3224
3225      /* Skip open. */
3226      nxt = regnext(nxt);
3227      if (!strchr((const char*)PL_simple,OP(nxt))
3228       && !(PL_regkind[OP(nxt)] == EXACT
3229        && STR_LEN(nxt) == 1))
3230       goto nogo;
3231 #ifdef DEBUGGING
3232      nxt2 = nxt;
3233 #endif
3234      nxt = regnext(nxt);
3235      if (OP(nxt) != CLOSE)
3236       goto nogo;
3237      if (RExC_open_parens) {
3238       RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3239       RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3240      }
3241      /* Now we know that nxt2 is the only contents: */
3242      oscan->flags = (U8)ARG(nxt);
3243      OP(oscan) = CURLYN;
3244      OP(nxt1) = NOTHING; /* was OPEN. */
3245
3246 #ifdef DEBUGGING
3247      OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3248      NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3249      NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3250      OP(nxt) = OPTIMIZED; /* was CLOSE. */
3251      OP(nxt + 1) = OPTIMIZED; /* was count. */
3252      NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3253 #endif
3254     }
3255    nogo:
3256
3257     /* Try optimization CURLYX => CURLYM. */
3258     if (  OP(oscan) == CURLYX && data
3259      && !(data->flags & SF_HAS_PAR)
3260      && !(data->flags & SF_HAS_EVAL)
3261      && !deltanext /* atom is fixed width */
3262      && minnext != 0 /* CURLYM can't handle zero width */
3263     ) {
3264      /* XXXX How to optimize if data == 0? */
3265      /* Optimize to a simpler form.  */
3266      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3267      regnode *nxt2;
3268
3269      OP(oscan) = CURLYM;
3270      while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3271        && (OP(nxt2) != WHILEM))
3272       nxt = nxt2;
3273      OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3274      /* Need to optimize away parenths. */
3275      if (data->flags & SF_IN_PAR) {
3276       /* Set the parenth number.  */
3277       regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3278
3279       if (OP(nxt) != CLOSE)
3280        FAIL("Panic opt close");
3281       oscan->flags = (U8)ARG(nxt);
3282       if (RExC_open_parens) {
3283        RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3284        RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3285       }
3286       OP(nxt1) = OPTIMIZED; /* was OPEN. */
3287       OP(nxt) = OPTIMIZED; /* was CLOSE. */
3288
3289 #ifdef DEBUGGING
3290       OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3291       OP(nxt + 1) = OPTIMIZED; /* was count. */
3292       NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3293       NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3294 #endif
3295 #if 0
3296       while ( nxt1 && (OP(nxt1) != WHILEM)) {
3297        regnode *nnxt = regnext(nxt1);
3298
3299        if (nnxt == nxt) {
3300         if (reg_off_by_arg[OP(nxt1)])
3301          ARG_SET(nxt1, nxt2 - nxt1);
3302         else if (nxt2 - nxt1 < U16_MAX)
3303          NEXT_OFF(nxt1) = nxt2 - nxt1;
3304         else
3305          OP(nxt) = NOTHING; /* Cannot beautify */
3306        }
3307        nxt1 = nnxt;
3308       }
3309 #endif
3310       /* Optimize again: */
3311       study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3312          NULL, stopparen, recursed, NULL, 0,depth+1);
3313      }
3314      else
3315       oscan->flags = 0;
3316     }
3317     else if ((OP(oscan) == CURLYX)
3318       && (flags & SCF_WHILEM_VISITED_POS)
3319       /* See the comment on a similar expression above.
3320        However, this time it not a subexpression
3321        we care about, but the expression itself. */
3322       && (maxcount == REG_INFTY)
3323       && data && ++data->whilem_c < 16) {
3324      /* This stays as CURLYX, we can put the count/of pair. */
3325      /* Find WHILEM (as in regexec.c) */
3326      regnode *nxt = oscan + NEXT_OFF(oscan);
3327
3328      if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3329       nxt += ARG(nxt);
3330      PREVOPER(nxt)->flags = (U8)(data->whilem_c
3331       | (RExC_whilem_seen << 4)); /* On WHILEM */
3332     }
3333     if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3334      pars++;
3335     if (flags & SCF_DO_SUBSTR) {
3336      SV *last_str = NULL;
3337      int counted = mincount != 0;
3338
3339      if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3340 #if defined(SPARC64_GCC_WORKAROUND)
3341       I32 b = 0;
3342       STRLEN l = 0;
3343       const char *s = NULL;
3344       I32 old = 0;
3345
3346       if (pos_before >= data->last_start_min)
3347        b = pos_before;
3348       else
3349        b = data->last_start_min;
3350
3351       l = 0;
3352       s = SvPV_const(data->last_found, l);
3353       old = b - data->last_start_min;
3354
3355 #else
3356       I32 b = pos_before >= data->last_start_min
3357        ? pos_before : data->last_start_min;
3358       STRLEN l;
3359       const char * const s = SvPV_const(data->last_found, l);
3360       I32 old = b - data->last_start_min;
3361 #endif
3362
3363       if (UTF)
3364        old = utf8_hop((U8*)s, old) - (U8*)s;
3365
3366       l -= old;
3367       /* Get the added string: */
3368       last_str = newSVpvn_utf8(s  + old, l, UTF);
3369       if (deltanext == 0 && pos_before == b) {
3370        /* What was added is a constant string */
3371        if (mincount > 1) {
3372         SvGROW(last_str, (mincount * l) + 1);
3373         repeatcpy(SvPVX(last_str) + l,
3374           SvPVX_const(last_str), l, mincount - 1);
3375         SvCUR_set(last_str, SvCUR(last_str) * mincount);
3376         /* Add additional parts. */
3377         SvCUR_set(data->last_found,
3378           SvCUR(data->last_found) - l);
3379         sv_catsv(data->last_found, last_str);
3380         {
3381          SV * sv = data->last_found;
3382          MAGIC *mg =
3383           SvUTF8(sv) && SvMAGICAL(sv) ?
3384           mg_find(sv, PERL_MAGIC_utf8) : NULL;
3385          if (mg && mg->mg_len >= 0)
3386           mg->mg_len += CHR_SVLEN(last_str) - l;
3387         }
3388         data->last_end += l * (mincount - 1);
3389        }
3390       } else {
3391        /* start offset must point into the last copy */
3392        data->last_start_min += minnext * (mincount - 1);
3393        data->last_start_max += is_inf ? I32_MAX
3394         : (maxcount - 1) * (minnext + data->pos_delta);
3395       }
3396      }
3397      /* It is counted once already... */
3398      data->pos_min += minnext * (mincount - counted);
3399      data->pos_delta += - counted * deltanext +
3400       (minnext + deltanext) * maxcount - minnext * mincount;
3401      if (mincount != maxcount) {
3402       /* Cannot extend fixed substrings found inside
3403        the group.  */
3404       SCAN_COMMIT(pRExC_state,data,minlenp);
3405       if (mincount && last_str) {
3406        SV * const sv = data->last_found;
3407        MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3408         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3409
3410        if (mg)
3411         mg->mg_len = -1;
3412        sv_setsv(sv, last_str);
3413        data->last_end = data->pos_min;
3414        data->last_start_min =
3415         data->pos_min - CHR_SVLEN(last_str);
3416        data->last_start_max = is_inf
3417         ? I32_MAX
3418         : data->pos_min + data->pos_delta
3419         - CHR_SVLEN(last_str);
3420       }
3421       data->longest = &(data->longest_float);
3422      }
3423      SvREFCNT_dec(last_str);
3424     }
3425     if (data && (fl & SF_HAS_EVAL))
3426      data->flags |= SF_HAS_EVAL;
3427    optimize_curly_tail:
3428     if (OP(oscan) != CURLYX) {
3429      while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3430       && NEXT_OFF(next))
3431       NEXT_OFF(oscan) += NEXT_OFF(next);
3432     }
3433     continue;
3434    default:   /* REF and CLUMP only? */
3435     if (flags & SCF_DO_SUBSTR) {
3436      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3437      data->longest = &(data->longest_float);
3438     }
3439     is_inf = is_inf_internal = 1;
3440     if (flags & SCF_DO_STCLASS_OR)
3441      cl_anything(pRExC_state, data->start_class);
3442     flags &= ~SCF_DO_STCLASS;
3443     break;
3444    }
3445   }
3446   else if (OP(scan) == LNBREAK) {
3447    if (flags & SCF_DO_STCLASS) {
3448     int value = 0;
3449     data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3450      if (flags & SCF_DO_STCLASS_AND) {
3451      for (value = 0; value < 256; value++)
3452       if (!is_VERTWS_cp(value))
3453        ANYOF_BITMAP_CLEAR(data->start_class, value);
3454     }
3455     else {
3456      for (value = 0; value < 256; value++)
3457       if (is_VERTWS_cp(value))
3458        ANYOF_BITMAP_SET(data->start_class, value);
3459     }
3460     if (flags & SCF_DO_STCLASS_OR)
3461      cl_and(data->start_class, and_withp);
3462     flags &= ~SCF_DO_STCLASS;
3463    }
3464    min += 1;
3465    delta += 1;
3466    if (flags & SCF_DO_SUBSTR) {
3467      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3468      data->pos_min += 1;
3469     data->pos_delta += 1;
3470     data->longest = &(data->longest_float);
3471     }
3472
3473   }
3474   else if (OP(scan) == FOLDCHAR) {
3475    int d = ARG(scan)==0xDF ? 1 : 2;
3476    flags &= ~SCF_DO_STCLASS;
3477    min += 1;
3478    delta += d;
3479    if (flags & SCF_DO_SUBSTR) {
3480     SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3481     data->pos_min += 1;
3482     data->pos_delta += d;
3483     data->longest = &(data->longest_float);
3484    }
3485   }
3486   else if (strchr((const char*)PL_simple,OP(scan))) {
3487    int value = 0;
3488
3489    if (flags & SCF_DO_SUBSTR) {
3490     SCAN_COMMIT(pRExC_state,data,minlenp);
3491     data->pos_min++;
3492    }
3493    min++;
3494    if (flags & SCF_DO_STCLASS) {
3495     data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3496
3497     /* Some of the logic below assumes that switching
3498     locale on will only add false positives. */
3499     switch (PL_regkind[OP(scan)]) {
3500     case SANY:
3501     default:
3502     do_default:
3503      /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3504      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3505       cl_anything(pRExC_state, data->start_class);
3506      break;
3507     case REG_ANY:
3508      if (OP(scan) == SANY)
3509       goto do_default;
3510      if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3511       value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3512         || (data->start_class->flags & ANYOF_CLASS));
3513       cl_anything(pRExC_state, data->start_class);
3514      }
3515      if (flags & SCF_DO_STCLASS_AND || !value)
3516       ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3517      break;
3518     case ANYOF:
3519      if (flags & SCF_DO_STCLASS_AND)
3520       cl_and(data->start_class,
3521        (struct regnode_charclass_class*)scan);
3522      else
3523       cl_or(pRExC_state, data->start_class,
3524        (struct regnode_charclass_class*)scan);
3525      break;
3526     case ALNUM:
3527      if (flags & SCF_DO_STCLASS_AND) {
3528       if (!(data->start_class->flags & ANYOF_LOCALE)) {
3529        ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3530        for (value = 0; value < 256; value++)
3531         if (!isALNUM(value))
3532          ANYOF_BITMAP_CLEAR(data->start_class, value);
3533       }
3534      }
3535      else {
3536       if (data->start_class->flags & ANYOF_LOCALE)
3537        ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3538       else {
3539        for (value = 0; value < 256; value++)
3540         if (isALNUM(value))
3541          ANYOF_BITMAP_SET(data->start_class, value);
3542       }
3543      }
3544      break;
3545     case ALNUML:
3546      if (flags & SCF_DO_STCLASS_AND) {
3547       if (data->start_class->flags & ANYOF_LOCALE)
3548        ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3549      }
3550      else {
3551       ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3552       data->start_class->flags |= ANYOF_LOCALE;
3553      }
3554      break;
3555     case NALNUM:
3556      if (flags & SCF_DO_STCLASS_AND) {
3557       if (!(data->start_class->flags & ANYOF_LOCALE)) {
3558        ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3559        for (value = 0; value < 256; value++)
3560         if (isALNUM(value))
3561          ANYOF_BITMAP_CLEAR(data->start_class, value);
3562       }
3563      }
3564      else {
3565       if (data->start_class->flags & ANYOF_LOCALE)
3566        ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3567       else {
3568        for (value = 0; value < 256; value++)
3569         if (!isALNUM(value))
3570          ANYOF_BITMAP_SET(data->start_class, value);
3571       }
3572      }
3573      break;
3574     case NALNUML:
3575      if (flags & SCF_DO_STCLASS_AND) {
3576       if (data->start_class->flags & ANYOF_LOCALE)
3577        ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3578      }
3579      else {
3580       data->start_class->flags |= ANYOF_LOCALE;
3581       ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3582      }
3583      break;
3584     case SPACE:
3585      if (flags & SCF_DO_STCLASS_AND) {
3586       if (!(data->start_class->flags & ANYOF_LOCALE)) {
3587        ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3588        for (value = 0; value < 256; value++)
3589         if (!isSPACE(value))
3590          ANYOF_BITMAP_CLEAR(data->start_class, value);
3591       }
3592      }
3593      else {
3594       if (data->start_class->flags & ANYOF_LOCALE)
3595        ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3596       else {
3597        for (value = 0; value < 256; value++)
3598         if (isSPACE(value))
3599          ANYOF_BITMAP_SET(data->start_class, value);
3600       }
3601      }
3602      break;
3603     case SPACEL:
3604      if (flags & SCF_DO_STCLASS_AND) {
3605       if (data->start_class->flags & ANYOF_LOCALE)
3606        ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3607      }
3608      else {
3609       data->start_class->flags |= ANYOF_LOCALE;
3610       ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3611      }
3612      break;
3613     case NSPACE:
3614      if (flags & SCF_DO_STCLASS_AND) {
3615       if (!(data->start_class->flags & ANYOF_LOCALE)) {
3616        ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3617        for (value = 0; value < 256; value++)
3618         if (isSPACE(value))
3619          ANYOF_BITMAP_CLEAR(data->start_class, value);
3620       }
3621      }
3622      else {
3623       if (data->start_class->flags & ANYOF_LOCALE)
3624        ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3625       else {
3626        for (value = 0; value < 256; value++)
3627         if (!isSPACE(value))
3628          ANYOF_BITMAP_SET(data->start_class, value);
3629       }
3630      }
3631      break;
3632     case NSPACEL:
3633      if (flags & SCF_DO_STCLASS_AND) {
3634       if (data->start_class->flags & ANYOF_LOCALE) {
3635        ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3636        for (value = 0; value < 256; value++)
3637         if (!isSPACE(value))
3638          ANYOF_BITMAP_CLEAR(data->start_class, value);
3639       }
3640      }
3641      else {
3642       data->start_class->flags |= ANYOF_LOCALE;
3643       ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3644      }
3645      break;
3646     case DIGIT:
3647      if (flags & SCF_DO_STCLASS_AND) {
3648       ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3649       for (value = 0; value < 256; value++)
3650        if (!isDIGIT(value))
3651         ANYOF_BITMAP_CLEAR(data->start_class, value);
3652      }
3653      else {
3654       if (data->start_class->flags & ANYOF_LOCALE)
3655        ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3656       else {
3657        for (value = 0; value < 256; value++)
3658         if (isDIGIT(value))
3659          ANYOF_BITMAP_SET(data->start_class, value);
3660       }
3661      }
3662      break;
3663     case NDIGIT:
3664      if (flags & SCF_DO_STCLASS_AND) {
3665       ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3666       for (value = 0; value < 256; value++)
3667        if (isDIGIT(value))
3668         ANYOF_BITMAP_CLEAR(data->start_class, value);
3669      }
3670      else {
3671       if (data->start_class->flags & ANYOF_LOCALE)
3672        ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3673       else {
3674        for (value = 0; value < 256; value++)
3675         if (!isDIGIT(value))
3676          ANYOF_BITMAP_SET(data->start_class, value);
3677       }
3678      }
3679      break;
3680     CASE_SYNST_FNC(VERTWS);
3681     CASE_SYNST_FNC(HORIZWS);
3682
3683     }
3684     if (flags & SCF_DO_STCLASS_OR)
3685      cl_and(data->start_class, and_withp);
3686     flags &= ~SCF_DO_STCLASS;
3687    }
3688   }
3689   else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3690    data->flags |= (OP(scan) == MEOL
3691        ? SF_BEFORE_MEOL
3692        : SF_BEFORE_SEOL);
3693   }
3694   else if (  PL_regkind[OP(scan)] == BRANCHJ
3695     /* Lookbehind, or need to calculate parens/evals/stclass: */
3696     && (scan->flags || data || (flags & SCF_DO_STCLASS))
3697     && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3698    if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3699     || OP(scan) == UNLESSM )
3700    {
3701     /* Negative Lookahead/lookbehind
3702     In this case we can't do fixed string optimisation.
3703     */
3704
3705     I32 deltanext, minnext, fake = 0;
3706     regnode *nscan;
3707     struct regnode_charclass_class intrnl;
3708     int f = 0;
3709
3710     data_fake.flags = 0;
3711     if (data) {
3712      data_fake.whilem_c = data->whilem_c;
3713      data_fake.last_closep = data->last_closep;
3714     }
3715     else
3716      data_fake.last_closep = &fake;
3717     data_fake.pos_delta = delta;
3718     if ( flags & SCF_DO_STCLASS && !scan->flags
3719      && OP(scan) == IFMATCH ) { /* Lookahead */
3720      cl_init(pRExC_state, &intrnl);
3721      data_fake.start_class = &intrnl;
3722      f |= SCF_DO_STCLASS_AND;
3723     }
3724     if (flags & SCF_WHILEM_VISITED_POS)
3725      f |= SCF_WHILEM_VISITED_POS;
3726     next = regnext(scan);
3727     nscan = NEXTOPER(NEXTOPER(scan));
3728     minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3729      last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3730     if (scan->flags) {
3731      if (deltanext) {
3732       FAIL("Variable length lookbehind not implemented");
3733      }
3734      else if (minnext > (I32)U8_MAX) {
3735       FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3736      }
3737      scan->flags = (U8)minnext;
3738     }
3739     if (data) {
3740      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3741       pars++;
3742      if (data_fake.flags & SF_HAS_EVAL)
3743       data->flags |= SF_HAS_EVAL;
3744      data->whilem_c = data_fake.whilem_c;
3745     }
3746     if (f & SCF_DO_STCLASS_AND) {
3747      if (flags & SCF_DO_STCLASS_OR) {
3748       /* OR before, AND after: ideally we would recurse with
3749       * data_fake to get the AND applied by study of the
3750       * remainder of the pattern, and then derecurse;
3751       * *** HACK *** for now just treat as "no information".
3752       * See [perl #56690].
3753       */
3754       cl_init(pRExC_state, data->start_class);
3755      }  else {
3756       /* AND before and after: combine and continue */
3757       const int was = (data->start_class->flags & ANYOF_EOS);
3758
3759       cl_and(data->start_class, &intrnl);
3760       if (was)
3761        data->start_class->flags |= ANYOF_EOS;
3762      }
3763     }
3764    }
3765 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3766    else {
3767     /* Positive Lookahead/lookbehind
3768     In this case we can do fixed string optimisation,
3769     but we must be careful about it. Note in the case of
3770     lookbehind the positions will be offset by the minimum
3771     length of the pattern, something we won't know about
3772     until after the recurse.
3773     */
3774     I32 deltanext, fake = 0;
3775     regnode *nscan;
3776     struct regnode_charclass_class intrnl;
3777     int f = 0;
3778     /* We use SAVEFREEPV so that when the full compile
3779      is finished perl will clean up the allocated
3780      minlens when its all done. This was we don't
3781      have to worry about freeing them when we know
3782      they wont be used, which would be a pain.
3783     */
3784     I32 *minnextp;
3785     Newx( minnextp, 1, I32 );
3786     SAVEFREEPV(minnextp);
3787
3788     if (data) {
3789      StructCopy(data, &data_fake, scan_data_t);
3790      if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3791       f |= SCF_DO_SUBSTR;
3792       if (scan->flags)
3793        SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3794       data_fake.last_found=newSVsv(data->last_found);
3795      }
3796     }
3797     else
3798      data_fake.last_closep = &fake;
3799     data_fake.flags = 0;
3800     data_fake.pos_delta = delta;
3801     if (is_inf)
3802      data_fake.flags |= SF_IS_INF;
3803     if ( flags & SCF_DO_STCLASS && !scan->flags
3804      && OP(scan) == IFMATCH ) { /* Lookahead */
3805      cl_init(pRExC_state, &intrnl);
3806      data_fake.start_class = &intrnl;
3807      f |= SCF_DO_STCLASS_AND;
3808     }
3809     if (flags & SCF_WHILEM_VISITED_POS)
3810      f |= SCF_WHILEM_VISITED_POS;
3811     next = regnext(scan);
3812     nscan = NEXTOPER(NEXTOPER(scan));
3813
3814     *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3815      last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3816     if (scan->flags) {
3817      if (deltanext) {
3818       FAIL("Variable length lookbehind not implemented");
3819      }
3820      else if (*minnextp > (I32)U8_MAX) {
3821       FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3822      }
3823      scan->flags = (U8)*minnextp;
3824     }
3825
3826     *minnextp += min;
3827
3828     if (f & SCF_DO_STCLASS_AND) {
3829      const int was = (data->start_class->flags & ANYOF_EOS);
3830
3831      cl_and(data->start_class, &intrnl);
3832      if (was)
3833       data->start_class->flags |= ANYOF_EOS;
3834     }
3835     if (data) {
3836      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3837       pars++;
3838      if (data_fake.flags & SF_HAS_EVAL)
3839       data->flags |= SF_HAS_EVAL;
3840      data->whilem_c = data_fake.whilem_c;
3841      if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3842       if (RExC_rx->minlen<*minnextp)
3843        RExC_rx->minlen=*minnextp;
3844       SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3845       SvREFCNT_dec(data_fake.last_found);
3846
3847       if ( data_fake.minlen_fixed != minlenp )
3848       {
3849        data->offset_fixed= data_fake.offset_fixed;
3850        data->minlen_fixed= data_fake.minlen_fixed;
3851        data->lookbehind_fixed+= scan->flags;
3852       }
3853       if ( data_fake.minlen_float != minlenp )
3854       {
3855        data->minlen_float= data_fake.minlen_float;
3856        data->offset_float_min=data_fake.offset_float_min;
3857        data->offset_float_max=data_fake.offset_float_max;
3858        data->lookbehind_float+= scan->flags;
3859       }
3860      }
3861     }
3862
3863
3864    }
3865 #endif
3866   }
3867   else if (OP(scan) == OPEN) {
3868    if (stopparen != (I32)ARG(scan))
3869     pars++;
3870   }
3871   else if (OP(scan) == CLOSE) {
3872    if (stopparen == (I32)ARG(scan)) {
3873     break;
3874    }
3875    if ((I32)ARG(scan) == is_par) {
3876     next = regnext(scan);
3877
3878     if ( next && (OP(next) != WHILEM) && next < last)
3879      is_par = 0;  /* Disable optimization */
3880    }
3881    if (data)
3882     *(data->last_closep) = ARG(scan);
3883   }
3884   else if (OP(scan) == EVAL) {
3885     if (data)
3886      data->flags |= SF_HAS_EVAL;
3887   }
3888   else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3889    if (flags & SCF_DO_SUBSTR) {
3890     SCAN_COMMIT(pRExC_state,data,minlenp);
3891     flags &= ~SCF_DO_SUBSTR;
3892    }
3893    if (data && OP(scan)==ACCEPT) {
3894     data->flags |= SCF_SEEN_ACCEPT;
3895     if (stopmin > min)
3896      stopmin = min;
3897    }
3898   }
3899   else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3900   {
3901     if (flags & SCF_DO_SUBSTR) {
3902      SCAN_COMMIT(pRExC_state,data,minlenp);
3903      data->longest = &(data->longest_float);
3904     }
3905     is_inf = is_inf_internal = 1;
3906     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3907      cl_anything(pRExC_state, data->start_class);
3908     flags &= ~SCF_DO_STCLASS;
3909   }
3910   else if (OP(scan) == GPOS) {
3911    if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3912     !(delta || is_inf || (data && data->pos_delta)))
3913    {
3914     if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3915      RExC_rx->extflags |= RXf_ANCH_GPOS;
3916     if (RExC_rx->gofs < (U32)min)
3917      RExC_rx->gofs = min;
3918    } else {
3919     RExC_rx->extflags |= RXf_GPOS_FLOAT;
3920     RExC_rx->gofs = 0;
3921    }
3922   }
3923 #ifdef TRIE_STUDY_OPT
3924 #ifdef FULL_TRIE_STUDY
3925   else if (PL_regkind[OP(scan)] == TRIE) {
3926    /* NOTE - There is similar code to this block above for handling
3927    BRANCH nodes on the initial study.  If you change stuff here
3928    check there too. */
3929    regnode *trie_node= scan;
3930    regnode *tail= regnext(scan);
3931    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3932    I32 max1 = 0, min1 = I32_MAX;
3933    struct regnode_charclass_class accum;
3934
3935    if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3936     SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3937    if (flags & SCF_DO_STCLASS)
3938     cl_init_zero(pRExC_state, &accum);
3939
3940    if (!trie->jump) {
3941     min1= trie->minlen;
3942     max1= trie->maxlen;
3943    } else {
3944     const regnode *nextbranch= NULL;
3945     U32 word;
3946
3947     for ( word=1 ; word <= trie->wordcount ; word++)
3948     {
3949      I32 deltanext=0, minnext=0, f = 0, fake;
3950      struct regnode_charclass_class this_class;
3951
3952      data_fake.flags = 0;
3953      if (data) {
3954       data_fake.whilem_c = data->whilem_c;
3955       data_fake.last_closep = data->last_closep;
3956      }
3957      else
3958       data_fake.last_closep = &fake;
3959      data_fake.pos_delta = delta;
3960      if (flags & SCF_DO_STCLASS) {
3961       cl_init(pRExC_state, &this_class);
3962       data_fake.start_class = &this_class;
3963       f = SCF_DO_STCLASS_AND;
3964      }
3965      if (flags & SCF_WHILEM_VISITED_POS)
3966       f |= SCF_WHILEM_VISITED_POS;
3967
3968      if (trie->jump[word]) {
3969       if (!nextbranch)
3970        nextbranch = trie_node + trie->jump[0];
3971       scan= trie_node + trie->jump[word];
3972       /* We go from the jump point to the branch that follows
3973       it. Note this means we need the vestigal unused branches
3974       even though they arent otherwise used.
3975       */
3976       minnext = study_chunk(pRExC_state, &scan, minlenp,
3977        &deltanext, (regnode *)nextbranch, &data_fake,
3978        stopparen, recursed, NULL, f,depth+1);
3979      }
3980      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3981       nextbranch= regnext((regnode*)nextbranch);
3982
3983      if (min1 > (I32)(minnext + trie->minlen))
3984       min1 = minnext + trie->minlen;
3985      if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3986       max1 = minnext + deltanext + trie->maxlen;
3987      if (deltanext == I32_MAX)
3988       is_inf = is_inf_internal = 1;
3989
3990      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3991       pars++;
3992      if (data_fake.flags & SCF_SEEN_ACCEPT) {
3993       if ( stopmin > min + min1)
3994        stopmin = min + min1;
3995       flags &= ~SCF_DO_SUBSTR;
3996       if (data)
3997        data->flags |= SCF_SEEN_ACCEPT;
3998      }
3999      if (data) {
4000       if (data_fake.flags & SF_HAS_EVAL)
4001        data->flags |= SF_HAS_EVAL;
4002       data->whilem_c = data_fake.whilem_c;
4003      }
4004      if (flags & SCF_DO_STCLASS)
4005       cl_or(pRExC_state, &accum, &this_class);
4006     }
4007    }
4008    if (flags & SCF_DO_SUBSTR) {
4009     data->pos_min += min1;
4010     data->pos_delta += max1 - min1;
4011     if (max1 != min1 || is_inf)
4012      data->longest = &(data->longest_float);
4013    }
4014    min += min1;
4015    delta += max1 - min1;
4016    if (flags & SCF_DO_STCLASS_OR) {
4017     cl_or(pRExC_state, data->start_class, &accum);
4018     if (min1) {
4019      cl_and(data->start_class, and_withp);
4020      flags &= ~SCF_DO_STCLASS;
4021     }
4022    }
4023    else if (flags & SCF_DO_STCLASS_AND) {
4024     if (min1) {
4025      cl_and(data->start_class, &accum);
4026      flags &= ~SCF_DO_STCLASS;
4027     }
4028     else {
4029      /* Switch to OR mode: cache the old value of
4030      * data->start_class */
4031      INIT_AND_WITHP;
4032      StructCopy(data->start_class, and_withp,
4033        struct regnode_charclass_class);
4034      flags &= ~SCF_DO_STCLASS_AND;
4035      StructCopy(&accum, data->start_class,
4036        struct regnode_charclass_class);
4037      flags |= SCF_DO_STCLASS_OR;
4038      data->start_class->flags |= ANYOF_EOS;
4039     }
4040    }
4041    scan= tail;
4042    continue;
4043   }
4044 #else
4045   else if (PL_regkind[OP(scan)] == TRIE) {
4046    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4047    U8*bang=NULL;
4048
4049    min += trie->minlen;
4050    delta += (trie->maxlen - trie->minlen);
4051    flags &= ~SCF_DO_STCLASS; /* xxx */
4052    if (flags & SCF_DO_SUBSTR) {
4053      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4054      data->pos_min += trie->minlen;
4055      data->pos_delta += (trie->maxlen - trie->minlen);
4056     if (trie->maxlen != trie->minlen)
4057      data->longest = &(data->longest_float);
4058     }
4059     if (trie->jump) /* no more substrings -- for now /grr*/
4060      flags &= ~SCF_DO_SUBSTR;
4061   }
4062 #endif /* old or new */
4063 #endif /* TRIE_STUDY_OPT */
4064
4065   /* Else: zero-length, ignore. */
4066   scan = regnext(scan);
4067  }
4068  if (frame) {
4069   last = frame->last;
4070   scan = frame->next;
4071   stopparen = frame->stop;
4072   frame = frame->prev;
4073   goto fake_study_recurse;
4074  }
4075
4076   finish:
4077  assert(!frame);
4078  DEBUG_STUDYDATA("pre-fin:",data,depth);
4079
4080  *scanp = scan;
4081  *deltap = is_inf_internal ? I32_MAX : delta;
4082  if (flags & SCF_DO_SUBSTR && is_inf)
4083   data->pos_delta = I32_MAX - data->pos_min;
4084  if (is_par > (I32)U8_MAX)
4085   is_par = 0;
4086  if (is_par && pars==1 && data) {
4087   data->flags |= SF_IN_PAR;
4088   data->flags &= ~SF_HAS_PAR;
4089  }
4090  else if (pars && data) {
4091   data->flags |= SF_HAS_PAR;
4092   data->flags &= ~SF_IN_PAR;
4093  }
4094  if (flags & SCF_DO_STCLASS_OR)
4095   cl_and(data->start_class, and_withp);
4096  if (flags & SCF_TRIE_RESTUDY)
4097   data->flags |=  SCF_TRIE_RESTUDY;
4098
4099  DEBUG_STUDYDATA("post-fin:",data,depth);
4100
4101  return min < stopmin ? min : stopmin;
4102 }
4103
4104 STATIC U32
4105 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4106 {
4107  U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4108
4109  PERL_ARGS_ASSERT_ADD_DATA;
4110
4111  Renewc(RExC_rxi->data,
4112   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4113   char, struct reg_data);
4114  if(count)
4115   Renew(RExC_rxi->data->what, count + n, U8);
4116  else
4117   Newx(RExC_rxi->data->what, n, U8);
4118  RExC_rxi->data->count = count + n;
4119  Copy(s, RExC_rxi->data->what + count, n, U8);
4120  return count;
4121 }
4122
4123 /*XXX: todo make this not included in a non debugging perl */
4124 #ifndef PERL_IN_XSUB_RE
4125 void
4126 Perl_reginitcolors(pTHX)
4127 {
4128  dVAR;
4129  const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4130  if (s) {
4131   char *t = savepv(s);
4132   int i = 0;
4133   PL_colors[0] = t;
4134   while (++i < 6) {
4135    t = strchr(t, '\t');
4136    if (t) {
4137     *t = '\0';
4138     PL_colors[i] = ++t;
4139    }
4140    else
4141     PL_colors[i] = t = (char *)"";
4142   }
4143  } else {
4144   int i = 0;
4145   while (i < 6)
4146    PL_colors[i++] = (char *)"";
4147  }
4148  PL_colorset = 1;
4149 }
4150 #endif
4151
4152
4153 #ifdef TRIE_STUDY_OPT
4154 #define CHECK_RESTUDY_GOTO                                  \
4155   if (                                                \
4156    (data.flags & SCF_TRIE_RESTUDY)               \
4157    && ! restudied++                              \
4158   )     goto reStudy
4159 #else
4160 #define CHECK_RESTUDY_GOTO
4161 #endif
4162
4163 /*
4164  - pregcomp - compile a regular expression into internal code
4165  *
4166  * We can't allocate space until we know how big the compiled form will be,
4167  * but we can't compile it (and thus know how big it is) until we've got a
4168  * place to put the code.  So we cheat:  we compile it twice, once with code
4169  * generation turned off and size counting turned on, and once "for real".
4170  * This also means that we don't allocate space until we are sure that the
4171  * thing really will compile successfully, and we never have to move the
4172  * code and thus invalidate pointers into it.  (Note that it has to be in
4173  * one piece because free() must be able to free it all.) [NB: not true in perl]
4174  *
4175  * Beware that the optimization-preparation code in here knows about some
4176  * of the structure of the compiled regexp.  [I'll say.]
4177  */
4178
4179
4180
4181 #ifndef PERL_IN_XSUB_RE
4182 #define RE_ENGINE_PTR &reh_regexp_engine
4183 #else
4184 extern const struct regexp_engine my_reg_engine;
4185 #define RE_ENGINE_PTR &my_reg_engine
4186 #endif
4187
4188 #ifndef PERL_IN_XSUB_RE
4189 REGEXP *
4190 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4191 {
4192  dVAR;
4193  HV * const table = GvHV(PL_hintgv);
4194
4195  PERL_ARGS_ASSERT_PREGCOMP;
4196
4197  /* Dispatch a request to compile a regexp to correct
4198  regexp engine. */
4199  if (table) {
4200   SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4201   GET_RE_DEBUG_FLAGS_DECL;
4202   if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4203    const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4204    DEBUG_COMPILE_r({
4205     PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4206      SvIV(*ptr));
4207    });
4208    return CALLREGCOMP_ENG(eng, pattern, flags);
4209   }
4210  }
4211  return Perl_re_compile(aTHX_ pattern, flags);
4212 }
4213 #endif
4214
4215 REGEXP *
4216 Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
4217 {
4218  dVAR;
4219  REGEXP *rx;
4220  struct regexp *r;
4221  register regexp_internal *ri;
4222  STRLEN plen;
4223  char  *exp = SvPV(pattern, plen);
4224  char* xend = exp + plen;
4225  regnode *scan;
4226  I32 flags;
4227  I32 minlen = 0;
4228  I32 sawplus = 0;
4229  I32 sawopen = 0;
4230  scan_data_t data;
4231  RExC_state_t RExC_state;
4232  RExC_state_t * const pRExC_state = &RExC_state;
4233 #ifdef TRIE_STUDY_OPT
4234  int restudied= 0;
4235  RExC_state_t copyRExC_state;
4236 #endif
4237  GET_RE_DEBUG_FLAGS_DECL;
4238
4239  PERL_ARGS_ASSERT_RE_COMPILE;
4240
4241  DEBUG_r(if (!PL_colorset) reginitcolors());
4242
4243  RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4244
4245  DEBUG_COMPILE_r({
4246   SV *dsv= sv_newmortal();
4247   RE_PV_QUOTED_DECL(s, RExC_utf8,
4248    dsv, exp, plen, 60);
4249   PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4250      PL_colors[4],PL_colors[5],s);
4251  });
4252
4253 redo_first_pass:
4254  RExC_precomp = exp;
4255  RExC_flags = pm_flags;
4256  RExC_sawback = 0;
4257
4258  RExC_seen = 0;
4259  RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4260  RExC_seen_evals = 0;
4261  RExC_extralen = 0;
4262
4263  /* First pass: determine size, legality. */
4264  RExC_parse = exp;
4265  RExC_start = exp;
4266  RExC_end = xend;
4267  RExC_naughty = 0;
4268  RExC_npar = 1;
4269  RExC_nestroot = 0;
4270  RExC_size = 0L;
4271  RExC_emit = &PL_regdummy;
4272  RExC_whilem_seen = 0;
4273  RExC_open_parens = NULL;
4274  RExC_close_parens = NULL;
4275  RExC_opend = NULL;
4276  RExC_paren_names = NULL;
4277 #ifdef DEBUGGING
4278  RExC_paren_name_list = NULL;
4279 #endif
4280  RExC_recurse = NULL;
4281  RExC_recurse_count = 0;
4282
4283 #if 0 /* REGC() is (currently) a NOP at the first pass.
4284  * Clever compilers notice this and complain. --jhi */
4285  REGC((U8)REG_MAGIC, (char*)RExC_emit);
4286 #endif
4287  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4288  if (reg(pRExC_state, 0, &flags,1) == NULL) {
4289   RExC_precomp = NULL;
4290   return(NULL);
4291  }
4292  if (RExC_utf8 && !RExC_orig_utf8) {
4293   /* It's possible to write a regexp in ascii that represents Unicode
4294   codepoints outside of the byte range, such as via \x{100}. If we
4295   detect such a sequence we have to convert the entire pattern to utf8
4296   and then recompile, as our sizing calculation will have been based
4297   on 1 byte == 1 character, but we will need to use utf8 to encode
4298   at least some part of the pattern, and therefore must convert the whole
4299   thing.
4300   XXX: somehow figure out how to make this less expensive...
4301   -- dmq */
4302   STRLEN len = plen;
4303   DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4304    "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4305   exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4306   xend = exp + len;
4307   RExC_orig_utf8 = RExC_utf8;
4308   SAVEFREEPV(exp);
4309   goto redo_first_pass;
4310  }
4311  DEBUG_PARSE_r({
4312   PerlIO_printf(Perl_debug_log,
4313    "Required size %"IVdf" nodes\n"
4314    "Starting second pass (creation)\n",
4315    (IV)RExC_size);
4316   RExC_lastnum=0;
4317   RExC_lastparse=NULL;
4318  });
4319  /* Small enough for pointer-storage convention?
4320  If extralen==0, this means that we will not need long jumps. */
4321  if (RExC_size >= 0x10000L && RExC_extralen)
4322   RExC_size += RExC_extralen;
4323  else
4324   RExC_extralen = 0;
4325  if (RExC_whilem_seen > 15)
4326   RExC_whilem_seen = 15;
4327
4328  /* Allocate space and zero-initialize. Note, the two step process
4329  of zeroing when in debug mode, thus anything assigned has to
4330  happen after that */
4331  rx = (REGEXP*) newSV_type(SVt_REGEXP);
4332  r = (struct regexp*)SvANY(rx);
4333  Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4334   char, regexp_internal);
4335  if ( r == NULL || ri == NULL )
4336   FAIL("Regexp out of space");
4337 #ifdef DEBUGGING
4338  /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4339  Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4340 #else
4341  /* bulk initialize base fields with 0. */
4342  Zero(ri, sizeof(regexp_internal), char);
4343 #endif
4344
4345  /* non-zero initialization begins here */
4346  RXi_SET( r, ri );
4347  r->engine= RE_ENGINE_PTR;
4348  r->extflags = pm_flags;
4349  {
4350   bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4351   bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4352   bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4353   U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4354        >> RXf_PMf_STD_PMMOD_SHIFT);
4355   const char *fptr = STD_PAT_MODS;        /*"msix"*/
4356   char *p;
4357   const STRLEN wraplen = plen + has_minus + has_p + has_runon
4358    + (sizeof(STD_PAT_MODS) - 1)
4359    + (sizeof("(?:)") - 1);
4360
4361   p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
4362   SvCUR_set(rx, wraplen);
4363   SvPOK_on(rx);
4364   SvFLAGS(rx) |= SvUTF8(pattern);
4365   *p++='('; *p++='?';
4366   if (has_p)
4367    *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4368   {
4369    char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4370    char *colon = r + 1;
4371    char ch;
4372
4373    while((ch = *fptr++)) {
4374     if(reganch & 1)
4375      *p++ = ch;
4376     else
4377      *r-- = ch;
4378     reganch >>= 1;
4379    }
4380    if(has_minus) {
4381     *r = '-';
4382     p = colon;
4383    }
4384   }
4385
4386   *p++ = ':';
4387   Copy(RExC_precomp, p, plen, char);
4388   assert ((RX_WRAPPED(rx) - p) < 16);
4389   r->pre_prefix = p - RX_WRAPPED(rx);
4390   p += plen;
4391   if (has_runon)
4392    *p++ = '\n';
4393   *p++ = ')';
4394   *p = 0;
4395  }
4396
4397  r->intflags = 0;
4398  r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4399
4400  if (RExC_seen & REG_SEEN_RECURSE) {
4401   Newxz(RExC_open_parens, RExC_npar,regnode *);
4402   SAVEFREEPV(RExC_open_parens);
4403   Newxz(RExC_close_parens,RExC_npar,regnode *);
4404   SAVEFREEPV(RExC_close_parens);
4405  }
4406
4407  /* Useful during FAIL. */
4408 #ifdef RE_TRACK_PATTERN_OFFSETS
4409  Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4410  DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4411       "%s %"UVuf" bytes for offset annotations.\n",
4412       ri->u.offsets ? "Got" : "Couldn't get",
4413       (UV)((2*RExC_size+1) * sizeof(U32))));
4414 #endif
4415  SetProgLen(ri,RExC_size);
4416  RExC_rx_sv = rx;
4417  RExC_rx = r;
4418  RExC_rxi = ri;
4419  REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4420
4421  /* Second pass: emit code. */
4422  RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4423  RExC_parse = exp;
4424  RExC_end = xend;
4425  RExC_naughty = 0;
4426  RExC_npar = 1;
4427  RExC_emit_start = ri->program;
4428  RExC_emit = ri->program;
4429  RExC_emit_bound = ri->program + RExC_size + 1;
4430
4431  /* Store the count of eval-groups for security checks: */
4432  RExC_rx->seen_evals = RExC_seen_evals;
4433  REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4434  if (reg(pRExC_state, 0, &flags,1) == NULL) {
4435   ReREFCNT_dec(rx);
4436   return(NULL);
4437  }
4438  /* XXXX To minimize changes to RE engine we always allocate
4439  3-units-long substrs field. */
4440  Newx(r->substrs, 1, struct reg_substr_data);
4441  if (RExC_recurse_count) {
4442   Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4443   SAVEFREEPV(RExC_recurse);
4444  }
4445
4446 reStudy:
4447  r->minlen = minlen = sawplus = sawopen = 0;
4448  Zero(r->substrs, 1, struct reg_substr_data);
4449
4450 #ifdef TRIE_STUDY_OPT
4451  if (!restudied) {
4452   StructCopy(&zero_scan_data, &data, scan_data_t);
4453   copyRExC_state = RExC_state;
4454  } else {
4455   U32 seen=RExC_seen;
4456   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4457
4458   RExC_state = copyRExC_state;
4459   if (seen & REG_TOP_LEVEL_BRANCHES)
4460    RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4461   else
4462    RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4463   if (data.last_found) {
4464    SvREFCNT_dec(data.longest_fixed);
4465    SvREFCNT_dec(data.longest_float);
4466    SvREFCNT_dec(data.last_found);
4467   }
4468   StructCopy(&zero_scan_data, &data, scan_data_t);
4469  }
4470 #else
4471  StructCopy(&zero_scan_data, &data, scan_data_t);
4472 #endif
4473
4474  /* Dig out information for optimizations. */
4475  r->extflags = RExC_flags; /* was pm_op */
4476  /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4477
4478  if (UTF)
4479   SvUTF8_on(rx); /* Unicode in it? */
4480  ri->regstclass = NULL;
4481  if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4482   r->intflags |= PREGf_NAUGHTY;
4483  scan = ri->program + 1;  /* First BRANCH. */
4484
4485  /* testing for BRANCH here tells us whether there is "must appear"
4486  data in the pattern. If there is then we can use it for optimisations */
4487  if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4488   I32 fake;
4489   STRLEN longest_float_length, longest_fixed_length;
4490   struct regnode_charclass_class ch_class; /* pointed to by data */
4491   int stclass_flag;
4492   I32 last_close = 0; /* pointed to by data */
4493   regnode *first= scan;
4494   regnode *first_next= regnext(first);
4495
4496   /*
4497   * Skip introductions and multiplicators >= 1
4498   * so that we can extract the 'meat' of the pattern that must
4499   * match in the large if() sequence following.
4500   * NOTE that EXACT is NOT covered here, as it is normally
4501   * picked up by the optimiser separately.
4502   *
4503   * This is unfortunate as the optimiser isnt handling lookahead
4504   * properly currently.
4505   *
4506   */
4507   while ((OP(first) == OPEN && (sawopen = 1)) ||
4508    /* An OR of *one* alternative - should not happen now. */
4509    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4510    /* for now we can't handle lookbehind IFMATCH*/
4511    (OP(first) == IFMATCH && !first->flags) ||
4512    (OP(first) == PLUS) ||
4513    (OP(first) == MINMOD) ||
4514    /* An {n,m} with n>0 */
4515    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4516    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4517   {
4518     /*
4519     * the only op that could be a regnode is PLUS, all the rest
4520     * will be regnode_1 or regnode_2.
4521     *
4522     */
4523     if (OP(first) == PLUS)
4524      sawplus = 1;
4525     else
4526      first += regarglen[OP(first)];
4527
4528     first = NEXTOPER(first);
4529     first_next= regnext(first);
4530   }
4531
4532   /* Starting-point info. */
4533  again:
4534   DEBUG_PEEP("first:",first,0);
4535   /* Ignore EXACT as we deal with it later. */
4536   if (PL_regkind[OP(first)] == EXACT) {
4537    if (OP(first) == EXACT)
4538     NOOP; /* Empty, get anchored substr later. */
4539    else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4540     ri->regstclass = first;
4541   }
4542 #ifdef TRIE_STCLASS
4543   else if (PL_regkind[OP(first)] == TRIE &&
4544     ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4545   {
4546    regnode *trie_op;
4547    /* this can happen only on restudy */
4548    if ( OP(first) == TRIE ) {
4549     struct regnode_1 *trieop = (struct regnode_1 *)
4550      PerlMemShared_calloc(1, sizeof(struct regnode_1));
4551     StructCopy(first,trieop,struct regnode_1);
4552     trie_op=(regnode *)trieop;
4553    } else {
4554     struct regnode_charclass *trieop = (struct regnode_charclass *)
4555      PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4556     StructCopy(first,trieop,struct regnode_charclass);
4557     trie_op=(regnode *)trieop;
4558    }
4559    OP(trie_op)+=2;
4560    make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4561    ri->regstclass = trie_op;
4562   }
4563 #endif
4564   else if (strchr((const char*)PL_simple,OP(first)))
4565    ri->regstclass = first;
4566   else if (PL_regkind[OP(first)] == BOUND ||
4567     PL_regkind[OP(first)] == NBOUND)
4568    ri->regstclass = first;
4569   else if (PL_regkind[OP(first)] == BOL) {
4570    r->extflags |= (OP(first) == MBOL
4571       ? RXf_ANCH_MBOL
4572       : (OP(first) == SBOL
4573        ? RXf_ANCH_SBOL
4574        : RXf_ANCH_BOL));
4575    first = NEXTOPER(first);
4576    goto again;
4577   }
4578   else if (OP(first) == GPOS) {
4579    r->extflags |= RXf_ANCH_GPOS;
4580    first = NEXTOPER(first);
4581    goto again;
4582   }
4583   else if ((!sawopen || !RExC_sawback) &&
4584    (OP(first) == STAR &&
4585    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4586    !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4587   {
4588    /* turn .* into ^.* with an implied $*=1 */
4589    const int type =
4590     (OP(NEXTOPER(first)) == REG_ANY)
4591      ? RXf_ANCH_MBOL
4592      : RXf_ANCH_SBOL;
4593    r->extflags |= type;
4594    r->intflags |= PREGf_IMPLICIT;
4595    first = NEXTOPER(first);
4596    goto again;
4597   }
4598   if (sawplus && (!sawopen || !RExC_sawback)
4599    && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4600    /* x+ must match at the 1st pos of run of x's */
4601    r->intflags |= PREGf_SKIP;
4602
4603   /* Scan is after the zeroth branch, first is atomic matcher. */
4604 #ifdef TRIE_STUDY_OPT
4605   DEBUG_PARSE_r(
4606    if (!restudied)
4607     PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4608        (IV)(first - scan + 1))
4609   );
4610 #else
4611   DEBUG_PARSE_r(
4612    PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4613     (IV)(first - scan + 1))
4614   );
4615 #endif
4616
4617
4618   /*
4619   * If there's something expensive in the r.e., find the
4620   * longest literal string that must appear and make it the
4621   * regmust.  Resolve ties in favor of later strings, since
4622   * the regstart check works with the beginning of the r.e.
4623   * and avoiding duplication strengthens checking.  Not a
4624   * strong reason, but sufficient in the absence of others.
4625   * [Now we resolve ties in favor of the earlier string if
4626   * it happens that c_offset_min has been invalidated, since the
4627   * earlier string may buy us something the later one won't.]
4628   */
4629
4630   data.longest_fixed = newSVpvs("");
4631   data.longest_float = newSVpvs("");
4632   data.last_found = newSVpvs("");
4633   data.longest = &(data.longest_fixed);
4634   first = scan;
4635   if (!ri->regstclass) {
4636    cl_init(pRExC_state, &ch_class);
4637    data.start_class = &ch_class;
4638    stclass_flag = SCF_DO_STCLASS_AND;
4639   } else    /* XXXX Check for BOUND? */
4640    stclass_flag = 0;
4641   data.last_closep = &last_close;
4642
4643   minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4644    &data, -1, NULL, NULL,
4645    SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4646
4647
4648   CHECK_RESTUDY_GOTO;
4649
4650
4651   if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4652    && data.last_start_min == 0 && data.last_end > 0
4653    && !RExC_seen_zerolen
4654    && !(RExC_seen & REG_SEEN_VERBARG)
4655    && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4656    r->extflags |= RXf_CHECK_ALL;
4657   scan_commit(pRExC_state, &data,&minlen,0);
4658   SvREFCNT_dec(data.last_found);
4659
4660   /* Note that code very similar to this but for anchored string
4661   follows immediately below, changes may need to be made to both.
4662   Be careful.
4663   */
4664   longest_float_length = CHR_SVLEN(data.longest_float);
4665   if (longest_float_length
4666    || (data.flags & SF_FL_BEFORE_EOL
4667     && (!(data.flags & SF_FL_BEFORE_MEOL)
4668      || (RExC_flags & RXf_PMf_MULTILINE))))
4669   {
4670    I32 t,ml;
4671
4672    if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
4673     && data.offset_fixed == data.offset_float_min
4674     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4675      goto remove_float;  /* As in (a)+. */
4676
4677    /* copy the information about the longest float from the reg_scan_data
4678    over to the program. */
4679    if (SvUTF8(data.longest_float)) {
4680     r->float_utf8 = data.longest_float;
4681     r->float_substr = NULL;
4682    } else {
4683     r->float_substr = data.longest_float;
4684     r->float_utf8 = NULL;
4685    }
4686    /* float_end_shift is how many chars that must be matched that
4687    follow this item. We calculate it ahead of time as once the
4688    lookbehind offset is added in we lose the ability to correctly
4689    calculate it.*/
4690    ml = data.minlen_float ? *(data.minlen_float)
4691         : (I32)longest_float_length;
4692    r->float_end_shift = ml - data.offset_float_min
4693     - longest_float_length + (SvTAIL(data.longest_float) != 0)
4694     + data.lookbehind_float;
4695    r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4696    r->float_max_offset = data.offset_float_max;
4697    if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4698     r->float_max_offset -= data.lookbehind_float;
4699
4700    t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4701      && (!(data.flags & SF_FL_BEFORE_MEOL)
4702       || (RExC_flags & RXf_PMf_MULTILINE)));
4703    fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4704   }
4705   else {
4706   remove_float:
4707    r->float_substr = r->float_utf8 = NULL;
4708    SvREFCNT_dec(data.longest_float);
4709    longest_float_length = 0;
4710   }
4711
4712   /* Note that code very similar to this but for floating string
4713   is immediately above, changes may need to be made to both.
4714   Be careful.
4715   */
4716   longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4717   if (longest_fixed_length
4718    || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4719     && (!(data.flags & SF_FIX_BEFORE_MEOL)
4720      || (RExC_flags & RXf_PMf_MULTILINE))))
4721   {
4722    I32 t,ml;
4723
4724    /* copy the information about the longest fixed
4725    from the reg_scan_data over to the program. */
4726    if (SvUTF8(data.longest_fixed)) {
4727     r->anchored_utf8 = data.longest_fixed;
4728     r->anchored_substr = NULL;
4729    } else {
4730     r->anchored_substr = data.longest_fixed;
4731     r->anchored_utf8 = NULL;
4732    }
4733    /* fixed_end_shift is how many chars that must be matched that
4734    follow this item. We calculate it ahead of time as once the
4735    lookbehind offset is added in we lose the ability to correctly
4736    calculate it.*/
4737    ml = data.minlen_fixed ? *(data.minlen_fixed)
4738         : (I32)longest_fixed_length;
4739    r->anchored_end_shift = ml - data.offset_fixed
4740     - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4741     + data.lookbehind_fixed;
4742    r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4743
4744    t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4745     && (!(data.flags & SF_FIX_BEFORE_MEOL)
4746      || (RExC_flags & RXf_PMf_MULTILINE)));
4747    fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4748   }
4749   else {
4750    r->anchored_substr = r->anchored_utf8 = NULL;
4751    SvREFCNT_dec(data.longest_fixed);
4752    longest_fixed_length = 0;
4753   }
4754   if (ri->regstclass
4755    && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4756    ri->regstclass = NULL;
4757   if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4758    && stclass_flag
4759    && !(data.start_class->flags & ANYOF_EOS)
4760    && !cl_is_anything(data.start_class))
4761   {
4762    const U32 n = add_data(pRExC_state, 1, "f");
4763
4764    Newx(RExC_rxi->data->data[n], 1,
4765     struct regnode_charclass_class);
4766    StructCopy(data.start_class,
4767      (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4768      struct regnode_charclass_class);
4769    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4770    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4771    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4772      regprop(r, sv, (regnode*)data.start_class);
4773      PerlIO_printf(Perl_debug_log,
4774          "synthetic stclass \"%s\".\n",
4775          SvPVX_const(sv));});
4776   }
4777
4778   /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4779   if (longest_fixed_length > longest_float_length) {
4780    r->check_end_shift = r->anchored_end_shift;
4781    r->check_substr = r->anchored_substr;
4782    r->check_utf8 = r->anchored_utf8;
4783    r->check_offset_min = r->check_offset_max = r->anchored_offset;
4784    if (r->extflags & RXf_ANCH_SINGLE)
4785     r->extflags |= RXf_NOSCAN;
4786   }
4787   else {
4788    r->check_end_shift = r->float_end_shift;
4789    r->check_substr = r->float_substr;
4790    r->check_utf8 = r->float_utf8;
4791    r->check_offset_min = r->float_min_offset;
4792    r->check_offset_max = r->float_max_offset;
4793   }
4794   /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4795   This should be changed ASAP!  */
4796   if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4797    r->extflags |= RXf_USE_INTUIT;
4798    if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4799     r->extflags |= RXf_INTUIT_TAIL;
4800   }
4801   /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4802   if ( (STRLEN)minlen < longest_float_length )
4803    minlen= longest_float_length;
4804   if ( (STRLEN)minlen < longest_fixed_length )
4805    minlen= longest_fixed_length;
4806   */
4807  }
4808  else {
4809   /* Several toplevels. Best we can is to set minlen. */
4810   I32 fake;
4811   struct regnode_charclass_class ch_class;
4812   I32 last_close = 0;
4813
4814   DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4815
4816   scan = ri->program + 1;
4817   cl_init(pRExC_state, &ch_class);
4818   data.start_class = &ch_class;
4819   data.last_closep = &last_close;
4820
4821
4822   minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4823    &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4824
4825   CHECK_RESTUDY_GOTO;
4826
4827   r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4828     = r->float_substr = r->float_utf8 = NULL;
4829   if (!(data.start_class->flags & ANYOF_EOS)
4830    && !cl_is_anything(data.start_class))
4831   {
4832    const U32 n = add_data(pRExC_state, 1, "f");
4833
4834    Newx(RExC_rxi->data->data[n], 1,
4835     struct regnode_charclass_class);
4836    StructCopy(data.start_class,
4837      (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4838      struct regnode_charclass_class);
4839    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4840    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4841    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4842      regprop(r, sv, (regnode*)data.start_class);
4843      PerlIO_printf(Perl_debug_log,
4844          "synthetic stclass \"%s\".\n",
4845          SvPVX_const(sv));});
4846   }
4847  }
4848
4849  /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4850  the "real" pattern. */
4851  DEBUG_OPTIMISE_r({
4852   PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4853      (IV)minlen, (IV)r->minlen);
4854  });
4855  r->minlenret = minlen;
4856  if (r->minlen < minlen)
4857   r->minlen = minlen;
4858
4859  if (RExC_seen & REG_SEEN_GPOS)
4860   r->extflags |= RXf_GPOS_SEEN;
4861  if (RExC_seen & REG_SEEN_LOOKBEHIND)
4862   r->extflags |= RXf_LOOKBEHIND_SEEN;
4863  if (RExC_seen & REG_SEEN_EVAL)
4864   r->extflags |= RXf_EVAL_SEEN;
4865  if (RExC_seen & REG_SEEN_CANY)
4866   r->extflags |= RXf_CANY_SEEN;
4867  if (RExC_seen & REG_SEEN_VERBARG)
4868   r->intflags |= PREGf_VERBARG_SEEN;
4869  if (RExC_seen & REG_SEEN_CUTGROUP)
4870   r->intflags |= PREGf_CUTGROUP_SEEN;
4871  if (RExC_paren_names)
4872   RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
4873  else
4874   RXp_PAREN_NAMES(r) = NULL;
4875
4876 #ifdef STUPID_PATTERN_CHECKS
4877  if (RX_PRELEN(rx) == 0)
4878   r->extflags |= RXf_NULL;
4879  if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4880   /* XXX: this should happen BEFORE we compile */
4881   r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4882  else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
4883   r->extflags |= RXf_WHITE;
4884  else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
4885   r->extflags |= RXf_START_ONLY;
4886 #else
4887  if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4888    /* XXX: this should happen BEFORE we compile */
4889    r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4890  else {
4891   regnode *first = ri->program + 1;
4892   U8 fop = OP(first);
4893   U8 nop = OP(NEXTOPER(first));
4894
4895   if (PL_regkind[fop] == NOTHING && nop == END)
4896    r->extflags |= RXf_NULL;
4897   else if (PL_regkind[fop] == BOL && nop == END)
4898    r->extflags |= RXf_START_ONLY;
4899   else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4900    r->extflags |= RXf_WHITE;
4901  }
4902 #endif
4903 #ifdef DEBUGGING
4904  if (RExC_paren_names) {
4905   ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4906   ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4907  } else
4908 #endif
4909   ri->name_list_idx = 0;
4910
4911  if (RExC_recurse_count) {
4912   for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4913    const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4914    ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4915   }
4916  }
4917  Newxz(r->offs, RExC_npar, regexp_paren_pair);
4918  /* assume we don't need to swap parens around before we match */
4919
4920  DEBUG_DUMP_r({
4921   PerlIO_printf(Perl_debug_log,"Final program:\n");
4922   regdump(r);
4923  });
4924 #ifdef RE_TRACK_PATTERN_OFFSETS
4925  DEBUG_OFFSETS_r(if (ri->u.offsets) {
4926   const U32 len = ri->u.offsets[0];
4927   U32 i;
4928   GET_RE_DEBUG_FLAGS_DECL;
4929   PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4930   for (i = 1; i <= len; i++) {
4931    if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4932     PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4933     (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4934    }
4935   PerlIO_printf(Perl_debug_log, "\n");
4936  });
4937 #endif
4938  return rx;
4939 }
4940
4941 #undef RE_ENGINE_PTR
4942
4943
4944 SV*
4945 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4946      const U32 flags)
4947 {
4948  PERL_ARGS_ASSERT_REG_NAMED_BUFF;
4949
4950  PERL_UNUSED_ARG(value);
4951
4952  if (flags & RXapif_FETCH) {
4953   return reg_named_buff_fetch(rx, key, flags);
4954  } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4955   Perl_croak(aTHX_ "%s", PL_no_modify);
4956   return NULL;
4957  } else if (flags & RXapif_EXISTS) {
4958   return reg_named_buff_exists(rx, key, flags)
4959    ? &PL_sv_yes
4960    : &PL_sv_no;
4961  } else if (flags & RXapif_REGNAMES) {
4962   return reg_named_buff_all(rx, flags);
4963  } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
4964   return reg_named_buff_scalar(rx, flags);
4965  } else {
4966   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4967   return NULL;
4968  }
4969 }
4970
4971 SV*
4972 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4973       const U32 flags)
4974 {
4975  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
4976  PERL_UNUSED_ARG(lastkey);
4977
4978  if (flags & RXapif_FIRSTKEY)
4979   return reg_named_buff_firstkey(rx, flags);
4980  else if (flags & RXapif_NEXTKEY)
4981   return reg_named_buff_nextkey(rx, flags);
4982  else {
4983   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4984   return NULL;
4985  }
4986 }
4987
4988 SV*
4989 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
4990       const U32 flags)
4991 {
4992  AV *retarray = NULL;
4993  SV *ret;
4994  struct regexp *const rx = (struct regexp *)SvANY(r);
4995
4996  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
4997
4998  if (flags & RXapif_ALL)
4999   retarray=newAV();
5000
5001  if (rx && RXp_PAREN_NAMES(rx)) {
5002   HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5003   if (he_str) {
5004    IV i;
5005    SV* sv_dat=HeVAL(he_str);
5006    I32 *nums=(I32*)SvPVX(sv_dat);
5007    for ( i=0; i<SvIVX(sv_dat); i++ ) {
5008     if ((I32)(rx->nparens) >= nums[i]
5009      && rx->offs[nums[i]].start != -1
5010      && rx->offs[nums[i]].end != -1)
5011     {
5012      ret = newSVpvs("");
5013      CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5014      if (!retarray)
5015       return ret;
5016     } else {
5017      ret = newSVsv(&PL_sv_undef);
5018     }
5019     if (retarray)
5020      av_push(retarray, ret);
5021    }
5022    if (retarray)
5023     return newRV_noinc(MUTABLE_SV(retarray));
5024   }
5025  }
5026  return NULL;
5027 }
5028
5029 bool
5030 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5031       const U32 flags)
5032 {
5033  struct regexp *const rx = (struct regexp *)SvANY(r);
5034
5035  PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5036
5037  if (rx && RXp_PAREN_NAMES(rx)) {
5038   if (flags & RXapif_ALL) {
5039    return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5040   } else {
5041    SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5042    if (sv) {
5043     SvREFCNT_dec(sv);
5044     return TRUE;
5045    } else {
5046     return FALSE;
5047    }
5048   }
5049  } else {
5050   return FALSE;
5051  }
5052 }
5053
5054 SV*
5055 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5056 {
5057  struct regexp *const rx = (struct regexp *)SvANY(r);
5058
5059  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5060
5061  if ( rx && RXp_PAREN_NAMES(rx) ) {
5062   (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5063
5064   return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5065  } else {
5066   return FALSE;
5067  }
5068 }
5069
5070 SV*
5071 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5072 {
5073  struct regexp *const rx = (struct regexp *)SvANY(r);
5074  GET_RE_DEBUG_FLAGS_DECL;
5075
5076  PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5077
5078  if (rx && RXp_PAREN_NAMES(rx)) {
5079   HV *hv = RXp_PAREN_NAMES(rx);
5080   HE *temphe;
5081   while ( (temphe = hv_iternext_flags(hv,0)) ) {
5082    IV i;
5083    IV parno = 0;
5084    SV* sv_dat = HeVAL(temphe);
5085    I32 *nums = (I32*)SvPVX(sv_dat);
5086    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5087     if ((I32)(rx->lastparen) >= nums[i] &&
5088      rx->offs[nums[i]].start != -1 &&
5089      rx->offs[nums[i]].end != -1)
5090     {
5091      parno = nums[i];
5092      break;
5093     }
5094    }
5095    if (parno || flags & RXapif_ALL) {
5096     return newSVhek(HeKEY_hek(temphe));
5097    }
5098   }
5099  }
5100  return NULL;
5101 }
5102
5103 SV*
5104 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5105 {
5106  SV *ret;
5107  AV *av;
5108  I32 length;
5109  struct regexp *const rx = (struct regexp *)SvANY(r);
5110
5111  PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5112
5113  if (rx && RXp_PAREN_NAMES(rx)) {
5114   if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5115    return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5116   } else if (flags & RXapif_ONE) {
5117    ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5118    av = MUTABLE_AV(SvRV(ret));
5119    length = av_len(av);
5120    SvREFCNT_dec(ret);
5121    return newSViv(length + 1);
5122   } else {
5123    Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5124    return NULL;
5125   }
5126  }
5127  return &PL_sv_undef;
5128 }
5129
5130 SV*
5131 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5132 {
5133  struct regexp *const rx = (struct regexp *)SvANY(r);
5134  AV *av = newAV();
5135
5136  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5137
5138  if (rx && RXp_PAREN_NAMES(rx)) {
5139   HV *hv= RXp_PAREN_NAMES(rx);
5140   HE *temphe;
5141   (void)hv_iterinit(hv);
5142   while ( (temphe = hv_iternext_flags(hv,0)) ) {
5143    IV i;
5144    IV parno = 0;
5145    SV* sv_dat = HeVAL(temphe);
5146    I32 *nums = (I32*)SvPVX(sv_dat);
5147    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5148     if ((I32)(rx->lastparen) >= nums[i] &&
5149      rx->offs[nums[i]].start != -1 &&
5150      rx->offs[nums[i]].end != -1)
5151     {
5152      parno = nums[i];
5153      break;
5154     }
5155    }
5156    if (parno || flags & RXapif_ALL) {
5157     av_push(av, newSVhek(HeKEY_hek(temphe)));
5158    }
5159   }
5160  }
5161
5162  return newRV_noinc(MUTABLE_SV(av));
5163 }
5164
5165 void
5166 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5167        SV * const sv)
5168 {
5169  struct regexp *const rx = (struct regexp *)SvANY(r);
5170  char *s = NULL;
5171  I32 i = 0;
5172  I32 s1, t1;
5173
5174  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5175
5176  if (!rx->subbeg) {
5177   sv_setsv(sv,&PL_sv_undef);
5178   return;
5179  }
5180  else
5181  if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5182   /* $` */
5183   i = rx->offs[0].start;
5184   s = rx->subbeg;
5185  }
5186  else
5187  if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5188   /* $' */
5189   s = rx->subbeg + rx->offs[0].end;
5190   i = rx->sublen - rx->offs[0].end;
5191  }
5192  else
5193  if ( 0 <= paren && paren <= (I32)rx->nparens &&
5194   (s1 = rx->offs[paren].start) != -1 &&
5195   (t1 = rx->offs[paren].end) != -1)
5196  {
5197   /* $& $1 ... */
5198   i = t1 - s1;
5199   s = rx->subbeg + s1;
5200  } else {
5201   sv_setsv(sv,&PL_sv_undef);
5202   return;
5203  }
5204  assert(rx->sublen >= (s - rx->subbeg) + i );
5205  if (i >= 0) {
5206   const int oldtainted = PL_tainted;
5207   TAINT_NOT;
5208   sv_setpvn(sv, s, i);
5209   PL_tainted = oldtainted;
5210   if ( (rx->extflags & RXf_CANY_SEEN)
5211    ? (RXp_MATCH_UTF8(rx)
5212       && (!i || is_utf8_string((U8*)s, i)))
5213    : (RXp_MATCH_UTF8(rx)) )
5214   {
5215    SvUTF8_on(sv);
5216   }
5217   else
5218    SvUTF8_off(sv);
5219   if (PL_tainting) {
5220    if (RXp_MATCH_TAINTED(rx)) {
5221     if (SvTYPE(sv) >= SVt_PVMG) {
5222      MAGIC* const mg = SvMAGIC(sv);
5223      MAGIC* mgt;
5224      PL_tainted = 1;
5225      SvMAGIC_set(sv, mg->mg_moremagic);
5226      SvTAINT(sv);
5227      if ((mgt = SvMAGIC(sv))) {
5228       mg->mg_moremagic = mgt;
5229       SvMAGIC_set(sv, mg);
5230      }
5231     } else {
5232      PL_tainted = 1;
5233      SvTAINT(sv);
5234     }
5235    } else
5236     SvTAINTED_off(sv);
5237   }
5238  } else {
5239   sv_setsv(sv,&PL_sv_undef);
5240   return;
5241  }
5242 }
5243
5244 void
5245 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5246               SV const * const value)
5247 {
5248  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5249
5250  PERL_UNUSED_ARG(rx);
5251  PERL_UNUSED_ARG(paren);
5252  PERL_UNUSED_ARG(value);
5253
5254  if (!PL_localizing)
5255   Perl_croak(aTHX_ "%s", PL_no_modify);
5256 }
5257
5258 I32
5259 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5260        const I32 paren)
5261 {
5262  struct regexp *const rx = (struct regexp *)SvANY(r);
5263  I32 i;
5264  I32 s1, t1;
5265
5266  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5267
5268  /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5269   switch (paren) {
5270  /* $` / ${^PREMATCH} */
5271  case RX_BUFF_IDX_PREMATCH:
5272   if (rx->offs[0].start != -1) {
5273       i = rx->offs[0].start;
5274       if (i > 0) {
5275         s1 = 0;
5276         t1 = i;
5277         goto getlen;
5278       }
5279    }
5280   return 0;
5281  /* $' / ${^POSTMATCH} */
5282  case RX_BUFF_IDX_POSTMATCH:
5283    if (rx->offs[0].end != -1) {
5284       i = rx->sublen - rx->offs[0].end;
5285       if (i > 0) {
5286         s1 = rx->offs[0].end;
5287         t1 = rx->sublen;
5288         goto getlen;
5289       }
5290    }
5291   return 0;
5292  /* $& / ${^MATCH}, $1, $2, ... */
5293  default:
5294    if (paren <= (I32)rx->nparens &&
5295    (s1 = rx->offs[paren].start) != -1 &&
5296    (t1 = rx->offs[paren].end) != -1)
5297    {
5298    i = t1 - s1;
5299    goto getlen;
5300   } else {
5301    if (ckWARN(WARN_UNINITIALIZED))
5302     report_uninit((const SV *)sv);
5303    return 0;
5304   }
5305  }
5306   getlen:
5307  if (i > 0 && RXp_MATCH_UTF8(rx)) {
5308   const char * const s = rx->subbeg + s1;
5309   const U8 *ep;
5310   STRLEN el;
5311
5312   i = t1 - s1;
5313   if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5314       i = el;
5315  }
5316  return i;
5317 }
5318
5319 SV*
5320 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5321 {
5322  PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5323   PERL_UNUSED_ARG(rx);
5324   if (0)
5325    return NULL;
5326   else
5327    return newSVpvs("Regexp");
5328 }
5329
5330 /* Scans the name of a named buffer from the pattern.
5331  * If flags is REG_RSN_RETURN_NULL returns null.
5332  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5333  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5334  * to the parsed name as looked up in the RExC_paren_names hash.
5335  * If there is an error throws a vFAIL().. type exception.
5336  */
5337
5338 #define REG_RSN_RETURN_NULL    0
5339 #define REG_RSN_RETURN_NAME    1
5340 #define REG_RSN_RETURN_DATA    2
5341
5342 STATIC SV*
5343 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5344 {
5345  char *name_start = RExC_parse;
5346
5347  PERL_ARGS_ASSERT_REG_SCAN_NAME;
5348
5349  if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5350   /* skip IDFIRST by using do...while */
5351   if (UTF)
5352    do {
5353     RExC_parse += UTF8SKIP(RExC_parse);
5354    } while (isALNUM_utf8((U8*)RExC_parse));
5355   else
5356    do {
5357     RExC_parse++;
5358    } while (isALNUM(*RExC_parse));
5359  }
5360
5361  if ( flags ) {
5362   SV* sv_name
5363    = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5364        SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5365   if ( flags == REG_RSN_RETURN_NAME)
5366    return sv_name;
5367   else if (flags==REG_RSN_RETURN_DATA) {
5368    HE *he_str = NULL;
5369    SV *sv_dat = NULL;
5370    if ( ! sv_name )      /* should not happen*/
5371     Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5372    if (RExC_paren_names)
5373     he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5374    if ( he_str )
5375     sv_dat = HeVAL(he_str);
5376    if ( ! sv_dat )
5377     vFAIL("Reference to nonexistent named group");
5378    return sv_dat;
5379   }
5380   else {
5381    Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5382   }
5383   /* NOT REACHED */
5384  }
5385  return NULL;
5386 }
5387
5388 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5389  int rem=(int)(RExC_end - RExC_parse);                       \
5390  int cut;                                                    \
5391  int num;                                                    \
5392  int iscut=0;                                                \
5393  if (rem>10) {                                               \
5394   rem=10;                                                 \
5395   iscut=1;                                                \
5396  }                                                           \
5397  cut=10-rem;                                                 \
5398  if (RExC_lastparse!=RExC_parse)                             \
5399   PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5400    rem, RExC_parse,                                    \
5401    cut + 4,                                            \
5402    iscut ? "..." : "<"                                 \
5403   );                                                      \
5404  else                                                        \
5405   PerlIO_printf(Perl_debug_log,"%16s","");                \
5406                 \
5407  if (SIZE_ONLY)                                              \
5408  num = RExC_size + 1;                                     \
5409  else                                                        \
5410  num=REG_NODE_NUM(RExC_emit);                             \
5411  if (RExC_lastnum!=num)                                      \
5412  PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5413  else                                                        \
5414  PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5415  PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5416   (int)((depth*2)), "",                                   \
5417   (funcname)                                              \
5418  );                                                          \
5419  RExC_lastnum=num;                                           \
5420  RExC_lastparse=RExC_parse;                                  \
5421 })
5422
5423
5424
5425 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5426  DEBUG_PARSE_MSG((funcname));                            \
5427  PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5428 })
5429 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5430  DEBUG_PARSE_MSG((funcname));                            \
5431  PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5432 })
5433 /*
5434  - reg - regular expression, i.e. main body or parenthesized thing
5435  *
5436  * Caller must absorb opening parenthesis.
5437  *
5438  * Combining parenthesis handling with the base level of regular expression
5439  * is a trifle forced, but the need to tie the tails of the branches to what
5440  * follows makes it hard to avoid.
5441  */
5442 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5443 #ifdef DEBUGGING
5444 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5445 #else
5446 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5447 #endif
5448
5449 STATIC regnode *
5450 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5451  /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5452 {
5453  dVAR;
5454  register regnode *ret;  /* Will be the head of the group. */
5455  register regnode *br;
5456  register regnode *lastbr;
5457  register regnode *ender = NULL;
5458  register I32 parno = 0;
5459  I32 flags;
5460  U32 oregflags = RExC_flags;
5461  bool have_branch = 0;
5462  bool is_open = 0;
5463  I32 freeze_paren = 0;
5464  I32 after_freeze = 0;
5465
5466  /* for (?g), (?gc), and (?o) warnings; warning
5467  about (?c) will warn about (?g) -- japhy    */
5468
5469 #define WASTED_O  0x01
5470 #define WASTED_G  0x02
5471 #define WASTED_C  0x04
5472 #define WASTED_GC (0x02|0x04)
5473  I32 wastedflags = 0x00;
5474
5475  char * parse_start = RExC_parse; /* MJD */
5476  char * const oregcomp_parse = RExC_parse;
5477
5478  GET_RE_DEBUG_FLAGS_DECL;
5479
5480  PERL_ARGS_ASSERT_REG;
5481  DEBUG_PARSE("reg ");
5482
5483  *flagp = 0;    /* Tentatively. */
5484
5485
5486  /* Make an OPEN node, if parenthesized. */
5487  if (paren) {
5488   if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5489    char *start_verb = RExC_parse;
5490    STRLEN verb_len = 0;
5491    char *start_arg = NULL;
5492    unsigned char op = 0;
5493    int argok = 1;
5494    int internal_argval = 0; /* internal_argval is only useful if !argok */
5495    while ( *RExC_parse && *RExC_parse != ')' ) {
5496     if ( *RExC_parse == ':' ) {
5497      start_arg = RExC_parse + 1;
5498      break;
5499     }
5500     RExC_parse++;
5501    }
5502    ++start_verb;
5503    verb_len = RExC_parse - start_verb;
5504    if ( start_arg ) {
5505     RExC_parse++;
5506     while ( *RExC_parse && *RExC_parse != ')' )
5507      RExC_parse++;
5508     if ( *RExC_parse != ')' )
5509      vFAIL("Unterminated verb pattern argument");
5510     if ( RExC_parse == start_arg )
5511      start_arg = NULL;
5512    } else {
5513     if ( *RExC_parse != ')' )
5514      vFAIL("Unterminated verb pattern");
5515    }
5516
5517    switch ( *start_verb ) {
5518    case 'A':  /* (*ACCEPT) */
5519     if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5520      op = ACCEPT;
5521      internal_argval = RExC_nestroot;
5522     }
5523     break;
5524    case 'C':  /* (*COMMIT) */
5525     if ( memEQs(start_verb,verb_len,"COMMIT") )
5526      op = COMMIT;
5527     break;
5528    case 'F':  /* (*FAIL) */
5529     if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5530      op = OPFAIL;
5531      argok = 0;
5532     }
5533     break;
5534    case ':':  /* (*:NAME) */
5535    case 'M':  /* (*MARK:NAME) */
5536     if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5537      op = MARKPOINT;
5538      argok = -1;
5539     }
5540     break;
5541    case 'P':  /* (*PRUNE) */
5542     if ( memEQs(start_verb,verb_len,"PRUNE") )
5543      op = PRUNE;
5544     break;
5545    case 'S':   /* (*SKIP) */
5546     if ( memEQs(start_verb,verb_len,"SKIP") )
5547      op = SKIP;
5548     break;
5549    case 'T':  /* (*THEN) */
5550     /* [19:06] <TimToady> :: is then */
5551     if ( memEQs(start_verb,verb_len,"THEN") ) {
5552      op = CUTGROUP;
5553      RExC_seen |= REG_SEEN_CUTGROUP;
5554     }
5555     break;
5556    }
5557    if ( ! op ) {
5558     RExC_parse++;
5559     vFAIL3("Unknown verb pattern '%.*s'",
5560      verb_len, start_verb);
5561    }
5562    if ( argok ) {
5563     if ( start_arg && internal_argval ) {
5564      vFAIL3("Verb pattern '%.*s' may not have an argument",
5565       verb_len, start_verb);
5566     } else if ( argok < 0 && !start_arg ) {
5567      vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5568       verb_len, start_verb);
5569     } else {
5570      ret = reganode(pRExC_state, op, internal_argval);
5571      if ( ! internal_argval && ! SIZE_ONLY ) {
5572       if (start_arg) {
5573        SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5574        ARG(ret) = add_data( pRExC_state, 1, "S" );
5575        RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5576        ret->flags = 0;
5577       } else {
5578        ret->flags = 1;
5579       }
5580      }
5581     }
5582     if (!internal_argval)
5583      RExC_seen |= REG_SEEN_VERBARG;
5584    } else if ( start_arg ) {
5585     vFAIL3("Verb pattern '%.*s' may not have an argument",
5586       verb_len, start_verb);
5587    } else {
5588     ret = reg_node(pRExC_state, op);
5589    }
5590    nextchar(pRExC_state);
5591    return ret;
5592   } else
5593   if (*RExC_parse == '?') { /* (?...) */
5594    bool is_logical = 0;
5595    const char * const seqstart = RExC_parse;
5596
5597    RExC_parse++;
5598    paren = *RExC_parse++;
5599    ret = NULL;   /* For look-ahead/behind. */
5600    switch (paren) {
5601
5602    case 'P': /* (?P...) variants for those used to PCRE/Python */
5603     paren = *RExC_parse++;
5604     if ( paren == '<')         /* (?P<...>) named capture */
5605      goto named_capture;
5606     else if (paren == '>') {   /* (?P>name) named recursion */
5607      goto named_recursion;
5608     }
5609     else if (paren == '=') {   /* (?P=...)  named backref */
5610      /* this pretty much dupes the code for \k<NAME> in regatom(), if
5611      you change this make sure you change that */
5612      char* name_start = RExC_parse;
5613      U32 num = 0;
5614      SV *sv_dat = reg_scan_name(pRExC_state,
5615       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5616      if (RExC_parse == name_start || *RExC_parse != ')')
5617       vFAIL2("Sequence %.3s... not terminated",parse_start);
5618
5619      if (!SIZE_ONLY) {
5620       num = add_data( pRExC_state, 1, "S" );
5621       RExC_rxi->data->data[num]=(void*)sv_dat;
5622       SvREFCNT_inc_simple_void(sv_dat);
5623      }
5624      RExC_sawback = 1;
5625      ret = reganode(pRExC_state,
5626        (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5627        num);
5628      *flagp |= HASWIDTH;
5629
5630      Set_Node_Offset(ret, parse_start+1);
5631      Set_Node_Cur_Length(ret); /* MJD */
5632
5633      nextchar(pRExC_state);
5634      return ret;
5635     }
5636     RExC_parse++;
5637     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5638     /*NOTREACHED*/
5639    case '<':           /* (?<...) */
5640     if (*RExC_parse == '!')
5641      paren = ',';
5642     else if (*RExC_parse != '=')
5643    named_capture:
5644     {               /* (?<...>) */
5645      char *name_start;
5646      SV *svname;
5647      paren= '>';
5648    case '\'':          /* (?'...') */
5649       name_start= RExC_parse;
5650       svname = reg_scan_name(pRExC_state,
5651        SIZE_ONLY ?  /* reverse test from the others */
5652        REG_RSN_RETURN_NAME :
5653        REG_RSN_RETURN_NULL);
5654      if (RExC_parse == name_start) {
5655       RExC_parse++;
5656       vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5657       /*NOTREACHED*/
5658      }
5659      if (*RExC_parse != paren)
5660       vFAIL2("Sequence (?%c... not terminated",
5661        paren=='>' ? '<' : paren);
5662      if (SIZE_ONLY) {
5663       HE *he_str;
5664       SV *sv_dat = NULL;
5665       if (!svname) /* shouldnt happen */
5666        Perl_croak(aTHX_
5667         "panic: reg_scan_name returned NULL");
5668       if (!RExC_paren_names) {
5669        RExC_paren_names= newHV();
5670        sv_2mortal(MUTABLE_SV(RExC_paren_names));
5671 #ifdef DEBUGGING
5672        RExC_paren_name_list= newAV();
5673        sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5674 #endif
5675       }
5676       he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5677       if ( he_str )
5678        sv_dat = HeVAL(he_str);
5679       if ( ! sv_dat ) {
5680        /* croak baby croak */
5681        Perl_croak(aTHX_
5682         "panic: paren_name hash element allocation failed");
5683       } else if ( SvPOK(sv_dat) ) {
5684        /* (?|...) can mean we have dupes so scan to check
5685        its already been stored. Maybe a flag indicating
5686        we are inside such a construct would be useful,
5687        but the arrays are likely to be quite small, so
5688        for now we punt -- dmq */
5689        IV count = SvIV(sv_dat);
5690        I32 *pv = (I32*)SvPVX(sv_dat);
5691        IV i;
5692        for ( i = 0 ; i < count ; i++ ) {
5693         if ( pv[i] == RExC_npar ) {
5694          count = 0;
5695          break;
5696         }
5697        }
5698        if ( count ) {
5699         pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5700         SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5701         pv[count] = RExC_npar;
5702         SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5703        }
5704       } else {
5705        (void)SvUPGRADE(sv_dat,SVt_PVNV);
5706        sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5707        SvIOK_on(sv_dat);
5708        SvIV_set(sv_dat, 1);
5709       }
5710 #ifdef DEBUGGING
5711       if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5712        SvREFCNT_dec(svname);
5713 #endif
5714
5715       /*sv_dump(sv_dat);*/
5716      }
5717      nextchar(pRExC_state);
5718      paren = 1;
5719      goto capturing_parens;
5720     }
5721     RExC_seen |= REG_SEEN_LOOKBEHIND;
5722     RExC_parse++;
5723    case '=':           /* (?=...) */
5724     RExC_seen_zerolen++;
5725       break;
5726    case '!':           /* (?!...) */
5727     RExC_seen_zerolen++;
5728     if (*RExC_parse == ')') {
5729      ret=reg_node(pRExC_state, OPFAIL);
5730      nextchar(pRExC_state);
5731      return ret;
5732     }
5733     break;
5734    case '|':           /* (?|...) */
5735     /* branch reset, behave like a (?:...) except that
5736     buffers in alternations share the same numbers */
5737     paren = ':';
5738     after_freeze = freeze_paren = RExC_npar;
5739     break;
5740    case ':':           /* (?:...) */
5741    case '>':           /* (?>...) */
5742     break;
5743    case '$':           /* (?$...) */
5744    case '@':           /* (?@...) */
5745     vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5746     break;
5747    case '#':           /* (?#...) */
5748     while (*RExC_parse && *RExC_parse != ')')
5749      RExC_parse++;
5750     if (*RExC_parse != ')')
5751      FAIL("Sequence (?#... not terminated");
5752     nextchar(pRExC_state);
5753     *flagp = TRYAGAIN;
5754     return NULL;
5755    case '0' :           /* (?0) */
5756    case 'R' :           /* (?R) */
5757     if (*RExC_parse != ')')
5758      FAIL("Sequence (?R) not terminated");
5759     ret = reg_node(pRExC_state, GOSTART);
5760     *flagp |= POSTPONED;
5761     nextchar(pRExC_state);
5762     return ret;
5763     /*notreached*/
5764    { /* named and numeric backreferences */
5765     I32 num;
5766    case '&':            /* (?&NAME) */
5767     parse_start = RExC_parse - 1;
5768    named_recursion:
5769     {
5770       SV *sv_dat = reg_scan_name(pRExC_state,
5771        SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5772       num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5773     }
5774     goto gen_recurse_regop;
5775     /* NOT REACHED */
5776    case '+':
5777     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5778      RExC_parse++;
5779      vFAIL("Illegal pattern");
5780     }
5781     goto parse_recursion;
5782     /* NOT REACHED*/
5783    case '-': /* (?-1) */
5784     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5785      RExC_parse--; /* rewind to let it be handled later */
5786      goto parse_flags;
5787     }
5788     /*FALLTHROUGH */
5789    case '1': case '2': case '3': case '4': /* (?1) */
5790    case '5': case '6': case '7': case '8': case '9':
5791     RExC_parse--;
5792    parse_recursion:
5793     num = atoi(RExC_parse);
5794     parse_start = RExC_parse - 1; /* MJD */
5795     if (*RExC_parse == '-')
5796      RExC_parse++;
5797     while (isDIGIT(*RExC_parse))
5798       RExC_parse++;
5799     if (*RExC_parse!=')')
5800      vFAIL("Expecting close bracket");
5801
5802    gen_recurse_regop:
5803     if ( paren == '-' ) {
5804      /*
5805      Diagram of capture buffer numbering.
5806      Top line is the normal capture buffer numbers
5807      Botton line is the negative indexing as from
5808      the X (the (?-2))
5809
5810      +   1 2    3 4 5 X          6 7
5811      /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5812      -   5 4    3 2 1 X          x x
5813
5814      */
5815      num = RExC_npar + num;
5816      if (num < 1)  {
5817       RExC_parse++;
5818       vFAIL("Reference to nonexistent group");
5819      }
5820     } else if ( paren == '+' ) {
5821      num = RExC_npar + num - 1;
5822     }
5823
5824     ret = reganode(pRExC_state, GOSUB, num);
5825     if (!SIZE_ONLY) {
5826      if (num > (I32)RExC_rx->nparens) {
5827       RExC_parse++;
5828       vFAIL("Reference to nonexistent group");
5829      }
5830      ARG2L_SET( ret, RExC_recurse_count++);
5831      RExC_emit++;
5832      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5833       "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5834     } else {
5835      RExC_size++;
5836      }
5837      RExC_seen |= REG_SEEN_RECURSE;
5838     Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5839     Set_Node_Offset(ret, parse_start); /* MJD */
5840
5841     *flagp |= POSTPONED;
5842     nextchar(pRExC_state);
5843     return ret;
5844    } /* named and numeric backreferences */
5845    /* NOT REACHED */
5846
5847    case '?':           /* (??...) */
5848     is_logical = 1;
5849     if (*RExC_parse != '{') {
5850      RExC_parse++;
5851      vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5852      /*NOTREACHED*/
5853     }
5854     *flagp |= POSTPONED;
5855     paren = *RExC_parse++;
5856     /* FALL THROUGH */
5857    case '{':           /* (?{...}) */
5858    {
5859     I32 count = 1;
5860     U32 n = 0;
5861     char c;
5862     char *s = RExC_parse;
5863
5864     RExC_seen_zerolen++;
5865     RExC_seen |= REG_SEEN_EVAL;
5866     while (count && (c = *RExC_parse)) {
5867      if (c == '\\') {
5868       if (RExC_parse[1])
5869        RExC_parse++;
5870      }
5871      else if (c == '{')
5872       count++;
5873      else if (c == '}')
5874       count--;
5875      RExC_parse++;
5876     }
5877     if (*RExC_parse != ')') {
5878      RExC_parse = s;
5879      vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5880     }
5881     if (!SIZE_ONLY) {
5882      PAD *pad;
5883      OP_4tree *sop, *rop;
5884      SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5885
5886      ENTER;
5887      Perl_save_re_context(aTHX);
5888      rop = sv_compile_2op(sv, &sop, "re", &pad);
5889      sop->op_private |= OPpREFCOUNTED;
5890      /* re_dup will OpREFCNT_inc */
5891      OpREFCNT_set(sop, 1);
5892      LEAVE;
5893
5894      n = add_data(pRExC_state, 3, "nop");
5895      RExC_rxi->data->data[n] = (void*)rop;
5896      RExC_rxi->data->data[n+1] = (void*)sop;
5897      RExC_rxi->data->data[n+2] = (void*)pad;
5898      SvREFCNT_dec(sv);
5899     }
5900     else {      /* First pass */
5901      if (PL_reginterp_cnt < ++RExC_seen_evals
5902       && IN_PERL_RUNTIME)
5903       /* No compiled RE interpolated, has runtime
5904       components ===> unsafe.  */
5905       FAIL("Eval-group not allowed at runtime, use re 'eval'");
5906      if (PL_tainting && PL_tainted)
5907       FAIL("Eval-group in insecure regular expression");
5908 #if PERL_VERSION > 8
5909      if (IN_PERL_COMPILETIME)
5910       PL_cv_has_eval = 1;
5911 #endif
5912     }
5913
5914     nextchar(pRExC_state);
5915     if (is_logical) {
5916      ret = reg_node(pRExC_state, LOGICAL);
5917      if (!SIZE_ONLY)
5918       ret->flags = 2;
5919      REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5920      /* deal with the length of this later - MJD */
5921      return ret;
5922     }
5923     ret = reganode(pRExC_state, EVAL, n);
5924     Set_Node_Length(ret, RExC_parse - parse_start + 1);
5925     Set_Node_Offset(ret, parse_start);
5926     return ret;
5927    }
5928    case '(':           /* (?(?{...})...) and (?(?=...)...) */
5929    {
5930     int is_define= 0;
5931     if (RExC_parse[0] == '?') {        /* (?(?...)) */
5932      if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5933       || RExC_parse[1] == '<'
5934       || RExC_parse[1] == '{') { /* Lookahead or eval. */
5935       I32 flag;
5936
5937       ret = reg_node(pRExC_state, LOGICAL);
5938       if (!SIZE_ONLY)
5939        ret->flags = 1;
5940       REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5941       goto insert_if;
5942      }
5943     }
5944     else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
5945       || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5946     {
5947      char ch = RExC_parse[0] == '<' ? '>' : '\'';
5948      char *name_start= RExC_parse++;
5949      U32 num = 0;
5950      SV *sv_dat=reg_scan_name(pRExC_state,
5951       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5952      if (RExC_parse == name_start || *RExC_parse != ch)
5953       vFAIL2("Sequence (?(%c... not terminated",
5954        (ch == '>' ? '<' : ch));
5955      RExC_parse++;
5956      if (!SIZE_ONLY) {
5957       num = add_data( pRExC_state, 1, "S" );
5958       RExC_rxi->data->data[num]=(void*)sv_dat;
5959       SvREFCNT_inc_simple_void(sv_dat);
5960      }
5961      ret = reganode(pRExC_state,NGROUPP,num);
5962      goto insert_if_check_paren;
5963     }
5964     else if (RExC_parse[0] == 'D' &&
5965       RExC_parse[1] == 'E' &&
5966       RExC_parse[2] == 'F' &&
5967       RExC_parse[3] == 'I' &&
5968       RExC_parse[4] == 'N' &&
5969       RExC_parse[5] == 'E')
5970     {
5971      ret = reganode(pRExC_state,DEFINEP,0);
5972      RExC_parse +=6 ;
5973      is_define = 1;
5974      goto insert_if_check_paren;
5975     }
5976     else if (RExC_parse[0] == 'R') {
5977      RExC_parse++;
5978      parno = 0;
5979      if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5980       parno = atoi(RExC_parse++);
5981       while (isDIGIT(*RExC_parse))
5982        RExC_parse++;
5983      } else if (RExC_parse[0] == '&') {
5984       SV *sv_dat;
5985       RExC_parse++;
5986       sv_dat = reg_scan_name(pRExC_state,
5987         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5988        parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5989      }
5990      ret = reganode(pRExC_state,INSUBP,parno);
5991      goto insert_if_check_paren;
5992     }
5993     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5994      /* (?(1)...) */
5995      char c;
5996      parno = atoi(RExC_parse++);
5997
5998      while (isDIGIT(*RExC_parse))
5999       RExC_parse++;
6000      ret = reganode(pRExC_state, GROUPP, parno);
6001
6002     insert_if_check_paren:
6003      if ((c = *nextchar(pRExC_state)) != ')')
6004       vFAIL("Switch condition not recognized");
6005     insert_if:
6006      REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6007      br = regbranch(pRExC_state, &flags, 1,depth+1);
6008      if (br == NULL)
6009       br = reganode(pRExC_state, LONGJMP, 0);
6010      else
6011       REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6012      c = *nextchar(pRExC_state);
6013      if (flags&HASWIDTH)
6014       *flagp |= HASWIDTH;
6015      if (c == '|') {
6016       if (is_define)
6017        vFAIL("(?(DEFINE)....) does not allow branches");
6018       lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6019       regbranch(pRExC_state, &flags, 1,depth+1);
6020       REGTAIL(pRExC_state, ret, lastbr);
6021       if (flags&HASWIDTH)
6022        *flagp |= HASWIDTH;
6023       c = *nextchar(pRExC_state);
6024      }
6025      else
6026       lastbr = NULL;
6027      if (c != ')')
6028       vFAIL("Switch (?(condition)... contains too many branches");
6029      ender = reg_node(pRExC_state, TAIL);
6030      REGTAIL(pRExC_state, br, ender);
6031      if (lastbr) {
6032       REGTAIL(pRExC_state, lastbr, ender);
6033       REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6034      }
6035      else
6036       REGTAIL(pRExC_state, ret, ender);
6037      RExC_size++; /* XXX WHY do we need this?!!
6038          For large programs it seems to be required
6039          but I can't figure out why. -- dmq*/
6040      return ret;
6041     }
6042     else {
6043      vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6044     }
6045    }
6046    case 0:
6047     RExC_parse--; /* for vFAIL to print correctly */
6048     vFAIL("Sequence (? incomplete");
6049     break;
6050    default:
6051     --RExC_parse;
6052     parse_flags:      /* (?i) */
6053    {
6054     U32 posflags = 0, negflags = 0;
6055     U32 *flagsp = &posflags;
6056
6057     while (*RExC_parse) {
6058      /* && strchr("iogcmsx", *RExC_parse) */
6059      /* (?g), (?gc) and (?o) are useless here
6060      and must be globally applied -- japhy */
6061      switch (*RExC_parse) {
6062      CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6063      case ONCE_PAT_MOD: /* 'o' */
6064      case GLOBAL_PAT_MOD: /* 'g' */
6065       if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6066        const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6067        if (! (wastedflags & wflagbit) ) {
6068         wastedflags |= wflagbit;
6069         vWARN5(
6070          RExC_parse + 1,
6071          "Useless (%s%c) - %suse /%c modifier",
6072          flagsp == &negflags ? "?-" : "?",
6073          *RExC_parse,
6074          flagsp == &negflags ? "don't " : "",
6075          *RExC_parse
6076         );
6077        }
6078       }
6079       break;
6080
6081      case CONTINUE_PAT_MOD: /* 'c' */
6082       if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6083        if (! (wastedflags & WASTED_C) ) {
6084         wastedflags |= WASTED_GC;
6085         vWARN3(
6086          RExC_parse + 1,
6087          "Useless (%sc) - %suse /gc modifier",
6088          flagsp == &negflags ? "?-" : "?",
6089          flagsp == &negflags ? "don't " : ""
6090         );
6091        }
6092       }
6093       break;
6094      case KEEPCOPY_PAT_MOD: /* 'p' */
6095       if (flagsp == &negflags) {
6096        if (SIZE_ONLY)
6097         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6098       } else {
6099        *flagsp |= RXf_PMf_KEEPCOPY;
6100       }
6101       break;
6102      case '-':
6103       if (flagsp == &negflags) {
6104        RExC_parse++;
6105        vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6106        /*NOTREACHED*/
6107       }
6108       flagsp = &negflags;
6109       wastedflags = 0;  /* reset so (?g-c) warns twice */
6110       break;
6111      case ':':
6112       paren = ':';
6113       /*FALLTHROUGH*/
6114      case ')':
6115       RExC_flags |= posflags;
6116       RExC_flags &= ~negflags;
6117       if (paren != ':') {
6118        oregflags |= posflags;
6119        oregflags &= ~negflags;
6120       }
6121       nextchar(pRExC_state);
6122       if (paren != ':') {
6123        *flagp = TRYAGAIN;
6124        return NULL;
6125       } else {
6126        ret = NULL;
6127        goto parse_rest;
6128       }
6129       /*NOTREACHED*/
6130      default:
6131       RExC_parse++;
6132       vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6133       /*NOTREACHED*/
6134      }
6135      ++RExC_parse;
6136     }
6137    }} /* one for the default block, one for the switch */
6138   }
6139   else {                  /* (...) */
6140   capturing_parens:
6141    parno = RExC_npar;
6142    RExC_npar++;
6143
6144    ret = reganode(pRExC_state, OPEN, parno);
6145    if (!SIZE_ONLY ){
6146     if (!RExC_nestroot)
6147      RExC_nestroot = parno;
6148     if (RExC_seen & REG_SEEN_RECURSE
6149      && !RExC_open_parens[parno-1])
6150     {
6151      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6152       "Setting open paren #%"IVdf" to %d\n",
6153       (IV)parno, REG_NODE_NUM(ret)));
6154      RExC_open_parens[parno-1]= ret;
6155     }
6156    }
6157    Set_Node_Length(ret, 1); /* MJD */
6158    Set_Node_Offset(ret, RExC_parse); /* MJD */
6159    is_open = 1;
6160   }
6161  }
6162  else                        /* ! paren */
6163   ret = NULL;
6164
6165    parse_rest:
6166  /* Pick up the branches, linking them together. */
6167  parse_start = RExC_parse;   /* MJD */
6168  br = regbranch(pRExC_state, &flags, 1,depth+1);
6169
6170  if (freeze_paren) {
6171   if (RExC_npar > after_freeze)
6172    after_freeze = RExC_npar;
6173   RExC_npar = freeze_paren;
6174  }
6175
6176  /*     branch_len = (paren != 0); */
6177
6178  if (br == NULL)
6179   return(NULL);
6180  if (*RExC_parse == '|') {
6181   if (!SIZE_ONLY && RExC_extralen) {
6182    reginsert(pRExC_state, BRANCHJ, br, depth+1);
6183   }
6184   else {                  /* MJD */
6185    reginsert(pRExC_state, BRANCH, br, depth+1);
6186    Set_Node_Length(br, paren != 0);
6187    Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6188   }
6189   have_branch = 1;
6190   if (SIZE_ONLY)
6191    RExC_extralen += 1;  /* For BRANCHJ-BRANCH. */
6192  }
6193  else if (paren == ':') {
6194   *flagp |= flags&SIMPLE;
6195  }
6196  if (is_open) {    /* Starts with OPEN. */
6197   REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
6198  }
6199  else if (paren != '?')  /* Not Conditional */
6200   ret = br;
6201  *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6202  lastbr = br;
6203  while (*RExC_parse == '|') {
6204   if (!SIZE_ONLY && RExC_extralen) {
6205    ender = reganode(pRExC_state, LONGJMP,0);
6206    REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6207   }
6208   if (SIZE_ONLY)
6209    RExC_extralen += 2;  /* Account for LONGJMP. */
6210   nextchar(pRExC_state);
6211   if (freeze_paren) {
6212    if (RExC_npar > after_freeze)
6213     after_freeze = RExC_npar;
6214    RExC_npar = freeze_paren;
6215   }
6216   br = regbranch(pRExC_state, &flags, 0, depth+1);
6217
6218   if (br == NULL)
6219    return(NULL);
6220   REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
6221   lastbr = br;
6222   *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6223  }
6224
6225  if (have_branch || paren != ':') {
6226   /* Make a closing node, and hook it on the end. */
6227   switch (paren) {
6228   case ':':
6229    ender = reg_node(pRExC_state, TAIL);
6230    break;
6231   case 1:
6232    ender = reganode(pRExC_state, CLOSE, parno);
6233    if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6234     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6235       "Setting close paren #%"IVdf" to %d\n",
6236       (IV)parno, REG_NODE_NUM(ender)));
6237     RExC_close_parens[parno-1]= ender;
6238     if (RExC_nestroot == parno)
6239      RExC_nestroot = 0;
6240    }
6241    Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6242    Set_Node_Length(ender,1); /* MJD */
6243    break;
6244   case '<':
6245   case ',':
6246   case '=':
6247   case '!':
6248    *flagp &= ~HASWIDTH;
6249    /* FALL THROUGH */
6250   case '>':
6251    ender = reg_node(pRExC_state, SUCCEED);
6252    break;
6253   case 0:
6254    ender = reg_node(pRExC_state, END);
6255    if (!SIZE_ONLY) {
6256     assert(!RExC_opend); /* there can only be one! */
6257     RExC_opend = ender;
6258    }
6259    break;
6260   }
6261   REGTAIL(pRExC_state, lastbr, ender);
6262
6263   if (have_branch && !SIZE_ONLY) {
6264    if (depth==1)
6265     RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6266
6267    /* Hook the tails of the branches to the closing node. */
6268    for (br = ret; br; br = regnext(br)) {
6269     const U8 op = PL_regkind[OP(br)];
6270     if (op == BRANCH) {
6271      REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6272     }
6273     else if (op == BRANCHJ) {
6274      REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6275     }
6276    }
6277   }
6278  }
6279
6280  {
6281   const char *p;
6282   static const char parens[] = "=!<,>";
6283
6284   if (paren && (p = strchr(parens, paren))) {
6285    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6286    int flag = (p - parens) > 1;
6287
6288    if (paren == '>')
6289     node = SUSPEND, flag = 0;
6290    reginsert(pRExC_state, node,ret, depth+1);
6291    Set_Node_Cur_Length(ret);
6292    Set_Node_Offset(ret, parse_start + 1);
6293    ret->flags = flag;
6294    REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6295   }
6296  }
6297
6298  /* Check for proper termination. */
6299  if (paren) {
6300   RExC_flags = oregflags;
6301   if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6302    RExC_parse = oregcomp_parse;
6303    vFAIL("Unmatched (");
6304   }
6305  }
6306  else if (!paren && RExC_parse < RExC_end) {
6307   if (*RExC_parse == ')') {
6308    RExC_parse++;
6309    vFAIL("Unmatched )");
6310   }
6311   else
6312    FAIL("Junk on end of regexp"); /* "Can't happen". */
6313   /* NOTREACHED */
6314  }
6315  if (after_freeze)
6316   RExC_npar = after_freeze;
6317  return(ret);
6318 }
6319
6320 /*
6321  - regbranch - one alternative of an | operator
6322  *
6323  * Implements the concatenation operator.
6324  */
6325 STATIC regnode *
6326 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6327 {
6328  dVAR;
6329  register regnode *ret;
6330  register regnode *chain = NULL;
6331  register regnode *latest;
6332  I32 flags = 0, c = 0;
6333  GET_RE_DEBUG_FLAGS_DECL;
6334
6335  PERL_ARGS_ASSERT_REGBRANCH;
6336
6337  DEBUG_PARSE("brnc");
6338
6339  if (first)
6340   ret = NULL;
6341  else {
6342   if (!SIZE_ONLY && RExC_extralen)
6343    ret = reganode(pRExC_state, BRANCHJ,0);
6344   else {
6345    ret = reg_node(pRExC_state, BRANCH);
6346    Set_Node_Length(ret, 1);
6347   }
6348  }
6349
6350  if (!first && SIZE_ONLY)
6351   RExC_extralen += 1;   /* BRANCHJ */
6352
6353  *flagp = WORST;   /* Tentatively. */
6354
6355  RExC_parse--;
6356  nextchar(pRExC_state);
6357  while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6358   flags &= ~TRYAGAIN;
6359   latest = regpiece(pRExC_state, &flags,depth+1);
6360   if (latest == NULL) {
6361    if (flags & TRYAGAIN)
6362     continue;
6363    return(NULL);
6364   }
6365   else if (ret == NULL)
6366    ret = latest;
6367   *flagp |= flags&(HASWIDTH|POSTPONED);
6368   if (chain == NULL)  /* First piece. */
6369    *flagp |= flags&SPSTART;
6370   else {
6371    RExC_naughty++;
6372    REGTAIL(pRExC_state, chain, latest);
6373   }
6374   chain = latest;
6375   c++;
6376  }
6377  if (chain == NULL) { /* Loop ran zero times. */
6378   chain = reg_node(pRExC_state, NOTHING);
6379   if (ret == NULL)
6380    ret = chain;
6381  }
6382  if (c == 1) {
6383   *flagp |= flags&SIMPLE;
6384  }
6385
6386  return ret;
6387 }
6388
6389 /*
6390  - regpiece - something followed by possible [*+?]
6391  *
6392  * Note that the branching code sequences used for ? and the general cases
6393  * of * and + are somewhat optimized:  they use the same NOTHING node as
6394  * both the endmarker for their branch list and the body of the last branch.
6395  * It might seem that this node could be dispensed with entirely, but the
6396  * endmarker role is not redundant.
6397  */
6398 STATIC regnode *
6399 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6400 {
6401  dVAR;
6402  register regnode *ret;
6403  register char op;
6404  register char *next;
6405  I32 flags;
6406  const char * const origparse = RExC_parse;
6407  I32 min;
6408  I32 max = REG_INFTY;
6409  char *parse_start;
6410  const char *maxpos = NULL;
6411  GET_RE_DEBUG_FLAGS_DECL;
6412
6413  PERL_ARGS_ASSERT_REGPIECE;
6414
6415  DEBUG_PARSE("piec");
6416
6417  ret = regatom(pRExC_state, &flags,depth+1);
6418  if (ret == NULL) {
6419   if (flags & TRYAGAIN)
6420    *flagp |= TRYAGAIN;
6421   return(NULL);
6422  }
6423
6424  op = *RExC_parse;
6425
6426  if (op == '{' && regcurly(RExC_parse)) {
6427   maxpos = NULL;
6428   parse_start = RExC_parse; /* MJD */
6429   next = RExC_parse + 1;
6430   while (isDIGIT(*next) || *next == ',') {
6431    if (*next == ',') {
6432     if (maxpos)
6433      break;
6434     else
6435      maxpos = next;
6436    }
6437    next++;
6438   }
6439   if (*next == '}') {  /* got one */
6440    if (!maxpos)
6441     maxpos = next;
6442    RExC_parse++;
6443    min = atoi(RExC_parse);
6444    if (*maxpos == ',')
6445     maxpos++;
6446    else
6447     maxpos = RExC_parse;
6448    max = atoi(maxpos);
6449    if (!max && *maxpos != '0')
6450     max = REG_INFTY;  /* meaning "infinity" */
6451    else if (max >= REG_INFTY)
6452     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6453    RExC_parse = next;
6454    nextchar(pRExC_state);
6455
6456   do_curly:
6457    if ((flags&SIMPLE)) {
6458     RExC_naughty += 2 + RExC_naughty / 2;
6459     reginsert(pRExC_state, CURLY, ret, depth+1);
6460     Set_Node_Offset(ret, parse_start+1); /* MJD */
6461     Set_Node_Cur_Length(ret);
6462    }
6463    else {
6464     regnode * const w = reg_node(pRExC_state, WHILEM);
6465
6466     w->flags = 0;
6467     REGTAIL(pRExC_state, ret, w);
6468     if (!SIZE_ONLY && RExC_extralen) {
6469      reginsert(pRExC_state, LONGJMP,ret, depth+1);
6470      reginsert(pRExC_state, NOTHING,ret, depth+1);
6471      NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6472     }
6473     reginsert(pRExC_state, CURLYX,ret, depth+1);
6474         /* MJD hk */
6475     Set_Node_Offset(ret, parse_start+1);
6476     Set_Node_Length(ret,
6477         op == '{' ? (RExC_parse - parse_start) : 1);
6478
6479     if (!SIZE_ONLY && RExC_extralen)
6480      NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
6481     REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6482     if (SIZE_ONLY)
6483      RExC_whilem_seen++, RExC_extralen += 3;
6484     RExC_naughty += 4 + RExC_naughty; /* compound interest */
6485    }
6486    ret->flags = 0;
6487
6488    if (min > 0)
6489     *flagp = WORST;
6490    if (max > 0)
6491     *flagp |= HASWIDTH;
6492    if (max < min)
6493     vFAIL("Can't do {n,m} with n > m");
6494    if (!SIZE_ONLY) {
6495     ARG1_SET(ret, (U16)min);
6496     ARG2_SET(ret, (U16)max);
6497    }
6498
6499    goto nest_check;
6500   }
6501  }
6502
6503  if (!ISMULT1(op)) {
6504   *flagp = flags;
6505   return(ret);
6506  }
6507
6508 #if 0    /* Now runtime fix should be reliable. */
6509
6510  /* if this is reinstated, don't forget to put this back into perldiag:
6511
6512    =item Regexp *+ operand could be empty at {#} in regex m/%s/
6513
6514   (F) The part of the regexp subject to either the * or + quantifier
6515   could match an empty string. The {#} shows in the regular
6516   expression about where the problem was discovered.
6517
6518  */
6519
6520  if (!(flags&HASWIDTH) && op != '?')
6521  vFAIL("Regexp *+ operand could be empty");
6522 #endif
6523
6524  parse_start = RExC_parse;
6525  nextchar(pRExC_state);
6526
6527  *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6528
6529  if (op == '*' && (flags&SIMPLE)) {
6530   reginsert(pRExC_state, STAR, ret, depth+1);
6531   ret->flags = 0;
6532   RExC_naughty += 4;
6533  }
6534  else if (op == '*') {
6535   min = 0;
6536   goto do_curly;
6537  }
6538  else if (op == '+' && (flags&SIMPLE)) {
6539   reginsert(pRExC_state, PLUS, ret, depth+1);
6540   ret->flags = 0;
6541   RExC_naughty += 3;
6542  }
6543  else if (op == '+') {
6544   min = 1;
6545   goto do_curly;
6546  }
6547  else if (op == '?') {
6548   min = 0; max = 1;
6549   goto do_curly;
6550  }
6551   nest_check:
6552  if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6553   ckWARN3reg(RExC_parse,
6554     "%.*s matches null string many times",
6555     (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6556     origparse);
6557  }
6558
6559  if (RExC_parse < RExC_end && *RExC_parse == '?') {
6560   nextchar(pRExC_state);
6561   reginsert(pRExC_state, MINMOD, ret, depth+1);
6562   REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6563  }
6564 #ifndef REG_ALLOW_MINMOD_SUSPEND
6565  else
6566 #endif
6567  if (RExC_parse < RExC_end && *RExC_parse == '+') {
6568   regnode *ender;
6569   nextchar(pRExC_state);
6570   ender = reg_node(pRExC_state, SUCCEED);
6571   REGTAIL(pRExC_state, ret, ender);
6572   reginsert(pRExC_state, SUSPEND, ret, depth+1);
6573   ret->flags = 0;
6574   ender = reg_node(pRExC_state, TAIL);
6575   REGTAIL(pRExC_state, ret, ender);
6576   /*ret= ender;*/
6577  }
6578
6579  if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6580   RExC_parse++;
6581   vFAIL("Nested quantifiers");
6582  }
6583
6584  return(ret);
6585 }
6586
6587
6588 /* reg_namedseq(pRExC_state,UVp)
6589
6590    This is expected to be called by a parser routine that has
6591    recognized '\N' and needs to handle the rest. RExC_parse is
6592    expected to point at the first char following the N at the time
6593    of the call.
6594
6595    The \N may be inside (indicated by valuep not being NULL) or outside a
6596    character class.
6597
6598    \N may begin either a named sequence, or if outside a character class, mean
6599    to match a non-newline.  For non single-quoted regexes, the tokenizer has
6600    attempted to decide which, and in the case of a named sequence converted it
6601    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6602    where c1... are the characters in the sequence.  For single-quoted regexes,
6603    the tokenizer passes the \N sequence through unchanged; this code will not
6604    attempt to determine this nor expand those.  The net effect is that if the
6605    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6606    signals that this \N occurrence means to match a non-newline.
6607
6608    Only the \N{U+...} form should occur in a character class, for the same
6609    reason that '.' inside a character class means to just match a period: it
6610    just doesn't make sense.
6611
6612    If valuep is non-null then it is assumed that we are parsing inside
6613    of a charclass definition and the first codepoint in the resolved
6614    string is returned via *valuep and the routine will return NULL.
6615    In this mode if a multichar string is returned from the charnames
6616    handler, a warning will be issued, and only the first char in the
6617    sequence will be examined. If the string returned is zero length
6618    then the value of *valuep is undefined and NON-NULL will
6619    be returned to indicate failure. (This will NOT be a valid pointer
6620    to a regnode.)
6621
6622    If valuep is null then it is assumed that we are parsing normal text and a
6623    new EXACT node is inserted into the program containing the resolved string,
6624    and a pointer to the new node is returned.  But if the string is zero length
6625    a NOTHING node is emitted instead.
6626
6627    On success RExC_parse is set to the char following the endbrace.
6628    Parsing failures will generate a fatal error via vFAIL(...)
6629  */
6630 STATIC regnode *
6631 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6632 {
6633  char * endbrace;    /* '}' following the name */
6634  regnode *ret = NULL;
6635 #ifdef DEBUGGING
6636  char* parse_start = RExC_parse - 2;     /* points to the '\N' */
6637 #endif
6638  char* p;
6639
6640  GET_RE_DEBUG_FLAGS_DECL;
6641
6642  PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6643
6644  GET_RE_DEBUG_FLAGS;
6645
6646  /* The [^\n] meaning of \N ignores spaces and comments under the /x
6647  * modifier.  The other meaning does not */
6648  p = (RExC_flags & RXf_PMf_EXTENDED)
6649   ? regwhite( pRExC_state, RExC_parse )
6650   : RExC_parse;
6651
6652  /* Disambiguate between \N meaning a named character versus \N meaning
6653  * [^\n].  The former is assumed when it can't be the latter. */
6654  if (*p != '{' || regcurly(p)) {
6655   RExC_parse = p;
6656   if (valuep) {
6657    /* no bare \N in a charclass */
6658    vFAIL("\\N in a character class must be a named character: \\N{...}");
6659   }
6660   nextchar(pRExC_state);
6661   ret = reg_node(pRExC_state, REG_ANY);
6662   *flagp |= HASWIDTH|SIMPLE;
6663   RExC_naughty++;
6664   RExC_parse--;
6665   Set_Node_Length(ret, 1); /* MJD */
6666   return ret;
6667  }
6668
6669  /* Here, we have decided it should be a named sequence */
6670
6671  /* The test above made sure that the next real character is a '{', but
6672  * under the /x modifier, it could be separated by space (or a comment and
6673  * \n) and this is not allowed (for consistency with \x{...} and the
6674  * tokenizer handling of \N{NAME}). */
6675  if (*RExC_parse != '{') {
6676   vFAIL("Missing braces on \\N{}");
6677  }
6678
6679  RExC_parse++; /* Skip past the '{' */
6680
6681  if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6682   || ! (endbrace == RExC_parse  /* nothing between the {} */
6683    || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
6684     && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6685  {
6686   if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
6687   vFAIL("\\N{NAME} must be resolved by the lexer");
6688  }
6689
6690  if (endbrace == RExC_parse) {   /* empty: \N{} */
6691   if (! valuep) {
6692    RExC_parse = endbrace + 1;
6693    return reg_node(pRExC_state,NOTHING);
6694   }
6695
6696   if (SIZE_ONLY) {
6697    ckWARNreg(RExC_parse,
6698      "Ignoring zero length \\N{} in character class"
6699    );
6700    RExC_parse = endbrace + 1;
6701   }
6702   *valuep = 0;
6703   return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6704  }
6705
6706  RExC_utf8 = 1; /* named sequences imply Unicode semantics */
6707  RExC_parse += 2; /* Skip past the 'U+' */
6708
6709  if (valuep) {   /* In a bracketed char class */
6710   /* We only pay attention to the first char of
6711   multichar strings being returned. I kinda wonder
6712   if this makes sense as it does change the behaviour
6713   from earlier versions, OTOH that behaviour was broken
6714   as well. XXX Solution is to recharacterize as
6715   [rest-of-class]|multi1|multi2... */
6716
6717   STRLEN length_of_hex;
6718   I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6719    | PERL_SCAN_DISALLOW_PREFIX
6720    | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6721
6722   char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
6723   if (endchar < endbrace) {
6724    ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6725   }
6726
6727   length_of_hex = (STRLEN)(endchar - RExC_parse);
6728   *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
6729
6730   /* The tokenizer should have guaranteed validity, but it's possible to
6731   * bypass it by using single quoting, so check */
6732   if (length_of_hex == 0
6733    || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6734   {
6735    RExC_parse += length_of_hex; /* Includes all the valid */
6736    RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6737        ? UTF8SKIP(RExC_parse)
6738        : 1;
6739    /* Guard against malformed utf8 */
6740    if (RExC_parse >= endchar) RExC_parse = endchar;
6741    vFAIL("Invalid hexadecimal number in \\N{U+...}");
6742   }
6743
6744   RExC_parse = endbrace + 1;
6745   if (endchar == endbrace) return NULL;
6746
6747   ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
6748  }
6749  else { /* Not a char class */
6750   char *s;     /* String to put in generated EXACT node */
6751   STRLEN len = 0;     /* Its current length */
6752   char *endchar;     /* Points to '.' or '}' ending cur char in the input
6753        stream */
6754
6755   ret = reg_node(pRExC_state,
6756       (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6757   s= STRING(ret);
6758
6759   /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
6760   * the input which is of the form now 'c1.c2.c3...}' until find the
6761   * ending brace or exeed length 255.  The characters that exceed this
6762   * limit are dropped.  The limit could be relaxed should it become
6763   * desirable by reparsing this as (?:\N{NAME}), so could generate
6764   * multiple EXACT nodes, as is done for just regular input.  But this
6765   * is primarily a named character, and not intended to be a huge long
6766   * string, so 255 bytes should be good enough */
6767   while (1) {
6768    STRLEN length_of_hex;
6769    I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
6770        | PERL_SCAN_DISALLOW_PREFIX
6771        | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6772    UV cp;  /* Ord of current character */
6773
6774    /* Code points are separated by dots.  If none, there is only one
6775    * code point, and is terminated by the brace */
6776    endchar = RExC_parse + strcspn(RExC_parse, ".}");
6777
6778    /* The values are Unicode even on EBCDIC machines */
6779    length_of_hex = (STRLEN)(endchar - RExC_parse);
6780    cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
6781    if ( length_of_hex == 0
6782     || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6783    {
6784     RExC_parse += length_of_hex;     /* Includes all the valid */
6785     RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6786         ? UTF8SKIP(RExC_parse)
6787         : 1;
6788     /* Guard against malformed utf8 */
6789     if (RExC_parse >= endchar) RExC_parse = endchar;
6790     vFAIL("Invalid hexadecimal number in \\N{U+...}");
6791    }
6792
6793    if (! FOLD) { /* Not folding, just append to the string */
6794     STRLEN unilen;
6795
6796     /* Quit before adding this character if would exceed limit */
6797     if (len + UNISKIP(cp) > U8_MAX) break;
6798
6799     unilen = reguni(pRExC_state, cp, s);
6800     if (unilen > 0) {
6801      s   += unilen;
6802      len += unilen;
6803     }
6804    } else { /* Folding, output the folded equivalent */
6805     STRLEN foldlen,numlen;
6806     U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6807     cp = toFOLD_uni(cp, tmpbuf, &foldlen);
6808
6809     /* Quit before exceeding size limit */
6810     if (len + foldlen > U8_MAX) break;
6811
6812     for (foldbuf = tmpbuf;
6813      foldlen;
6814      foldlen -= numlen)
6815     {
6816      cp = utf8_to_uvchr(foldbuf, &numlen);
6817      if (numlen > 0) {
6818       const STRLEN unilen = reguni(pRExC_state, cp, s);
6819       s       += unilen;
6820       len     += unilen;
6821       /* In EBCDIC the numlen and unilen can differ. */
6822       foldbuf += numlen;
6823       if (numlen >= foldlen)
6824        break;
6825      }
6826      else
6827       break; /* "Can't happen." */
6828     }
6829    }
6830
6831    /* Point to the beginning of the next character in the sequence. */
6832    RExC_parse = endchar + 1;
6833
6834    /* Quit if no more characters */
6835    if (RExC_parse >= endbrace) break;
6836   }
6837
6838
6839   if (SIZE_ONLY) {
6840    if (RExC_parse < endbrace) {
6841     ckWARNreg(RExC_parse - 1,
6842       "Using just the first characters returned by \\N{}");
6843    }
6844
6845    RExC_size += STR_SZ(len);
6846   } else {
6847    STR_LEN(ret) = len;
6848    RExC_emit += STR_SZ(len);
6849   }
6850
6851   RExC_parse = endbrace + 1;
6852
6853   *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
6854        with malformed in t/re/pat_advanced.t */
6855   RExC_parse --;
6856   Set_Node_Cur_Length(ret); /* MJD */
6857   nextchar(pRExC_state);
6858  }
6859
6860  return ret;
6861 }
6862
6863
6864 /*
6865  * reg_recode
6866  *
6867  * It returns the code point in utf8 for the value in *encp.
6868  *    value: a code value in the source encoding
6869  *    encp:  a pointer to an Encode object
6870  *
6871  * If the result from Encode is not a single character,
6872  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6873  */
6874 STATIC UV
6875 S_reg_recode(pTHX_ const char value, SV **encp)
6876 {
6877  STRLEN numlen = 1;
6878  SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
6879  const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6880  const STRLEN newlen = SvCUR(sv);
6881  UV uv = UNICODE_REPLACEMENT;
6882
6883  PERL_ARGS_ASSERT_REG_RECODE;
6884
6885  if (newlen)
6886   uv = SvUTF8(sv)
6887    ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6888    : *(U8*)s;
6889
6890  if (!newlen || numlen != newlen) {
6891   uv = UNICODE_REPLACEMENT;
6892   *encp = NULL;
6893  }
6894  return uv;
6895 }
6896
6897
6898 /*
6899  - regatom - the lowest level
6900
6901    Try to identify anything special at the start of the pattern. If there
6902    is, then handle it as required. This may involve generating a single regop,
6903    such as for an assertion; or it may involve recursing, such as to
6904    handle a () structure.
6905
6906    If the string doesn't start with something special then we gobble up
6907    as much literal text as we can.
6908
6909    Once we have been able to handle whatever type of thing started the
6910    sequence, we return.
6911
6912    Note: we have to be careful with escapes, as they can be both literal
6913    and special, and in the case of \10 and friends can either, depending
6914    on context. Specifically there are two seperate switches for handling
6915    escape sequences, with the one for handling literal escapes requiring
6916    a dummy entry for all of the special escapes that are actually handled
6917    by the other.
6918 */
6919
6920 STATIC regnode *
6921 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6922 {
6923  dVAR;
6924  register regnode *ret = NULL;
6925  I32 flags;
6926  char *parse_start = RExC_parse;
6927  GET_RE_DEBUG_FLAGS_DECL;
6928  DEBUG_PARSE("atom");
6929  *flagp = WORST;  /* Tentatively. */
6930
6931  PERL_ARGS_ASSERT_REGATOM;
6932
6933 tryagain:
6934  switch ((U8)*RExC_parse) {
6935  case '^':
6936   RExC_seen_zerolen++;
6937   nextchar(pRExC_state);
6938   if (RExC_flags & RXf_PMf_MULTILINE)
6939    ret = reg_node(pRExC_state, MBOL);
6940   else if (RExC_flags & RXf_PMf_SINGLELINE)
6941    ret = reg_node(pRExC_state, SBOL);
6942   else
6943    ret = reg_node(pRExC_state, BOL);
6944   Set_Node_Length(ret, 1); /* MJD */
6945   break;
6946  case '$':
6947   nextchar(pRExC_state);
6948   if (*RExC_parse)
6949    RExC_seen_zerolen++;
6950   if (RExC_flags & RXf_PMf_MULTILINE)
6951    ret = reg_node(pRExC_state, MEOL);
6952   else if (RExC_flags & RXf_PMf_SINGLELINE)
6953    ret = reg_node(pRExC_state, SEOL);
6954   else
6955    ret = reg_node(pRExC_state, EOL);
6956   Set_Node_Length(ret, 1); /* MJD */
6957   break;
6958  case '.':
6959   nextchar(pRExC_state);
6960   if (RExC_flags & RXf_PMf_SINGLELINE)
6961    ret = reg_node(pRExC_state, SANY);
6962   else
6963    ret = reg_node(pRExC_state, REG_ANY);
6964   *flagp |= HASWIDTH|SIMPLE;
6965   RExC_naughty++;
6966   Set_Node_Length(ret, 1); /* MJD */
6967   break;
6968  case '[':
6969  {
6970   char * const oregcomp_parse = ++RExC_parse;
6971   ret = regclass(pRExC_state,depth+1);
6972   if (*RExC_parse != ']') {
6973    RExC_parse = oregcomp_parse;
6974    vFAIL("Unmatched [");
6975   }
6976   nextchar(pRExC_state);
6977   *flagp |= HASWIDTH|SIMPLE;
6978   Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6979   break;
6980  }
6981  case '(':
6982   nextchar(pRExC_state);
6983   ret = reg(pRExC_state, 1, &flags,depth+1);
6984   if (ret == NULL) {
6985     if (flags & TRYAGAIN) {
6986      if (RExC_parse == RExC_end) {
6987       /* Make parent create an empty node if needed. */
6988       *flagp |= TRYAGAIN;
6989       return(NULL);
6990      }
6991      goto tryagain;
6992     }
6993     return(NULL);
6994   }
6995   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
6996   break;
6997  case '|':
6998  case ')':
6999   if (flags & TRYAGAIN) {
7000    *flagp |= TRYAGAIN;
7001    return NULL;
7002   }
7003   vFAIL("Internal urp");
7004         /* Supposed to be caught earlier. */
7005   break;
7006  case '{':
7007   if (!regcurly(RExC_parse)) {
7008    RExC_parse++;
7009    goto defchar;
7010   }
7011   /* FALL THROUGH */
7012  case '?':
7013  case '+':
7014  case '*':
7015   RExC_parse++;
7016   vFAIL("Quantifier follows nothing");
7017   break;
7018  case 0xDF:
7019  case 0xC3:
7020  case 0xCE:
7021   do_foldchar:
7022   if (!LOC && FOLD) {
7023    U32 len,cp;
7024    len=0; /* silence a spurious compiler warning */
7025    if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7026     *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7027     RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7028     ret = reganode(pRExC_state, FOLDCHAR, cp);
7029     Set_Node_Length(ret, 1); /* MJD */
7030     nextchar(pRExC_state); /* kill whitespace under /x */
7031     return ret;
7032    }
7033   }
7034   goto outer_default;
7035  case '\\':
7036   /* Special Escapes
7037
7038   This switch handles escape sequences that resolve to some kind
7039   of special regop and not to literal text. Escape sequnces that
7040   resolve to literal text are handled below in the switch marked
7041   "Literal Escapes".
7042
7043   Every entry in this switch *must* have a corresponding entry
7044   in the literal escape switch. However, the opposite is not
7045   required, as the default for this switch is to jump to the
7046   literal text handling code.
7047   */
7048   switch ((U8)*++RExC_parse) {
7049   case 0xDF:
7050   case 0xC3:
7051   case 0xCE:
7052     goto do_foldchar;
7053   /* Special Escapes */
7054   case 'A':
7055    RExC_seen_zerolen++;
7056    ret = reg_node(pRExC_state, SBOL);
7057    *flagp |= SIMPLE;
7058    goto finish_meta_pat;
7059   case 'G':
7060    ret = reg_node(pRExC_state, GPOS);
7061    RExC_seen |= REG_SEEN_GPOS;
7062    *flagp |= SIMPLE;
7063    goto finish_meta_pat;
7064   case 'K':
7065    RExC_seen_zerolen++;
7066    ret = reg_node(pRExC_state, KEEPS);
7067    *flagp |= SIMPLE;
7068    /* XXX:dmq : disabling in-place substitution seems to
7069    * be necessary here to avoid cases of memory corruption, as
7070    * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7071    */
7072    RExC_seen |= REG_SEEN_LOOKBEHIND;
7073    goto finish_meta_pat;
7074   case 'Z':
7075    ret = reg_node(pRExC_state, SEOL);
7076    *flagp |= SIMPLE;
7077    RExC_seen_zerolen++;  /* Do not optimize RE away */
7078    goto finish_meta_pat;
7079   case 'z':
7080    ret = reg_node(pRExC_state, EOS);
7081    *flagp |= SIMPLE;
7082    RExC_seen_zerolen++;  /* Do not optimize RE away */
7083    goto finish_meta_pat;
7084   case 'C':
7085    ret = reg_node(pRExC_state, CANY);
7086    RExC_seen |= REG_SEEN_CANY;
7087    *flagp |= HASWIDTH|SIMPLE;
7088    goto finish_meta_pat;
7089   case 'X':
7090    ret = reg_node(pRExC_state, CLUMP);
7091    *flagp |= HASWIDTH;
7092    goto finish_meta_pat;
7093   case 'w':
7094    ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
7095    *flagp |= HASWIDTH|SIMPLE;
7096    goto finish_meta_pat;
7097   case 'W':
7098    ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
7099    *flagp |= HASWIDTH|SIMPLE;
7100    goto finish_meta_pat;
7101   case 'b':
7102    RExC_seen_zerolen++;
7103    RExC_seen |= REG_SEEN_LOOKBEHIND;
7104    ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
7105    *flagp |= SIMPLE;
7106    goto finish_meta_pat;
7107   case 'B':
7108    RExC_seen_zerolen++;
7109    RExC_seen |= REG_SEEN_LOOKBEHIND;
7110    ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
7111    *flagp |= SIMPLE;
7112    goto finish_meta_pat;
7113   case 's':
7114    ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
7115    *flagp |= HASWIDTH|SIMPLE;
7116    goto finish_meta_pat;
7117   case 'S':
7118    ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
7119    *flagp |= HASWIDTH|SIMPLE;
7120    goto finish_meta_pat;
7121   case 'd':
7122    ret = reg_node(pRExC_state, DIGIT);
7123    *flagp |= HASWIDTH|SIMPLE;
7124    goto finish_meta_pat;
7125   case 'D':
7126    ret = reg_node(pRExC_state, NDIGIT);
7127    *flagp |= HASWIDTH|SIMPLE;
7128    goto finish_meta_pat;
7129   case 'R':
7130    ret = reg_node(pRExC_state, LNBREAK);
7131    *flagp |= HASWIDTH|SIMPLE;
7132    goto finish_meta_pat;
7133   case 'h':
7134    ret = reg_node(pRExC_state, HORIZWS);
7135    *flagp |= HASWIDTH|SIMPLE;
7136    goto finish_meta_pat;
7137   case 'H':
7138    ret = reg_node(pRExC_state, NHORIZWS);
7139    *flagp |= HASWIDTH|SIMPLE;
7140    goto finish_meta_pat;
7141   case 'v':
7142    ret = reg_node(pRExC_state, VERTWS);
7143    *flagp |= HASWIDTH|SIMPLE;
7144    goto finish_meta_pat;
7145   case 'V':
7146    ret = reg_node(pRExC_state, NVERTWS);
7147    *flagp |= HASWIDTH|SIMPLE;
7148   finish_meta_pat:
7149    nextchar(pRExC_state);
7150    Set_Node_Length(ret, 2); /* MJD */
7151    break;
7152   case 'p':
7153   case 'P':
7154    {
7155     char* const oldregxend = RExC_end;
7156 #ifdef DEBUGGING
7157     char* parse_start = RExC_parse - 2;
7158 #endif
7159
7160     if (RExC_parse[1] == '{') {
7161     /* a lovely hack--pretend we saw [\pX] instead */
7162      RExC_end = strchr(RExC_parse, '}');
7163      if (!RExC_end) {
7164       const U8 c = (U8)*RExC_parse;
7165       RExC_parse += 2;
7166       RExC_end = oldregxend;
7167       vFAIL2("Missing right brace on \\%c{}", c);
7168      }
7169      RExC_end++;
7170     }
7171     else {
7172      RExC_end = RExC_parse + 2;
7173      if (RExC_end > oldregxend)
7174       RExC_end = oldregxend;
7175     }
7176     RExC_parse--;
7177
7178     ret = regclass(pRExC_state,depth+1);
7179
7180     RExC_end = oldregxend;
7181     RExC_parse--;
7182
7183     Set_Node_Offset(ret, parse_start + 2);
7184     Set_Node_Cur_Length(ret);
7185     nextchar(pRExC_state);
7186     *flagp |= HASWIDTH|SIMPLE;
7187    }
7188    break;
7189   case 'N':
7190    /* Handle \N and \N{NAME} here and not below because it can be
7191    multicharacter. join_exact() will join them up later on.
7192    Also this makes sure that things like /\N{BLAH}+/ and
7193    \N{BLAH} being multi char Just Happen. dmq*/
7194    ++RExC_parse;
7195    ret= reg_namedseq(pRExC_state, NULL, flagp);
7196    break;
7197   case 'k':    /* Handle \k<NAME> and \k'NAME' */
7198   parse_named_seq:
7199   {
7200    char ch= RExC_parse[1];
7201    if (ch != '<' && ch != '\'' && ch != '{') {
7202     RExC_parse++;
7203     vFAIL2("Sequence %.2s... not terminated",parse_start);
7204    } else {
7205     /* this pretty much dupes the code for (?P=...) in reg(), if
7206     you change this make sure you change that */
7207     char* name_start = (RExC_parse += 2);
7208     U32 num = 0;
7209     SV *sv_dat = reg_scan_name(pRExC_state,
7210      SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7211     ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7212     if (RExC_parse == name_start || *RExC_parse != ch)
7213      vFAIL2("Sequence %.3s... not terminated",parse_start);
7214
7215     if (!SIZE_ONLY) {
7216      num = add_data( pRExC_state, 1, "S" );
7217      RExC_rxi->data->data[num]=(void*)sv_dat;
7218      SvREFCNT_inc_simple_void(sv_dat);
7219     }
7220
7221     RExC_sawback = 1;
7222     ret = reganode(pRExC_state,
7223       (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7224       num);
7225     *flagp |= HASWIDTH;
7226
7227     /* override incorrect value set in reganode MJD */
7228     Set_Node_Offset(ret, parse_start+1);
7229     Set_Node_Cur_Length(ret); /* MJD */
7230     nextchar(pRExC_state);
7231
7232    }
7233    break;
7234   }
7235   case 'g':
7236   case '1': case '2': case '3': case '4':
7237   case '5': case '6': case '7': case '8': case '9':
7238    {
7239     I32 num;
7240     bool isg = *RExC_parse == 'g';
7241     bool isrel = 0;
7242     bool hasbrace = 0;
7243     if (isg) {
7244      RExC_parse++;
7245      if (*RExC_parse == '{') {
7246       RExC_parse++;
7247       hasbrace = 1;
7248      }
7249      if (*RExC_parse == '-') {
7250       RExC_parse++;
7251       isrel = 1;
7252      }
7253      if (hasbrace && !isDIGIT(*RExC_parse)) {
7254       if (isrel) RExC_parse--;
7255       RExC_parse -= 2;
7256       goto parse_named_seq;
7257     }   }
7258     num = atoi(RExC_parse);
7259     if (isg && num == 0)
7260      vFAIL("Reference to invalid group 0");
7261     if (isrel) {
7262      num = RExC_npar - num;
7263      if (num < 1)
7264       vFAIL("Reference to nonexistent or unclosed group");
7265     }
7266     if (!isg && num > 9 && num >= RExC_npar)
7267      goto defchar;
7268     else {
7269      char * const parse_start = RExC_parse - 1; /* MJD */
7270      while (isDIGIT(*RExC_parse))
7271       RExC_parse++;
7272      if (parse_start == RExC_parse - 1)
7273       vFAIL("Unterminated \\g... pattern");
7274      if (hasbrace) {
7275       if (*RExC_parse != '}')
7276        vFAIL("Unterminated \\g{...} pattern");
7277       RExC_parse++;
7278      }
7279      if (!SIZE_ONLY) {
7280       if (num > (I32)RExC_rx->nparens)
7281        vFAIL("Reference to nonexistent group");
7282      }
7283      RExC_sawback = 1;
7284      ret = reganode(pRExC_state,
7285         (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7286         num);
7287      *flagp |= HASWIDTH;
7288
7289      /* override incorrect value set in reganode MJD */
7290      Set_Node_Offset(ret, parse_start+1);
7291      Set_Node_Cur_Length(ret); /* MJD */
7292      RExC_parse--;
7293      nextchar(pRExC_state);
7294     }
7295    }
7296    break;
7297   case '\0':
7298    if (RExC_parse >= RExC_end)
7299     FAIL("Trailing \\");
7300    /* FALL THROUGH */
7301   default:
7302    /* Do not generate "unrecognized" warnings here, we fall
7303    back into the quick-grab loop below */
7304    parse_start--;
7305    goto defchar;
7306   }
7307   break;
7308
7309  case '#':
7310   if (RExC_flags & RXf_PMf_EXTENDED) {
7311    if ( reg_skipcomment( pRExC_state ) )
7312     goto tryagain;
7313   }
7314   /* FALL THROUGH */
7315
7316  default:
7317   outer_default:{
7318    register STRLEN len;
7319    register UV ender;
7320    register char *p;
7321    char *s;
7322    STRLEN foldlen;
7323    U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7324
7325    parse_start = RExC_parse - 1;
7326
7327    RExC_parse++;
7328
7329   defchar:
7330    ender = 0;
7331    ret = reg_node(pRExC_state,
7332       (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7333    s = STRING(ret);
7334    for (len = 0, p = RExC_parse - 1;
7335    len < 127 && p < RExC_end;
7336    len++)
7337    {
7338     char * const oldp = p;
7339
7340     if (RExC_flags & RXf_PMf_EXTENDED)
7341      p = regwhite( pRExC_state, p );
7342     switch ((U8)*p) {
7343     case 0xDF:
7344     case 0xC3:
7345     case 0xCE:
7346       if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7347         goto normal_default;
7348     case '^':
7349     case '$':
7350     case '.':
7351     case '[':
7352     case '(':
7353     case ')':
7354     case '|':
7355      goto loopdone;
7356     case '\\':
7357      /* Literal Escapes Switch
7358
7359      This switch is meant to handle escape sequences that
7360      resolve to a literal character.
7361
7362      Every escape sequence that represents something
7363      else, like an assertion or a char class, is handled
7364      in the switch marked 'Special Escapes' above in this
7365      routine, but also has an entry here as anything that
7366      isn't explicitly mentioned here will be treated as
7367      an unescaped equivalent literal.
7368      */
7369
7370      switch ((U8)*++p) {
7371      /* These are all the special escapes. */
7372       case 0xDF:
7373       case 0xC3:
7374       case 0xCE:
7375        if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7376          goto normal_default;
7377      case 'A':             /* Start assertion */
7378      case 'b': case 'B':   /* Word-boundary assertion*/
7379      case 'C':             /* Single char !DANGEROUS! */
7380      case 'd': case 'D':   /* digit class */
7381      case 'g': case 'G':   /* generic-backref, pos assertion */
7382      case 'h': case 'H':   /* HORIZWS */
7383      case 'k': case 'K':   /* named backref, keep marker */
7384      case 'N':             /* named char sequence */
7385      case 'p': case 'P':   /* Unicode property */
7386        case 'R':   /* LNBREAK */
7387      case 's': case 'S':   /* space class */
7388      case 'v': case 'V':   /* VERTWS */
7389      case 'w': case 'W':   /* word class */
7390      case 'X':             /* eXtended Unicode "combining character sequence" */
7391      case 'z': case 'Z':   /* End of line/string assertion */
7392       --p;
7393       goto loopdone;
7394
7395      /* Anything after here is an escape that resolves to a
7396      literal. (Except digits, which may or may not)
7397      */
7398      case 'n':
7399       ender = '\n';
7400       p++;
7401       break;
7402      case 'r':
7403       ender = '\r';
7404       p++;
7405       break;
7406      case 't':
7407       ender = '\t';
7408       p++;
7409       break;
7410      case 'f':
7411       ender = '\f';
7412       p++;
7413       break;
7414      case 'e':
7415       ender = ASCII_TO_NATIVE('\033');
7416       p++;
7417       break;
7418      case 'a':
7419       ender = ASCII_TO_NATIVE('\007');
7420       p++;
7421       break;
7422      case 'x':
7423       if (*++p == '{') {
7424        char* const e = strchr(p, '}');
7425
7426        if (!e) {
7427         RExC_parse = p + 1;
7428         vFAIL("Missing right brace on \\x{}");
7429        }
7430        else {
7431         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7432          | PERL_SCAN_DISALLOW_PREFIX;
7433         STRLEN numlen = e - p - 1;
7434         ender = grok_hex(p + 1, &numlen, &flags, NULL);
7435         if (ender > 0xff)
7436          RExC_utf8 = 1;
7437         p = e + 1;
7438        }
7439       }
7440       else {
7441        I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7442        STRLEN numlen = 2;
7443        ender = grok_hex(p, &numlen, &flags, NULL);
7444        p += numlen;
7445       }
7446       if (PL_encoding && ender < 0x100)
7447        goto recode_encoding;
7448       break;
7449      case 'c':
7450       p++;
7451       ender = UCHARAT(p++);
7452       ender = toCTRL(ender);
7453       break;
7454      case '0': case '1': case '2': case '3':case '4':
7455      case '5': case '6': case '7': case '8':case '9':
7456       if (*p == '0' ||
7457       (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
7458        I32 flags = 0;
7459        STRLEN numlen = 3;
7460        ender = grok_oct(p, &numlen, &flags, NULL);
7461
7462        /* An octal above 0xff is interpreted differently
7463        * depending on if the re is in utf8 or not.  If it
7464        * is in utf8, the value will be itself, otherwise
7465        * it is interpreted as modulo 0x100.  It has been
7466        * decided to discourage the use of octal above the
7467        * single-byte range.  For now, warn only when
7468        * it ends up modulo */
7469        if (SIZE_ONLY && ender >= 0x100
7470          && ! UTF && ! PL_encoding) {
7471         ckWARNregdep(p, "Use of octal value above 377 is deprecated");
7472        }
7473        p += numlen;
7474       }
7475       else {
7476        --p;
7477        goto loopdone;
7478       }
7479       if (PL_encoding && ender < 0x100)
7480        goto recode_encoding;
7481       break;
7482      recode_encoding:
7483       {
7484        SV* enc = PL_encoding;
7485        ender = reg_recode((const char)(U8)ender, &enc);
7486        if (!enc && SIZE_ONLY)
7487         ckWARNreg(p, "Invalid escape in the specified encoding");
7488        RExC_utf8 = 1;
7489       }
7490       break;
7491      case '\0':
7492       if (p >= RExC_end)
7493        FAIL("Trailing \\");
7494       /* FALL THROUGH */
7495      default:
7496       if (!SIZE_ONLY&& isALPHA(*p))
7497        ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7498       goto normal_default;
7499      }
7500      break;
7501     default:
7502     normal_default:
7503      if (UTF8_IS_START(*p) && UTF) {
7504       STRLEN numlen;
7505       ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7506            &numlen, UTF8_ALLOW_DEFAULT);
7507       p += numlen;
7508      }
7509      else
7510       ender = *p++;
7511      break;
7512     }
7513     if ( RExC_flags & RXf_PMf_EXTENDED)
7514      p = regwhite( pRExC_state, p );
7515     if (UTF && FOLD) {
7516      /* Prime the casefolded buffer. */
7517      ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7518     }
7519     if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7520      if (len)
7521       p = oldp;
7522      else if (UTF) {
7523       if (FOLD) {
7524        /* Emit all the Unicode characters. */
7525        STRLEN numlen;
7526        for (foldbuf = tmpbuf;
7527         foldlen;
7528         foldlen -= numlen) {
7529         ender = utf8_to_uvchr(foldbuf, &numlen);
7530         if (numlen > 0) {
7531           const STRLEN unilen = reguni(pRExC_state, ender, s);
7532           s       += unilen;
7533           len     += unilen;
7534           /* In EBCDIC the numlen
7535           * and unilen can differ. */
7536           foldbuf += numlen;
7537           if (numlen >= foldlen)
7538            break;
7539         }
7540         else
7541           break; /* "Can't happen." */
7542        }
7543       }
7544       else {
7545        const STRLEN unilen = reguni(pRExC_state, ender, s);
7546        if (unilen > 0) {
7547         s   += unilen;
7548         len += unilen;
7549        }
7550       }
7551      }
7552      else {
7553       len++;
7554       REGC((char)ender, s++);
7555      }
7556      break;
7557     }
7558     if (UTF) {
7559      if (FOLD) {
7560       /* Emit all the Unicode characters. */
7561       STRLEN numlen;
7562       for (foldbuf = tmpbuf;
7563        foldlen;
7564        foldlen -= numlen) {
7565        ender = utf8_to_uvchr(foldbuf, &numlen);
7566        if (numlen > 0) {
7567          const STRLEN unilen = reguni(pRExC_state, ender, s);
7568          len     += unilen;
7569          s       += unilen;
7570          /* In EBCDIC the numlen
7571          * and unilen can differ. */
7572          foldbuf += numlen;
7573          if (numlen >= foldlen)
7574           break;
7575        }
7576        else
7577          break;
7578       }
7579      }
7580      else {
7581       const STRLEN unilen = reguni(pRExC_state, ender, s);
7582       if (unilen > 0) {
7583        s   += unilen;
7584        len += unilen;
7585       }
7586      }
7587      len--;
7588     }
7589     else
7590      REGC((char)ender, s++);
7591    }
7592   loopdone:
7593    RExC_parse = p - 1;
7594    Set_Node_Cur_Length(ret); /* MJD */
7595    nextchar(pRExC_state);
7596    {
7597     /* len is STRLEN which is unsigned, need to copy to signed */
7598     IV iv = len;
7599     if (iv < 0)
7600      vFAIL("Internal disaster");
7601    }
7602    if (len > 0)
7603     *flagp |= HASWIDTH;
7604    if (len == 1 && UNI_IS_INVARIANT(ender))
7605     *flagp |= SIMPLE;
7606
7607    if (SIZE_ONLY)
7608     RExC_size += STR_SZ(len);
7609    else {
7610     STR_LEN(ret) = len;
7611     RExC_emit += STR_SZ(len);
7612    }
7613   }
7614   break;
7615  }
7616
7617  return(ret);
7618 }
7619
7620 STATIC char *
7621 S_regwhite( RExC_state_t *pRExC_state, char *p )
7622 {
7623  const char *e = RExC_end;
7624
7625  PERL_ARGS_ASSERT_REGWHITE;
7626
7627  while (p < e) {
7628   if (isSPACE(*p))
7629    ++p;
7630   else if (*p == '#') {
7631    bool ended = 0;
7632    do {
7633     if (*p++ == '\n') {
7634      ended = 1;
7635      break;
7636     }
7637    } while (p < e);
7638    if (!ended)
7639     RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7640   }
7641   else
7642    break;
7643  }
7644  return p;
7645 }
7646
7647 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7648    Character classes ([:foo:]) can also be negated ([:^foo:]).
7649    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7650    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7651    but trigger failures because they are currently unimplemented. */
7652
7653 #define POSIXCC_DONE(c)   ((c) == ':')
7654 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7655 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7656
7657 STATIC I32
7658 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7659 {
7660  dVAR;
7661  I32 namedclass = OOB_NAMEDCLASS;
7662
7663  PERL_ARGS_ASSERT_REGPPOSIXCC;
7664
7665  if (value == '[' && RExC_parse + 1 < RExC_end &&
7666   /* I smell either [: or [= or [. -- POSIX has been here, right? */
7667   POSIXCC(UCHARAT(RExC_parse))) {
7668   const char c = UCHARAT(RExC_parse);
7669   char* const s = RExC_parse++;
7670
7671   while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7672    RExC_parse++;
7673   if (RExC_parse == RExC_end)
7674    /* Grandfather lone [:, [=, [. */
7675    RExC_parse = s;
7676   else {
7677    const char* const t = RExC_parse++; /* skip over the c */
7678    assert(*t == c);
7679
7680    if (UCHARAT(RExC_parse) == ']') {
7681     const char *posixcc = s + 1;
7682     RExC_parse++; /* skip over the ending ] */
7683
7684     if (*s == ':') {
7685      const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7686      const I32 skip = t - posixcc;
7687
7688      /* Initially switch on the length of the name.  */
7689      switch (skip) {
7690      case 4:
7691       if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7692        namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7693       break;
7694      case 5:
7695       /* Names all of length 5.  */
7696       /* alnum alpha ascii blank cntrl digit graph lower
7697       print punct space upper  */
7698       /* Offset 4 gives the best switch position.  */
7699       switch (posixcc[4]) {
7700       case 'a':
7701        if (memEQ(posixcc, "alph", 4)) /* alpha */
7702         namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7703        break;
7704       case 'e':
7705        if (memEQ(posixcc, "spac", 4)) /* space */
7706         namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7707        break;
7708       case 'h':
7709        if (memEQ(posixcc, "grap", 4)) /* graph */
7710         namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7711        break;
7712       case 'i':
7713        if (memEQ(posixcc, "asci", 4)) /* ascii */
7714         namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7715        break;
7716       case 'k':
7717        if (memEQ(posixcc, "blan", 4)) /* blank */
7718         namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7719        break;
7720       case 'l':
7721        if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7722         namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7723        break;
7724       case 'm':
7725        if (memEQ(posixcc, "alnu", 4)) /* alnum */
7726         namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7727        break;
7728       case 'r':
7729        if (memEQ(posixcc, "lowe", 4)) /* lower */
7730         namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7731        else if (memEQ(posixcc, "uppe", 4)) /* upper */
7732         namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7733        break;
7734       case 't':
7735        if (memEQ(posixcc, "digi", 4)) /* digit */
7736         namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7737        else if (memEQ(posixcc, "prin", 4)) /* print */
7738         namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7739        else if (memEQ(posixcc, "punc", 4)) /* punct */
7740         namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7741        break;
7742       }
7743       break;
7744      case 6:
7745       if (memEQ(posixcc, "xdigit", 6))
7746        namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7747       break;
7748      }
7749
7750      if (namedclass == OOB_NAMEDCLASS)
7751       Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7752          t - s - 1, s + 1);
7753      assert (posixcc[skip] == ':');
7754      assert (posixcc[skip+1] == ']');
7755     } else if (!SIZE_ONLY) {
7756      /* [[=foo=]] and [[.foo.]] are still future. */
7757
7758      /* adjust RExC_parse so the warning shows after
7759      the class closes */
7760      while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7761       RExC_parse++;
7762      Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7763     }
7764    } else {
7765     /* Maternal grandfather:
7766     * "[:" ending in ":" but not in ":]" */
7767     RExC_parse = s;
7768    }
7769   }
7770  }
7771
7772  return namedclass;
7773 }
7774
7775 STATIC void
7776 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7777 {
7778  dVAR;
7779
7780  PERL_ARGS_ASSERT_CHECKPOSIXCC;
7781
7782  if (POSIXCC(UCHARAT(RExC_parse))) {
7783   const char *s = RExC_parse;
7784   const char  c = *s++;
7785
7786   while (isALNUM(*s))
7787    s++;
7788   if (*s && c == *s && s[1] == ']') {
7789    ckWARN3reg(s+2,
7790      "POSIX syntax [%c %c] belongs inside character classes",
7791      c, c);
7792
7793    /* [[=foo=]] and [[.foo.]] are still future. */
7794    if (POSIXCC_NOTYET(c)) {
7795     /* adjust RExC_parse so the error shows after
7796     the class closes */
7797     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7798      NOOP;
7799     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7800    }
7801   }
7802  }
7803 }
7804
7805
7806 #define _C_C_T_(NAME,TEST,WORD)                         \
7807 ANYOF_##NAME:                                           \
7808  if (LOC)                                            \
7809   ANYOF_CLASS_SET(ret, ANYOF_##NAME);             \
7810  else {                                              \
7811   for (value = 0; value < 256; value++)           \
7812    if (TEST)                                   \
7813     ANYOF_BITMAP_SET(ret, value);           \
7814  }                                                   \
7815  yesno = '+';                                        \
7816  what = WORD;                                        \
7817  break;                                              \
7818 case ANYOF_N##NAME:                                     \
7819  if (LOC)                                            \
7820   ANYOF_CLASS_SET(ret, ANYOF_N##NAME);            \
7821  else {                                              \
7822   for (value = 0; value < 256; value++)           \
7823    if (!TEST)                                  \
7824     ANYOF_BITMAP_SET(ret, value);           \
7825  }                                                   \
7826  yesno = '!';                                        \
7827  what = WORD;                                        \
7828  break
7829
7830 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
7831 ANYOF_##NAME:                                           \
7832   for (value = 0; value < 256; value++)           \
7833    if (TEST)                                   \
7834     ANYOF_BITMAP_SET(ret, value);           \
7835  yesno = '+';                                        \
7836  what = WORD;                                        \
7837  break;                                              \
7838 case ANYOF_N##NAME:                                     \
7839   for (value = 0; value < 256; value++)           \
7840    if (!TEST)                                  \
7841     ANYOF_BITMAP_SET(ret, value);           \
7842  yesno = '!';                                        \
7843  what = WORD;                                        \
7844  break
7845
7846 /*
7847    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7848    so that it is possible to override the option here without having to
7849    rebuild the entire core. as we are required to do if we change regcomp.h
7850    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7851 */
7852 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7853 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7854 #endif
7855
7856 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7857 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7858 #else
7859 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7860 #endif
7861
7862 /*
7863    parse a class specification and produce either an ANYOF node that
7864    matches the pattern or if the pattern matches a single char only and
7865    that char is < 256 and we are case insensitive then we produce an
7866    EXACT node instead.
7867 */
7868
7869 STATIC regnode *
7870 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7871 {
7872  dVAR;
7873  register UV nextvalue;
7874  register IV prevvalue = OOB_UNICODE;
7875  register IV range = 0;
7876  UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7877  register regnode *ret;
7878  STRLEN numlen;
7879  IV namedclass;
7880  char *rangebegin = NULL;
7881  bool need_class = 0;
7882  SV *listsv = NULL;
7883  UV n;
7884  bool optimize_invert   = TRUE;
7885  AV* unicode_alternate  = NULL;
7886 #ifdef EBCDIC
7887  UV literal_endpoint = 0;
7888 #endif
7889  UV stored = 0;  /* number of chars stored in the class */
7890
7891  regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7892   case we need to change the emitted regop to an EXACT. */
7893  const char * orig_parse = RExC_parse;
7894  GET_RE_DEBUG_FLAGS_DECL;
7895
7896  PERL_ARGS_ASSERT_REGCLASS;
7897 #ifndef DEBUGGING
7898  PERL_UNUSED_ARG(depth);
7899 #endif
7900
7901  DEBUG_PARSE("clas");
7902
7903  /* Assume we are going to generate an ANYOF node. */
7904  ret = reganode(pRExC_state, ANYOF, 0);
7905
7906  if (!SIZE_ONLY)
7907   ANYOF_FLAGS(ret) = 0;
7908
7909  if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
7910   RExC_naughty++;
7911   RExC_parse++;
7912   if (!SIZE_ONLY)
7913    ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7914  }
7915
7916  if (SIZE_ONLY) {
7917   RExC_size += ANYOF_SKIP;
7918   listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7919  }
7920  else {
7921   RExC_emit += ANYOF_SKIP;
7922   if (FOLD)
7923    ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7924   if (LOC)
7925    ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7926   ANYOF_BITMAP_ZERO(ret);
7927   listsv = newSVpvs("# comment\n");
7928  }
7929
7930  nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7931
7932  if (!SIZE_ONLY && POSIXCC(nextvalue))
7933   checkposixcc(pRExC_state);
7934
7935  /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7936  if (UCHARAT(RExC_parse) == ']')
7937   goto charclassloop;
7938
7939 parseit:
7940  while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7941
7942  charclassloop:
7943
7944   namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7945
7946   if (!range)
7947    rangebegin = RExC_parse;
7948   if (UTF) {
7949    value = utf8n_to_uvchr((U8*)RExC_parse,
7950         RExC_end - RExC_parse,
7951         &numlen, UTF8_ALLOW_DEFAULT);
7952    RExC_parse += numlen;
7953   }
7954   else
7955    value = UCHARAT(RExC_parse++);
7956
7957   nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7958   if (value == '[' && POSIXCC(nextvalue))
7959    namedclass = regpposixcc(pRExC_state, value);
7960   else if (value == '\\') {
7961    if (UTF) {
7962     value = utf8n_to_uvchr((U8*)RExC_parse,
7963         RExC_end - RExC_parse,
7964         &numlen, UTF8_ALLOW_DEFAULT);
7965     RExC_parse += numlen;
7966    }
7967    else
7968     value = UCHARAT(RExC_parse++);
7969    /* Some compilers cannot handle switching on 64-bit integer
7970    * values, therefore value cannot be an UV.  Yes, this will
7971    * be a problem later if we want switch on Unicode.
7972    * A similar issue a little bit later when switching on
7973    * namedclass. --jhi */
7974    switch ((I32)value) {
7975    case 'w': namedclass = ANYOF_ALNUM; break;
7976    case 'W': namedclass = ANYOF_NALNUM; break;
7977    case 's': namedclass = ANYOF_SPACE; break;
7978    case 'S': namedclass = ANYOF_NSPACE; break;
7979    case 'd': namedclass = ANYOF_DIGIT; break;
7980    case 'D': namedclass = ANYOF_NDIGIT; break;
7981    case 'v': namedclass = ANYOF_VERTWS; break;
7982    case 'V': namedclass = ANYOF_NVERTWS; break;
7983    case 'h': namedclass = ANYOF_HORIZWS; break;
7984    case 'H': namedclass = ANYOF_NHORIZWS; break;
7985    case 'N':  /* Handle \N{NAME} in class */
7986     {
7987      /* We only pay attention to the first char of
7988      multichar strings being returned. I kinda wonder
7989      if this makes sense as it does change the behaviour
7990      from earlier versions, OTOH that behaviour was broken
7991      as well. */
7992      UV v; /* value is register so we cant & it /grrr */
7993      if (reg_namedseq(pRExC_state, &v, NULL)) {
7994       goto parseit;
7995      }
7996      value= v;
7997     }
7998     break;
7999    case 'p':
8000    case 'P':
8001     {
8002     char *e;
8003     if (RExC_parse >= RExC_end)
8004      vFAIL2("Empty \\%c{}", (U8)value);
8005     if (*RExC_parse == '{') {
8006      const U8 c = (U8)value;
8007      e = strchr(RExC_parse++, '}');
8008      if (!e)
8009       vFAIL2("Missing right brace on \\%c{}", c);
8010      while (isSPACE(UCHARAT(RExC_parse)))
8011       RExC_parse++;
8012      if (e == RExC_parse)
8013       vFAIL2("Empty \\%c{}", c);
8014      n = e - RExC_parse;
8015      while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8016       n--;
8017     }
8018     else {
8019      e = RExC_parse;
8020      n = 1;
8021     }
8022     if (!SIZE_ONLY) {
8023      if (UCHARAT(RExC_parse) == '^') {
8024       RExC_parse++;
8025       n--;
8026       value = value == 'p' ? 'P' : 'p'; /* toggle */
8027       while (isSPACE(UCHARAT(RExC_parse))) {
8028        RExC_parse++;
8029        n--;
8030       }
8031      }
8032      Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8033       (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8034     }
8035     RExC_parse = e + 1;
8036     ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8037     namedclass = ANYOF_MAX;  /* no official name, but it's named */
8038     }
8039     break;
8040    case 'n': value = '\n';   break;
8041    case 'r': value = '\r';   break;
8042    case 't': value = '\t';   break;
8043    case 'f': value = '\f';   break;
8044    case 'b': value = '\b';   break;
8045    case 'e': value = ASCII_TO_NATIVE('\033');break;
8046    case 'a': value = ASCII_TO_NATIVE('\007');break;
8047    case 'x':
8048     if (*RExC_parse == '{') {
8049      I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8050       | PERL_SCAN_DISALLOW_PREFIX;
8051      char * const e = strchr(RExC_parse++, '}');
8052      if (!e)
8053       vFAIL("Missing right brace on \\x{}");
8054
8055      numlen = e - RExC_parse;
8056      value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8057      RExC_parse = e + 1;
8058     }
8059     else {
8060      I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8061      numlen = 2;
8062      value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8063      RExC_parse += numlen;
8064     }
8065     if (PL_encoding && value < 0x100)
8066      goto recode_encoding;
8067     break;
8068    case 'c':
8069     value = UCHARAT(RExC_parse++);
8070     value = toCTRL(value);
8071     break;
8072    case '0': case '1': case '2': case '3': case '4':
8073    case '5': case '6': case '7': case '8': case '9':
8074     {
8075      I32 flags = 0;
8076      numlen = 3;
8077      value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8078      RExC_parse += numlen;
8079      if (PL_encoding && value < 0x100)
8080       goto recode_encoding;
8081      break;
8082     }
8083    recode_encoding:
8084     {
8085      SV* enc = PL_encoding;
8086      value = reg_recode((const char)(U8)value, &enc);
8087      if (!enc && SIZE_ONLY)
8088       ckWARNreg(RExC_parse,
8089         "Invalid escape in the specified encoding");
8090      break;
8091     }
8092    default:
8093     if (!SIZE_ONLY && isALPHA(value))
8094      ckWARN2reg(RExC_parse,
8095        "Unrecognized escape \\%c in character class passed through",
8096        (int)value);
8097     break;
8098    }
8099   } /* end of \blah */
8100 #ifdef EBCDIC
8101   else
8102    literal_endpoint++;
8103 #endif
8104
8105   if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8106
8107    if (!SIZE_ONLY && !need_class)
8108     ANYOF_CLASS_ZERO(ret);
8109
8110    need_class = 1;
8111
8112    /* a bad range like a-\d, a-[:digit:] ? */
8113    if (range) {
8114     if (!SIZE_ONLY) {
8115      const int w =
8116       RExC_parse >= rangebegin ?
8117       RExC_parse - rangebegin : 0;
8118      ckWARN4reg(RExC_parse,
8119        "False [] range \"%*.*s\"",
8120        w, w, rangebegin);
8121
8122      if (prevvalue < 256) {
8123       ANYOF_BITMAP_SET(ret, prevvalue);
8124       ANYOF_BITMAP_SET(ret, '-');
8125      }
8126      else {
8127       ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8128       Perl_sv_catpvf(aTHX_ listsv,
8129          "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8130      }
8131     }
8132
8133     range = 0; /* this was not a true range */
8134    }
8135
8136
8137
8138    if (!SIZE_ONLY) {
8139     const char *what = NULL;
8140     char yesno = 0;
8141
8142     if (namedclass > OOB_NAMEDCLASS)
8143      optimize_invert = FALSE;
8144     /* Possible truncation here but in some 64-bit environments
8145     * the compiler gets heartburn about switch on 64-bit values.
8146     * A similar issue a little earlier when switching on value.
8147     * --jhi */
8148     switch ((I32)namedclass) {
8149
8150     case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8151     case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8152     case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8153     case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8154     case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8155     case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8156     case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8157     case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8158     case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8159     case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8160 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8161     case _C_C_T_(ALNUM, isALNUM(value), "Word");
8162     case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
8163 #else
8164     case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8165     case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8166 #endif
8167     case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
8168     case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8169     case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8170     case ANYOF_ASCII:
8171      if (LOC)
8172       ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8173      else {
8174 #ifndef EBCDIC
8175       for (value = 0; value < 128; value++)
8176        ANYOF_BITMAP_SET(ret, value);
8177 #else  /* EBCDIC */
8178       for (value = 0; value < 256; value++) {
8179        if (isASCII(value))
8180         ANYOF_BITMAP_SET(ret, value);
8181       }
8182 #endif /* EBCDIC */
8183      }
8184      yesno = '+';
8185      what = "ASCII";
8186      break;
8187     case ANYOF_NASCII:
8188      if (LOC)
8189       ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8190      else {
8191 #ifndef EBCDIC
8192       for (value = 128; value < 256; value++)
8193        ANYOF_BITMAP_SET(ret, value);
8194 #else  /* EBCDIC */
8195       for (value = 0; value < 256; value++) {
8196        if (!isASCII(value))
8197         ANYOF_BITMAP_SET(ret, value);
8198       }
8199 #endif /* EBCDIC */
8200      }
8201      yesno = '!';
8202      what = "ASCII";
8203      break;
8204     case ANYOF_DIGIT:
8205      if (LOC)
8206       ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8207      else {
8208       /* consecutive digits assumed */
8209       for (value = '0'; value <= '9'; value++)
8210        ANYOF_BITMAP_SET(ret, value);
8211      }
8212      yesno = '+';
8213      what = POSIX_CC_UNI_NAME("Digit");
8214      break;
8215     case ANYOF_NDIGIT:
8216      if (LOC)
8217       ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8218      else {
8219       /* consecutive digits assumed */
8220       for (value = 0; value < '0'; value++)
8221        ANYOF_BITMAP_SET(ret, value);
8222       for (value = '9' + 1; value < 256; value++)
8223        ANYOF_BITMAP_SET(ret, value);
8224      }
8225      yesno = '!';
8226      what = POSIX_CC_UNI_NAME("Digit");
8227      break;
8228     case ANYOF_MAX:
8229      /* this is to handle \p and \P */
8230      break;
8231     default:
8232      vFAIL("Invalid [::] class");
8233      break;
8234     }
8235     if (what) {
8236      /* Strings such as "+utf8::isWord\n" */
8237      Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8238     }
8239     if (LOC)
8240      ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8241     continue;
8242    }
8243   } /* end of namedclass \blah */
8244
8245   if (range) {
8246    if (prevvalue > (IV)value) /* b-a */ {
8247     const int w = RExC_parse - rangebegin;
8248     Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8249     range = 0; /* not a valid range */
8250    }
8251   }
8252   else {
8253    prevvalue = value; /* save the beginning of the range */
8254    if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8255     RExC_parse[1] != ']') {
8256     RExC_parse++;
8257
8258     /* a bad range like \w-, [:word:]- ? */
8259     if (namedclass > OOB_NAMEDCLASS) {
8260      if (ckWARN(WARN_REGEXP)) {
8261       const int w =
8262        RExC_parse >= rangebegin ?
8263        RExC_parse - rangebegin : 0;
8264       vWARN4(RExC_parse,
8265        "False [] range \"%*.*s\"",
8266        w, w, rangebegin);
8267      }
8268      if (!SIZE_ONLY)
8269       ANYOF_BITMAP_SET(ret, '-');
8270     } else
8271      range = 1; /* yeah, it's a range! */
8272     continue; /* but do it the next time */
8273    }
8274   }
8275
8276   /* now is the next time */
8277   /*stored += (value - prevvalue + 1);*/
8278   if (!SIZE_ONLY) {
8279    if (prevvalue < 256) {
8280     const IV ceilvalue = value < 256 ? value : 255;
8281     IV i;
8282 #ifdef EBCDIC
8283     /* In EBCDIC [\x89-\x91] should include
8284     * the \x8e but [i-j] should not. */
8285     if (literal_endpoint == 2 &&
8286      ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8287      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8288     {
8289      if (isLOWER(prevvalue)) {
8290       for (i = prevvalue; i <= ceilvalue; i++)
8291        if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8292         stored++;
8293         ANYOF_BITMAP_SET(ret, i);
8294        }
8295      } else {
8296       for (i = prevvalue; i <= ceilvalue; i++)
8297        if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8298         stored++;
8299         ANYOF_BITMAP_SET(ret, i);
8300        }
8301      }
8302     }
8303     else
8304 #endif
8305      for (i = prevvalue; i <= ceilvalue; i++) {
8306       if (!ANYOF_BITMAP_TEST(ret,i)) {
8307        stored++;
8308        ANYOF_BITMAP_SET(ret, i);
8309       }
8310      }
8311   }
8312   if (value > 255 || UTF) {
8313     const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
8314     const UV natvalue      = NATIVE_TO_UNI(value);
8315     stored+=2; /* can't optimize this class */
8316     ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8317     if (prevnatvalue < natvalue) { /* what about > ? */
8318      Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8319         prevnatvalue, natvalue);
8320     }
8321     else if (prevnatvalue == natvalue) {
8322      Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8323      if (FOLD) {
8324       U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8325       STRLEN foldlen;
8326       const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8327
8328 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8329       if (RExC_precomp[0] == ':' &&
8330        RExC_precomp[1] == '[' &&
8331        (f == 0xDF || f == 0x92)) {
8332        f = NATIVE_TO_UNI(f);
8333       }
8334 #endif
8335       /* If folding and foldable and a single
8336       * character, insert also the folded version
8337       * to the charclass. */
8338       if (f != value) {
8339 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8340        if ((RExC_precomp[0] == ':' &&
8341         RExC_precomp[1] == '[' &&
8342         (f == 0xA2 &&
8343         (value == 0xFB05 || value == 0xFB06))) ?
8344         foldlen == ((STRLEN)UNISKIP(f) - 1) :
8345         foldlen == (STRLEN)UNISKIP(f) )
8346 #else
8347        if (foldlen == (STRLEN)UNISKIP(f))
8348 #endif
8349         Perl_sv_catpvf(aTHX_ listsv,
8350             "%04"UVxf"\n", f);
8351        else {
8352         /* Any multicharacter foldings
8353         * require the following transform:
8354         * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8355         * where E folds into "pq" and F folds
8356         * into "rst", all other characters
8357         * fold to single characters.  We save
8358         * away these multicharacter foldings,
8359         * to be later saved as part of the
8360         * additional "s" data. */
8361         SV *sv;
8362
8363         if (!unicode_alternate)
8364          unicode_alternate = newAV();
8365         sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8366              TRUE);
8367         av_push(unicode_alternate, sv);
8368        }
8369       }
8370
8371       /* If folding and the value is one of the Greek
8372       * sigmas insert a few more sigmas to make the
8373       * folding rules of the sigmas to work right.
8374       * Note that not all the possible combinations
8375       * are handled here: some of them are handled
8376       * by the standard folding rules, and some of
8377       * them (literal or EXACTF cases) are handled
8378       * during runtime in regexec.c:S_find_byclass(). */
8379       if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8380        Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8381            (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8382        Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8383            (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8384       }
8385       else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8386        Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8387            (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8388      }
8389     }
8390    }
8391 #ifdef EBCDIC
8392    literal_endpoint = 0;
8393 #endif
8394   }
8395
8396   range = 0; /* this range (if it was one) is done now */
8397  }
8398
8399  if (need_class) {
8400   ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8401   if (SIZE_ONLY)
8402    RExC_size += ANYOF_CLASS_ADD_SKIP;
8403   else
8404    RExC_emit += ANYOF_CLASS_ADD_SKIP;
8405  }
8406
8407
8408  if (SIZE_ONLY)
8409   return ret;
8410  /****** !SIZE_ONLY AFTER HERE *********/
8411
8412  if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8413   && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8414  ) {
8415   /* optimize single char class to an EXACT node
8416   but *only* when its not a UTF/high char  */
8417   const char * cur_parse= RExC_parse;
8418   RExC_emit = (regnode *)orig_emit;
8419   RExC_parse = (char *)orig_parse;
8420   ret = reg_node(pRExC_state,
8421      (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8422   RExC_parse = (char *)cur_parse;
8423   *STRING(ret)= (char)value;
8424   STR_LEN(ret)= 1;
8425   RExC_emit += STR_SZ(1);
8426   SvREFCNT_dec(listsv);
8427   return ret;
8428  }
8429  /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8430  if ( /* If the only flag is folding (plus possibly inversion). */
8431   ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8432  ) {
8433   for (value = 0; value < 256; ++value) {
8434    if (ANYOF_BITMAP_TEST(ret, value)) {
8435     UV fold = PL_fold[value];
8436
8437     if (fold != value)
8438      ANYOF_BITMAP_SET(ret, fold);
8439    }
8440   }
8441   ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8442  }
8443
8444  /* optimize inverted simple patterns (e.g. [^a-z]) */
8445  if (optimize_invert &&
8446   /* If the only flag is inversion. */
8447   (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8448   for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8449    ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8450   ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8451  }
8452  {
8453   AV * const av = newAV();
8454   SV *rv;
8455   /* The 0th element stores the character class description
8456   * in its textual form: used later (regexec.c:Perl_regclass_swash())
8457   * to initialize the appropriate swash (which gets stored in
8458   * the 1st element), and also useful for dumping the regnode.
8459   * The 2nd element stores the multicharacter foldings,
8460   * used later (regexec.c:S_reginclass()). */
8461   av_store(av, 0, listsv);
8462   av_store(av, 1, NULL);
8463   av_store(av, 2, MUTABLE_SV(unicode_alternate));
8464   rv = newRV_noinc(MUTABLE_SV(av));
8465   n = add_data(pRExC_state, 1, "s");
8466   RExC_rxi->data->data[n] = (void*)rv;
8467   ARG_SET(ret, n);
8468  }
8469  return ret;
8470 }
8471 #undef _C_C_T_
8472
8473
8474 /* reg_skipcomment()
8475
8476    Absorbs an /x style # comments from the input stream.
8477    Returns true if there is more text remaining in the stream.
8478    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8479    terminates the pattern without including a newline.
8480
8481    Note its the callers responsibility to ensure that we are
8482    actually in /x mode
8483
8484 */
8485
8486 STATIC bool
8487 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8488 {
8489  bool ended = 0;
8490
8491  PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8492
8493  while (RExC_parse < RExC_end)
8494   if (*RExC_parse++ == '\n') {
8495    ended = 1;
8496    break;
8497   }
8498  if (!ended) {
8499   /* we ran off the end of the pattern without ending
8500   the comment, so we have to add an \n when wrapping */
8501   RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8502   return 0;
8503  } else
8504   return 1;
8505 }
8506
8507 /* nextchar()
8508
8509    Advance that parse position, and optionall absorbs
8510    "whitespace" from the inputstream.
8511
8512    Without /x "whitespace" means (?#...) style comments only,
8513    with /x this means (?#...) and # comments and whitespace proper.
8514
8515    Returns the RExC_parse point from BEFORE the scan occurs.
8516
8517    This is the /x friendly way of saying RExC_parse++.
8518 */
8519
8520 STATIC char*
8521 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8522 {
8523  char* const retval = RExC_parse++;
8524
8525  PERL_ARGS_ASSERT_NEXTCHAR;
8526
8527  for (;;) {
8528   if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8529     RExC_parse[2] == '#') {
8530    while (*RExC_parse != ')') {
8531     if (RExC_parse == RExC_end)
8532      FAIL("Sequence (?#... not terminated");
8533     RExC_parse++;
8534    }
8535    RExC_parse++;
8536    continue;
8537   }
8538   if (RExC_flags & RXf_PMf_EXTENDED) {
8539    if (isSPACE(*RExC_parse)) {
8540     RExC_parse++;
8541     continue;
8542    }
8543    else if (*RExC_parse == '#') {
8544     if ( reg_skipcomment( pRExC_state ) )
8545      continue;
8546    }
8547   }
8548   return retval;
8549  }
8550 }
8551
8552 /*
8553 - reg_node - emit a node
8554 */
8555 STATIC regnode *   /* Location. */
8556 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8557 {
8558  dVAR;
8559  register regnode *ptr;
8560  regnode * const ret = RExC_emit;
8561  GET_RE_DEBUG_FLAGS_DECL;
8562
8563  PERL_ARGS_ASSERT_REG_NODE;
8564
8565  if (SIZE_ONLY) {
8566   SIZE_ALIGN(RExC_size);
8567   RExC_size += 1;
8568   return(ret);
8569  }
8570  if (RExC_emit >= RExC_emit_bound)
8571   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8572
8573  NODE_ALIGN_FILL(ret);
8574  ptr = ret;
8575  FILL_ADVANCE_NODE(ptr, op);
8576  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
8577 #ifdef RE_TRACK_PATTERN_OFFSETS
8578  if (RExC_offsets) {         /* MJD */
8579   MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8580    "reg_node", __LINE__,
8581    PL_reg_name[op],
8582    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8583     ? "Overwriting end of array!\n" : "OK",
8584    (UV)(RExC_emit - RExC_emit_start),
8585    (UV)(RExC_parse - RExC_start),
8586    (UV)RExC_offsets[0]));
8587   Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8588  }
8589 #endif
8590  RExC_emit = ptr;
8591  return(ret);
8592 }
8593
8594 /*
8595 - reganode - emit a node with an argument
8596 */
8597 STATIC regnode *   /* Location. */
8598 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8599 {
8600  dVAR;
8601  register regnode *ptr;
8602  regnode * const ret = RExC_emit;
8603  GET_RE_DEBUG_FLAGS_DECL;
8604
8605  PERL_ARGS_ASSERT_REGANODE;
8606
8607  if (SIZE_ONLY) {
8608   SIZE_ALIGN(RExC_size);
8609   RExC_size += 2;
8610   /*
8611   We can't do this:
8612
8613   assert(2==regarglen[op]+1);
8614
8615   Anything larger than this has to allocate the extra amount.
8616   If we changed this to be:
8617
8618   RExC_size += (1 + regarglen[op]);
8619
8620   then it wouldn't matter. Its not clear what side effect
8621   might come from that so its not done so far.
8622   -- dmq
8623   */
8624   return(ret);
8625  }
8626  if (RExC_emit >= RExC_emit_bound)
8627   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8628
8629  NODE_ALIGN_FILL(ret);
8630  ptr = ret;
8631  FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8632  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
8633 #ifdef RE_TRACK_PATTERN_OFFSETS
8634  if (RExC_offsets) {         /* MJD */
8635   MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8636    "reganode",
8637    __LINE__,
8638    PL_reg_name[op],
8639    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8640    "Overwriting end of array!\n" : "OK",
8641    (UV)(RExC_emit - RExC_emit_start),
8642    (UV)(RExC_parse - RExC_start),
8643    (UV)RExC_offsets[0]));
8644   Set_Cur_Node_Offset;
8645  }
8646 #endif
8647  RExC_emit = ptr;
8648  return(ret);
8649 }
8650
8651 /*
8652 - reguni - emit (if appropriate) a Unicode character
8653 */
8654 STATIC STRLEN
8655 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8656 {
8657  dVAR;
8658
8659  PERL_ARGS_ASSERT_REGUNI;
8660
8661  return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8662 }
8663
8664 /*
8665 - reginsert - insert an operator in front of already-emitted operand
8666 *
8667 * Means relocating the operand.
8668 */
8669 STATIC void
8670 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8671 {
8672  dVAR;
8673  register regnode *src;
8674  register regnode *dst;
8675  register regnode *place;
8676  const int offset = regarglen[(U8)op];
8677  const int size = NODE_STEP_REGNODE + offset;
8678  GET_RE_DEBUG_FLAGS_DECL;
8679
8680  PERL_ARGS_ASSERT_REGINSERT;
8681  PERL_UNUSED_ARG(depth);
8682 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8683  DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8684  if (SIZE_ONLY) {
8685   RExC_size += size;
8686   return;
8687  }
8688
8689  src = RExC_emit;
8690  RExC_emit += size;
8691  dst = RExC_emit;
8692  if (RExC_open_parens) {
8693   int paren;
8694   /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8695   for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8696    if ( RExC_open_parens[paren] >= opnd ) {
8697     /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8698     RExC_open_parens[paren] += size;
8699    } else {
8700     /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8701    }
8702    if ( RExC_close_parens[paren] >= opnd ) {
8703     /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8704     RExC_close_parens[paren] += size;
8705    } else {
8706     /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8707    }
8708   }
8709  }
8710
8711  while (src > opnd) {
8712   StructCopy(--src, --dst, regnode);
8713 #ifdef RE_TRACK_PATTERN_OFFSETS
8714   if (RExC_offsets) {     /* MJD 20010112 */
8715    MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8716     "reg_insert",
8717     __LINE__,
8718     PL_reg_name[op],
8719     (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8720      ? "Overwriting end of array!\n" : "OK",
8721     (UV)(src - RExC_emit_start),
8722     (UV)(dst - RExC_emit_start),
8723     (UV)RExC_offsets[0]));
8724    Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8725    Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8726   }
8727 #endif
8728  }
8729
8730
8731  place = opnd;  /* Op node, where operand used to be. */
8732 #ifdef RE_TRACK_PATTERN_OFFSETS
8733  if (RExC_offsets) {         /* MJD */
8734   MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8735    "reginsert",
8736    __LINE__,
8737    PL_reg_name[op],
8738    (UV)(place - RExC_emit_start) > RExC_offsets[0]
8739    ? "Overwriting end of array!\n" : "OK",
8740    (UV)(place - RExC_emit_start),
8741    (UV)(RExC_parse - RExC_start),
8742    (UV)RExC_offsets[0]));
8743   Set_Node_Offset(place, RExC_parse);
8744   Set_Node_Length(place, 1);
8745  }
8746 #endif
8747  src = NEXTOPER(place);
8748  FILL_ADVANCE_NODE(place, op);
8749  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
8750  Zero(src, offset, regnode);
8751 }
8752
8753 /*
8754 - regtail - set the next-pointer at the end of a node chain of p to val.
8755 - SEE ALSO: regtail_study
8756 */
8757 /* TODO: All three parms should be const */
8758 STATIC void
8759 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8760 {
8761  dVAR;
8762  register regnode *scan;
8763  GET_RE_DEBUG_FLAGS_DECL;
8764
8765  PERL_ARGS_ASSERT_REGTAIL;
8766 #ifndef DEBUGGING
8767  PERL_UNUSED_ARG(depth);
8768 #endif
8769
8770  if (SIZE_ONLY)
8771   return;
8772
8773  /* Find last node. */
8774  scan = p;
8775  for (;;) {
8776   regnode * const temp = regnext(scan);
8777   DEBUG_PARSE_r({
8778    SV * const mysv=sv_newmortal();
8779    DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8780    regprop(RExC_rx, mysv, scan);
8781    PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8782     SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8783      (temp == NULL ? "->" : ""),
8784      (temp == NULL ? PL_reg_name[OP(val)] : "")
8785    );
8786   });
8787   if (temp == NULL)
8788    break;
8789   scan = temp;
8790  }
8791
8792  if (reg_off_by_arg[OP(scan)]) {
8793   ARG_SET(scan, val - scan);
8794  }
8795  else {
8796   NEXT_OFF(scan) = val - scan;
8797  }
8798 }
8799
8800 #ifdef DEBUGGING
8801 /*
8802 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8803 - Look for optimizable sequences at the same time.
8804 - currently only looks for EXACT chains.
8805
8806 This is expermental code. The idea is to use this routine to perform
8807 in place optimizations on branches and groups as they are constructed,
8808 with the long term intention of removing optimization from study_chunk so
8809 that it is purely analytical.
8810
8811 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8812 to control which is which.
8813
8814 */
8815 /* TODO: All four parms should be const */
8816
8817 STATIC U8
8818 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8819 {
8820  dVAR;
8821  register regnode *scan;
8822  U8 exact = PSEUDO;
8823 #ifdef EXPERIMENTAL_INPLACESCAN
8824  I32 min = 0;
8825 #endif
8826  GET_RE_DEBUG_FLAGS_DECL;
8827
8828  PERL_ARGS_ASSERT_REGTAIL_STUDY;
8829
8830
8831  if (SIZE_ONLY)
8832   return exact;
8833
8834  /* Find last node. */
8835
8836  scan = p;
8837  for (;;) {
8838   regnode * const temp = regnext(scan);
8839 #ifdef EXPERIMENTAL_INPLACESCAN
8840   if (PL_regkind[OP(scan)] == EXACT)
8841    if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8842     return EXACT;
8843 #endif
8844   if ( exact ) {
8845    switch (OP(scan)) {
8846     case EXACT:
8847     case EXACTF:
8848     case EXACTFL:
8849       if( exact == PSEUDO )
8850        exact= OP(scan);
8851       else if ( exact != OP(scan) )
8852        exact= 0;
8853     case NOTHING:
8854      break;
8855     default:
8856      exact= 0;
8857    }
8858   }
8859   DEBUG_PARSE_r({
8860    SV * const mysv=sv_newmortal();
8861    DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8862    regprop(RExC_rx, mysv, scan);
8863    PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8864     SvPV_nolen_const(mysv),
8865     REG_NODE_NUM(scan),
8866     PL_reg_name[exact]);
8867   });
8868   if (temp == NULL)
8869    break;
8870   scan = temp;
8871  }
8872  DEBUG_PARSE_r({
8873   SV * const mysv_val=sv_newmortal();
8874   DEBUG_PARSE_MSG("");
8875   regprop(RExC_rx, mysv_val, val);
8876   PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8877      SvPV_nolen_const(mysv_val),
8878      (IV)REG_NODE_NUM(val),
8879      (IV)(val - scan)
8880   );
8881  });
8882  if (reg_off_by_arg[OP(scan)]) {
8883   ARG_SET(scan, val - scan);
8884  }
8885  else {
8886   NEXT_OFF(scan) = val - scan;
8887  }
8888
8889  return exact;
8890 }
8891 #endif
8892
8893 /*
8894  - regcurly - a little FSA that accepts {\d+,?\d*}
8895  */
8896 #ifndef PERL_IN_XSUB_RE
8897 I32
8898 Perl_regcurly(register const char *s)
8899 {
8900  PERL_ARGS_ASSERT_REGCURLY;
8901
8902  if (*s++ != '{')
8903   return FALSE;
8904  if (!isDIGIT(*s))
8905   return FALSE;
8906  while (isDIGIT(*s))
8907   s++;
8908  if (*s == ',')
8909   s++;
8910  while (isDIGIT(*s))
8911   s++;
8912  if (*s != '}')
8913   return FALSE;
8914  return TRUE;
8915 }
8916 #endif
8917
8918 /*
8919  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8920  */
8921 #ifdef DEBUGGING
8922 static void
8923 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
8924 {
8925  int bit;
8926  int set=0;
8927
8928  for (bit=0; bit<32; bit++) {
8929   if (flags & (1<<bit)) {
8930    if (!set++ && lead)
8931     PerlIO_printf(Perl_debug_log, "%s",lead);
8932    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8933   }
8934  }
8935  if (lead)  {
8936   if (set)
8937    PerlIO_printf(Perl_debug_log, "\n");
8938   else
8939    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8940  }
8941 }
8942 #endif
8943
8944 void
8945 Perl_regdump(pTHX_ const regexp *r)
8946 {
8947 #ifdef DEBUGGING
8948  dVAR;
8949  SV * const sv = sv_newmortal();
8950  SV *dsv= sv_newmortal();
8951  RXi_GET_DECL(r,ri);
8952  GET_RE_DEBUG_FLAGS_DECL;
8953
8954  PERL_ARGS_ASSERT_REGDUMP;
8955
8956  (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8957
8958  /* Header fields of interest. */
8959  if (r->anchored_substr) {
8960   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8961    RE_SV_DUMPLEN(r->anchored_substr), 30);
8962   PerlIO_printf(Perl_debug_log,
8963      "anchored %s%s at %"IVdf" ",
8964      s, RE_SV_TAIL(r->anchored_substr),
8965      (IV)r->anchored_offset);
8966  } else if (r->anchored_utf8) {
8967   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8968    RE_SV_DUMPLEN(r->anchored_utf8), 30);
8969   PerlIO_printf(Perl_debug_log,
8970      "anchored utf8 %s%s at %"IVdf" ",
8971      s, RE_SV_TAIL(r->anchored_utf8),
8972      (IV)r->anchored_offset);
8973  }
8974  if (r->float_substr) {
8975   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8976    RE_SV_DUMPLEN(r->float_substr), 30);
8977   PerlIO_printf(Perl_debug_log,
8978      "floating %s%s at %"IVdf"..%"UVuf" ",
8979      s, RE_SV_TAIL(r->float_substr),
8980      (IV)r->float_min_offset, (UV)r->float_max_offset);
8981  } else if (r->float_utf8) {
8982   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8983    RE_SV_DUMPLEN(r->float_utf8), 30);
8984   PerlIO_printf(Perl_debug_log,
8985      "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8986      s, RE_SV_TAIL(r->float_utf8),
8987      (IV)r->float_min_offset, (UV)r->float_max_offset);
8988  }
8989  if (r->check_substr || r->check_utf8)
8990   PerlIO_printf(Perl_debug_log,
8991      (const char *)
8992      (r->check_substr == r->float_substr
8993      && r->check_utf8 == r->float_utf8
8994      ? "(checking floating" : "(checking anchored"));
8995  if (r->extflags & RXf_NOSCAN)
8996   PerlIO_printf(Perl_debug_log, " noscan");
8997  if (r->extflags & RXf_CHECK_ALL)
8998   PerlIO_printf(Perl_debug_log, " isall");
8999  if (r->check_substr || r->check_utf8)
9000   PerlIO_printf(Perl_debug_log, ") ");
9001
9002  if (ri->regstclass) {
9003   regprop(r, sv, ri->regstclass);
9004   PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9005  }
9006  if (r->extflags & RXf_ANCH) {
9007   PerlIO_printf(Perl_debug_log, "anchored");
9008   if (r->extflags & RXf_ANCH_BOL)
9009    PerlIO_printf(Perl_debug_log, "(BOL)");
9010   if (r->extflags & RXf_ANCH_MBOL)
9011    PerlIO_printf(Perl_debug_log, "(MBOL)");
9012   if (r->extflags & RXf_ANCH_SBOL)
9013    PerlIO_printf(Perl_debug_log, "(SBOL)");
9014   if (r->extflags & RXf_ANCH_GPOS)
9015    PerlIO_printf(Perl_debug_log, "(GPOS)");
9016   PerlIO_putc(Perl_debug_log, ' ');
9017  }
9018  if (r->extflags & RXf_GPOS_SEEN)
9019   PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9020  if (r->intflags & PREGf_SKIP)
9021   PerlIO_printf(Perl_debug_log, "plus ");
9022  if (r->intflags & PREGf_IMPLICIT)
9023   PerlIO_printf(Perl_debug_log, "implicit ");
9024  PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9025  if (r->extflags & RXf_EVAL_SEEN)
9026   PerlIO_printf(Perl_debug_log, "with eval ");
9027  PerlIO_printf(Perl_debug_log, "\n");
9028  DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9029 #else
9030  PERL_ARGS_ASSERT_REGDUMP;
9031  PERL_UNUSED_CONTEXT;
9032  PERL_UNUSED_ARG(r);
9033 #endif /* DEBUGGING */
9034 }
9035
9036 /*
9037 - regprop - printable representation of opcode
9038 */
9039 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9040 STMT_START { \
9041   if (do_sep) {                           \
9042    Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9043    if (flags & ANYOF_INVERT)           \
9044     /*make sure the invert info is in each */ \
9045     sv_catpvs(sv, "^");             \
9046    do_sep = 0;                         \
9047   }                                       \
9048 } STMT_END
9049
9050 void
9051 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9052 {
9053 #ifdef DEBUGGING
9054  dVAR;
9055  register int k;
9056  RXi_GET_DECL(prog,progi);
9057  GET_RE_DEBUG_FLAGS_DECL;
9058
9059  PERL_ARGS_ASSERT_REGPROP;
9060
9061  sv_setpvs(sv, "");
9062
9063  if (OP(o) > REGNODE_MAX)  /* regnode.type is unsigned */
9064   /* It would be nice to FAIL() here, but this may be called from
9065   regexec.c, and it would be hard to supply pRExC_state. */
9066   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9067  sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9068
9069  k = PL_regkind[OP(o)];
9070
9071  if (k == EXACT) {
9072   sv_catpvs(sv, " ");
9073   /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9074   * is a crude hack but it may be the best for now since
9075   * we have no flag "this EXACTish node was UTF-8"
9076   * --jhi */
9077   pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9078     PERL_PV_ESCAPE_UNI_DETECT |
9079     PERL_PV_PRETTY_ELLIPSES   |
9080     PERL_PV_PRETTY_LTGT       |
9081     PERL_PV_PRETTY_NOCLEAR
9082     );
9083  } else if (k == TRIE) {
9084   /* print the details of the trie in dumpuntil instead, as
9085   * progi->data isn't available here */
9086   const char op = OP(o);
9087   const U32 n = ARG(o);
9088   const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9089    (reg_ac_data *)progi->data->data[n] :
9090    NULL;
9091   const reg_trie_data * const trie
9092    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9093
9094   Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9095   DEBUG_TRIE_COMPILE_r(
9096    Perl_sv_catpvf(aTHX_ sv,
9097     "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9098     (UV)trie->startstate,
9099     (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9100     (UV)trie->wordcount,
9101     (UV)trie->minlen,
9102     (UV)trie->maxlen,
9103     (UV)TRIE_CHARCOUNT(trie),
9104     (UV)trie->uniquecharcount
9105    )
9106   );
9107   if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9108    int i;
9109    int rangestart = -1;
9110    U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9111    sv_catpvs(sv, "[");
9112    for (i = 0; i <= 256; i++) {
9113     if (i < 256 && BITMAP_TEST(bitmap,i)) {
9114      if (rangestart == -1)
9115       rangestart = i;
9116     } else if (rangestart != -1) {
9117      if (i <= rangestart + 3)
9118       for (; rangestart < i; rangestart++)
9119        put_byte(sv, rangestart);
9120      else {
9121       put_byte(sv, rangestart);
9122       sv_catpvs(sv, "-");
9123       put_byte(sv, i - 1);
9124      }
9125      rangestart = -1;
9126     }
9127    }
9128    sv_catpvs(sv, "]");
9129   }
9130
9131  } else if (k == CURLY) {
9132   if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9133    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9134   Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9135  }
9136  else if (k == WHILEM && o->flags)   /* Ordinal/of */
9137   Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9138  else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9139   Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9140   if ( RXp_PAREN_NAMES(prog) ) {
9141    if ( k != REF || OP(o) < NREF) {
9142     AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9143     SV **name= av_fetch(list, ARG(o), 0 );
9144     if (name)
9145      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9146    }
9147    else {
9148     AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9149     SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9150     I32 *nums=(I32*)SvPVX(sv_dat);
9151     SV **name= av_fetch(list, nums[0], 0 );
9152     I32 n;
9153     if (name) {
9154      for ( n=0; n<SvIVX(sv_dat); n++ ) {
9155       Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9156          (n ? "," : ""), (IV)nums[n]);
9157      }
9158      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9159     }
9160    }
9161   }
9162  } else if (k == GOSUB)
9163   Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9164  else if (k == VERB) {
9165   if (!o->flags)
9166    Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9167       SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9168  } else if (k == LOGICAL)
9169   Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
9170  else if (k == FOLDCHAR)
9171   Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9172  else if (k == ANYOF) {
9173   int i, rangestart = -1;
9174   const U8 flags = ANYOF_FLAGS(o);
9175   int do_sep = 0;
9176
9177   /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9178   static const char * const anyofs[] = {
9179    "\\w",
9180    "\\W",
9181    "\\s",
9182    "\\S",
9183    "\\d",
9184    "\\D",
9185    "[:alnum:]",
9186    "[:^alnum:]",
9187    "[:alpha:]",
9188    "[:^alpha:]",
9189    "[:ascii:]",
9190    "[:^ascii:]",
9191    "[:cntrl:]",
9192    "[:^cntrl:]",
9193    "[:graph:]",
9194    "[:^graph:]",
9195    "[:lower:]",
9196    "[:^lower:]",
9197    "[:print:]",
9198    "[:^print:]",
9199    "[:punct:]",
9200    "[:^punct:]",
9201    "[:upper:]",
9202    "[:^upper:]",
9203    "[:xdigit:]",
9204    "[:^xdigit:]",
9205    "[:space:]",
9206    "[:^space:]",
9207    "[:blank:]",
9208    "[:^blank:]"
9209   };
9210
9211   if (flags & ANYOF_LOCALE)
9212    sv_catpvs(sv, "{loc}");
9213   if (flags & ANYOF_FOLD)
9214    sv_catpvs(sv, "{i}");
9215   Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9216   if (flags & ANYOF_INVERT)
9217    sv_catpvs(sv, "^");
9218
9219   /* output what the standard cp 0-255 bitmap matches */
9220   for (i = 0; i <= 256; i++) {
9221    if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9222     if (rangestart == -1)
9223      rangestart = i;
9224    } else if (rangestart != -1) {
9225     if (i <= rangestart + 3)
9226      for (; rangestart < i; rangestart++)
9227       put_byte(sv, rangestart);
9228     else {
9229      put_byte(sv, rangestart);
9230      sv_catpvs(sv, "-");
9231      put_byte(sv, i - 1);
9232     }
9233     do_sep = 1;
9234     rangestart = -1;
9235    }
9236   }
9237
9238   EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9239   /* output any special charclass tests (used mostly under use locale) */
9240   if (o->flags & ANYOF_CLASS)
9241    for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9242     if (ANYOF_CLASS_TEST(o,i)) {
9243      sv_catpv(sv, anyofs[i]);
9244      do_sep = 1;
9245     }
9246
9247   EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9248
9249   /* output information about the unicode matching */
9250   if (flags & ANYOF_UNICODE)
9251    sv_catpvs(sv, "{unicode}");
9252   else if (flags & ANYOF_UNICODE_ALL)
9253    sv_catpvs(sv, "{unicode_all}");
9254
9255   {
9256    SV *lv;
9257    SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9258
9259    if (lv) {
9260     if (sw) {
9261      U8 s[UTF8_MAXBYTES_CASE+1];
9262
9263      for (i = 0; i <= 256; i++) { /* just the first 256 */
9264       uvchr_to_utf8(s, i);
9265
9266       if (i < 256 && swash_fetch(sw, s, TRUE)) {
9267        if (rangestart == -1)
9268         rangestart = i;
9269       } else if (rangestart != -1) {
9270        if (i <= rangestart + 3)
9271         for (; rangestart < i; rangestart++) {
9272          const U8 * const e = uvchr_to_utf8(s,rangestart);
9273          U8 *p;
9274          for(p = s; p < e; p++)
9275           put_byte(sv, *p);
9276         }
9277        else {
9278         const U8 *e = uvchr_to_utf8(s,rangestart);
9279         U8 *p;
9280         for (p = s; p < e; p++)
9281          put_byte(sv, *p);
9282         sv_catpvs(sv, "-");
9283         e = uvchr_to_utf8(s, i-1);
9284         for (p = s; p < e; p++)
9285          put_byte(sv, *p);
9286         }
9287         rangestart = -1;
9288        }
9289       }
9290
9291      sv_catpvs(sv, "..."); /* et cetera */
9292     }
9293
9294     {
9295      char *s = savesvpv(lv);
9296      char * const origs = s;
9297
9298      while (*s && *s != '\n')
9299       s++;
9300
9301      if (*s == '\n') {
9302       const char * const t = ++s;
9303
9304       while (*s) {
9305        if (*s == '\n')
9306         *s = ' ';
9307        s++;
9308       }
9309       if (s[-1] == ' ')
9310        s[-1] = 0;
9311
9312       sv_catpv(sv, t);
9313      }
9314
9315      Safefree(origs);
9316     }
9317    }
9318   }
9319
9320   Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9321  }
9322  else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9323   Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9324 #else
9325  PERL_UNUSED_CONTEXT;
9326  PERL_UNUSED_ARG(sv);
9327  PERL_UNUSED_ARG(o);
9328  PERL_UNUSED_ARG(prog);
9329 #endif /* DEBUGGING */
9330 }
9331
9332 SV *
9333 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9334 {    /* Assume that RE_INTUIT is set */
9335  dVAR;
9336  struct regexp *const prog = (struct regexp *)SvANY(r);
9337  GET_RE_DEBUG_FLAGS_DECL;
9338
9339  PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9340  PERL_UNUSED_CONTEXT;
9341
9342  DEBUG_COMPILE_r(
9343   {
9344    const char * const s = SvPV_nolen_const(prog->check_substr
9345      ? prog->check_substr : prog->check_utf8);
9346
9347    if (!PL_colorset) reginitcolors();
9348    PerlIO_printf(Perl_debug_log,
9349      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9350      PL_colors[4],
9351      prog->check_substr ? "" : "utf8 ",
9352      PL_colors[5],PL_colors[0],
9353      s,
9354      PL_colors[1],
9355      (strlen(s) > 60 ? "..." : ""));
9356   } );
9357
9358  return prog->check_substr ? prog->check_substr : prog->check_utf8;
9359 }
9360
9361 /*
9362    pregfree()
9363
9364    handles refcounting and freeing the perl core regexp structure. When
9365    it is necessary to actually free the structure the first thing it
9366    does is call the 'free' method of the regexp_engine associated to to
9367    the regexp, allowing the handling of the void *pprivate; member
9368    first. (This routine is not overridable by extensions, which is why
9369    the extensions free is called first.)
9370
9371    See regdupe and regdupe_internal if you change anything here.
9372 */
9373 #ifndef PERL_IN_XSUB_RE
9374 void
9375 Perl_pregfree(pTHX_ REGEXP *r)
9376 {
9377  SvREFCNT_dec(r);
9378 }
9379
9380 void
9381 Perl_pregfree2(pTHX_ REGEXP *rx)
9382 {
9383  dVAR;
9384  struct regexp *const r = (struct regexp *)SvANY(rx);
9385  GET_RE_DEBUG_FLAGS_DECL;
9386
9387  PERL_ARGS_ASSERT_PREGFREE2;
9388
9389  if (r->mother_re) {
9390   ReREFCNT_dec(r->mother_re);
9391  } else {
9392   CALLREGFREE_PVT(rx); /* free the private data */
9393   SvREFCNT_dec(RXp_PAREN_NAMES(r));
9394  }
9395  if (r->substrs) {
9396   SvREFCNT_dec(r->anchored_substr);
9397   SvREFCNT_dec(r->anchored_utf8);
9398   SvREFCNT_dec(r->float_substr);
9399   SvREFCNT_dec(r->float_utf8);
9400   Safefree(r->substrs);
9401  }
9402  RX_MATCH_COPY_FREE(rx);
9403 #ifdef PERL_OLD_COPY_ON_WRITE
9404  SvREFCNT_dec(r->saved_copy);
9405 #endif
9406  Safefree(r->offs);
9407 }
9408
9409 /*  reg_temp_copy()
9410
9411  This is a hacky workaround to the structural issue of match results
9412  being stored in the regexp structure which is in turn stored in
9413  PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9414  could be PL_curpm in multiple contexts, and could require multiple
9415  result sets being associated with the pattern simultaneously, such
9416  as when doing a recursive match with (??{$qr})
9417
9418  The solution is to make a lightweight copy of the regexp structure
9419  when a qr// is returned from the code executed by (??{$qr}) this
9420  lightweight copy doesnt actually own any of its data except for
9421  the starp/end and the actual regexp structure itself.
9422
9423 */
9424
9425
9426 REGEXP *
9427 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9428 {
9429  struct regexp *ret;
9430  struct regexp *const r = (struct regexp *)SvANY(rx);
9431  register const I32 npar = r->nparens+1;
9432
9433  PERL_ARGS_ASSERT_REG_TEMP_COPY;
9434
9435  if (!ret_x)
9436   ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9437  ret = (struct regexp *)SvANY(ret_x);
9438
9439  (void)ReREFCNT_inc(rx);
9440  /* We can take advantage of the existing "copied buffer" mechanism in SVs
9441  by pointing directly at the buffer, but flagging that the allocated
9442  space in the copy is zero. As we've just done a struct copy, it's now
9443  a case of zero-ing that, rather than copying the current length.  */
9444  SvPV_set(ret_x, RX_WRAPPED(rx));
9445  SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9446  memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9447   sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9448  SvLEN_set(ret_x, 0);
9449  SvSTASH_set(ret_x, NULL);
9450  SvMAGIC_set(ret_x, NULL);
9451  Newx(ret->offs, npar, regexp_paren_pair);
9452  Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9453  if (r->substrs) {
9454   Newx(ret->substrs, 1, struct reg_substr_data);
9455   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9456
9457   SvREFCNT_inc_void(ret->anchored_substr);
9458   SvREFCNT_inc_void(ret->anchored_utf8);
9459   SvREFCNT_inc_void(ret->float_substr);
9460   SvREFCNT_inc_void(ret->float_utf8);
9461
9462   /* check_substr and check_utf8, if non-NULL, point to either their
9463   anchored or float namesakes, and don't hold a second reference.  */
9464  }
9465  RX_MATCH_COPIED_off(ret_x);
9466 #ifdef PERL_OLD_COPY_ON_WRITE
9467  ret->saved_copy = NULL;
9468 #endif
9469  ret->mother_re = rx;
9470
9471  return ret_x;
9472 }
9473 #endif
9474
9475 /* regfree_internal()
9476
9477    Free the private data in a regexp. This is overloadable by
9478    extensions. Perl takes care of the regexp structure in pregfree(),
9479    this covers the *pprivate pointer which technically perldoesnt
9480    know about, however of course we have to handle the
9481    regexp_internal structure when no extension is in use.
9482
9483    Note this is called before freeing anything in the regexp
9484    structure.
9485  */
9486
9487 void
9488 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9489 {
9490  dVAR;
9491  struct regexp *const r = (struct regexp *)SvANY(rx);
9492  RXi_GET_DECL(r,ri);
9493  GET_RE_DEBUG_FLAGS_DECL;
9494
9495  PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9496
9497  DEBUG_COMPILE_r({
9498   if (!PL_colorset)
9499    reginitcolors();
9500   {
9501    SV *dsv= sv_newmortal();
9502    RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
9503     dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
9504    PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9505     PL_colors[4],PL_colors[5],s);
9506   }
9507  });
9508 #ifdef RE_TRACK_PATTERN_OFFSETS
9509  if (ri->u.offsets)
9510   Safefree(ri->u.offsets);             /* 20010421 MJD */
9511 #endif
9512  if (ri->data) {
9513   int n = ri->data->count;
9514   PAD* new_comppad = NULL;
9515   PAD* old_comppad;
9516   PADOFFSET refcnt;
9517
9518   while (--n >= 0) {
9519   /* If you add a ->what type here, update the comment in regcomp.h */
9520    switch (ri->data->what[n]) {
9521    case 's':
9522    case 'S':
9523    case 'u':
9524     SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9525     break;
9526    case 'f':
9527     Safefree(ri->data->data[n]);
9528     break;
9529    case 'p':
9530     new_comppad = MUTABLE_AV(ri->data->data[n]);
9531     break;
9532    case 'o':
9533     if (new_comppad == NULL)
9534      Perl_croak(aTHX_ "panic: pregfree comppad");
9535     PAD_SAVE_LOCAL(old_comppad,
9536      /* Watch out for global destruction's random ordering. */
9537      (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9538     );
9539     OP_REFCNT_LOCK;
9540     refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9541     OP_REFCNT_UNLOCK;
9542     if (!refcnt)
9543      op_free((OP_4tree*)ri->data->data[n]);
9544
9545     PAD_RESTORE_LOCAL(old_comppad);
9546     SvREFCNT_dec(MUTABLE_SV(new_comppad));
9547     new_comppad = NULL;
9548     break;
9549    case 'n':
9550     break;
9551    case 'T':
9552     { /* Aho Corasick add-on structure for a trie node.
9553      Used in stclass optimization only */
9554      U32 refcount;
9555      reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9556      OP_REFCNT_LOCK;
9557      refcount = --aho->refcount;
9558      OP_REFCNT_UNLOCK;
9559      if ( !refcount ) {
9560       PerlMemShared_free(aho->states);
9561       PerlMemShared_free(aho->fail);
9562       /* do this last!!!! */
9563       PerlMemShared_free(ri->data->data[n]);
9564       PerlMemShared_free(ri->regstclass);
9565      }
9566     }
9567     break;
9568    case 't':
9569     {
9570      /* trie structure. */
9571      U32 refcount;
9572      reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9573      OP_REFCNT_LOCK;
9574      refcount = --trie->refcount;
9575      OP_REFCNT_UNLOCK;
9576      if ( !refcount ) {
9577       PerlMemShared_free(trie->charmap);
9578       PerlMemShared_free(trie->states);
9579       PerlMemShared_free(trie->trans);
9580       if (trie->bitmap)
9581        PerlMemShared_free(trie->bitmap);
9582       if (trie->wordlen)
9583        PerlMemShared_free(trie->wordlen);
9584       if (trie->jump)
9585        PerlMemShared_free(trie->jump);
9586       if (trie->nextword)
9587        PerlMemShared_free(trie->nextword);
9588       /* do this last!!!! */
9589       PerlMemShared_free(ri->data->data[n]);
9590      }
9591     }
9592     break;
9593    default:
9594     Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9595    }
9596   }
9597   Safefree(ri->data->what);
9598   Safefree(ri->data);
9599  }
9600
9601  Safefree(ri);
9602 }
9603
9604 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
9605 #define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9606 #define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9607 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9608
9609 /*
9610    re_dup - duplicate a regexp.
9611
9612    This routine is expected to clone a given regexp structure. It is only
9613    compiled under USE_ITHREADS.
9614
9615    After all of the core data stored in struct regexp is duplicated
9616    the regexp_engine.dupe method is used to copy any private data
9617    stored in the *pprivate pointer. This allows extensions to handle
9618    any duplication it needs to do.
9619
9620    See pregfree() and regfree_internal() if you change anything here.
9621 */
9622 #if defined(USE_ITHREADS)
9623 #ifndef PERL_IN_XSUB_RE
9624 void
9625 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
9626 {
9627  dVAR;
9628  I32 npar;
9629  const struct regexp *r = (const struct regexp *)SvANY(sstr);
9630  struct regexp *ret = (struct regexp *)SvANY(dstr);
9631
9632  PERL_ARGS_ASSERT_RE_DUP_GUTS;
9633
9634  npar = r->nparens+1;
9635  Newx(ret->offs, npar, regexp_paren_pair);
9636  Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9637  if(ret->swap) {
9638   /* no need to copy these */
9639   Newx(ret->swap, npar, regexp_paren_pair);
9640  }
9641
9642  if (ret->substrs) {
9643   /* Do it this way to avoid reading from *r after the StructCopy().
9644   That way, if any of the sv_dup_inc()s dislodge *r from the L1
9645   cache, it doesn't matter.  */
9646   const bool anchored = r->check_substr
9647    ? r->check_substr == r->anchored_substr
9648    : r->check_utf8 == r->anchored_utf8;
9649   Newx(ret->substrs, 1, struct reg_substr_data);
9650   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9651
9652   ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9653   ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9654   ret->float_substr = sv_dup_inc(ret->float_substr, param);
9655   ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9656
9657   /* check_substr and check_utf8, if non-NULL, point to either their
9658   anchored or float namesakes, and don't hold a second reference.  */
9659
9660   if (ret->check_substr) {
9661    if (anchored) {
9662     assert(r->check_utf8 == r->anchored_utf8);
9663     ret->check_substr = ret->anchored_substr;
9664     ret->check_utf8 = ret->anchored_utf8;
9665    } else {
9666     assert(r->check_substr == r->float_substr);
9667     assert(r->check_utf8 == r->float_utf8);
9668     ret->check_substr = ret->float_substr;
9669     ret->check_utf8 = ret->float_utf8;
9670    }
9671   } else if (ret->check_utf8) {
9672    if (anchored) {
9673     ret->check_utf8 = ret->anchored_utf8;
9674    } else {
9675     ret->check_utf8 = ret->float_utf8;
9676    }
9677   }
9678  }
9679
9680  RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9681
9682  if (ret->pprivate)
9683   RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
9684
9685  if (RX_MATCH_COPIED(dstr))
9686   ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
9687  else
9688   ret->subbeg = NULL;
9689 #ifdef PERL_OLD_COPY_ON_WRITE
9690  ret->saved_copy = NULL;
9691 #endif
9692
9693  if (ret->mother_re) {
9694   if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9695    /* Our storage points directly to our mother regexp, but that's
9696    1: a buffer in a different thread
9697    2: something we no longer hold a reference on
9698    so we need to copy it locally.  */
9699    /* Note we need to sue SvCUR() on our mother_re, because it, in
9700    turn, may well be pointing to its own mother_re.  */
9701    SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9702         SvCUR(ret->mother_re)+1));
9703    SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9704   }
9705   ret->mother_re      = NULL;
9706  }
9707  ret->gofs = 0;
9708 }
9709 #endif /* PERL_IN_XSUB_RE */
9710
9711 /*
9712    regdupe_internal()
9713
9714    This is the internal complement to regdupe() which is used to copy
9715    the structure pointed to by the *pprivate pointer in the regexp.
9716    This is the core version of the extension overridable cloning hook.
9717    The regexp structure being duplicated will be copied by perl prior
9718    to this and will be provided as the regexp *r argument, however
9719    with the /old/ structures pprivate pointer value. Thus this routine
9720    may override any copying normally done by perl.
9721
9722    It returns a pointer to the new regexp_internal structure.
9723 */
9724
9725 void *
9726 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
9727 {
9728  dVAR;
9729  struct regexp *const r = (struct regexp *)SvANY(rx);
9730  regexp_internal *reti;
9731  int len, npar;
9732  RXi_GET_DECL(r,ri);
9733
9734  PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
9735
9736  npar = r->nparens+1;
9737  len = ProgLen(ri);
9738
9739  Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
9740  Copy(ri->program, reti->program, len+1, regnode);
9741
9742
9743  reti->regstclass = NULL;
9744
9745  if (ri->data) {
9746   struct reg_data *d;
9747   const int count = ri->data->count;
9748   int i;
9749
9750   Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9751     char, struct reg_data);
9752   Newx(d->what, count, U8);
9753
9754   d->count = count;
9755   for (i = 0; i < count; i++) {
9756    d->what[i] = ri->data->what[i];
9757    switch (d->what[i]) {
9758     /* legal options are one of: sSfpontTu
9759     see also regcomp.h and pregfree() */
9760    case 's':
9761    case 'S':
9762    case 'p': /* actually an AV, but the dup function is identical.  */
9763    case 'u': /* actually an HV, but the dup function is identical.  */
9764     d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
9765     break;
9766    case 'f':
9767     /* This is cheating. */
9768     Newx(d->data[i], 1, struct regnode_charclass_class);
9769     StructCopy(ri->data->data[i], d->data[i],
9770        struct regnode_charclass_class);
9771     reti->regstclass = (regnode*)d->data[i];
9772     break;
9773    case 'o':
9774     /* Compiled op trees are readonly and in shared memory,
9775     and can thus be shared without duplication. */
9776     OP_REFCNT_LOCK;
9777     d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9778     OP_REFCNT_UNLOCK;
9779     break;
9780    case 'T':
9781     /* Trie stclasses are readonly and can thus be shared
9782     * without duplication. We free the stclass in pregfree
9783     * when the corresponding reg_ac_data struct is freed.
9784     */
9785     reti->regstclass= ri->regstclass;
9786     /* Fall through */
9787    case 't':
9788     OP_REFCNT_LOCK;
9789     ((reg_trie_data*)ri->data->data[i])->refcount++;
9790     OP_REFCNT_UNLOCK;
9791     /* Fall through */
9792    case 'n':
9793     d->data[i] = ri->data->data[i];
9794     break;
9795    default:
9796     Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9797    }
9798   }
9799
9800   reti->data = d;
9801  }
9802  else
9803   reti->data = NULL;
9804
9805  reti->name_list_idx = ri->name_list_idx;
9806
9807 #ifdef RE_TRACK_PATTERN_OFFSETS
9808  if (ri->u.offsets) {
9809   Newx(reti->u.offsets, 2*len+1, U32);
9810   Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9811  }
9812 #else
9813  SetProgLen(reti,len);
9814 #endif
9815
9816  return (void*)reti;
9817 }
9818
9819 #endif    /* USE_ITHREADS */
9820
9821 #ifndef PERL_IN_XSUB_RE
9822
9823 /*
9824  - regnext - dig the "next" pointer out of a node
9825  */
9826 regnode *
9827 Perl_regnext(pTHX_ register regnode *p)
9828 {
9829  dVAR;
9830  register I32 offset;
9831
9832  if (!p)
9833   return(NULL);
9834
9835  offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9836  if (offset == 0)
9837   return(NULL);
9838
9839  return(p+offset);
9840 }
9841 #endif
9842
9843 STATIC void
9844 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9845 {
9846  va_list args;
9847  STRLEN l1 = strlen(pat1);
9848  STRLEN l2 = strlen(pat2);
9849  char buf[512];
9850  SV *msv;
9851  const char *message;
9852
9853  PERL_ARGS_ASSERT_RE_CROAK2;
9854
9855  if (l1 > 510)
9856   l1 = 510;
9857  if (l1 + l2 > 510)
9858   l2 = 510 - l1;
9859  Copy(pat1, buf, l1 , char);
9860  Copy(pat2, buf + l1, l2 , char);
9861  buf[l1 + l2] = '\n';
9862  buf[l1 + l2 + 1] = '\0';
9863 #ifdef I_STDARG
9864  /* ANSI variant takes additional second argument */
9865  va_start(args, pat2);
9866 #else
9867  va_start(args);
9868 #endif
9869  msv = vmess(buf, &args);
9870  va_end(args);
9871  message = SvPV_const(msv,l1);
9872  if (l1 > 512)
9873   l1 = 512;
9874  Copy(message, buf, l1 , char);
9875  buf[l1-1] = '\0';   /* Overwrite \n */
9876  Perl_croak(aTHX_ "%s", buf);
9877 }
9878
9879 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
9880
9881 #ifndef PERL_IN_XSUB_RE
9882 void
9883 Perl_save_re_context(pTHX)
9884 {
9885  dVAR;
9886
9887  struct re_save_state *state;
9888
9889  SAVEVPTR(PL_curcop);
9890  SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9891
9892  state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9893  PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9894  SSPUSHINT(SAVEt_RE_STATE);
9895
9896  Copy(&PL_reg_state, state, 1, struct re_save_state);
9897
9898  PL_reg_start_tmp = 0;
9899  PL_reg_start_tmpl = 0;
9900  PL_reg_oldsaved = NULL;
9901  PL_reg_oldsavedlen = 0;
9902  PL_reg_maxiter = 0;
9903  PL_reg_leftiter = 0;
9904  PL_reg_poscache = NULL;
9905  PL_reg_poscache_size = 0;
9906 #ifdef PERL_OLD_COPY_ON_WRITE
9907  PL_nrs = NULL;
9908 #endif
9909
9910  /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9911  if (PL_curpm) {
9912   const REGEXP * const rx = PM_GETRE(PL_curpm);
9913   if (rx) {
9914    U32 i;
9915    for (i = 1; i <= RX_NPARENS(rx); i++) {
9916     char digits[TYPE_CHARS(long)];
9917     const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9918     GV *const *const gvp
9919      = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9920
9921     if (gvp) {
9922      GV * const gv = *gvp;
9923      if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9924       save_scalar(gv);
9925     }
9926    }
9927   }
9928  }
9929 }
9930 #endif
9931
9932 static void
9933 clear_re(pTHX_ void *r)
9934 {
9935  dVAR;
9936  ReREFCNT_dec((REGEXP *)r);
9937 }
9938
9939 #ifdef DEBUGGING
9940
9941 STATIC void
9942 S_put_byte(pTHX_ SV *sv, int c)
9943 {
9944  PERL_ARGS_ASSERT_PUT_BYTE;
9945
9946  /* Our definition of isPRINT() ignores locales, so only bytes that are
9947  not part of UTF-8 are considered printable. I assume that the same
9948  holds for UTF-EBCDIC.
9949  Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9950  which Wikipedia says:
9951
9952  EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9953  ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9954  identical, to the ASCII delete (DEL) or rubout control character.
9955  ) So the old condition can be simplified to !isPRINT(c)  */
9956  if (!isPRINT(c))
9957   Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9958  else {
9959   const char string = c;
9960   if (c == '-' || c == ']' || c == '\\' || c == '^')
9961    sv_catpvs(sv, "\\");
9962   sv_catpvn(sv, &string, 1);
9963  }
9964 }
9965
9966
9967 #define CLEAR_OPTSTART \
9968  if (optstart) STMT_START { \
9969    DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9970    optstart=NULL; \
9971  } STMT_END
9972
9973 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9974
9975 STATIC const regnode *
9976 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9977    const regnode *last, const regnode *plast,
9978    SV* sv, I32 indent, U32 depth)
9979 {
9980  dVAR;
9981  register U8 op = PSEUDO; /* Arbitrary non-END op. */
9982  register const regnode *next;
9983  const regnode *optstart= NULL;
9984
9985  RXi_GET_DECL(r,ri);
9986  GET_RE_DEBUG_FLAGS_DECL;
9987
9988  PERL_ARGS_ASSERT_DUMPUNTIL;
9989
9990 #ifdef DEBUG_DUMPUNTIL
9991  PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9992   last ? last-start : 0,plast ? plast-start : 0);
9993 #endif
9994
9995  if (plast && plast < last)
9996   last= plast;
9997
9998  while (PL_regkind[op] != END && (!last || node < last)) {
9999   /* While that wasn't END last time... */
10000   NODE_ALIGN(node);
10001   op = OP(node);
10002   if (op == CLOSE || op == WHILEM)
10003    indent--;
10004   next = regnext((regnode *)node);
10005
10006   /* Where, what. */
10007   if (OP(node) == OPTIMIZED) {
10008    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10009     optstart = node;
10010    else
10011     goto after_print;
10012   } else
10013    CLEAR_OPTSTART;
10014
10015   regprop(r, sv, node);
10016   PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10017      (int)(2*indent + 1), "", SvPVX_const(sv));
10018
10019   if (OP(node) != OPTIMIZED) {
10020    if (next == NULL)  /* Next ptr. */
10021     PerlIO_printf(Perl_debug_log, " (0)");
10022    else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10023     PerlIO_printf(Perl_debug_log, " (FAIL)");
10024    else
10025     PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10026    (void)PerlIO_putc(Perl_debug_log, '\n');
10027   }
10028
10029  after_print:
10030   if (PL_regkind[(U8)op] == BRANCHJ) {
10031    assert(next);
10032    {
10033     register const regnode *nnode = (OP(next) == LONGJMP
10034            ? regnext((regnode *)next)
10035            : next);
10036     if (last && nnode > last)
10037      nnode = last;
10038     DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10039    }
10040   }
10041   else if (PL_regkind[(U8)op] == BRANCH) {
10042    assert(next);
10043    DUMPUNTIL(NEXTOPER(node), next);
10044   }
10045   else if ( PL_regkind[(U8)op]  == TRIE ) {
10046    const regnode *this_trie = node;
10047    const char op = OP(node);
10048    const U32 n = ARG(node);
10049    const reg_ac_data * const ac = op>=AHOCORASICK ?
10050    (reg_ac_data *)ri->data->data[n] :
10051    NULL;
10052    const reg_trie_data * const trie =
10053     (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10054 #ifdef DEBUGGING
10055    AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10056 #endif
10057    const regnode *nextbranch= NULL;
10058    I32 word_idx;
10059    sv_setpvs(sv, "");
10060    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10061     SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10062
10063     PerlIO_printf(Perl_debug_log, "%*s%s ",
10064     (int)(2*(indent+3)), "",
10065      elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10066        PL_colors[0], PL_colors[1],
10067        (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10068        PERL_PV_PRETTY_ELLIPSES    |
10069        PERL_PV_PRETTY_LTGT
10070        )
10071        : "???"
10072     );
10073     if (trie->jump) {
10074      U16 dist= trie->jump[word_idx+1];
10075      PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10076         (UV)((dist ? this_trie + dist : next) - start));
10077      if (dist) {
10078       if (!nextbranch)
10079        nextbranch= this_trie + trie->jump[0];
10080       DUMPUNTIL(this_trie + dist, nextbranch);
10081      }
10082      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10083       nextbranch= regnext((regnode *)nextbranch);
10084     } else {
10085      PerlIO_printf(Perl_debug_log, "\n");
10086     }
10087    }
10088    if (last && next > last)
10089     node= last;
10090    else
10091     node= next;
10092   }
10093   else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
10094    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10095      NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10096   }
10097   else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10098    assert(next);
10099    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10100   }
10101   else if ( op == PLUS || op == STAR) {
10102    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10103   }
10104   else if (op == ANYOF) {
10105    /* arglen 1 + class block */
10106    node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10107      ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10108    node = NEXTOPER(node);
10109   }
10110   else if (PL_regkind[(U8)op] == EXACT) {
10111    /* Literal string, where present. */
10112    node += NODE_SZ_STR(node) - 1;
10113    node = NEXTOPER(node);
10114   }
10115   else {
10116    node = NEXTOPER(node);
10117    node += regarglen[(U8)op];
10118   }
10119   if (op == CURLYX || op == OPEN)
10120    indent++;
10121  }
10122  CLEAR_OPTSTART;
10123 #ifdef DEBUG_DUMPUNTIL
10124  PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10125 #endif
10126  return node;
10127 }
10128
10129 #endif /* DEBUGGING */
10130
10131 /*
10132  * Local variables:
10133  * c-indentation-style: bsd
10134  * c-basic-offset: 4
10135  * indent-tabs-mode: t
10136  * End:
10137  *
10138  * ex: set ts=8 sts=4 sw=4 noet:
10139  */