src/5013005/orig/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to  pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #ifdef op
  89 #undef op
  90 #endif /* op */
  91
  92 #ifdef MSDOS
  93 #  if defined(BUGGY_MSC6)
  94  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  95 #    pragma optimize("a",off)
  96  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  97 #    pragma optimize("w",on )
  98 #  endif /* BUGGY_MSC6 */
  99 #endif /* MSDOS */
 100
 101 #ifndef STATIC
 102 #define STATIC  static
 103 #endif
 104
 105 typedef struct RExC_state_t {
 106     U32         flags;                  /* are we folding, multilining? */
 107     char        *precomp;               /* uncompiled string. */
 108     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 109     regexp      *rx;                    /* perl core regexp structure */
 110     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 111     char        *start;                 /* Start of input for compile */
 112     char        *end;                   /* End of input for compile */
 113     char        *parse;                 /* Input-scan pointer. */
 114     I32         whilem_seen;            /* number of WHILEM in this expr */
 115     regnode     *emit_start;            /* Start of emitted-code area */
 116     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 117     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 118     I32         naughty;                /* How bad is this pattern? */
 119     I32         sawback;                /* Did we see \1, ...? */
 120     U32         seen;
 121     I32         size;                   /* Code size. */
 122     I32         npar;                   /* Capture buffer count, (OPEN). */
 123     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 124     I32         nestroot;               /* root parens we are in - used by accept */
 125     I32         extralen;
 126     I32         seen_zerolen;
 127     I32         seen_evals;
 128     regnode     **open_parens;          /* pointers to open parens */
 129     regnode     **close_parens;         /* pointers to close parens */
 130     regnode     *opend;                 /* END node in program */
 131     I32         utf8;           /* whether the pattern is utf8 or not */
 132     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 133                                 /* XXX use this for future optimisation of case
 134                                  * where pattern must be upgraded to utf8. */
 135     HV          *paren_names;           /* Paren names */
 136
 137     regnode     **recurse;              /* Recurse regops */
 138     I32         recurse_count;          /* Number of recurse regops */
 139 #if ADD_TO_REGEXEC
 140     char        *starttry;              /* -Dr: where regtry was called. */
 141 #define RExC_starttry   (pRExC_state->starttry)
 142 #endif
 143 #ifdef DEBUGGING
 144     const char  *lastparse;
 145     I32         lastnum;
 146     AV          *paren_name_list;       /* idx -> name */
 147 #define RExC_lastparse  (pRExC_state->lastparse)
 148 #define RExC_lastnum    (pRExC_state->lastnum)
 149 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 150 #endif
 151 } RExC_state_t;
 152
 153 #define RExC_flags      (pRExC_state->flags)
 154 #define RExC_precomp    (pRExC_state->precomp)
 155 #define RExC_rx_sv      (pRExC_state->rx_sv)
 156 #define RExC_rx         (pRExC_state->rx)
 157 #define RExC_rxi        (pRExC_state->rxi)
 158 #define RExC_start      (pRExC_state->start)
 159 #define RExC_end        (pRExC_state->end)
 160 #define RExC_parse      (pRExC_state->parse)
 161 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 162 #ifdef RE_TRACK_PATTERN_OFFSETS
 163 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 164 #endif
 165 #define RExC_emit       (pRExC_state->emit)
 166 #define RExC_emit_start (pRExC_state->emit_start)
 167 #define RExC_emit_bound (pRExC_state->emit_bound)
 168 #define RExC_naughty    (pRExC_state->naughty)
 169 #define RExC_sawback    (pRExC_state->sawback)
 170 #define RExC_seen       (pRExC_state->seen)
 171 #define RExC_size       (pRExC_state->size)
 172 #define RExC_npar       (pRExC_state->npar)
 173 #define RExC_nestroot   (pRExC_state->nestroot)
 174 #define RExC_extralen   (pRExC_state->extralen)
 175 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 176 #define RExC_seen_evals (pRExC_state->seen_evals)
 177 #define RExC_utf8       (pRExC_state->utf8)
 178 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 179 #define RExC_open_parens        (pRExC_state->open_parens)
 180 #define RExC_close_parens       (pRExC_state->close_parens)
 181 #define RExC_opend      (pRExC_state->opend)
 182 #define RExC_paren_names        (pRExC_state->paren_names)
 183 #define RExC_recurse    (pRExC_state->recurse)
 184 #define RExC_recurse_count      (pRExC_state->recurse_count)
 185
 186
 187 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 188 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 189         ((*s) == '{' && regcurly(s)))
 190
 191 #ifdef SPSTART
 192 #undef SPSTART          /* dratted cpp namespace... */
 193 #endif
 194 /*
 195  * Flags to be passed up and down.
 196  */
 197 #define WORST           0       /* Worst case. */
 198 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 199
 200 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 201  * character, and if utf8, must be invariant. */
 202 #define SIMPLE          0x02
 203 #define SPSTART         0x04    /* Starts with * or +. */
 204 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 205 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 206
 207 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 208
 209 /* whether trie related optimizations are enabled */
 210 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 211 #define TRIE_STUDY_OPT
 212 #define FULL_TRIE_STUDY
 213 #define TRIE_STCLASS
 214 #endif
 215
 216
 217
 218 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 219 #define PBITVAL(paren) (1 << ((paren) & 7))
 220 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 221 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 222 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 223
 224 /* If not already in utf8, do a longjmp back to the beginning */
 225 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 226 #define REQUIRE_UTF8    STMT_START {                                       \
 227                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 228                         } STMT_END
 229
 230 /* About scan_data_t.
 231
 232   During optimisation we recurse through the regexp program performing
 233   various inplace (keyhole style) optimisations. In addition study_chunk
 234   and scan_commit populate this data structure with information about
 235   what strings MUST appear in the pattern. We look for the longest
 236   string that must appear for at a fixed location, and we look for the
 237   longest string that may appear at a floating location. So for instance
 238   in the pattern:
 239
 240     /FOO[xX]A.*B[xX]BAR/
 241
 242   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 243   strings (because they follow a .* construct). study_chunk will identify
 244   both FOO and BAR as being the longest fixed and floating strings respectively.
 245
 246   The strings can be composites, for instance
 247
 248      /(f)(o)(o)/
 249
 250   will result in a composite fixed substring 'foo'.
 251
 252   For each string some basic information is maintained:
 253
 254   - offset or min_offset
 255     This is the position the string must appear at, or not before.
 256     It also implicitly (when combined with minlenp) tells us how many
 257     character must match before the string we are searching.
 258     Likewise when combined with minlenp and the length of the string
 259     tells us how many characters must appear after the string we have
 260     found.
 261
 262   - max_offset
 263     Only used for floating strings. This is the rightmost point that
 264     the string can appear at. Ifset to I32 max it indicates that the
 265     string can occur infinitely far to the right.
 266
 267   - minlenp
 268     A pointer to the minimum length of the pattern that the string
 269     was found inside. This is important as in the case of positive
 270     lookahead or positive lookbehind we can have multiple patterns
 271     involved. Consider
 272
 273     /(?=FOO).*F/
 274
 275     The minimum length of the pattern overall is 3, the minimum length
 276     of the lookahead part is 3, but the minimum length of the part that
 277     will actually match is 1. So 'FOO's minimum length is 3, but the
 278     minimum length for the F is 1. This is important as the minimum length
 279     is used to determine offsets in front of and behind the string being
 280     looked for.  Since strings can be composites this is the length of the
 281     pattern at the time it was commited with a scan_commit. Note that
 282     the length is calculated by study_chunk, so that the minimum lengths
 283     are not known until the full pattern has been compiled, thus the
 284     pointer to the value.
 285
 286   - lookbehind
 287
 288     In the case of lookbehind the string being searched for can be
 289     offset past the start point of the final matching string.
 290     If this value was just blithely removed from the min_offset it would
 291     invalidate some of the calculations for how many chars must match
 292     before or after (as they are derived from min_offset and minlen and
 293     the length of the string being searched for).
 294     When the final pattern is compiled and the data is moved from the
 295     scan_data_t structure into the regexp structure the information
 296     about lookbehind is factored in, with the information that would
 297     have been lost precalculated in the end_shift field for the
 298     associated string.
 299
 300   The fields pos_min and pos_delta are used to store the minimum offset
 301   and the delta to the maximum offset at the current point in the pattern.
 302
 303 */
 304
 305 typedef struct scan_data_t {
 306     /*I32 len_min;      unused */
 307     /*I32 len_delta;    unused */
 308     I32 pos_min;
 309     I32 pos_delta;
 310     SV *last_found;
 311     I32 last_end;           /* min value, <0 unless valid. */
 312     I32 last_start_min;
 313     I32 last_start_max;
 314     SV **longest;           /* Either &l_fixed, or &l_float. */
 315     SV *longest_fixed;      /* longest fixed string found in pattern */
 316     I32 offset_fixed;       /* offset where it starts */
 317     I32 *minlen_fixed;      /* pointer to the minlen relevent to the string */
 318     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 319     SV *longest_float;      /* longest floating string found in pattern */
 320     I32 offset_float_min;   /* earliest point in string it can appear */
 321     I32 offset_float_max;   /* latest point in string it can appear */
 322     I32 *minlen_float;      /* pointer to the minlen relevent to the string */
 323     I32 lookbehind_float;   /* is the position of the string modified by LB */
 324     I32 flags;
 325     I32 whilem_c;
 326     I32 *last_closep;
 327     struct regnode_charclass_class *start_class;
 328 } scan_data_t;
 329
 330 /*
 331  * Forward declarations for pregcomp()'s friends.
 332  */
 333
 334 static const scan_data_t zero_scan_data =
 335   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 336
 337 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 338 #define SF_BEFORE_SEOL          0x0001
 339 #define SF_BEFORE_MEOL          0x0002
 340 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 341 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 342
 343 #ifdef NO_UNARY_PLUS
 344 #  define SF_FIX_SHIFT_EOL      (0+2)
 345 #  define SF_FL_SHIFT_EOL               (0+4)
 346 #else
 347 #  define SF_FIX_SHIFT_EOL      (+2)
 348 #  define SF_FL_SHIFT_EOL               (+4)
 349 #endif
 350
 351 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 352 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 353
 354 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 355 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 356 #define SF_IS_INF               0x0040
 357 #define SF_HAS_PAR              0x0080
 358 #define SF_IN_PAR               0x0100
 359 #define SF_HAS_EVAL             0x0200
 360 #define SCF_DO_SUBSTR           0x0400
 361 #define SCF_DO_STCLASS_AND      0x0800
 362 #define SCF_DO_STCLASS_OR       0x1000
 363 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 364 #define SCF_WHILEM_VISITED_POS  0x2000
 365
 366 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 367 #define SCF_SEEN_ACCEPT         0x8000
 368
 369 #define UTF (RExC_utf8 != 0)
 370 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
 371 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
 372
 373 #define OOB_UNICODE             12345678
 374 #define OOB_NAMEDCLASS          -1
 375
 376 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 377 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 378
 379
 380 /* length of regex to show in messages that don't mark a position within */
 381 #define RegexLengthToShowInErrorMessages 127
 382
 383 /*
 384  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 385  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 386  * op/pragma/warn/regcomp.
 387  */
 388 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 389 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 390
 391 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 392
 393 /*
 394  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 395  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 396  * "...".
 397  */
 398 #define _FAIL(code) STMT_START {                                        \
 399     const char *ellipses = "";                                          \
 400     IV len = RExC_end - RExC_precomp;                                   \
 401                                                                         \
 402     if (!SIZE_ONLY)                                                     \
 403         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 404     if (len > RegexLengthToShowInErrorMessages) {                       \
 405         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 406         len = RegexLengthToShowInErrorMessages - 10;                    \
 407         ellipses = "...";                                               \
 408     }                                                                   \
 409     code;                                                               \
 410 } STMT_END
 411
 412 #define FAIL(msg) _FAIL(                            \
 413     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 414             msg, (int)len, RExC_precomp, ellipses))
 415
 416 #define FAIL2(msg,arg) _FAIL(                       \
 417     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 418             arg, (int)len, RExC_precomp, ellipses))
 419
 420 /*
 421  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 422  */
 423 #define Simple_vFAIL(m) STMT_START {                                    \
 424     const IV offset = RExC_parse - RExC_precomp;                        \
 425     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 426             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 427 } STMT_END
 428
 429 /*
 430  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 431  */
 432 #define vFAIL(m) STMT_START {                           \
 433     if (!SIZE_ONLY)                                     \
 434         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 435     Simple_vFAIL(m);                                    \
 436 } STMT_END
 437
 438 /*
 439  * Like Simple_vFAIL(), but accepts two arguments.
 440  */
 441 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 442     const IV offset = RExC_parse - RExC_precomp;                        \
 443     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 444             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 445 } STMT_END
 446
 447 /*
 448  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 449  */
 450 #define vFAIL2(m,a1) STMT_START {                       \
 451     if (!SIZE_ONLY)                                     \
 452         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 453     Simple_vFAIL2(m, a1);                               \
 454 } STMT_END
 455
 456
 457 /*
 458  * Like Simple_vFAIL(), but accepts three arguments.
 459  */
 460 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 461     const IV offset = RExC_parse - RExC_precomp;                \
 462     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 463             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 464 } STMT_END
 465
 466 /*
 467  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 468  */
 469 #define vFAIL3(m,a1,a2) STMT_START {                    \
 470     if (!SIZE_ONLY)                                     \
 471         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 472     Simple_vFAIL3(m, a1, a2);                           \
 473 } STMT_END
 474
 475 /*
 476  * Like Simple_vFAIL(), but accepts four arguments.
 477  */
 478 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 479     const IV offset = RExC_parse - RExC_precomp;                \
 480     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 481             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 482 } STMT_END
 483
 484 #define ckWARNreg(loc,m) STMT_START {                                   \
 485     const IV offset = loc - RExC_precomp;                               \
 486     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 487             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 488 } STMT_END
 489
 490 #define ckWARNregdep(loc,m) STMT_START {                                \
 491     const IV offset = loc - RExC_precomp;                               \
 492     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 493             m REPORT_LOCATION,                                          \
 494             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 495 } STMT_END
 496
 497 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 498     const IV offset = loc - RExC_precomp;                               \
 499     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 500             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 501 } STMT_END
 502
 503 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 504     const IV offset = loc - RExC_precomp;                               \
 505     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 506             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 507 } STMT_END
 508
 509 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 510     const IV offset = loc - RExC_precomp;                               \
 511     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 512             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 513 } STMT_END
 514
 515 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 516     const IV offset = loc - RExC_precomp;                               \
 517     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 518             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 519 } STMT_END
 520
 521 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 522     const IV offset = loc - RExC_precomp;                               \
 523     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 524             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 525 } STMT_END
 526
 527 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 528     const IV offset = loc - RExC_precomp;                               \
 529     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 530             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 531 } STMT_END
 532
 533
 534 /* Allow for side effects in s */
 535 #define REGC(c,s) STMT_START {                  \
 536     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 537 } STMT_END
 538
 539 /* Macros for recording node offsets.   20001227 mjd@plover.com
 540  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 541  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 542  * Element 0 holds the number n.
 543  * Position is 1 indexed.
 544  */
 545 #ifndef RE_TRACK_PATTERN_OFFSETS
 546 #define Set_Node_Offset_To_R(node,byte)
 547 #define Set_Node_Offset(node,byte)
 548 #define Set_Cur_Node_Offset
 549 #define Set_Node_Length_To_R(node,len)
 550 #define Set_Node_Length(node,len)
 551 #define Set_Node_Cur_Length(node)
 552 #define Node_Offset(n)
 553 #define Node_Length(n)
 554 #define Set_Node_Offset_Length(node,offset,len)
 555 #define ProgLen(ri) ri->u.proglen
 556 #define SetProgLen(ri,x) ri->u.proglen = x
 557 #else
 558 #define ProgLen(ri) ri->u.offsets[0]
 559 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 560 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 561     if (! SIZE_ONLY) {                                                  \
 562         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 563                     __LINE__, (int)(node), (int)(byte)));               \
 564         if((node) < 0) {                                                \
 565             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 566         } else {                                                        \
 567             RExC_offsets[2*(node)-1] = (byte);                          \
 568         }                                                               \
 569     }                                                                   \
 570 } STMT_END
 571
 572 #define Set_Node_Offset(node,byte) \
 573     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 574 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 575
 576 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 577     if (! SIZE_ONLY) {                                                  \
 578         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 579                 __LINE__, (int)(node), (int)(len)));                    \
 580         if((node) < 0) {                                                \
 581             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 582         } else {                                                        \
 583             RExC_offsets[2*(node)] = (len);                             \
 584         }                                                               \
 585     }                                                                   \
 586 } STMT_END
 587
 588 #define Set_Node_Length(node,len) \
 589     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 590 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 591 #define Set_Node_Cur_Length(node) \
 592     Set_Node_Length(node, RExC_parse - parse_start)
 593
 594 /* Get offsets and lengths */
 595 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 596 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 597
 598 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 599     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 600     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 601 } STMT_END
 602 #endif
 603
 604 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 605 #define EXPERIMENTAL_INPLACESCAN
 606 #endif /*RE_TRACK_PATTERN_OFFSETS*/
 607
 608 #define DEBUG_STUDYDATA(str,data,depth)                              \
 609 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 610     PerlIO_printf(Perl_debug_log,                                    \
 611         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 612         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 613         (int)(depth)*2, "",                                          \
 614         (IV)((data)->pos_min),                                       \
 615         (IV)((data)->pos_delta),                                     \
 616         (UV)((data)->flags),                                         \
 617         (IV)((data)->whilem_c),                                      \
 618         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 619         is_inf ? "INF " : ""                                         \
 620     );                                                               \
 621     if ((data)->last_found)                                          \
 622         PerlIO_printf(Perl_debug_log,                                \
 623             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 624             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 625             SvPVX_const((data)->last_found),                         \
 626             (IV)((data)->last_end),                                  \
 627             (IV)((data)->last_start_min),                            \
 628             (IV)((data)->last_start_max),                            \
 629             ((data)->longest &&                                      \
 630              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 631             SvPVX_const((data)->longest_fixed),                      \
 632             (IV)((data)->offset_fixed),                              \
 633             ((data)->longest &&                                      \
 634              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 635             SvPVX_const((data)->longest_float),                      \
 636             (IV)((data)->offset_float_min),                          \
 637             (IV)((data)->offset_float_max)                           \
 638         );                                                           \
 639     PerlIO_printf(Perl_debug_log,"\n");                              \
 640 });
 641
 642 static void clear_re(pTHX_ void *r);
 643
 644 /* Mark that we cannot extend a found fixed substring at this point.
 645    Update the longest found anchored substring and the longest found
 646    floating substrings if needed. */
 647
 648 STATIC void
 649 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 650 {
 651     const STRLEN l = CHR_SVLEN(data->last_found);
 652     const STRLEN old_l = CHR_SVLEN(*data->longest);
 653     GET_RE_DEBUG_FLAGS_DECL;
 654
 655     PERL_ARGS_ASSERT_SCAN_COMMIT;
 656
 657     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 658         SvSetMagicSV(*data->longest, data->last_found);
 659         if (*data->longest == data->longest_fixed) {
 660             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 661             if (data->flags & SF_BEFORE_EOL)
 662                 data->flags
 663                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 664             else
 665                 data->flags &= ~SF_FIX_BEFORE_EOL;
 666             data->minlen_fixed=minlenp;
 667             data->lookbehind_fixed=0;
 668         }
 669         else { /* *data->longest == data->longest_float */
 670             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 671             data->offset_float_max = (l
 672                                       ? data->last_start_max
 673                                       : data->pos_min + data->pos_delta);
 674             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 675                 data->offset_float_max = I32_MAX;
 676             if (data->flags & SF_BEFORE_EOL)
 677                 data->flags
 678                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 679             else
 680                 data->flags &= ~SF_FL_BEFORE_EOL;
 681             data->minlen_float=minlenp;
 682             data->lookbehind_float=0;
 683         }
 684     }
 685     SvCUR_set(data->last_found, 0);
 686     {
 687         SV * const sv = data->last_found;
 688         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 689             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 690             if (mg)
 691                 mg->mg_len = 0;
 692         }
 693     }
 694     data->last_end = -1;
 695     data->flags &= ~SF_BEFORE_EOL;
 696     DEBUG_STUDYDATA("commit: ",data,0);
 697 }
 698
 699 /* Can match anything (initialization) */
 700 STATIC void
 701 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 702 {
 703     PERL_ARGS_ASSERT_CL_ANYTHING;
 704
 705     ANYOF_CLASS_ZERO(cl);
 706     ANYOF_BITMAP_SETALL(cl);
 707     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
 708     if (LOC)
 709         cl->flags |= ANYOF_LOCALE;
 710 }
 711
 712 /* Can match anything (initialization) */
 713 STATIC int
 714 S_cl_is_anything(const struct regnode_charclass_class *cl)
 715 {
 716     int value;
 717
 718     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 719
 720     for (value = 0; value <= ANYOF_MAX; value += 2)
 721         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 722             return 1;
 723     if (!(cl->flags & ANYOF_UNICODE_ALL))
 724         return 0;
 725     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 726         return 0;
 727     return 1;
 728 }
 729
 730 /* Can match anything (initialization) */
 731 STATIC void
 732 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 733 {
 734     PERL_ARGS_ASSERT_CL_INIT;
 735
 736     Zero(cl, 1, struct regnode_charclass_class);
 737     cl->type = ANYOF;
 738     cl_anything(pRExC_state, cl);
 739 }
 740
 741 STATIC void
 742 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 743 {
 744     PERL_ARGS_ASSERT_CL_INIT_ZERO;
 745
 746     Zero(cl, 1, struct regnode_charclass_class);
 747     cl->type = ANYOF;
 748     cl_anything(pRExC_state, cl);
 749     if (LOC)
 750         cl->flags |= ANYOF_LOCALE;
 751 }
 752
 753 /* 'And' a given class with another one.  Can create false positives */
 754 /* We assume that cl is not inverted */
 755 STATIC void
 756 S_cl_and(struct regnode_charclass_class *cl,
 757         const struct regnode_charclass_class *and_with)
 758 {
 759     PERL_ARGS_ASSERT_CL_AND;
 760
 761     assert(and_with->type == ANYOF);
 762     if (!(and_with->flags & ANYOF_CLASS)
 763         && !(cl->flags & ANYOF_CLASS)
 764         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 765         && !(and_with->flags & ANYOF_FOLD)
 766         && !(cl->flags & ANYOF_FOLD)) {
 767         int i;
 768
 769         if (and_with->flags & ANYOF_INVERT)
 770             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 771                 cl->bitmap[i] &= ~and_with->bitmap[i];
 772         else
 773             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 774                 cl->bitmap[i] &= and_with->bitmap[i];
 775     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 776     if (!(and_with->flags & ANYOF_EOS))
 777         cl->flags &= ~ANYOF_EOS;
 778
 779     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
 780         !(and_with->flags & ANYOF_INVERT)) {
 781         cl->flags &= ~ANYOF_UNICODE_ALL;
 782         cl->flags |= ANYOF_UNICODE;
 783         ARG_SET(cl, ARG(and_with));
 784     }
 785     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
 786         !(and_with->flags & ANYOF_INVERT))
 787         cl->flags &= ~ANYOF_UNICODE_ALL;
 788     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
 789         !(and_with->flags & ANYOF_INVERT))
 790         cl->flags &= ~ANYOF_UNICODE;
 791 }
 792
 793 /* 'OR' a given class with another one.  Can create false positives */
 794 /* We assume that cl is not inverted */
 795 STATIC void
 796 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 797 {
 798     PERL_ARGS_ASSERT_CL_OR;
 799
 800     if (or_with->flags & ANYOF_INVERT) {
 801         /* We do not use
 802          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 803          *   <= (B1 | !B2) | (CL1 | !CL2)
 804          * which is wasteful if CL2 is small, but we ignore CL2:
 805          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 806          * XXXX Can we handle case-fold?  Unclear:
 807          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 808          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 809          */
 810         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 811              && !(or_with->flags & ANYOF_FOLD)
 812              && !(cl->flags & ANYOF_FOLD) ) {
 813             int i;
 814
 815             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 816                 cl->bitmap[i] |= ~or_with->bitmap[i];
 817         } /* XXXX: logic is complicated otherwise */
 818         else {
 819             cl_anything(pRExC_state, cl);
 820         }
 821     } else {
 822         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 823         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 824              && (!(or_with->flags & ANYOF_FOLD)
 825                  || (cl->flags & ANYOF_FOLD)) ) {
 826             int i;
 827
 828             /* OR char bitmap and class bitmap separately */
 829             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 830                 cl->bitmap[i] |= or_with->bitmap[i];
 831             if (or_with->flags & ANYOF_CLASS) {
 832                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 833                     cl->classflags[i] |= or_with->classflags[i];
 834                 cl->flags |= ANYOF_CLASS;
 835             }
 836         }
 837         else { /* XXXX: logic is complicated, leave it along for a moment. */
 838             cl_anything(pRExC_state, cl);
 839         }
 840     }
 841     if (or_with->flags & ANYOF_EOS)
 842         cl->flags |= ANYOF_EOS;
 843
 844     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
 845         ARG(cl) != ARG(or_with)) {
 846         cl->flags |= ANYOF_UNICODE_ALL;
 847         cl->flags &= ~ANYOF_UNICODE;
 848     }
 849     if (or_with->flags & ANYOF_UNICODE_ALL) {
 850         cl->flags |= ANYOF_UNICODE_ALL;
 851         cl->flags &= ~ANYOF_UNICODE;
 852     }
 853 }
 854
 855 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 856 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 857 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 858 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
 859
 860
 861 #ifdef DEBUGGING
 862 /*
 863    dump_trie(trie,widecharmap,revcharmap)
 864    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
 865    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 866
 867    These routines dump out a trie in a somewhat readable format.
 868    The _interim_ variants are used for debugging the interim
 869    tables that are used to generate the final compressed
 870    representation which is what dump_trie expects.
 871
 872    Part of the reason for their existance is to provide a form
 873    of documentation as to how the different representations function.
 874
 875 */
 876
 877 /*
 878   Dumps the final compressed table form of the trie to Perl_debug_log.
 879   Used for debugging make_trie().
 880 */
 881
 882 STATIC void
 883 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
 884             AV *revcharmap, U32 depth)
 885 {
 886     U32 state;
 887     SV *sv=sv_newmortal();
 888     int colwidth= widecharmap ? 6 : 4;
 889     U16 word;
 890     GET_RE_DEBUG_FLAGS_DECL;
 891
 892     PERL_ARGS_ASSERT_DUMP_TRIE;
 893
 894     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
 895         (int)depth * 2 + 2,"",
 896         "Match","Base","Ofs" );
 897
 898     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
 899         SV ** const tmp = av_fetch( revcharmap, state, 0);
 900         if ( tmp ) {
 901             PerlIO_printf( Perl_debug_log, "%*s",
 902                 colwidth,
 903                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 904                             PL_colors[0], PL_colors[1],
 905                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 906                             PERL_PV_ESCAPE_FIRSTCHAR
 907                 )
 908             );
 909         }
 910     }
 911     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
 912         (int)depth * 2 + 2,"");
 913
 914     for( state = 0 ; state < trie->uniquecharcount ; state++ )
 915         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
 916     PerlIO_printf( Perl_debug_log, "\n");
 917
 918     for( state = 1 ; state < trie->statecount ; state++ ) {
 919         const U32 base = trie->states[ state ].trans.base;
 920
 921         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
 922
 923         if ( trie->states[ state ].wordnum ) {
 924             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
 925         } else {
 926             PerlIO_printf( Perl_debug_log, "%6s", "" );
 927         }
 928
 929         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
 930
 931         if ( base ) {
 932             U32 ofs = 0;
 933
 934             while( ( base + ofs  < trie->uniquecharcount ) ||
 935                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
 936                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
 937                     ofs++;
 938
 939             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
 940
 941             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
 942                 if ( ( base + ofs >= trie->uniquecharcount ) &&
 943                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
 944                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
 945                 {
 946                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
 947                     colwidth,
 948                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
 949                 } else {
 950                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
 951                 }
 952             }
 953
 954             PerlIO_printf( Perl_debug_log, "]");
 955
 956         }
 957         PerlIO_printf( Perl_debug_log, "\n" );
 958     }
 959     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
 960     for (word=1; word <= trie->wordcount; word++) {
 961         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
 962             (int)word, (int)(trie->wordinfo[word].prev),
 963             (int)(trie->wordinfo[word].len));
 964     }
 965     PerlIO_printf(Perl_debug_log, "\n" );
 966 }
 967 /*
 968   Dumps a fully constructed but uncompressed trie in list form.
 969   List tries normally only are used for construction when the number of
 970   possible chars (trie->uniquecharcount) is very high.
 971   Used for debugging make_trie().
 972 */
 973 STATIC void
 974 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
 975                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
 976                          U32 depth)
 977 {
 978     U32 state;
 979     SV *sv=sv_newmortal();
 980     int colwidth= widecharmap ? 6 : 4;
 981     GET_RE_DEBUG_FLAGS_DECL;
 982
 983     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
 984
 985     /* print out the table precompression.  */
 986     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
 987         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
 988         "------:-----+-----------------\n" );
 989
 990     for( state=1 ; state < next_alloc ; state ++ ) {
 991         U16 charid;
 992
 993         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
 994             (int)depth * 2 + 2,"", (UV)state  );
 995         if ( ! trie->states[ state ].wordnum ) {
 996             PerlIO_printf( Perl_debug_log, "%5s| ","");
 997         } else {
 998             PerlIO_printf( Perl_debug_log, "W%4x| ",
 999                 trie->states[ state ].wordnum
1000             );
1001         }
1002         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1003             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1004             if ( tmp ) {
1005                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1006                     colwidth,
1007                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1008                             PL_colors[0], PL_colors[1],
1009                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1010                             PERL_PV_ESCAPE_FIRSTCHAR
1011                     ) ,
1012                     TRIE_LIST_ITEM(state,charid).forid,
1013                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1014                 );
1015                 if (!(charid % 10))
1016                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1017                         (int)((depth * 2) + 14), "");
1018             }
1019         }
1020         PerlIO_printf( Perl_debug_log, "\n");
1021     }
1022 }
1023
1024 /*
1025   Dumps a fully constructed but uncompressed trie in table form.
1026   This is the normal DFA style state transition table, with a few
1027   twists to facilitate compression later.
1028   Used for debugging make_trie().
1029 */
1030 STATIC void
1031 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1032                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1033                           U32 depth)
1034 {
1035     U32 state;
1036     U16 charid;
1037     SV *sv=sv_newmortal();
1038     int colwidth= widecharmap ? 6 : 4;
1039     GET_RE_DEBUG_FLAGS_DECL;
1040
1041     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1042
1043     /*
1044        print out the table precompression so that we can do a visual check
1045        that they are identical.
1046      */
1047
1048     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1049
1050     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1051         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1052         if ( tmp ) {
1053             PerlIO_printf( Perl_debug_log, "%*s",
1054                 colwidth,
1055                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1056                             PL_colors[0], PL_colors[1],
1057                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1058                             PERL_PV_ESCAPE_FIRSTCHAR
1059                 )
1060             );
1061         }
1062     }
1063
1064     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1065
1066     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1067         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1068     }
1069
1070     PerlIO_printf( Perl_debug_log, "\n" );
1071
1072     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1073
1074         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1075             (int)depth * 2 + 2,"",
1076             (UV)TRIE_NODENUM( state ) );
1077
1078         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1079             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1080             if (v)
1081                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1082             else
1083                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1084         }
1085         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1086             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1087         } else {
1088             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1089             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1090         }
1091     }
1092 }
1093
1094 #endif
1095
1096
1097 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1098   startbranch: the first branch in the whole branch sequence
1099   first      : start branch of sequence of branch-exact nodes.
1100                May be the same as startbranch
1101   last       : Thing following the last branch.
1102                May be the same as tail.
1103   tail       : item following the branch sequence
1104   count      : words in the sequence
1105   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1106   depth      : indent depth
1107
1108 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1109
1110 A trie is an N'ary tree where the branches are determined by digital
1111 decomposition of the key. IE, at the root node you look up the 1st character and
1112 follow that branch repeat until you find the end of the branches. Nodes can be
1113 marked as "accepting" meaning they represent a complete word. Eg:
1114
1115   /he|she|his|hers/
1116
1117 would convert into the following structure. Numbers represent states, letters
1118 following numbers represent valid transitions on the letter from that state, if
1119 the number is in square brackets it represents an accepting state, otherwise it
1120 will be in parenthesis.
1121
1122       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1123       |    |
1124       |   (2)
1125       |    |
1126      (1)   +-i->(6)-+-s->[7]
1127       |
1128       +-s->(3)-+-h->(4)-+-e->[5]
1129
1130       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1131
1132 This shows that when matching against the string 'hers' we will begin at state 1
1133 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1134 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1135 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1136 single traverse. We store a mapping from accepting to state to which word was
1137 matched, and then when we have multiple possibilities we try to complete the
1138 rest of the regex in the order in which they occured in the alternation.
1139
1140 The only prior NFA like behaviour that would be changed by the TRIE support is
1141 the silent ignoring of duplicate alternations which are of the form:
1142
1143  / (DUPE|DUPE) X? (?{ ... }) Y /x
1144
1145 Thus EVAL blocks follwing a trie may be called a different number of times with
1146 and without the optimisation. With the optimisations dupes will be silently
1147 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1148 the following demonstrates:
1149
1150  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1151
1152 which prints out 'word' three times, but
1153
1154  'words'=~/(word|word|word)(?{ print $1 })S/
1155
1156 which doesnt print it out at all. This is due to other optimisations kicking in.
1157
1158 Example of what happens on a structural level:
1159
1160 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1161
1162    1: CURLYM[1] {1,32767}(18)
1163    5:   BRANCH(8)
1164    6:     EXACT <ac>(16)
1165    8:   BRANCH(11)
1166    9:     EXACT <ad>(16)
1167   11:   BRANCH(14)
1168   12:     EXACT <ab>(16)
1169   16:   SUCCEED(0)
1170   17:   NOTHING(18)
1171   18: END(0)
1172
1173 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1174 and should turn into:
1175
1176    1: CURLYM[1] {1,32767}(18)
1177    5:   TRIE(16)
1178         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1179           <ac>
1180           <ad>
1181           <ab>
1182   16:   SUCCEED(0)
1183   17:   NOTHING(18)
1184   18: END(0)
1185
1186 Cases where tail != last would be like /(?foo|bar)baz/:
1187
1188    1: BRANCH(4)
1189    2:   EXACT <foo>(8)
1190    4: BRANCH(7)
1191    5:   EXACT <bar>(8)
1192    7: TAIL(8)
1193    8: EXACT <baz>(10)
1194   10: END(0)
1195
1196 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1197 and would end up looking like:
1198
1199     1: TRIE(8)
1200       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1201         <foo>
1202         <bar>
1203    7: TAIL(8)
1204    8: EXACT <baz>(10)
1205   10: END(0)
1206
1207     d = uvuni_to_utf8_flags(d, uv, 0);
1208
1209 is the recommended Unicode-aware way of saying
1210
1211     *(d++) = uv;
1212 */
1213
1214 #define TRIE_STORE_REVCHAR                                                 \
1215     STMT_START {                                                           \
1216         if (UTF) {                                                         \
1217             SV *zlopp = newSV(2);                                          \
1218             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1219             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1220             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1221             SvPOK_on(zlopp);                                               \
1222             SvUTF8_on(zlopp);                                              \
1223             av_push(revcharmap, zlopp);                                    \
1224         } else {                                                           \
1225             char ooooff = (char)uvc;                                               \
1226             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1227         }                                                                  \
1228         } STMT_END
1229
1230 #define TRIE_READ_CHAR STMT_START {                                           \
1231     wordlen++;                                                                \
1232     if ( UTF ) {                                                              \
1233         if ( folder ) {                                                       \
1234             if ( foldlen > 0 ) {                                              \
1235                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1236                foldlen -= len;                                                \
1237                scan += len;                                                   \
1238                len = 0;                                                       \
1239             } else {                                                          \
1240                 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1241                 uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1242                 foldlen -= UNISKIP( uvc );                                    \
1243                 scan = foldbuf + UNISKIP( uvc );                              \
1244             }                                                                 \
1245         } else {                                                              \
1246             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1247         }                                                                     \
1248     } else {                                                                  \
1249         uvc = (U32)*uc;                                                       \
1250         len = 1;                                                              \
1251     }                                                                         \
1252 } STMT_END
1253
1254
1255
1256 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1257     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1258         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1259         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1260     }                                                           \
1261     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1262     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1263     TRIE_LIST_CUR( state )++;                                   \
1264 } STMT_END
1265
1266 #define TRIE_LIST_NEW(state) STMT_START {                       \
1267     Newxz( trie->states[ state ].trans.list,               \
1268         4, reg_trie_trans_le );                                 \
1269      TRIE_LIST_CUR( state ) = 1;                                \
1270      TRIE_LIST_LEN( state ) = 4;                                \
1271 } STMT_END
1272
1273 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1274     U16 dupe= trie->states[ state ].wordnum;                    \
1275     regnode * const noper_next = regnext( noper );              \
1276                                                                 \
1277     DEBUG_r({                                                   \
1278         /* store the word for dumping */                        \
1279         SV* tmp;                                                \
1280         if (OP(noper) != NOTHING)                               \
1281             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1282         else                                                    \
1283             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1284         av_push( trie_words, tmp );                             \
1285     });                                                         \
1286                                                                 \
1287     curword++;                                                  \
1288     trie->wordinfo[curword].prev   = 0;                         \
1289     trie->wordinfo[curword].len    = wordlen;                   \
1290     trie->wordinfo[curword].accept = state;                     \
1291                                                                 \
1292     if ( noper_next < tail ) {                                  \
1293         if (!trie->jump)                                        \
1294             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1295         trie->jump[curword] = (U16)(noper_next - convert);      \
1296         if (!jumper)                                            \
1297             jumper = noper_next;                                \
1298         if (!nextbranch)                                        \
1299             nextbranch= regnext(cur);                           \
1300     }                                                           \
1301                                                                 \
1302     if ( dupe ) {                                               \
1303         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1304         /* chain, so that when the bits of chain are later    */\
1305         /* linked together, the dups appear in the chain      */\
1306         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1307         trie->wordinfo[dupe].prev = curword;                    \
1308     } else {                                                    \
1309         /* we haven't inserted this word yet.                */ \
1310         trie->states[ state ].wordnum = curword;                \
1311     }                                                           \
1312 } STMT_END
1313
1314
1315 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1316      ( ( base + charid >=  ucharcount                                   \
1317          && base + charid < ubound                                      \
1318          && state == trie->trans[ base - ucharcount + charid ].check    \
1319          && trie->trans[ base - ucharcount + charid ].next )            \
1320            ? trie->trans[ base - ucharcount + charid ].next             \
1321            : ( state==1 ? special : 0 )                                 \
1322       )
1323
1324 #define MADE_TRIE       1
1325 #define MADE_JUMP_TRIE  2
1326 #define MADE_EXACT_TRIE 4
1327
1328 STATIC I32
1329 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1330 {
1331     dVAR;
1332     /* first pass, loop through and scan words */
1333     reg_trie_data *trie;
1334     HV *widecharmap = NULL;
1335     AV *revcharmap = newAV();
1336     regnode *cur;
1337     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1338     STRLEN len = 0;
1339     UV uvc = 0;
1340     U16 curword = 0;
1341     U32 next_alloc = 0;
1342     regnode *jumper = NULL;
1343     regnode *nextbranch = NULL;
1344     regnode *convert = NULL;
1345     U32 *prev_states; /* temp array mapping each state to previous one */
1346     /* we just use folder as a flag in utf8 */
1347     const U8 * const folder = ( flags == EXACTF
1348                        ? PL_fold
1349                        : ( flags == EXACTFL
1350                            ? PL_fold_locale
1351                            : NULL
1352                          )
1353                      );
1354
1355 #ifdef DEBUGGING
1356     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1357     AV *trie_words = NULL;
1358     /* along with revcharmap, this only used during construction but both are
1359      * useful during debugging so we store them in the struct when debugging.
1360      */
1361 #else
1362     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1363     STRLEN trie_charcount=0;
1364 #endif
1365     SV *re_trie_maxbuff;
1366     GET_RE_DEBUG_FLAGS_DECL;
1367
1368     PERL_ARGS_ASSERT_MAKE_TRIE;
1369 #ifndef DEBUGGING
1370     PERL_UNUSED_ARG(depth);
1371 #endif
1372
1373     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1374     trie->refcount = 1;
1375     trie->startstate = 1;
1376     trie->wordcount = word_count;
1377     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1378     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1379     if (!(UTF && folder))
1380         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1381     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1382                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1383
1384     DEBUG_r({
1385         trie_words = newAV();
1386     });
1387
1388     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1389     if (!SvIOK(re_trie_maxbuff)) {
1390         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1391     }
1392     DEBUG_OPTIMISE_r({
1393                 PerlIO_printf( Perl_debug_log,
1394                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1395                   (int)depth * 2 + 2, "",
1396                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1397                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1398                   (int)depth);
1399     });
1400
1401    /* Find the node we are going to overwrite */
1402     if ( first == startbranch && OP( last ) != BRANCH ) {
1403         /* whole branch chain */
1404         convert = first;
1405     } else {
1406         /* branch sub-chain */
1407         convert = NEXTOPER( first );
1408     }
1409
1410     /*  -- First loop and Setup --
1411
1412        We first traverse the branches and scan each word to determine if it
1413        contains widechars, and how many unique chars there are, this is
1414        important as we have to build a table with at least as many columns as we
1415        have unique chars.
1416
1417        We use an array of integers to represent the character codes 0..255
1418        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1419        native representation of the character value as the key and IV's for the
1420        coded index.
1421
1422        *TODO* If we keep track of how many times each character is used we can
1423        remap the columns so that the table compression later on is more
1424        efficient in terms of memory by ensuring most common value is in the
1425        middle and the least common are on the outside.  IMO this would be better
1426        than a most to least common mapping as theres a decent chance the most
1427        common letter will share a node with the least common, meaning the node
1428        will not be compressable. With a middle is most common approach the worst
1429        case is when we have the least common nodes twice.
1430
1431      */
1432
1433     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1434         regnode * const noper = NEXTOPER( cur );
1435         const U8 *uc = (U8*)STRING( noper );
1436         const U8 * const e  = uc + STR_LEN( noper );
1437         STRLEN foldlen = 0;
1438         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1439         const U8 *scan = (U8*)NULL;
1440         U32 wordlen      = 0;         /* required init */
1441         STRLEN chars = 0;
1442         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1443
1444         if (OP(noper) == NOTHING) {
1445             trie->minlen= 0;
1446             continue;
1447         }
1448         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1449             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1450                                           regardless of encoding */
1451
1452         for ( ; uc < e ; uc += len ) {
1453             TRIE_CHARCOUNT(trie)++;
1454             TRIE_READ_CHAR;
1455             chars++;
1456             if ( uvc < 256 ) {
1457                 if ( !trie->charmap[ uvc ] ) {
1458                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1459                     if ( folder )
1460                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1461                     TRIE_STORE_REVCHAR;
1462                 }
1463                 if ( set_bit ) {
1464                     /* store the codepoint in the bitmap, and if its ascii
1465                        also store its folded equivelent. */
1466                     TRIE_BITMAP_SET(trie,uvc);
1467
1468                     /* store the folded codepoint */
1469                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1470
1471                     if ( !UTF ) {
1472                         /* store first byte of utf8 representation of
1473                            codepoints in the 127 < uvc < 256 range */
1474                         if (127 < uvc && uvc < 192) {
1475                             TRIE_BITMAP_SET(trie,194);
1476                         } else if (191 < uvc ) {
1477                             TRIE_BITMAP_SET(trie,195);
1478                         /* && uvc < 256 -- we know uvc is < 256 already */
1479                         }
1480                     }
1481                     set_bit = 0; /* We've done our bit :-) */
1482                 }
1483             } else {
1484                 SV** svpp;
1485                 if ( !widecharmap )
1486                     widecharmap = newHV();
1487
1488                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1489
1490                 if ( !svpp )
1491                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1492
1493                 if ( !SvTRUE( *svpp ) ) {
1494                     sv_setiv( *svpp, ++trie->uniquecharcount );
1495                     TRIE_STORE_REVCHAR;
1496                 }
1497             }
1498         }
1499         if( cur == first ) {
1500             trie->minlen=chars;
1501             trie->maxlen=chars;
1502         } else if (chars < trie->minlen) {
1503             trie->minlen=chars;
1504         } else if (chars > trie->maxlen) {
1505             trie->maxlen=chars;
1506         }
1507
1508     } /* end first pass */
1509     DEBUG_TRIE_COMPILE_r(
1510         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1511                 (int)depth * 2 + 2,"",
1512                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1513                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1514                 (int)trie->minlen, (int)trie->maxlen )
1515     );
1516
1517     /*
1518         We now know what we are dealing with in terms of unique chars and
1519         string sizes so we can calculate how much memory a naive
1520         representation using a flat table  will take. If it's over a reasonable
1521         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1522         conservative but potentially much slower representation using an array
1523         of lists.
1524
1525         At the end we convert both representations into the same compressed
1526         form that will be used in regexec.c for matching with. The latter
1527         is a form that cannot be used to construct with but has memory
1528         properties similar to the list form and access properties similar
1529         to the table form making it both suitable for fast searches and
1530         small enough that its feasable to store for the duration of a program.
1531
1532         See the comment in the code where the compressed table is produced
1533         inplace from the flat tabe representation for an explanation of how
1534         the compression works.
1535
1536     */
1537
1538
1539     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1540     prev_states[1] = 0;
1541
1542     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1543         /*
1544             Second Pass -- Array Of Lists Representation
1545
1546             Each state will be represented by a list of charid:state records
1547             (reg_trie_trans_le) the first such element holds the CUR and LEN
1548             points of the allocated array. (See defines above).
1549
1550             We build the initial structure using the lists, and then convert
1551             it into the compressed table form which allows faster lookups
1552             (but cant be modified once converted).
1553         */
1554
1555         STRLEN transcount = 1;
1556
1557         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1558             "%*sCompiling trie using list compiler\n",
1559             (int)depth * 2 + 2, ""));
1560
1561         trie->states = (reg_trie_state *)
1562             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1563                                   sizeof(reg_trie_state) );
1564         TRIE_LIST_NEW(1);
1565         next_alloc = 2;
1566
1567         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1568
1569             regnode * const noper = NEXTOPER( cur );
1570             U8 *uc           = (U8*)STRING( noper );
1571             const U8 * const e = uc + STR_LEN( noper );
1572             U32 state        = 1;         /* required init */
1573             U16 charid       = 0;         /* sanity init */
1574             U8 *scan         = (U8*)NULL; /* sanity init */
1575             STRLEN foldlen   = 0;         /* required init */
1576             U32 wordlen      = 0;         /* required init */
1577             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1578
1579             if (OP(noper) != NOTHING) {
1580                 for ( ; uc < e ; uc += len ) {
1581
1582                     TRIE_READ_CHAR;
1583
1584                     if ( uvc < 256 ) {
1585                         charid = trie->charmap[ uvc ];
1586                     } else {
1587                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1588                         if ( !svpp ) {
1589                             charid = 0;
1590                         } else {
1591                             charid=(U16)SvIV( *svpp );
1592                         }
1593                     }
1594                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1595                     if ( charid ) {
1596
1597                         U16 check;
1598                         U32 newstate = 0;
1599
1600                         charid--;
1601                         if ( !trie->states[ state ].trans.list ) {
1602                             TRIE_LIST_NEW( state );
1603                         }
1604                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1605                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1606                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1607                                 break;
1608                             }
1609                         }
1610                         if ( ! newstate ) {
1611                             newstate = next_alloc++;
1612                             prev_states[newstate] = state;
1613                             TRIE_LIST_PUSH( state, charid, newstate );
1614                             transcount++;
1615                         }
1616                         state = newstate;
1617                     } else {
1618                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1619                     }
1620                 }
1621             }
1622             TRIE_HANDLE_WORD(state);
1623
1624         } /* end second pass */
1625
1626         /* next alloc is the NEXT state to be allocated */
1627         trie->statecount = next_alloc;
1628         trie->states = (reg_trie_state *)
1629             PerlMemShared_realloc( trie->states,
1630                                    next_alloc
1631                                    * sizeof(reg_trie_state) );
1632
1633         /* and now dump it out before we compress it */
1634         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1635                                                          revcharmap, next_alloc,
1636                                                          depth+1)
1637         );
1638
1639         trie->trans = (reg_trie_trans *)
1640             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1641         {
1642             U32 state;
1643             U32 tp = 0;
1644             U32 zp = 0;
1645
1646
1647             for( state=1 ; state < next_alloc ; state ++ ) {
1648                 U32 base=0;
1649
1650                 /*
1651                 DEBUG_TRIE_COMPILE_MORE_r(
1652                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1653                 );
1654                 */
1655
1656                 if (trie->states[state].trans.list) {
1657                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1658                     U16 maxid=minid;
1659                     U16 idx;
1660
1661                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1662                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1663                         if ( forid < minid ) {
1664                             minid=forid;
1665                         } else if ( forid > maxid ) {
1666                             maxid=forid;
1667                         }
1668                     }
1669                     if ( transcount < tp + maxid - minid + 1) {
1670                         transcount *= 2;
1671                         trie->trans = (reg_trie_trans *)
1672                             PerlMemShared_realloc( trie->trans,
1673                                                      transcount
1674                                                      * sizeof(reg_trie_trans) );
1675                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1676                     }
1677                     base = trie->uniquecharcount + tp - minid;
1678                     if ( maxid == minid ) {
1679                         U32 set = 0;
1680                         for ( ; zp < tp ; zp++ ) {
1681                             if ( ! trie->trans[ zp ].next ) {
1682                                 base = trie->uniquecharcount + zp - minid;
1683                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1684                                 trie->trans[ zp ].check = state;
1685                                 set = 1;
1686                                 break;
1687                             }
1688                         }
1689                         if ( !set ) {
1690                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1691                             trie->trans[ tp ].check = state;
1692                             tp++;
1693                             zp = tp;
1694                         }
1695                     } else {
1696                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1697                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1698                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1699                             trie->trans[ tid ].check = state;
1700                         }
1701                         tp += ( maxid - minid + 1 );
1702                     }
1703                     Safefree(trie->states[ state ].trans.list);
1704                 }
1705                 /*
1706                 DEBUG_TRIE_COMPILE_MORE_r(
1707                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1708                 );
1709                 */
1710                 trie->states[ state ].trans.base=base;
1711             }
1712             trie->lasttrans = tp + 1;
1713         }
1714     } else {
1715         /*
1716            Second Pass -- Flat Table Representation.
1717
1718            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1719            We know that we will need Charcount+1 trans at most to store the data
1720            (one row per char at worst case) So we preallocate both structures
1721            assuming worst case.
1722
1723            We then construct the trie using only the .next slots of the entry
1724            structs.
1725
1726            We use the .check field of the first entry of the node  temporarily to
1727            make compression both faster and easier by keeping track of how many non
1728            zero fields are in the node.
1729
1730            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1731            transition.
1732
1733            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1734            number representing the first entry of the node, and state as a
1735            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1736            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1737            are 2 entrys per node. eg:
1738
1739              A B       A B
1740           1. 2 4    1. 3 7
1741           2. 0 3    3. 0 5
1742           3. 0 0    5. 0 0
1743           4. 0 0    7. 0 0
1744
1745            The table is internally in the right hand, idx form. However as we also
1746            have to deal with the states array which is indexed by nodenum we have to
1747            use TRIE_NODENUM() to convert.
1748
1749         */
1750         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1751             "%*sCompiling trie using table compiler\n",
1752             (int)depth * 2 + 2, ""));
1753
1754         trie->trans = (reg_trie_trans *)
1755             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1756                                   * trie->uniquecharcount + 1,
1757                                   sizeof(reg_trie_trans) );
1758         trie->states = (reg_trie_state *)
1759             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1760                                   sizeof(reg_trie_state) );
1761         next_alloc = trie->uniquecharcount + 1;
1762
1763
1764         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1765
1766             regnode * const noper   = NEXTOPER( cur );
1767             const U8 *uc     = (U8*)STRING( noper );
1768             const U8 * const e = uc + STR_LEN( noper );
1769
1770             U32 state        = 1;         /* required init */
1771
1772             U16 charid       = 0;         /* sanity init */
1773             U32 accept_state = 0;         /* sanity init */
1774             U8 *scan         = (U8*)NULL; /* sanity init */
1775
1776             STRLEN foldlen   = 0;         /* required init */
1777             U32 wordlen      = 0;         /* required init */
1778             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1779
1780             if ( OP(noper) != NOTHING ) {
1781                 for ( ; uc < e ; uc += len ) {
1782
1783                     TRIE_READ_CHAR;
1784
1785                     if ( uvc < 256 ) {
1786                         charid = trie->charmap[ uvc ];
1787                     } else {
1788                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1789                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1790                     }
1791                     if ( charid ) {
1792                         charid--;
1793                         if ( !trie->trans[ state + charid ].next ) {
1794                             trie->trans[ state + charid ].next = next_alloc;
1795                             trie->trans[ state ].check++;
1796                             prev_states[TRIE_NODENUM(next_alloc)]
1797                                     = TRIE_NODENUM(state);
1798                             next_alloc += trie->uniquecharcount;
1799                         }
1800                         state = trie->trans[ state + charid ].next;
1801                     } else {
1802                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1803                     }
1804                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1805                 }
1806             }
1807             accept_state = TRIE_NODENUM( state );
1808             TRIE_HANDLE_WORD(accept_state);
1809
1810         } /* end second pass */
1811
1812         /* and now dump it out before we compress it */
1813         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1814                                                           revcharmap,
1815                                                           next_alloc, depth+1));
1816
1817         {
1818         /*
1819            * Inplace compress the table.*
1820
1821            For sparse data sets the table constructed by the trie algorithm will
1822            be mostly 0/FAIL transitions or to put it another way mostly empty.
1823            (Note that leaf nodes will not contain any transitions.)
1824
1825            This algorithm compresses the tables by eliminating most such
1826            transitions, at the cost of a modest bit of extra work during lookup:
1827
1828            - Each states[] entry contains a .base field which indicates the
1829            index in the state[] array wheres its transition data is stored.
1830
1831            - If .base is 0 there are no  valid transitions from that node.
1832
1833            - If .base is nonzero then charid is added to it to find an entry in
1834            the trans array.
1835
1836            -If trans[states[state].base+charid].check!=state then the
1837            transition is taken to be a 0/Fail transition. Thus if there are fail
1838            transitions at the front of the node then the .base offset will point
1839            somewhere inside the previous nodes data (or maybe even into a node
1840            even earlier), but the .check field determines if the transition is
1841            valid.
1842
1843            XXX - wrong maybe?
1844            The following process inplace converts the table to the compressed
1845            table: We first do not compress the root node 1,and mark its all its
1846            .check pointers as 1 and set its .base pointer as 1 as well. This
1847            allows to do a DFA construction from the compressed table later, and
1848            ensures that any .base pointers we calculate later are greater than
1849            0.
1850
1851            - We set 'pos' to indicate the first entry of the second node.
1852
1853            - We then iterate over the columns of the node, finding the first and
1854            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1855            and set the .check pointers accordingly, and advance pos
1856            appropriately and repreat for the next node. Note that when we copy
1857            the next pointers we have to convert them from the original
1858            NODEIDX form to NODENUM form as the former is not valid post
1859            compression.
1860
1861            - If a node has no transitions used we mark its base as 0 and do not
1862            advance the pos pointer.
1863
1864            - If a node only has one transition we use a second pointer into the
1865            structure to fill in allocated fail transitions from other states.
1866            This pointer is independent of the main pointer and scans forward
1867            looking for null transitions that are allocated to a state. When it
1868            finds one it writes the single transition into the "hole".  If the
1869            pointer doesnt find one the single transition is appended as normal.
1870
1871            - Once compressed we can Renew/realloc the structures to release the
1872            excess space.
1873
1874            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1875            specifically Fig 3.47 and the associated pseudocode.
1876
1877            demq
1878         */
1879         const U32 laststate = TRIE_NODENUM( next_alloc );
1880         U32 state, charid;
1881         U32 pos = 0, zp=0;
1882         trie->statecount = laststate;
1883
1884         for ( state = 1 ; state < laststate ; state++ ) {
1885             U8 flag = 0;
1886             const U32 stateidx = TRIE_NODEIDX( state );
1887             const U32 o_used = trie->trans[ stateidx ].check;
1888             U32 used = trie->trans[ stateidx ].check;
1889             trie->trans[ stateidx ].check = 0;
1890
1891             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1892                 if ( flag || trie->trans[ stateidx + charid ].next ) {
1893                     if ( trie->trans[ stateidx + charid ].next ) {
1894                         if (o_used == 1) {
1895                             for ( ; zp < pos ; zp++ ) {
1896                                 if ( ! trie->trans[ zp ].next ) {
1897                                     break;
1898                                 }
1899                             }
1900                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1901                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1902                             trie->trans[ zp ].check = state;
1903                             if ( ++zp > pos ) pos = zp;
1904                             break;
1905                         }
1906                         used--;
1907                     }
1908                     if ( !flag ) {
1909                         flag = 1;
1910                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1911                     }
1912                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1913                     trie->trans[ pos ].check = state;
1914                     pos++;
1915                 }
1916             }
1917         }
1918         trie->lasttrans = pos + 1;
1919         trie->states = (reg_trie_state *)
1920             PerlMemShared_realloc( trie->states, laststate
1921                                    * sizeof(reg_trie_state) );
1922         DEBUG_TRIE_COMPILE_MORE_r(
1923                 PerlIO_printf( Perl_debug_log,
1924                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1925                     (int)depth * 2 + 2,"",
1926                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1927                     (IV)next_alloc,
1928                     (IV)pos,
1929                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1930             );
1931
1932         } /* end table compress */
1933     }
1934     DEBUG_TRIE_COMPILE_MORE_r(
1935             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1936                 (int)depth * 2 + 2, "",
1937                 (UV)trie->statecount,
1938                 (UV)trie->lasttrans)
1939     );
1940     /* resize the trans array to remove unused space */
1941     trie->trans = (reg_trie_trans *)
1942         PerlMemShared_realloc( trie->trans, trie->lasttrans
1943                                * sizeof(reg_trie_trans) );
1944
1945     {   /* Modify the program and insert the new TRIE node*/
1946         U8 nodetype =(U8)(flags & 0xFF);
1947         char *str=NULL;
1948
1949 #ifdef DEBUGGING
1950         regnode *optimize = NULL;
1951 #ifdef RE_TRACK_PATTERN_OFFSETS
1952
1953         U32 mjd_offset = 0;
1954         U32 mjd_nodelen = 0;
1955 #endif /* RE_TRACK_PATTERN_OFFSETS */
1956 #endif /* DEBUGGING */
1957         /*
1958            This means we convert either the first branch or the first Exact,
1959            depending on whether the thing following (in 'last') is a branch
1960            or not and whther first is the startbranch (ie is it a sub part of
1961            the alternation or is it the whole thing.)
1962            Assuming its a sub part we conver the EXACT otherwise we convert
1963            the whole branch sequence, including the first.
1964          */
1965         /* Find the node we are going to overwrite */
1966         if ( first != startbranch || OP( last ) == BRANCH ) {
1967             /* branch sub-chain */
1968             NEXT_OFF( first ) = (U16)(last - first);
1969 #ifdef RE_TRACK_PATTERN_OFFSETS
1970             DEBUG_r({
1971                 mjd_offset= Node_Offset((convert));
1972                 mjd_nodelen= Node_Length((convert));
1973             });
1974 #endif
1975             /* whole branch chain */
1976         }
1977 #ifdef RE_TRACK_PATTERN_OFFSETS
1978         else {
1979             DEBUG_r({
1980                 const  regnode *nop = NEXTOPER( convert );
1981                 mjd_offset= Node_Offset((nop));
1982                 mjd_nodelen= Node_Length((nop));
1983             });
1984         }
1985         DEBUG_OPTIMISE_r(
1986             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1987                 (int)depth * 2 + 2, "",
1988                 (UV)mjd_offset, (UV)mjd_nodelen)
1989         );
1990 #endif
1991         /* But first we check to see if there is a common prefix we can
1992            split out as an EXACT and put in front of the TRIE node.  */
1993         trie->startstate= 1;
1994         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
1995             U32 state;
1996             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1997                 U32 ofs = 0;
1998                 I32 idx = -1;
1999                 U32 count = 0;
2000                 const U32 base = trie->states[ state ].trans.base;
2001
2002                 if ( trie->states[state].wordnum )
2003                         count = 1;
2004
2005                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2006                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2007                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2008                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2009                     {
2010                         if ( ++count > 1 ) {
2011                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2012                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2013                             if ( state == 1 ) break;
2014                             if ( count == 2 ) {
2015                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2016                                 DEBUG_OPTIMISE_r(
2017                                     PerlIO_printf(Perl_debug_log,
2018                                         "%*sNew Start State=%"UVuf" Class: [",
2019                                         (int)depth * 2 + 2, "",
2020                                         (UV)state));
2021                                 if (idx >= 0) {
2022                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2023                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2024
2025                                     TRIE_BITMAP_SET(trie,*ch);
2026                                     if ( folder )
2027                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2028                                     DEBUG_OPTIMISE_r(
2029                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2030                                     );
2031                                 }
2032                             }
2033                             TRIE_BITMAP_SET(trie,*ch);
2034                             if ( folder )
2035                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2036                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2037                         }
2038                         idx = ofs;
2039                     }
2040                 }
2041                 if ( count == 1 ) {
2042                     SV **tmp = av_fetch( revcharmap, idx, 0);
2043                     STRLEN len;
2044                     char *ch = SvPV( *tmp, len );
2045                     DEBUG_OPTIMISE_r({
2046                         SV *sv=sv_newmortal();
2047                         PerlIO_printf( Perl_debug_log,
2048                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2049                             (int)depth * 2 + 2, "",
2050                             (UV)state, (UV)idx,
2051                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2052                                 PL_colors[0], PL_colors[1],
2053                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2054                                 PERL_PV_ESCAPE_FIRSTCHAR
2055                             )
2056                         );
2057                     });
2058                     if ( state==1 ) {
2059                         OP( convert ) = nodetype;
2060                         str=STRING(convert);
2061                         STR_LEN(convert)=0;
2062                     }
2063                     STR_LEN(convert) += len;
2064                     while (len--)
2065                         *str++ = *ch++;
2066                 } else {
2067 #ifdef DEBUGGING
2068                     if (state>1)
2069                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2070 #endif
2071                     break;
2072                 }
2073             }
2074             trie->prefixlen = (state-1);
2075             if (str) {
2076                 regnode *n = convert+NODE_SZ_STR(convert);
2077                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2078                 trie->startstate = state;
2079                 trie->minlen -= (state - 1);
2080                 trie->maxlen -= (state - 1);
2081 #ifdef DEBUGGING
2082                /* At least the UNICOS C compiler choked on this
2083                 * being argument to DEBUG_r(), so let's just have
2084                 * it right here. */
2085                if (
2086 #ifdef PERL_EXT_RE_BUILD
2087                    1
2088 #else
2089                    DEBUG_r_TEST
2090 #endif
2091                    ) {
2092                    regnode *fix = convert;
2093                    U32 word = trie->wordcount;
2094                    mjd_nodelen++;
2095                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2096                    while( ++fix < n ) {
2097                        Set_Node_Offset_Length(fix, 0, 0);
2098                    }
2099                    while (word--) {
2100                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2101                        if (tmp) {
2102                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2103                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2104                            else
2105                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2106                        }
2107                    }
2108                }
2109 #endif
2110                 if (trie->maxlen) {
2111                     convert = n;
2112                 } else {
2113                     NEXT_OFF(convert) = (U16)(tail - convert);
2114                     DEBUG_r(optimize= n);
2115                 }
2116             }
2117         }
2118         if (!jumper)
2119             jumper = last;
2120         if ( trie->maxlen ) {
2121             NEXT_OFF( convert ) = (U16)(tail - convert);
2122             ARG_SET( convert, data_slot );
2123             /* Store the offset to the first unabsorbed branch in
2124                jump[0], which is otherwise unused by the jump logic.
2125                We use this when dumping a trie and during optimisation. */
2126             if (trie->jump)
2127                 trie->jump[0] = (U16)(nextbranch - convert);
2128
2129             /* XXXX */
2130             if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2131                  ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2132             {
2133                 OP( convert ) = TRIEC;
2134                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2135                 PerlMemShared_free(trie->bitmap);
2136                 trie->bitmap= NULL;
2137             } else
2138                 OP( convert ) = TRIE;
2139
2140             /* store the type in the flags */
2141             convert->flags = nodetype;
2142             DEBUG_r({
2143             optimize = convert
2144                       + NODE_STEP_REGNODE
2145                       + regarglen[ OP( convert ) ];
2146             });
2147             /* XXX We really should free up the resource in trie now,
2148                    as we won't use them - (which resources?) dmq */
2149         }
2150         /* needed for dumping*/
2151         DEBUG_r(if (optimize) {
2152             regnode *opt = convert;
2153
2154             while ( ++opt < optimize) {
2155                 Set_Node_Offset_Length(opt,0,0);
2156             }
2157             /*
2158                 Try to clean up some of the debris left after the
2159                 optimisation.
2160              */
2161             while( optimize < jumper ) {
2162                 mjd_nodelen += Node_Length((optimize));
2163                 OP( optimize ) = OPTIMIZED;
2164                 Set_Node_Offset_Length(optimize,0,0);
2165                 optimize++;
2166             }
2167             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2168         });
2169     } /* end node insert */
2170
2171     /*  Finish populating the prev field of the wordinfo array.  Walk back
2172      *  from each accept state until we find another accept state, and if
2173      *  so, point the first word's .prev field at the second word. If the
2174      *  second already has a .prev field set, stop now. This will be the
2175      *  case either if we've already processed that word's accept state,
2176      *  or that that state had multiple words, and the overspill words
2177      *  were already linked up earlier.
2178      */
2179     {
2180         U16 word;
2181         U32 state;
2182         U16 prev;
2183
2184         for (word=1; word <= trie->wordcount; word++) {
2185             prev = 0;
2186             if (trie->wordinfo[word].prev)
2187                 continue;
2188             state = trie->wordinfo[word].accept;
2189             while (state) {
2190                 state = prev_states[state];
2191                 if (!state)
2192                     break;
2193                 prev = trie->states[state].wordnum;
2194                 if (prev)
2195                     break;
2196             }
2197             trie->wordinfo[word].prev = prev;
2198         }
2199         Safefree(prev_states);
2200     }
2201
2202
2203     /* and now dump out the compressed format */
2204     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2205
2206     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2207 #ifdef DEBUGGING
2208     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2209     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2210 #else
2211     SvREFCNT_dec(revcharmap);
2212 #endif
2213     return trie->jump
2214            ? MADE_JUMP_TRIE
2215            : trie->startstate>1
2216              ? MADE_EXACT_TRIE
2217              : MADE_TRIE;
2218 }
2219
2220 STATIC void
2221 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2222 {
2223 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2224
2225    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2226    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2227    ISBN 0-201-10088-6
2228
2229    We find the fail state for each state in the trie, this state is the longest proper
2230    suffix of the current states 'word' that is also a proper prefix of another word in our
2231    trie. State 1 represents the word '' and is the thus the default fail state. This allows
2232    the DFA not to have to restart after its tried and failed a word at a given point, it
2233    simply continues as though it had been matching the other word in the first place.
2234    Consider
2235       'abcdgu'=~/abcdefg|cdgu/
2236    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2237    fail, which would bring use to the state representing 'd' in the second word where we would
2238    try 'g' and succeed, prodceding to match 'cdgu'.
2239  */
2240  /* add a fail transition */
2241     const U32 trie_offset = ARG(source);
2242     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2243     U32 *q;
2244     const U32 ucharcount = trie->uniquecharcount;
2245     const U32 numstates = trie->statecount;
2246     const U32 ubound = trie->lasttrans + ucharcount;
2247     U32 q_read = 0;
2248     U32 q_write = 0;
2249     U32 charid;
2250     U32 base = trie->states[ 1 ].trans.base;
2251     U32 *fail;
2252     reg_ac_data *aho;
2253     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2254     GET_RE_DEBUG_FLAGS_DECL;
2255
2256     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2257 #ifndef DEBUGGING
2258     PERL_UNUSED_ARG(depth);
2259 #endif
2260
2261
2262     ARG_SET( stclass, data_slot );
2263     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2264     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2265     aho->trie=trie_offset;
2266     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2267     Copy( trie->states, aho->states, numstates, reg_trie_state );
2268     Newxz( q, numstates, U32);
2269     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2270     aho->refcount = 1;
2271     fail = aho->fail;
2272     /* initialize fail[0..1] to be 1 so that we always have
2273        a valid final fail state */
2274     fail[ 0 ] = fail[ 1 ] = 1;
2275
2276     for ( charid = 0; charid < ucharcount ; charid++ ) {
2277         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2278         if ( newstate ) {
2279             q[ q_write ] = newstate;
2280             /* set to point at the root */
2281             fail[ q[ q_write++ ] ]=1;
2282         }
2283     }
2284     while ( q_read < q_write) {
2285         const U32 cur = q[ q_read++ % numstates ];
2286         base = trie->states[ cur ].trans.base;
2287
2288         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2289             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2290             if (ch_state) {
2291                 U32 fail_state = cur;
2292                 U32 fail_base;
2293                 do {
2294                     fail_state = fail[ fail_state ];
2295                     fail_base = aho->states[ fail_state ].trans.base;
2296                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2297
2298                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2299                 fail[ ch_state ] = fail_state;
2300                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2301                 {
2302                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2303                 }
2304                 q[ q_write++ % numstates] = ch_state;
2305             }
2306         }
2307     }
2308     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2309        when we fail in state 1, this allows us to use the
2310        charclass scan to find a valid start char. This is based on the principle
2311        that theres a good chance the string being searched contains lots of stuff
2312        that cant be a start char.
2313      */
2314     fail[ 0 ] = fail[ 1 ] = 0;
2315     DEBUG_TRIE_COMPILE_r({
2316         PerlIO_printf(Perl_debug_log,
2317                       "%*sStclass Failtable (%"UVuf" states): 0",
2318                       (int)(depth * 2), "", (UV)numstates
2319         );
2320         for( q_read=1; q_read<numstates; q_read++ ) {
2321             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2322         }
2323         PerlIO_printf(Perl_debug_log, "\n");
2324     });
2325     Safefree(q);
2326     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2327 }
2328
2329
2330 /*
2331  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2332  * These need to be revisited when a newer toolchain becomes available.
2333  */
2334 #if defined(__sparc64__) && defined(__GNUC__)
2335 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2336 #       undef  SPARC64_GCC_WORKAROUND
2337 #       define SPARC64_GCC_WORKAROUND 1
2338 #   endif
2339 #endif
2340
2341 #define DEBUG_PEEP(str,scan,depth) \
2342     DEBUG_OPTIMISE_r({if (scan){ \
2343        SV * const mysv=sv_newmortal(); \
2344        regnode *Next = regnext(scan); \
2345        regprop(RExC_rx, mysv, scan); \
2346        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2347        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2348        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2349    }});
2350
2351
2352
2353
2354
2355 #define JOIN_EXACT(scan,min,flags) \
2356     if (PL_regkind[OP(scan)] == EXACT) \
2357         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2358
2359 STATIC U32
2360 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2361     /* Merge several consecutive EXACTish nodes into one. */
2362     regnode *n = regnext(scan);
2363     U32 stringok = 1;
2364     regnode *next = scan + NODE_SZ_STR(scan);
2365     U32 merged = 0;
2366     U32 stopnow = 0;
2367 #ifdef DEBUGGING
2368     regnode *stop = scan;
2369     GET_RE_DEBUG_FLAGS_DECL;
2370 #else
2371     PERL_UNUSED_ARG(depth);
2372 #endif
2373
2374     PERL_ARGS_ASSERT_JOIN_EXACT;
2375 #ifndef EXPERIMENTAL_INPLACESCAN
2376     PERL_UNUSED_ARG(flags);
2377     PERL_UNUSED_ARG(val);
2378 #endif
2379     DEBUG_PEEP("join",scan,depth);
2380
2381     /* Skip NOTHING, merge EXACT*. */
2382     while (n &&
2383            ( PL_regkind[OP(n)] == NOTHING ||
2384              (stringok && (OP(n) == OP(scan))))
2385            && NEXT_OFF(n)
2386            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2387
2388         if (OP(n) == TAIL || n > next)
2389             stringok = 0;
2390         if (PL_regkind[OP(n)] == NOTHING) {
2391             DEBUG_PEEP("skip:",n,depth);
2392             NEXT_OFF(scan) += NEXT_OFF(n);
2393             next = n + NODE_STEP_REGNODE;
2394 #ifdef DEBUGGING
2395             if (stringok)
2396                 stop = n;
2397 #endif
2398             n = regnext(n);
2399         }
2400         else if (stringok) {
2401             const unsigned int oldl = STR_LEN(scan);
2402             regnode * const nnext = regnext(n);
2403
2404             DEBUG_PEEP("merg",n,depth);
2405
2406             merged++;
2407             if (oldl + STR_LEN(n) > U8_MAX)
2408                 break;
2409             NEXT_OFF(scan) += NEXT_OFF(n);
2410             STR_LEN(scan) += STR_LEN(n);
2411             next = n + NODE_SZ_STR(n);
2412             /* Now we can overwrite *n : */
2413             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2414 #ifdef DEBUGGING
2415             stop = next - 1;
2416 #endif
2417             n = nnext;
2418             if (stopnow) break;
2419         }
2420
2421 #ifdef EXPERIMENTAL_INPLACESCAN
2422         if (flags && !NEXT_OFF(n)) {
2423             DEBUG_PEEP("atch", val, depth);
2424             if (reg_off_by_arg[OP(n)]) {
2425                 ARG_SET(n, val - n);
2426             }
2427             else {
2428                 NEXT_OFF(n) = val - n;
2429             }
2430             stopnow = 1;
2431         }
2432 #endif
2433     }
2434
2435     if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2436     /*
2437     Two problematic code points in Unicode casefolding of EXACT nodes:
2438
2439     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2440     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2441
2442     which casefold to
2443
2444     Unicode                      UTF-8
2445
2446     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2447     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2448
2449     This means that in case-insensitive matching (or "loose matching",
2450     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2451     length of the above casefolded versions) can match a target string
2452     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2453     This would rather mess up the minimum length computation.
2454
2455     What we'll do is to look for the tail four bytes, and then peek
2456     at the preceding two bytes to see whether we need to decrease
2457     the minimum length by four (six minus two).
2458
2459     Thanks to the design of UTF-8, there cannot be false matches:
2460     A sequence of valid UTF-8 bytes cannot be a subsequence of
2461     another valid sequence of UTF-8 bytes.
2462
2463     */
2464          char * const s0 = STRING(scan), *s, *t;
2465          char * const s1 = s0 + STR_LEN(scan) - 1;
2466          char * const s2 = s1 - 4;
2467 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2468          const char t0[] = "\xaf\x49\xaf\x42";
2469 #else
2470          const char t0[] = "\xcc\x88\xcc\x81";
2471 #endif
2472          const char * const t1 = t0 + 3;
2473
2474          for (s = s0 + 2;
2475               s < s2 && (t = ninstr(s, s1, t0, t1));
2476               s = t + 4) {
2477 #ifdef EBCDIC
2478               if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2479                   ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2480 #else
2481               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2482                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2483 #endif
2484                    *min -= 4;
2485          }
2486     }
2487
2488 #ifdef DEBUGGING
2489     /* Allow dumping */
2490     n = scan + NODE_SZ_STR(scan);
2491     while (n <= stop) {
2492         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2493             OP(n) = OPTIMIZED;
2494             NEXT_OFF(n) = 0;
2495         }
2496         n++;
2497     }
2498 #endif
2499     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2500     return stopnow;
2501 }
2502
2503 /* REx optimizer.  Converts nodes into quickier variants "in place".
2504    Finds fixed substrings.  */
2505
2506 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2507    to the position after last scanned or to NULL. */
2508
2509 #define INIT_AND_WITHP \
2510     assert(!and_withp); \
2511     Newx(and_withp,1,struct regnode_charclass_class); \
2512     SAVEFREEPV(and_withp)
2513
2514 /* this is a chain of data about sub patterns we are processing that
2515    need to be handled seperately/specially in study_chunk. Its so
2516    we can simulate recursion without losing state.  */
2517 struct scan_frame;
2518 typedef struct scan_frame {
2519     regnode *last;  /* last node to process in this frame */
2520     regnode *next;  /* next node to process when last is reached */
2521     struct scan_frame *prev; /*previous frame*/
2522     I32 stop; /* what stopparen do we use */
2523 } scan_frame;
2524
2525
2526 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2527
2528 #define CASE_SYNST_FNC(nAmE)                                       \
2529 case nAmE:                                                         \
2530     if (flags & SCF_DO_STCLASS_AND) {                              \
2531             for (value = 0; value < 256; value++)                  \
2532                 if (!is_ ## nAmE ## _cp(value))                       \
2533                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2534     }                                                              \
2535     else {                                                         \
2536             for (value = 0; value < 256; value++)                  \
2537                 if (is_ ## nAmE ## _cp(value))                        \
2538                     ANYOF_BITMAP_SET(data->start_class, value);    \
2539     }                                                              \
2540     break;                                                         \
2541 case N ## nAmE:                                                    \
2542     if (flags & SCF_DO_STCLASS_AND) {                              \
2543             for (value = 0; value < 256; value++)                   \
2544                 if (is_ ## nAmE ## _cp(value))                         \
2545                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2546     }                                                               \
2547     else {                                                          \
2548             for (value = 0; value < 256; value++)                   \
2549                 if (!is_ ## nAmE ## _cp(value))                        \
2550                     ANYOF_BITMAP_SET(data->start_class, value);     \
2551     }                                                               \
2552     break
2553
2554
2555
2556 STATIC I32
2557 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2558                         I32 *minlenp, I32 *deltap,
2559                         regnode *last,
2560                         scan_data_t *data,
2561                         I32 stopparen,
2562                         U8* recursed,
2563                         struct regnode_charclass_class *and_withp,
2564                         U32 flags, U32 depth)
2565                         /* scanp: Start here (read-write). */
2566                         /* deltap: Write maxlen-minlen here. */
2567                         /* last: Stop before this one. */
2568                         /* data: string data about the pattern */
2569                         /* stopparen: treat close N as END */
2570                         /* recursed: which subroutines have we recursed into */
2571                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2572 {
2573     dVAR;
2574     I32 min = 0, pars = 0, code;
2575     regnode *scan = *scanp, *next;
2576     I32 delta = 0;
2577     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2578     int is_inf_internal = 0;            /* The studied chunk is infinite */
2579     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2580     scan_data_t data_fake;
2581     SV *re_trie_maxbuff = NULL;
2582     regnode *first_non_open = scan;
2583     I32 stopmin = I32_MAX;
2584     scan_frame *frame = NULL;
2585     GET_RE_DEBUG_FLAGS_DECL;
2586
2587     PERL_ARGS_ASSERT_STUDY_CHUNK;
2588
2589 #ifdef DEBUGGING
2590     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2591 #endif
2592
2593     if ( depth == 0 ) {
2594         while (first_non_open && OP(first_non_open) == OPEN)
2595             first_non_open=regnext(first_non_open);
2596     }
2597
2598
2599   fake_study_recurse:
2600     while ( scan && OP(scan) != END && scan < last ){
2601         /* Peephole optimizer: */
2602         DEBUG_STUDYDATA("Peep:", data,depth);
2603         DEBUG_PEEP("Peep",scan,depth);
2604         JOIN_EXACT(scan,&min,0);
2605
2606         /* Follow the next-chain of the current node and optimize
2607            away all the NOTHINGs from it.  */
2608         if (OP(scan) != CURLYX) {
2609             const int max = (reg_off_by_arg[OP(scan)]
2610                        ? I32_MAX
2611                        /* I32 may be smaller than U16 on CRAYs! */
2612                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2613             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2614             int noff;
2615             regnode *n = scan;
2616
2617             /* Skip NOTHING and LONGJMP. */
2618             while ((n = regnext(n))
2619                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2620                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2621                    && off + noff < max)
2622                 off += noff;
2623             if (reg_off_by_arg[OP(scan)])
2624                 ARG(scan) = off;
2625             else
2626                 NEXT_OFF(scan) = off;
2627         }
2628
2629
2630
2631         /* The principal pseudo-switch.  Cannot be a switch, since we
2632            look into several different things.  */
2633         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2634                    || OP(scan) == IFTHEN) {
2635             next = regnext(scan);
2636             code = OP(scan);
2637             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2638
2639             if (OP(next) == code || code == IFTHEN) {
2640                 /* NOTE - There is similar code to this block below for handling
2641                    TRIE nodes on a re-study.  If you change stuff here check there
2642                    too. */
2643                 I32 max1 = 0, min1 = I32_MAX, num = 0;
2644                 struct regnode_charclass_class accum;
2645                 regnode * const startbranch=scan;
2646
2647                 if (flags & SCF_DO_SUBSTR)
2648                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2649                 if (flags & SCF_DO_STCLASS)
2650                     cl_init_zero(pRExC_state, &accum);
2651
2652                 while (OP(scan) == code) {
2653                     I32 deltanext, minnext, f = 0, fake;
2654                     struct regnode_charclass_class this_class;
2655
2656                     num++;
2657                     data_fake.flags = 0;
2658                     if (data) {
2659                         data_fake.whilem_c = data->whilem_c;
2660                         data_fake.last_closep = data->last_closep;
2661                     }
2662                     else
2663                         data_fake.last_closep = &fake;
2664
2665                     data_fake.pos_delta = delta;
2666                     next = regnext(scan);
2667                     scan = NEXTOPER(scan);
2668                     if (code != BRANCH)
2669                         scan = NEXTOPER(scan);
2670                     if (flags & SCF_DO_STCLASS) {
2671                         cl_init(pRExC_state, &this_class);
2672                         data_fake.start_class = &this_class;
2673                         f = SCF_DO_STCLASS_AND;
2674                     }
2675                     if (flags & SCF_WHILEM_VISITED_POS)
2676                         f |= SCF_WHILEM_VISITED_POS;
2677
2678                     /* we suppose the run is continuous, last=next...*/
2679                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2680                                           next, &data_fake,
2681                                           stopparen, recursed, NULL, f,depth+1);
2682                     if (min1 > minnext)
2683                         min1 = minnext;
2684                     if (max1 < minnext + deltanext)
2685                         max1 = minnext + deltanext;
2686                     if (deltanext == I32_MAX)
2687                         is_inf = is_inf_internal = 1;
2688                     scan = next;
2689                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2690                         pars++;
2691                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
2692                         if ( stopmin > minnext)
2693                             stopmin = min + min1;
2694                         flags &= ~SCF_DO_SUBSTR;
2695                         if (data)
2696                             data->flags |= SCF_SEEN_ACCEPT;
2697                     }
2698                     if (data) {
2699                         if (data_fake.flags & SF_HAS_EVAL)
2700                             data->flags |= SF_HAS_EVAL;
2701                         data->whilem_c = data_fake.whilem_c;
2702                     }
2703                     if (flags & SCF_DO_STCLASS)
2704                         cl_or(pRExC_state, &accum, &this_class);
2705                 }
2706                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2707                     min1 = 0;
2708                 if (flags & SCF_DO_SUBSTR) {
2709                     data->pos_min += min1;
2710                     data->pos_delta += max1 - min1;
2711                     if (max1 != min1 || is_inf)
2712                         data->longest = &(data->longest_float);
2713                 }
2714                 min += min1;
2715                 delta += max1 - min1;
2716                 if (flags & SCF_DO_STCLASS_OR) {
2717                     cl_or(pRExC_state, data->start_class, &accum);
2718                     if (min1) {
2719                         cl_and(data->start_class, and_withp);
2720                         flags &= ~SCF_DO_STCLASS;
2721                     }
2722                 }
2723                 else if (flags & SCF_DO_STCLASS_AND) {
2724                     if (min1) {
2725                         cl_and(data->start_class, &accum);
2726                         flags &= ~SCF_DO_STCLASS;
2727                     }
2728                     else {
2729                         /* Switch to OR mode: cache the old value of
2730                          * data->start_class */
2731                         INIT_AND_WITHP;
2732                         StructCopy(data->start_class, and_withp,
2733                                    struct regnode_charclass_class);
2734                         flags &= ~SCF_DO_STCLASS_AND;
2735                         StructCopy(&accum, data->start_class,
2736                                    struct regnode_charclass_class);
2737                         flags |= SCF_DO_STCLASS_OR;
2738                         data->start_class->flags |= ANYOF_EOS;
2739                     }
2740                 }
2741
2742                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2743                 /* demq.
2744
2745                    Assuming this was/is a branch we are dealing with: 'scan' now
2746                    points at the item that follows the branch sequence, whatever
2747                    it is. We now start at the beginning of the sequence and look
2748                    for subsequences of
2749
2750                    BRANCH->EXACT=>x1
2751                    BRANCH->EXACT=>x2
2752                    tail
2753
2754                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
2755
2756                    If we can find such a subseqence we need to turn the first
2757                    element into a trie and then add the subsequent branch exact
2758                    strings to the trie.
2759
2760                    We have two cases
2761
2762                      1. patterns where the whole set of branch can be converted.
2763
2764                      2. patterns where only a subset can be converted.
2765
2766                    In case 1 we can replace the whole set with a single regop
2767                    for the trie. In case 2 we need to keep the start and end
2768                    branchs so
2769
2770                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2771                      becomes BRANCH TRIE; BRANCH X;
2772
2773                   There is an additional case, that being where there is a
2774                   common prefix, which gets split out into an EXACT like node
2775                   preceding the TRIE node.
2776
2777                   If x(1..n)==tail then we can do a simple trie, if not we make
2778                   a "jump" trie, such that when we match the appropriate word
2779                   we "jump" to the appopriate tail node. Essentailly we turn
2780                   a nested if into a case structure of sorts.
2781
2782                 */
2783
2784                     int made=0;
2785                     if (!re_trie_maxbuff) {
2786                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2787                         if (!SvIOK(re_trie_maxbuff))
2788                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2789                     }
2790                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2791                         regnode *cur;
2792                         regnode *first = (regnode *)NULL;
2793                         regnode *last = (regnode *)NULL;
2794                         regnode *tail = scan;
2795                         U8 optype = 0;
2796                         U32 count=0;
2797
2798 #ifdef DEBUGGING
2799                         SV * const mysv = sv_newmortal();       /* for dumping */
2800 #endif
2801                         /* var tail is used because there may be a TAIL
2802                            regop in the way. Ie, the exacts will point to the
2803                            thing following the TAIL, but the last branch will
2804                            point at the TAIL. So we advance tail. If we
2805                            have nested (?:) we may have to move through several
2806                            tails.
2807                          */
2808
2809                         while ( OP( tail ) == TAIL ) {
2810                             /* this is the TAIL generated by (?:) */
2811                             tail = regnext( tail );
2812                         }
2813
2814
2815                         DEBUG_OPTIMISE_r({
2816                             regprop(RExC_rx, mysv, tail );
2817                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2818                                 (int)depth * 2 + 2, "",
2819                                 "Looking for TRIE'able sequences. Tail node is: ",
2820                                 SvPV_nolen_const( mysv )
2821                             );
2822                         });
2823
2824                         /*
2825
2826                            step through the branches, cur represents each
2827                            branch, noper is the first thing to be matched
2828                            as part of that branch and noper_next is the
2829                            regnext() of that node. if noper is an EXACT
2830                            and noper_next is the same as scan (our current
2831                            position in the regex) then the EXACT branch is
2832                            a possible optimization target. Once we have
2833                            two or more consequetive such branches we can
2834                            create a trie of the EXACT's contents and stich
2835                            it in place. If the sequence represents all of
2836                            the branches we eliminate the whole thing and
2837                            replace it with a single TRIE. If it is a
2838                            subsequence then we need to stitch it in. This
2839                            means the first branch has to remain, and needs
2840                            to be repointed at the item on the branch chain
2841                            following the last branch optimized. This could
2842                            be either a BRANCH, in which case the
2843                            subsequence is internal, or it could be the
2844                            item following the branch sequence in which
2845                            case the subsequence is at the end.
2846
2847                         */
2848
2849                         /* dont use tail as the end marker for this traverse */
2850                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2851                             regnode * const noper = NEXTOPER( cur );
2852 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2853                             regnode * const noper_next = regnext( noper );
2854 #endif
2855
2856                             DEBUG_OPTIMISE_r({
2857                                 regprop(RExC_rx, mysv, cur);
2858                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2859                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2860
2861                                 regprop(RExC_rx, mysv, noper);
2862                                 PerlIO_printf( Perl_debug_log, " -> %s",
2863                                     SvPV_nolen_const(mysv));
2864
2865                                 if ( noper_next ) {
2866                                   regprop(RExC_rx, mysv, noper_next );
2867                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2868                                     SvPV_nolen_const(mysv));
2869                                 }
2870                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2871                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2872                             });
2873                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2874                                          : PL_regkind[ OP( noper ) ] == EXACT )
2875                                   || OP(noper) == NOTHING )
2876 #ifdef NOJUMPTRIE
2877                                   && noper_next == tail
2878 #endif
2879                                   && count < U16_MAX)
2880                             {
2881                                 count++;
2882                                 if ( !first || optype == NOTHING ) {
2883                                     if (!first) first = cur;
2884                                     optype = OP( noper );
2885                                 } else {
2886                                     last = cur;
2887                                 }
2888                             } else {
2889 /*
2890     Currently we do not believe that the trie logic can
2891     handle case insensitive matching properly when the
2892     pattern is not unicode (thus forcing unicode semantics).
2893
2894     If/when this is fixed the following define can be swapped
2895     in below to fully enable trie logic.
2896
2897 #define TRIE_TYPE_IS_SAFE 1
2898
2899 */
2900 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2901
2902                                 if ( last && TRIE_TYPE_IS_SAFE ) {
2903                                     make_trie( pRExC_state,
2904                                             startbranch, first, cur, tail, count,
2905                                             optype, depth+1 );
2906                                 }
2907                                 if ( PL_regkind[ OP( noper ) ] == EXACT
2908 #ifdef NOJUMPTRIE
2909                                      && noper_next == tail
2910 #endif
2911                                 ){
2912                                     count = 1;
2913                                     first = cur;
2914                                     optype = OP( noper );
2915                                 } else {
2916                                     count = 0;
2917                                     first = NULL;
2918                                     optype = 0;
2919                                 }
2920                                 last = NULL;
2921                             }
2922                         }
2923                         DEBUG_OPTIMISE_r({
2924                             regprop(RExC_rx, mysv, cur);
2925                             PerlIO_printf( Perl_debug_log,
2926                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2927                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2928
2929                         });
2930
2931                         if ( last && TRIE_TYPE_IS_SAFE ) {
2932                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2933 #ifdef TRIE_STUDY_OPT
2934                             if ( ((made == MADE_EXACT_TRIE &&
2935                                  startbranch == first)
2936                                  || ( first_non_open == first )) &&
2937                                  depth==0 ) {
2938                                 flags |= SCF_TRIE_RESTUDY;
2939                                 if ( startbranch == first
2940                                      && scan == tail )
2941                                 {
2942                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2943                                 }
2944                             }
2945 #endif
2946                         }
2947                     }
2948
2949                 } /* do trie */
2950
2951             }
2952             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
2953                 scan = NEXTOPER(NEXTOPER(scan));
2954             } else                      /* single branch is optimized. */
2955                 scan = NEXTOPER(scan);
2956             continue;
2957         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2958             scan_frame *newframe = NULL;
2959             I32 paren;
2960             regnode *start;
2961             regnode *end;
2962
2963             if (OP(scan) != SUSPEND) {
2964             /* set the pointer */
2965                 if (OP(scan) == GOSUB) {
2966                     paren = ARG(scan);
2967                     RExC_recurse[ARG2L(scan)] = scan;
2968                     start = RExC_open_parens[paren-1];
2969                     end   = RExC_close_parens[paren-1];
2970                 } else {
2971                     paren = 0;
2972                     start = RExC_rxi->program + 1;
2973                     end   = RExC_opend;
2974                 }
2975                 if (!recursed) {
2976                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2977                     SAVEFREEPV(recursed);
2978                 }
2979                 if (!PAREN_TEST(recursed,paren+1)) {
2980                     PAREN_SET(recursed,paren+1);
2981                     Newx(newframe,1,scan_frame);
2982                 } else {
2983                     if (flags & SCF_DO_SUBSTR) {
2984                         SCAN_COMMIT(pRExC_state,data,minlenp);
2985                         data->longest = &(data->longest_float);
2986                     }
2987                     is_inf = is_inf_internal = 1;
2988                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2989                         cl_anything(pRExC_state, data->start_class);
2990                     flags &= ~SCF_DO_STCLASS;
2991                 }
2992             } else {
2993                 Newx(newframe,1,scan_frame);
2994                 paren = stopparen;
2995                 start = scan+2;
2996                 end = regnext(scan);
2997             }
2998             if (newframe) {
2999                 assert(start);
3000                 assert(end);
3001                 SAVEFREEPV(newframe);
3002                 newframe->next = regnext(scan);
3003                 newframe->last = last;
3004                 newframe->stop = stopparen;
3005                 newframe->prev = frame;
3006
3007                 frame = newframe;
3008                 scan =  start;
3009                 stopparen = paren;
3010                 last = end;
3011
3012                 continue;
3013             }
3014         }
3015         else if (OP(scan) == EXACT) {
3016             I32 l = STR_LEN(scan);
3017             UV uc;
3018             if (UTF) {
3019                 const U8 * const s = (U8*)STRING(scan);
3020                 l = utf8_length(s, s + l);
3021                 uc = utf8_to_uvchr(s, NULL);
3022             } else {
3023                 uc = *((U8*)STRING(scan));
3024             }
3025             min += l;
3026             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3027                 /* The code below prefers earlier match for fixed
3028                    offset, later match for variable offset.  */
3029                 if (data->last_end == -1) { /* Update the start info. */
3030                     data->last_start_min = data->pos_min;
3031                     data->last_start_max = is_inf
3032                         ? I32_MAX : data->pos_min + data->pos_delta;
3033                 }
3034                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3035                 if (UTF)
3036                     SvUTF8_on(data->last_found);
3037                 {
3038                     SV * const sv = data->last_found;
3039                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3040                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3041                     if (mg && mg->mg_len >= 0)
3042                         mg->mg_len += utf8_length((U8*)STRING(scan),
3043                                                   (U8*)STRING(scan)+STR_LEN(scan));
3044                 }
3045                 data->last_end = data->pos_min + l;
3046                 data->pos_min += l; /* As in the first entry. */
3047                 data->flags &= ~SF_BEFORE_EOL;
3048             }
3049             if (flags & SCF_DO_STCLASS_AND) {
3050                 /* Check whether it is compatible with what we know already! */
3051                 int compat = 1;
3052
3053                 if (uc >= 0x100 ||
3054                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3055                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3056                     && (!(data->start_class->flags & ANYOF_FOLD)
3057                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3058                     )
3059                     compat = 0;
3060                 ANYOF_CLASS_ZERO(data->start_class);
3061                 ANYOF_BITMAP_ZERO(data->start_class);
3062                 if (compat)
3063                     ANYOF_BITMAP_SET(data->start_class, uc);
3064                 data->start_class->flags &= ~ANYOF_EOS;
3065                 if (uc < 0x100)
3066                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3067             }
3068             else if (flags & SCF_DO_STCLASS_OR) {
3069                 /* false positive possible if the class is case-folded */
3070                 if (uc < 0x100)
3071                     ANYOF_BITMAP_SET(data->start_class, uc);
3072                 else
3073                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3074                 data->start_class->flags &= ~ANYOF_EOS;
3075                 cl_and(data->start_class, and_withp);
3076             }
3077             flags &= ~SCF_DO_STCLASS;
3078         }
3079         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3080             I32 l = STR_LEN(scan);
3081             UV uc = *((U8*)STRING(scan));
3082
3083             /* Search for fixed substrings supports EXACT only. */
3084             if (flags & SCF_DO_SUBSTR) {
3085                 assert(data);
3086                 SCAN_COMMIT(pRExC_state, data, minlenp);
3087             }
3088             if (UTF) {
3089                 const U8 * const s = (U8 *)STRING(scan);
3090                 l = utf8_length(s, s + l);
3091                 uc = utf8_to_uvchr(s, NULL);
3092             }
3093             min += l;
3094             if (flags & SCF_DO_SUBSTR)
3095                 data->pos_min += l;
3096             if (flags & SCF_DO_STCLASS_AND) {
3097                 /* Check whether it is compatible with what we know already! */
3098                 int compat = 1;
3099
3100                 if (uc >= 0x100 ||
3101                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3102                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3103                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3104                     compat = 0;
3105                 ANYOF_CLASS_ZERO(data->start_class);
3106                 ANYOF_BITMAP_ZERO(data->start_class);
3107                 if (compat) {
3108                     ANYOF_BITMAP_SET(data->start_class, uc);
3109                     data->start_class->flags &= ~ANYOF_EOS;
3110                     data->start_class->flags |= ANYOF_FOLD;
3111                     if (OP(scan) == EXACTFL)
3112                         data->start_class->flags |= ANYOF_LOCALE;
3113                 }
3114             }
3115             else if (flags & SCF_DO_STCLASS_OR) {
3116                 if (data->start_class->flags & ANYOF_FOLD) {
3117                     /* false positive possible if the class is case-folded.
3118                        Assume that the locale settings are the same... */
3119                     if (uc < 0x100)
3120                         ANYOF_BITMAP_SET(data->start_class, uc);
3121                     data->start_class->flags &= ~ANYOF_EOS;
3122                 }
3123                 cl_and(data->start_class, and_withp);
3124             }
3125             flags &= ~SCF_DO_STCLASS;
3126         }
3127         else if (REGNODE_VARIES(OP(scan))) {
3128             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3129             I32 f = flags, pos_before = 0;
3130             regnode * const oscan = scan;
3131             struct regnode_charclass_class this_class;
3132             struct regnode_charclass_class *oclass = NULL;
3133             I32 next_is_eval = 0;
3134
3135             switch (PL_regkind[OP(scan)]) {
3136             case WHILEM:                /* End of (?:...)* . */
3137                 scan = NEXTOPER(scan);
3138                 goto finish;
3139             case PLUS:
3140                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3141                     next = NEXTOPER(scan);
3142                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3143                         mincount = 1;
3144                         maxcount = REG_INFTY;
3145                         next = regnext(scan);
3146                         scan = NEXTOPER(scan);
3147                         goto do_curly;
3148                     }
3149                 }
3150                 if (flags & SCF_DO_SUBSTR)
3151                     data->pos_min++;
3152                 min++;
3153                 /* Fall through. */
3154             case STAR:
3155                 if (flags & SCF_DO_STCLASS) {
3156                     mincount = 0;
3157                     maxcount = REG_INFTY;
3158                     next = regnext(scan);
3159                     scan = NEXTOPER(scan);
3160                     goto do_curly;
3161                 }
3162                 is_inf = is_inf_internal = 1;
3163                 scan = regnext(scan);
3164                 if (flags & SCF_DO_SUBSTR) {
3165                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3166                     data->longest = &(data->longest_float);
3167                 }
3168                 goto optimize_curly_tail;
3169             case CURLY:
3170                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3171                     && (scan->flags == stopparen))
3172                 {
3173                     mincount = 1;
3174                     maxcount = 1;
3175                 } else {
3176                     mincount = ARG1(scan);
3177                     maxcount = ARG2(scan);
3178                 }
3179                 next = regnext(scan);
3180                 if (OP(scan) == CURLYX) {
3181                     I32 lp = (data ? *(data->last_closep) : 0);
3182                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3183                 }
3184                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3185                 next_is_eval = (OP(scan) == EVAL);
3186               do_curly:
3187                 if (flags & SCF_DO_SUBSTR) {
3188                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3189                     pos_before = data->pos_min;
3190                 }
3191                 if (data) {
3192                     fl = data->flags;
3193                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3194                     if (is_inf)
3195                         data->flags |= SF_IS_INF;
3196                 }
3197                 if (flags & SCF_DO_STCLASS) {
3198                     cl_init(pRExC_state, &this_class);
3199                     oclass = data->start_class;
3200                     data->start_class = &this_class;
3201                     f |= SCF_DO_STCLASS_AND;
3202                     f &= ~SCF_DO_STCLASS_OR;
3203                 }
3204                 /* These are the cases when once a subexpression
3205                    fails at a particular position, it cannot succeed
3206                    even after backtracking at the enclosing scope.
3207
3208                    XXXX what if minimal match and we are at the
3209                         initial run of {n,m}? */
3210                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3211                     f &= ~SCF_WHILEM_VISITED_POS;
3212
3213                 /* This will finish on WHILEM, setting scan, or on NULL: */
3214                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3215                                       last, data, stopparen, recursed, NULL,
3216                                       (mincount == 0
3217                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3218
3219                 if (flags & SCF_DO_STCLASS)
3220                     data->start_class = oclass;
3221                 if (mincount == 0 || minnext == 0) {
3222                     if (flags & SCF_DO_STCLASS_OR) {
3223                         cl_or(pRExC_state, data->start_class, &this_class);
3224                     }
3225                     else if (flags & SCF_DO_STCLASS_AND) {
3226                         /* Switch to OR mode: cache the old value of
3227                          * data->start_class */
3228                         INIT_AND_WITHP;
3229                         StructCopy(data->start_class, and_withp,
3230                                    struct regnode_charclass_class);
3231                         flags &= ~SCF_DO_STCLASS_AND;
3232                         StructCopy(&this_class, data->start_class,
3233                                    struct regnode_charclass_class);
3234                         flags |= SCF_DO_STCLASS_OR;
3235                         data->start_class->flags |= ANYOF_EOS;
3236                     }
3237                 } else {                /* Non-zero len */
3238                     if (flags & SCF_DO_STCLASS_OR) {
3239                         cl_or(pRExC_state, data->start_class, &this_class);
3240                         cl_and(data->start_class, and_withp);
3241                     }
3242                     else if (flags & SCF_DO_STCLASS_AND)
3243                         cl_and(data->start_class, &this_class);
3244                     flags &= ~SCF_DO_STCLASS;
3245                 }
3246                 if (!scan)              /* It was not CURLYX, but CURLY. */
3247                     scan = next;
3248                 if ( /* ? quantifier ok, except for (?{ ... }) */
3249                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3250                     && (minnext == 0) && (deltanext == 0)
3251                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3252                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3253                 {
3254                     ckWARNreg(RExC_parse,
3255                               "Quantifier unexpected on zero-length expression");
3256                 }
3257
3258                 min += minnext * mincount;
3259                 is_inf_internal |= ((maxcount == REG_INFTY
3260                                      && (minnext + deltanext) > 0)
3261                                     || deltanext == I32_MAX);
3262                 is_inf |= is_inf_internal;
3263                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3264
3265                 /* Try powerful optimization CURLYX => CURLYN. */
3266                 if (  OP(oscan) == CURLYX && data
3267                       && data->flags & SF_IN_PAR
3268                       && !(data->flags & SF_HAS_EVAL)
3269                       && !deltanext && minnext == 1 ) {
3270                     /* Try to optimize to CURLYN.  */
3271                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3272                     regnode * const nxt1 = nxt;
3273 #ifdef DEBUGGING
3274                     regnode *nxt2;
3275 #endif
3276
3277                     /* Skip open. */
3278                     nxt = regnext(nxt);
3279                     if (!REGNODE_SIMPLE(OP(nxt))
3280                         && !(PL_regkind[OP(nxt)] == EXACT
3281                              && STR_LEN(nxt) == 1))
3282                         goto nogo;
3283 #ifdef DEBUGGING
3284                     nxt2 = nxt;
3285 #endif
3286                     nxt = regnext(nxt);
3287                     if (OP(nxt) != CLOSE)
3288                         goto nogo;
3289                     if (RExC_open_parens) {
3290                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3291                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3292                     }
3293                     /* Now we know that nxt2 is the only contents: */
3294                     oscan->flags = (U8)ARG(nxt);
3295                     OP(oscan) = CURLYN;
3296                     OP(nxt1) = NOTHING; /* was OPEN. */
3297
3298 #ifdef DEBUGGING
3299                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3300                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3301                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3302                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3303                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3304                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3305 #endif
3306                 }
3307               nogo:
3308
3309                 /* Try optimization CURLYX => CURLYM. */
3310                 if (  OP(oscan) == CURLYX && data
3311                       && !(data->flags & SF_HAS_PAR)
3312                       && !(data->flags & SF_HAS_EVAL)
3313                       && !deltanext     /* atom is fixed width */
3314                       && minnext != 0   /* CURLYM can't handle zero width */
3315                 ) {
3316                     /* XXXX How to optimize if data == 0? */
3317                     /* Optimize to a simpler form.  */
3318                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3319                     regnode *nxt2;
3320
3321                     OP(oscan) = CURLYM;
3322                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3323                             && (OP(nxt2) != WHILEM))
3324                         nxt = nxt2;
3325                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3326                     /* Need to optimize away parenths. */
3327                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3328                         /* Set the parenth number.  */
3329                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3330
3331                         oscan->flags = (U8)ARG(nxt);
3332                         if (RExC_open_parens) {
3333                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3334                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3335                         }
3336                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3337                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3338
3339 #ifdef DEBUGGING
3340                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3341                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3342                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3343                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3344 #endif
3345 #if 0
3346                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3347                             regnode *nnxt = regnext(nxt1);
3348
3349                             if (nnxt == nxt) {
3350                                 if (reg_off_by_arg[OP(nxt1)])
3351                                     ARG_SET(nxt1, nxt2 - nxt1);
3352                                 else if (nxt2 - nxt1 < U16_MAX)
3353                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3354                                 else
3355                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3356                             }
3357                             nxt1 = nnxt;
3358                         }
3359 #endif
3360                         /* Optimize again: */
3361                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3362                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3363                     }
3364                     else
3365                         oscan->flags = 0;
3366                 }
3367                 else if ((OP(oscan) == CURLYX)
3368                          && (flags & SCF_WHILEM_VISITED_POS)
3369                          /* See the comment on a similar expression above.
3370                             However, this time it not a subexpression
3371                             we care about, but the expression itself. */
3372                          && (maxcount == REG_INFTY)
3373                          && data && ++data->whilem_c < 16) {
3374                     /* This stays as CURLYX, we can put the count/of pair. */
3375                     /* Find WHILEM (as in regexec.c) */
3376                     regnode *nxt = oscan + NEXT_OFF(oscan);
3377
3378                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3379                         nxt += ARG(nxt);
3380                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3381                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3382                 }
3383                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3384                     pars++;
3385                 if (flags & SCF_DO_SUBSTR) {
3386                     SV *last_str = NULL;
3387                     int counted = mincount != 0;
3388
3389                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3390 #if defined(SPARC64_GCC_WORKAROUND)
3391                         I32 b = 0;
3392                         STRLEN l = 0;
3393                         const char *s = NULL;
3394                         I32 old = 0;
3395
3396                         if (pos_before >= data->last_start_min)
3397                             b = pos_before;
3398                         else
3399                             b = data->last_start_min;
3400
3401                         l = 0;
3402                         s = SvPV_const(data->last_found, l);
3403                         old = b - data->last_start_min;
3404
3405 #else
3406                         I32 b = pos_before >= data->last_start_min
3407                             ? pos_before : data->last_start_min;
3408                         STRLEN l;
3409                         const char * const s = SvPV_const(data->last_found, l);
3410                         I32 old = b - data->last_start_min;
3411 #endif
3412
3413                         if (UTF)
3414                             old = utf8_hop((U8*)s, old) - (U8*)s;
3415
3416                         l -= old;
3417                         /* Get the added string: */
3418                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3419                         if (deltanext == 0 && pos_before == b) {
3420                             /* What was added is a constant string */
3421                             if (mincount > 1) {
3422                                 SvGROW(last_str, (mincount * l) + 1);
3423                                 repeatcpy(SvPVX(last_str) + l,
3424                                           SvPVX_const(last_str), l, mincount - 1);
3425                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3426                                 /* Add additional parts. */
3427                                 SvCUR_set(data->last_found,
3428                                           SvCUR(data->last_found) - l);
3429                                 sv_catsv(data->last_found, last_str);
3430                                 {
3431                                     SV * sv = data->last_found;
3432                                     MAGIC *mg =
3433                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3434                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3435                                     if (mg && mg->mg_len >= 0)
3436                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3437                                 }
3438                                 data->last_end += l * (mincount - 1);
3439                             }
3440                         } else {
3441                             /* start offset must point into the last copy */
3442                             data->last_start_min += minnext * (mincount - 1);
3443                             data->last_start_max += is_inf ? I32_MAX
3444                                 : (maxcount - 1) * (minnext + data->pos_delta);
3445                         }
3446                     }
3447                     /* It is counted once already... */
3448                     data->pos_min += minnext * (mincount - counted);
3449                     data->pos_delta += - counted * deltanext +
3450                         (minnext + deltanext) * maxcount - minnext * mincount;
3451                     if (mincount != maxcount) {
3452                          /* Cannot extend fixed substrings found inside
3453                             the group.  */
3454                         SCAN_COMMIT(pRExC_state,data,minlenp);
3455                         if (mincount && last_str) {
3456                             SV * const sv = data->last_found;
3457                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3458                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3459
3460                             if (mg)
3461                                 mg->mg_len = -1;
3462                             sv_setsv(sv, last_str);
3463                             data->last_end = data->pos_min;
3464                             data->last_start_min =
3465                                 data->pos_min - CHR_SVLEN(last_str);
3466                             data->last_start_max = is_inf
3467                                 ? I32_MAX
3468                                 : data->pos_min + data->pos_delta
3469                                 - CHR_SVLEN(last_str);
3470                         }
3471                         data->longest = &(data->longest_float);
3472                     }
3473                     SvREFCNT_dec(last_str);
3474                 }
3475                 if (data && (fl & SF_HAS_EVAL))
3476                     data->flags |= SF_HAS_EVAL;
3477               optimize_curly_tail:
3478                 if (OP(oscan) != CURLYX) {
3479                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3480                            && NEXT_OFF(next))
3481                         NEXT_OFF(oscan) += NEXT_OFF(next);
3482                 }
3483                 continue;
3484             default:                    /* REF and CLUMP only? */
3485                 if (flags & SCF_DO_SUBSTR) {
3486                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3487                     data->longest = &(data->longest_float);
3488                 }
3489                 is_inf = is_inf_internal = 1;
3490                 if (flags & SCF_DO_STCLASS_OR)
3491                     cl_anything(pRExC_state, data->start_class);
3492                 flags &= ~SCF_DO_STCLASS;
3493                 break;
3494             }
3495         }
3496         else if (OP(scan) == LNBREAK) {
3497             if (flags & SCF_DO_STCLASS) {
3498                 int value = 0;
3499                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3500                 if (flags & SCF_DO_STCLASS_AND) {
3501                     for (value = 0; value < 256; value++)
3502                         if (!is_VERTWS_cp(value))
3503                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3504                 }
3505                 else {
3506                     for (value = 0; value < 256; value++)
3507                         if (is_VERTWS_cp(value))
3508                             ANYOF_BITMAP_SET(data->start_class, value);
3509                 }
3510                 if (flags & SCF_DO_STCLASS_OR)
3511                     cl_and(data->start_class, and_withp);
3512                 flags &= ~SCF_DO_STCLASS;
3513             }
3514             min += 1;
3515             delta += 1;
3516             if (flags & SCF_DO_SUBSTR) {
3517                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3518                 data->pos_min += 1;
3519                 data->pos_delta += 1;
3520                 data->longest = &(data->longest_float);
3521             }
3522
3523         }
3524         else if (OP(scan) == FOLDCHAR) {
3525             int d = ARG(scan)==0xDF ? 1 : 2;
3526             flags &= ~SCF_DO_STCLASS;
3527             min += 1;
3528             delta += d;
3529             if (flags & SCF_DO_SUBSTR) {
3530                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3531                 data->pos_min += 1;
3532                 data->pos_delta += d;
3533                 data->longest = &(data->longest_float);
3534             }
3535         }
3536         else if (REGNODE_SIMPLE(OP(scan))) {
3537             int value = 0;
3538
3539             if (flags & SCF_DO_SUBSTR) {
3540                 SCAN_COMMIT(pRExC_state,data,minlenp);
3541                 data->pos_min++;
3542             }
3543             min++;
3544             if (flags & SCF_DO_STCLASS) {
3545                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3546
3547                 /* Some of the logic below assumes that switching
3548                    locale on will only add false positives. */
3549                 switch (PL_regkind[OP(scan)]) {
3550                 case SANY:
3551                 default:
3552                   do_default:
3553                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3554                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3555                         cl_anything(pRExC_state, data->start_class);
3556                     break;
3557                 case REG_ANY:
3558                     if (OP(scan) == SANY)
3559                         goto do_default;
3560                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3561                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3562                                  || (data->start_class->flags & ANYOF_CLASS));
3563                         cl_anything(pRExC_state, data->start_class);
3564                     }
3565                     if (flags & SCF_DO_STCLASS_AND || !value)
3566                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3567                     break;
3568                 case ANYOF:
3569                     if (flags & SCF_DO_STCLASS_AND)
3570                         cl_and(data->start_class,
3571                                (struct regnode_charclass_class*)scan);
3572                     else
3573                         cl_or(pRExC_state, data->start_class,
3574                               (struct regnode_charclass_class*)scan);
3575                     break;
3576                 case ALNUM:
3577                     if (flags & SCF_DO_STCLASS_AND) {
3578                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3579                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3580                             for (value = 0; value < 256; value++)
3581                                 if (!isALNUM(value))
3582                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3583                         }
3584                     }
3585                     else {
3586                         if (data->start_class->flags & ANYOF_LOCALE)
3587                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3588                         else {
3589                             for (value = 0; value < 256; value++)
3590                                 if (isALNUM(value))
3591                                     ANYOF_BITMAP_SET(data->start_class, value);
3592                         }
3593                     }
3594                     break;
3595                 case ALNUML:
3596                     if (flags & SCF_DO_STCLASS_AND) {
3597                         if (data->start_class->flags & ANYOF_LOCALE)
3598                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3599                     }
3600                     else {
3601                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3602                         data->start_class->flags |= ANYOF_LOCALE;
3603                     }
3604                     break;
3605                 case NALNUM:
3606                     if (flags & SCF_DO_STCLASS_AND) {
3607                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3608                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3609                             for (value = 0; value < 256; value++)
3610                                 if (isALNUM(value))
3611                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3612                         }
3613                     }
3614                     else {
3615                         if (data->start_class->flags & ANYOF_LOCALE)
3616                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3617                         else {
3618                             for (value = 0; value < 256; value++)
3619                                 if (!isALNUM(value))
3620                                     ANYOF_BITMAP_SET(data->start_class, value);
3621                         }
3622                     }
3623                     break;
3624                 case NALNUML:
3625                     if (flags & SCF_DO_STCLASS_AND) {
3626                         if (data->start_class->flags & ANYOF_LOCALE)
3627                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3628                     }
3629                     else {
3630                         data->start_class->flags |= ANYOF_LOCALE;
3631                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3632                     }
3633                     break;
3634                 case SPACE:
3635                     if (flags & SCF_DO_STCLASS_AND) {
3636                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3637                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3638                             for (value = 0; value < 256; value++)
3639                                 if (!isSPACE(value))
3640                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3641                         }
3642                     }
3643                     else {
3644                         if (data->start_class->flags & ANYOF_LOCALE)
3645                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3646                         else {
3647                             for (value = 0; value < 256; value++)
3648                                 if (isSPACE(value))
3649                                     ANYOF_BITMAP_SET(data->start_class, value);
3650                         }
3651                     }
3652                     break;
3653                 case SPACEL:
3654                     if (flags & SCF_DO_STCLASS_AND) {
3655                         if (data->start_class->flags & ANYOF_LOCALE)
3656                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3657                     }
3658                     else {
3659                         data->start_class->flags |= ANYOF_LOCALE;
3660                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3661                     }
3662                     break;
3663                 case NSPACE:
3664                     if (flags & SCF_DO_STCLASS_AND) {
3665                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3666                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3667                             for (value = 0; value < 256; value++)
3668                                 if (isSPACE(value))
3669                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3670                         }
3671                     }
3672                     else {
3673                         if (data->start_class->flags & ANYOF_LOCALE)
3674                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3675                         else {
3676                             for (value = 0; value < 256; value++)
3677                                 if (!isSPACE(value))
3678                                     ANYOF_BITMAP_SET(data->start_class, value);
3679                         }
3680                     }
3681                     break;
3682                 case NSPACEL:
3683                     if (flags & SCF_DO_STCLASS_AND) {
3684                         if (data->start_class->flags & ANYOF_LOCALE) {
3685                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3686                             for (value = 0; value < 256; value++)
3687                                 if (!isSPACE(value))
3688                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3689                         }
3690                     }
3691                     else {
3692                         data->start_class->flags |= ANYOF_LOCALE;
3693                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3694                     }
3695                     break;
3696                 case DIGIT:
3697                     if (flags & SCF_DO_STCLASS_AND) {
3698                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3699                         for (value = 0; value < 256; value++)
3700                             if (!isDIGIT(value))
3701                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3702                     }
3703                     else {
3704                         if (data->start_class->flags & ANYOF_LOCALE)
3705                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3706                         else {
3707                             for (value = 0; value < 256; value++)
3708                                 if (isDIGIT(value))
3709                                     ANYOF_BITMAP_SET(data->start_class, value);
3710                         }
3711                     }
3712                     break;
3713                 case NDIGIT:
3714                     if (flags & SCF_DO_STCLASS_AND) {
3715                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3716                         for (value = 0; value < 256; value++)
3717                             if (isDIGIT(value))
3718                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3719                     }
3720                     else {
3721                         if (data->start_class->flags & ANYOF_LOCALE)
3722                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3723                         else {
3724                             for (value = 0; value < 256; value++)
3725                                 if (!isDIGIT(value))
3726                                     ANYOF_BITMAP_SET(data->start_class, value);
3727                         }
3728                     }
3729                     break;
3730                 CASE_SYNST_FNC(VERTWS);
3731                 CASE_SYNST_FNC(HORIZWS);
3732
3733                 }
3734                 if (flags & SCF_DO_STCLASS_OR)
3735                     cl_and(data->start_class, and_withp);
3736                 flags &= ~SCF_DO_STCLASS;
3737             }
3738         }
3739         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3740             data->flags |= (OP(scan) == MEOL
3741                             ? SF_BEFORE_MEOL
3742                             : SF_BEFORE_SEOL);
3743         }
3744         else if (  PL_regkind[OP(scan)] == BRANCHJ
3745                  /* Lookbehind, or need to calculate parens/evals/stclass: */
3746                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
3747                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3748             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3749                 || OP(scan) == UNLESSM )
3750             {
3751                 /* Negative Lookahead/lookbehind
3752                    In this case we can't do fixed string optimisation.
3753                 */
3754
3755                 I32 deltanext, minnext, fake = 0;
3756                 regnode *nscan;
3757                 struct regnode_charclass_class intrnl;
3758                 int f = 0;
3759
3760                 data_fake.flags = 0;
3761                 if (data) {
3762                     data_fake.whilem_c = data->whilem_c;
3763                     data_fake.last_closep = data->last_closep;
3764                 }
3765                 else
3766                     data_fake.last_closep = &fake;
3767                 data_fake.pos_delta = delta;
3768                 if ( flags & SCF_DO_STCLASS && !scan->flags
3769                      && OP(scan) == IFMATCH ) { /* Lookahead */
3770                     cl_init(pRExC_state, &intrnl);
3771                     data_fake.start_class = &intrnl;
3772                     f |= SCF_DO_STCLASS_AND;
3773                 }
3774                 if (flags & SCF_WHILEM_VISITED_POS)
3775                     f |= SCF_WHILEM_VISITED_POS;
3776                 next = regnext(scan);
3777                 nscan = NEXTOPER(NEXTOPER(scan));
3778                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3779                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3780                 if (scan->flags) {
3781                     if (deltanext) {
3782                         FAIL("Variable length lookbehind not implemented");
3783                     }
3784                     else if (minnext > (I32)U8_MAX) {
3785                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3786                     }
3787                     scan->flags = (U8)minnext;
3788                 }
3789                 if (data) {
3790                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3791                         pars++;
3792                     if (data_fake.flags & SF_HAS_EVAL)
3793                         data->flags |= SF_HAS_EVAL;
3794                     data->whilem_c = data_fake.whilem_c;
3795                 }
3796                 if (f & SCF_DO_STCLASS_AND) {
3797                     if (flags & SCF_DO_STCLASS_OR) {
3798                         /* OR before, AND after: ideally we would recurse with
3799                          * data_fake to get the AND applied by study of the
3800                          * remainder of the pattern, and then derecurse;
3801                          * *** HACK *** for now just treat as "no information".
3802                          * See [perl #56690].
3803                          */
3804                         cl_init(pRExC_state, data->start_class);
3805                     }  else {
3806                         /* AND before and after: combine and continue */
3807                         const int was = (data->start_class->flags & ANYOF_EOS);
3808
3809                         cl_and(data->start_class, &intrnl);
3810                         if (was)
3811                             data->start_class->flags |= ANYOF_EOS;
3812                     }
3813                 }
3814             }
3815 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3816             else {
3817                 /* Positive Lookahead/lookbehind
3818                    In this case we can do fixed string optimisation,
3819                    but we must be careful about it. Note in the case of
3820                    lookbehind the positions will be offset by the minimum
3821                    length of the pattern, something we won't know about
3822                    until after the recurse.
3823                 */
3824                 I32 deltanext, fake = 0;
3825                 regnode *nscan;
3826                 struct regnode_charclass_class intrnl;
3827                 int f = 0;
3828                 /* We use SAVEFREEPV so that when the full compile
3829                     is finished perl will clean up the allocated
3830                     minlens when its all done. This was we don't
3831                     have to worry about freeing them when we know
3832                     they wont be used, which would be a pain.
3833                  */
3834                 I32 *minnextp;
3835                 Newx( minnextp, 1, I32 );
3836                 SAVEFREEPV(minnextp);
3837
3838                 if (data) {
3839                     StructCopy(data, &data_fake, scan_data_t);
3840                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3841                         f |= SCF_DO_SUBSTR;
3842                         if (scan->flags)
3843                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3844                         data_fake.last_found=newSVsv(data->last_found);
3845                     }
3846                 }
3847                 else
3848                     data_fake.last_closep = &fake;
3849                 data_fake.flags = 0;
3850                 data_fake.pos_delta = delta;
3851                 if (is_inf)
3852                     data_fake.flags |= SF_IS_INF;
3853                 if ( flags & SCF_DO_STCLASS && !scan->flags
3854                      && OP(scan) == IFMATCH ) { /* Lookahead */
3855                     cl_init(pRExC_state, &intrnl);
3856                     data_fake.start_class = &intrnl;
3857                     f |= SCF_DO_STCLASS_AND;
3858                 }
3859                 if (flags & SCF_WHILEM_VISITED_POS)
3860                     f |= SCF_WHILEM_VISITED_POS;
3861                 next = regnext(scan);
3862                 nscan = NEXTOPER(NEXTOPER(scan));
3863
3864                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3865                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3866                 if (scan->flags) {
3867                     if (deltanext) {
3868                         FAIL("Variable length lookbehind not implemented");
3869                     }
3870                     else if (*minnextp > (I32)U8_MAX) {
3871                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3872                     }
3873                     scan->flags = (U8)*minnextp;
3874                 }
3875
3876                 *minnextp += min;
3877
3878                 if (f & SCF_DO_STCLASS_AND) {
3879                     const int was = (data->start_class->flags & ANYOF_EOS);
3880
3881                     cl_and(data->start_class, &intrnl);
3882                     if (was)
3883                         data->start_class->flags |= ANYOF_EOS;
3884                 }
3885                 if (data) {
3886                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3887                         pars++;
3888                     if (data_fake.flags & SF_HAS_EVAL)
3889                         data->flags |= SF_HAS_EVAL;
3890                     data->whilem_c = data_fake.whilem_c;
3891                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3892                         if (RExC_rx->minlen<*minnextp)
3893                             RExC_rx->minlen=*minnextp;
3894                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3895                         SvREFCNT_dec(data_fake.last_found);
3896
3897                         if ( data_fake.minlen_fixed != minlenp )
3898                         {
3899                             data->offset_fixed= data_fake.offset_fixed;
3900                             data->minlen_fixed= data_fake.minlen_fixed;
3901                             data->lookbehind_fixed+= scan->flags;
3902                         }
3903                         if ( data_fake.minlen_float != minlenp )
3904                         {
3905                             data->minlen_float= data_fake.minlen_float;
3906                             data->offset_float_min=data_fake.offset_float_min;
3907                             data->offset_float_max=data_fake.offset_float_max;
3908                             data->lookbehind_float+= scan->flags;
3909                         }
3910                     }
3911                 }
3912
3913
3914             }
3915 #endif
3916         }
3917         else if (OP(scan) == OPEN) {
3918             if (stopparen != (I32)ARG(scan))
3919                 pars++;
3920         }
3921         else if (OP(scan) == CLOSE) {
3922             if (stopparen == (I32)ARG(scan)) {
3923                 break;
3924             }
3925             if ((I32)ARG(scan) == is_par) {
3926                 next = regnext(scan);
3927
3928                 if ( next && (OP(next) != WHILEM) && next < last)
3929                     is_par = 0;         /* Disable optimization */
3930             }
3931             if (data)
3932                 *(data->last_closep) = ARG(scan);
3933         }
3934         else if (OP(scan) == EVAL) {
3935                 if (data)
3936                     data->flags |= SF_HAS_EVAL;
3937         }
3938         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3939             if (flags & SCF_DO_SUBSTR) {
3940                 SCAN_COMMIT(pRExC_state,data,minlenp);
3941                 flags &= ~SCF_DO_SUBSTR;
3942             }
3943             if (data && OP(scan)==ACCEPT) {
3944                 data->flags |= SCF_SEEN_ACCEPT;
3945                 if (stopmin > min)
3946                     stopmin = min;
3947             }
3948         }
3949         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3950         {
3951                 if (flags & SCF_DO_SUBSTR) {
3952                     SCAN_COMMIT(pRExC_state,data,minlenp);
3953                     data->longest = &(data->longest_float);
3954                 }
3955                 is_inf = is_inf_internal = 1;
3956                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3957                     cl_anything(pRExC_state, data->start_class);
3958                 flags &= ~SCF_DO_STCLASS;
3959         }
3960         else if (OP(scan) == GPOS) {
3961             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3962                 !(delta || is_inf || (data && data->pos_delta)))
3963             {
3964                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3965                     RExC_rx->extflags |= RXf_ANCH_GPOS;
3966                 if (RExC_rx->gofs < (U32)min)
3967                     RExC_rx->gofs = min;
3968             } else {
3969                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3970                 RExC_rx->gofs = 0;
3971             }
3972         }
3973 #ifdef TRIE_STUDY_OPT
3974 #ifdef FULL_TRIE_STUDY
3975         else if (PL_regkind[OP(scan)] == TRIE) {
3976             /* NOTE - There is similar code to this block above for handling
3977                BRANCH nodes on the initial study.  If you change stuff here
3978                check there too. */
3979             regnode *trie_node= scan;
3980             regnode *tail= regnext(scan);
3981             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3982             I32 max1 = 0, min1 = I32_MAX;
3983             struct regnode_charclass_class accum;
3984
3985             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3986                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3987             if (flags & SCF_DO_STCLASS)
3988                 cl_init_zero(pRExC_state, &accum);
3989
3990             if (!trie->jump) {
3991                 min1= trie->minlen;
3992                 max1= trie->maxlen;
3993             } else {
3994                 const regnode *nextbranch= NULL;
3995                 U32 word;
3996
3997                 for ( word=1 ; word <= trie->wordcount ; word++)
3998                 {
3999                     I32 deltanext=0, minnext=0, f = 0, fake;
4000                     struct regnode_charclass_class this_class;
4001
4002                     data_fake.flags = 0;
4003                     if (data) {
4004                         data_fake.whilem_c = data->whilem_c;
4005                         data_fake.last_closep = data->last_closep;
4006                     }
4007                     else
4008                         data_fake.last_closep = &fake;
4009                     data_fake.pos_delta = delta;
4010                     if (flags & SCF_DO_STCLASS) {
4011                         cl_init(pRExC_state, &this_class);
4012                         data_fake.start_class = &this_class;
4013                         f = SCF_DO_STCLASS_AND;
4014                     }
4015                     if (flags & SCF_WHILEM_VISITED_POS)
4016                         f |= SCF_WHILEM_VISITED_POS;
4017
4018                     if (trie->jump[word]) {
4019                         if (!nextbranch)
4020                             nextbranch = trie_node + trie->jump[0];
4021                         scan= trie_node + trie->jump[word];
4022                         /* We go from the jump point to the branch that follows
4023                            it. Note this means we need the vestigal unused branches
4024                            even though they arent otherwise used.
4025                          */
4026                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4027                             &deltanext, (regnode *)nextbranch, &data_fake,
4028                             stopparen, recursed, NULL, f,depth+1);
4029                     }
4030                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4031                         nextbranch= regnext((regnode*)nextbranch);
4032
4033                     if (min1 > (I32)(minnext + trie->minlen))
4034                         min1 = minnext + trie->minlen;
4035                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4036                         max1 = minnext + deltanext + trie->maxlen;
4037                     if (deltanext == I32_MAX)
4038                         is_inf = is_inf_internal = 1;
4039
4040                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4041                         pars++;
4042                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4043                         if ( stopmin > min + min1)
4044                             stopmin = min + min1;
4045                         flags &= ~SCF_DO_SUBSTR;
4046                         if (data)
4047                             data->flags |= SCF_SEEN_ACCEPT;
4048                     }
4049                     if (data) {
4050                         if (data_fake.flags & SF_HAS_EVAL)
4051                             data->flags |= SF_HAS_EVAL;
4052                         data->whilem_c = data_fake.whilem_c;
4053                     }
4054                     if (flags & SCF_DO_STCLASS)
4055                         cl_or(pRExC_state, &accum, &this_class);
4056                 }
4057             }
4058             if (flags & SCF_DO_SUBSTR) {
4059                 data->pos_min += min1;
4060                 data->pos_delta += max1 - min1;
4061                 if (max1 != min1 || is_inf)
4062                     data->longest = &(data->longest_float);
4063             }
4064             min += min1;
4065             delta += max1 - min1;
4066             if (flags & SCF_DO_STCLASS_OR) {
4067                 cl_or(pRExC_state, data->start_class, &accum);
4068                 if (min1) {
4069                     cl_and(data->start_class, and_withp);
4070                     flags &= ~SCF_DO_STCLASS;
4071                 }
4072             }
4073             else if (flags & SCF_DO_STCLASS_AND) {
4074                 if (min1) {
4075                     cl_and(data->start_class, &accum);
4076                     flags &= ~SCF_DO_STCLASS;
4077                 }
4078                 else {
4079                     /* Switch to OR mode: cache the old value of
4080                      * data->start_class */
4081                     INIT_AND_WITHP;
4082                     StructCopy(data->start_class, and_withp,
4083                                struct regnode_charclass_class);
4084                     flags &= ~SCF_DO_STCLASS_AND;
4085                     StructCopy(&accum, data->start_class,
4086                                struct regnode_charclass_class);
4087                     flags |= SCF_DO_STCLASS_OR;
4088                     data->start_class->flags |= ANYOF_EOS;
4089                 }
4090             }
4091             scan= tail;
4092             continue;
4093         }
4094 #else
4095         else if (PL_regkind[OP(scan)] == TRIE) {
4096             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4097             U8*bang=NULL;
4098
4099             min += trie->minlen;
4100             delta += (trie->maxlen - trie->minlen);
4101             flags &= ~SCF_DO_STCLASS; /* xxx */
4102             if (flags & SCF_DO_SUBSTR) {
4103                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4104                 data->pos_min += trie->minlen;
4105                 data->pos_delta += (trie->maxlen - trie->minlen);
4106                 if (trie->maxlen != trie->minlen)
4107                     data->longest = &(data->longest_float);
4108             }
4109             if (trie->jump) /* no more substrings -- for now /grr*/
4110                 flags &= ~SCF_DO_SUBSTR;
4111         }
4112 #endif /* old or new */
4113 #endif /* TRIE_STUDY_OPT */
4114
4115         /* Else: zero-length, ignore. */
4116         scan = regnext(scan);
4117     }
4118     if (frame) {
4119         last = frame->last;
4120         scan = frame->next;
4121         stopparen = frame->stop;
4122         frame = frame->prev;
4123         goto fake_study_recurse;
4124     }
4125
4126   finish:
4127     assert(!frame);
4128     DEBUG_STUDYDATA("pre-fin:",data,depth);
4129
4130     *scanp = scan;
4131     *deltap = is_inf_internal ? I32_MAX : delta;
4132     if (flags & SCF_DO_SUBSTR && is_inf)
4133         data->pos_delta = I32_MAX - data->pos_min;
4134     if (is_par > (I32)U8_MAX)
4135         is_par = 0;
4136     if (is_par && pars==1 && data) {
4137         data->flags |= SF_IN_PAR;
4138         data->flags &= ~SF_HAS_PAR;
4139     }
4140     else if (pars && data) {
4141         data->flags |= SF_HAS_PAR;
4142         data->flags &= ~SF_IN_PAR;
4143     }
4144     if (flags & SCF_DO_STCLASS_OR)
4145         cl_and(data->start_class, and_withp);
4146     if (flags & SCF_TRIE_RESTUDY)
4147         data->flags |=  SCF_TRIE_RESTUDY;
4148
4149     DEBUG_STUDYDATA("post-fin:",data,depth);
4150
4151     return min < stopmin ? min : stopmin;
4152 }
4153
4154 STATIC U32
4155 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4156 {
4157     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4158
4159     PERL_ARGS_ASSERT_ADD_DATA;
4160
4161     Renewc(RExC_rxi->data,
4162            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4163            char, struct reg_data);
4164     if(count)
4165         Renew(RExC_rxi->data->what, count + n, U8);
4166     else
4167         Newx(RExC_rxi->data->what, n, U8);
4168     RExC_rxi->data->count = count + n;
4169     Copy(s, RExC_rxi->data->what + count, n, U8);
4170     return count;
4171 }
4172
4173 /*XXX: todo make this not included in a non debugging perl */
4174 #ifndef PERL_IN_XSUB_RE
4175 void
4176 Perl_reginitcolors(pTHX)
4177 {
4178     dVAR;
4179     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4180     if (s) {
4181         char *t = savepv(s);
4182         int i = 0;
4183         PL_colors[0] = t;
4184         while (++i < 6) {
4185             t = strchr(t, '\t');
4186             if (t) {
4187                 *t = '\0';
4188                 PL_colors[i] = ++t;
4189             }
4190             else
4191                 PL_colors[i] = t = (char *)"";
4192         }
4193     } else {
4194         int i = 0;
4195         while (i < 6)
4196             PL_colors[i++] = (char *)"";
4197     }
4198     PL_colorset = 1;
4199 }
4200 #endif
4201
4202
4203 #ifdef TRIE_STUDY_OPT
4204 #define CHECK_RESTUDY_GOTO                                  \
4205         if (                                                \
4206               (data.flags & SCF_TRIE_RESTUDY)               \
4207               && ! restudied++                              \
4208         )     goto reStudy
4209 #else
4210 #define CHECK_RESTUDY_GOTO
4211 #endif
4212
4213 /*
4214  - pregcomp - compile a regular expression into internal code
4215  *
4216  * We can't allocate space until we know how big the compiled form will be,
4217  * but we can't compile it (and thus know how big it is) until we've got a
4218  * place to put the code.  So we cheat:  we compile it twice, once with code
4219  * generation turned off and size counting turned on, and once "for real".
4220  * This also means that we don't allocate space until we are sure that the
4221  * thing really will compile successfully, and we never have to move the
4222  * code and thus invalidate pointers into it.  (Note that it has to be in
4223  * one piece because free() must be able to free it all.) [NB: not true in perl]
4224  *
4225  * Beware that the optimization-preparation code in here knows about some
4226  * of the structure of the compiled regexp.  [I'll say.]
4227  */
4228
4229
4230
4231 #ifndef PERL_IN_XSUB_RE
4232 #define RE_ENGINE_PTR &PL_core_reg_engine
4233 #else
4234 extern const struct regexp_engine my_reg_engine;
4235 #define RE_ENGINE_PTR &my_reg_engine
4236 #endif
4237
4238 #ifndef PERL_IN_XSUB_RE
4239 REGEXP *
4240 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4241 {
4242     dVAR;
4243     HV * const table = GvHV(PL_hintgv);
4244
4245     PERL_ARGS_ASSERT_PREGCOMP;
4246
4247     /* Dispatch a request to compile a regexp to correct
4248        regexp engine. */
4249     if (table) {
4250         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4251         GET_RE_DEBUG_FLAGS_DECL;
4252         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4253             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4254             DEBUG_COMPILE_r({
4255                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4256                     SvIV(*ptr));
4257             });
4258             return CALLREGCOMP_ENG(eng, pattern, flags);
4259         }
4260     }
4261     return Perl_re_compile(aTHX_ pattern, flags);
4262 }
4263 #endif
4264
4265 REGEXP *
4266 Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
4267 {
4268     dVAR;
4269     REGEXP *rx;
4270     struct regexp *r;
4271     register regexp_internal *ri;
4272     STRLEN plen;
4273     char  *exp = SvPV(pattern, plen);
4274     char* xend = exp + plen;
4275     regnode *scan;
4276     I32 flags;
4277     I32 minlen = 0;
4278     I32 sawplus = 0;
4279     I32 sawopen = 0;
4280     U8 jump_ret = 0;
4281     dJMPENV;
4282     scan_data_t data;
4283     RExC_state_t RExC_state;
4284     RExC_state_t * const pRExC_state = &RExC_state;
4285 #ifdef TRIE_STUDY_OPT
4286     int restudied= 0;
4287     RExC_state_t copyRExC_state;
4288 #endif
4289     GET_RE_DEBUG_FLAGS_DECL;
4290
4291     PERL_ARGS_ASSERT_RE_COMPILE;
4292
4293     DEBUG_r(if (!PL_colorset) reginitcolors());
4294
4295     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4296
4297     DEBUG_COMPILE_r({
4298         SV *dsv= sv_newmortal();
4299         RE_PV_QUOTED_DECL(s, RExC_utf8,
4300             dsv, exp, plen, 60);
4301         PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4302                        PL_colors[4],PL_colors[5],s);
4303     });
4304
4305     /* Longjmp back to here if have to switch in midstream to utf8 */
4306     if (! RExC_orig_utf8) {
4307         JMPENV_PUSH(jump_ret);
4308     }
4309
4310     if (jump_ret != 0) {
4311         STRLEN len = plen;
4312
4313         /* Here, we longjmped back.  If the cause was other than changing to
4314          * utf8, pop our own setjmp, and longjmp to the correct handler */
4315         if (jump_ret != UTF8_LONGJMP) {
4316             JMPENV_POP;
4317             JMPENV_JUMP(jump_ret);
4318         }
4319
4320         GET_RE_DEBUG_FLAGS;
4321
4322         /* It's possible to write a regexp in ascii that represents Unicode
4323         codepoints outside of the byte range, such as via \x{100}. If we
4324         detect such a sequence we have to convert the entire pattern to utf8
4325         and then recompile, as our sizing calculation will have been based
4326         on 1 byte == 1 character, but we will need to use utf8 to encode
4327         at least some part of the pattern, and therefore must convert the whole
4328         thing.
4329         -- dmq */
4330         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4331             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4332         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4333         xend = exp + len;
4334         RExC_orig_utf8 = RExC_utf8 = 1;
4335         SAVEFREEPV(exp);
4336     }
4337
4338     RExC_precomp = exp;
4339     RExC_flags = pm_flags;
4340     RExC_sawback = 0;
4341
4342     RExC_seen = 0;
4343     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4344     RExC_seen_evals = 0;
4345     RExC_extralen = 0;
4346
4347     /* First pass: determine size, legality. */
4348     RExC_parse = exp;
4349     RExC_start = exp;
4350     RExC_end = xend;
4351     RExC_naughty = 0;
4352     RExC_npar = 1;
4353     RExC_nestroot = 0;
4354     RExC_size = 0L;
4355     RExC_emit = &PL_regdummy;
4356     RExC_whilem_seen = 0;
4357     RExC_open_parens = NULL;
4358     RExC_close_parens = NULL;
4359     RExC_opend = NULL;
4360     RExC_paren_names = NULL;
4361 #ifdef DEBUGGING
4362     RExC_paren_name_list = NULL;
4363 #endif
4364     RExC_recurse = NULL;
4365     RExC_recurse_count = 0;
4366
4367 #if 0 /* REGC() is (currently) a NOP at the first pass.
4368        * Clever compilers notice this and complain. --jhi */
4369     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4370 #endif
4371     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4372     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4373         RExC_precomp = NULL;
4374         return(NULL);
4375     }
4376
4377     /* Here, finished first pass.  Get rid of our setjmp, which we added for
4378      * efficiency only if the passed-in string wasn't in utf8, as shown by
4379      * RExC_orig_utf8.  But if the first pass was redone, that variable will be
4380      * 1 here even though the original string wasn't utf8, but in this case
4381      * there will have been a long jump */
4382     if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
4383         JMPENV_POP;
4384     }
4385     DEBUG_PARSE_r({
4386         PerlIO_printf(Perl_debug_log,
4387             "Required size %"IVdf" nodes\n"
4388             "Starting second pass (creation)\n",
4389             (IV)RExC_size);
4390         RExC_lastnum=0;
4391         RExC_lastparse=NULL;
4392     });
4393     /* Small enough for pointer-storage convention?
4394        If extralen==0, this means that we will not need long jumps. */
4395     if (RExC_size >= 0x10000L && RExC_extralen)
4396         RExC_size += RExC_extralen;
4397     else
4398         RExC_extralen = 0;
4399     if (RExC_whilem_seen > 15)
4400         RExC_whilem_seen = 15;
4401
4402     /* Allocate space and zero-initialize. Note, the two step process
4403        of zeroing when in debug mode, thus anything assigned has to
4404        happen after that */
4405     rx = (REGEXP*) newSV_type(SVt_REGEXP);
4406     r = (struct regexp*)SvANY(rx);
4407     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4408          char, regexp_internal);
4409     if ( r == NULL || ri == NULL )
4410         FAIL("Regexp out of space");
4411 #ifdef DEBUGGING
4412     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4413     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4414 #else
4415     /* bulk initialize base fields with 0. */
4416     Zero(ri, sizeof(regexp_internal), char);
4417 #endif
4418
4419     /* non-zero initialization begins here */
4420     RXi_SET( r, ri );
4421     r->engine= RE_ENGINE_PTR;
4422     r->extflags = pm_flags;
4423     {
4424         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4425         bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4426         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4427         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4428                             >> RXf_PMf_STD_PMMOD_SHIFT);
4429         const char *fptr = STD_PAT_MODS;        /*"msix"*/
4430         char *p;
4431         const STRLEN wraplen = plen + has_minus + has_p + has_runon
4432             + (sizeof(STD_PAT_MODS) - 1)
4433             + (sizeof("(?:)") - 1);
4434
4435         p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
4436         SvCUR_set(rx, wraplen);
4437         SvPOK_on(rx);
4438         SvFLAGS(rx) |= SvUTF8(pattern);
4439         *p++='('; *p++='?';
4440         if (has_p)
4441             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4442         {
4443             char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4444             char *colon = r + 1;
4445             char ch;
4446
4447             while((ch = *fptr++)) {
4448                 if(reganch & 1)
4449                     *p++ = ch;
4450                 else
4451                     *r-- = ch;
4452                 reganch >>= 1;
4453             }
4454             if(has_minus) {
4455                 *r = '-';
4456                 p = colon;
4457             }
4458         }
4459
4460         *p++ = ':';
4461         Copy(RExC_precomp, p, plen, char);
4462         assert ((RX_WRAPPED(rx) - p) < 16);
4463         r->pre_prefix = p - RX_WRAPPED(rx);
4464         p += plen;
4465         if (has_runon)
4466             *p++ = '\n';
4467         *p++ = ')';
4468         *p = 0;
4469     }
4470
4471     r->intflags = 0;
4472     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4473
4474     if (RExC_seen & REG_SEEN_RECURSE) {
4475         Newxz(RExC_open_parens, RExC_npar,regnode *);
4476         SAVEFREEPV(RExC_open_parens);
4477         Newxz(RExC_close_parens,RExC_npar,regnode *);
4478         SAVEFREEPV(RExC_close_parens);
4479     }
4480
4481     /* Useful during FAIL. */
4482 #ifdef RE_TRACK_PATTERN_OFFSETS
4483     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4484     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4485                           "%s %"UVuf" bytes for offset annotations.\n",
4486                           ri->u.offsets ? "Got" : "Couldn't get",
4487                           (UV)((2*RExC_size+1) * sizeof(U32))));
4488 #endif
4489     SetProgLen(ri,RExC_size);
4490     RExC_rx_sv = rx;
4491     RExC_rx = r;
4492     RExC_rxi = ri;
4493
4494     /* Second pass: emit code. */
4495     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
4496     RExC_parse = exp;
4497     RExC_end = xend;
4498     RExC_naughty = 0;
4499     RExC_npar = 1;
4500     RExC_emit_start = ri->program;
4501     RExC_emit = ri->program;
4502     RExC_emit_bound = ri->program + RExC_size + 1;
4503
4504     /* Store the count of eval-groups for security checks: */
4505     RExC_rx->seen_evals = RExC_seen_evals;
4506     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4507     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4508         ReREFCNT_dec(rx);
4509         return(NULL);
4510     }
4511     /* XXXX To minimize changes to RE engine we always allocate
4512        3-units-long substrs field. */
4513     Newx(r->substrs, 1, struct reg_substr_data);
4514     if (RExC_recurse_count) {
4515         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4516         SAVEFREEPV(RExC_recurse);
4517     }
4518
4519 reStudy:
4520     r->minlen = minlen = sawplus = sawopen = 0;
4521     Zero(r->substrs, 1, struct reg_substr_data);
4522
4523 #ifdef TRIE_STUDY_OPT
4524     if (!restudied) {
4525         StructCopy(&zero_scan_data, &data, scan_data_t);
4526         copyRExC_state = RExC_state;
4527     } else {
4528         U32 seen=RExC_seen;
4529         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4530
4531         RExC_state = copyRExC_state;
4532         if (seen & REG_TOP_LEVEL_BRANCHES)
4533             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4534         else
4535             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4536         if (data.last_found) {
4537             SvREFCNT_dec(data.longest_fixed);
4538             SvREFCNT_dec(data.longest_float);
4539             SvREFCNT_dec(data.last_found);
4540         }
4541         StructCopy(&zero_scan_data, &data, scan_data_t);
4542     }
4543 #else
4544     StructCopy(&zero_scan_data, &data, scan_data_t);
4545 #endif
4546
4547     /* Dig out information for optimizations. */
4548     r->extflags = RExC_flags; /* was pm_op */
4549     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4550
4551     if (UTF)
4552         SvUTF8_on(rx);  /* Unicode in it? */
4553     ri->regstclass = NULL;
4554     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
4555         r->intflags |= PREGf_NAUGHTY;
4556     scan = ri->program + 1;             /* First BRANCH. */
4557
4558     /* testing for BRANCH here tells us whether there is "must appear"
4559        data in the pattern. If there is then we can use it for optimisations */
4560     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4561         I32 fake;
4562         STRLEN longest_float_length, longest_fixed_length;
4563         struct regnode_charclass_class ch_class; /* pointed to by data */
4564         int stclass_flag;
4565         I32 last_close = 0; /* pointed to by data */
4566         regnode *first= scan;
4567         regnode *first_next= regnext(first);
4568
4569         /*
4570          * Skip introductions and multiplicators >= 1
4571          * so that we can extract the 'meat' of the pattern that must
4572          * match in the large if() sequence following.
4573          * NOTE that EXACT is NOT covered here, as it is normally
4574          * picked up by the optimiser separately.
4575          *
4576          * This is unfortunate as the optimiser isnt handling lookahead
4577          * properly currently.
4578          *
4579          */
4580         while ((OP(first) == OPEN && (sawopen = 1)) ||
4581                /* An OR of *one* alternative - should not happen now. */
4582             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4583             /* for now we can't handle lookbehind IFMATCH*/
4584             (OP(first) == IFMATCH && !first->flags) ||
4585             (OP(first) == PLUS) ||
4586             (OP(first) == MINMOD) ||
4587                /* An {n,m} with n>0 */
4588             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4589             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4590         {
4591                 /*
4592                  * the only op that could be a regnode is PLUS, all the rest
4593                  * will be regnode_1 or regnode_2.
4594                  *
4595                  */
4596                 if (OP(first) == PLUS)
4597                     sawplus = 1;
4598                 else
4599                     first += regarglen[OP(first)];
4600
4601                 first = NEXTOPER(first);
4602                 first_next= regnext(first);
4603         }
4604
4605         /* Starting-point info. */
4606       again:
4607         DEBUG_PEEP("first:",first,0);
4608         /* Ignore EXACT as we deal with it later. */
4609         if (PL_regkind[OP(first)] == EXACT) {
4610             if (OP(first) == EXACT)
4611                 NOOP;   /* Empty, get anchored substr later. */
4612             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4613                 ri->regstclass = first;
4614         }
4615 #ifdef TRIE_STCLASS
4616         else if (PL_regkind[OP(first)] == TRIE &&
4617                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4618         {
4619             regnode *trie_op;
4620             /* this can happen only on restudy */
4621             if ( OP(first) == TRIE ) {
4622                 struct regnode_1 *trieop = (struct regnode_1 *)
4623                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
4624                 StructCopy(first,trieop,struct regnode_1);
4625                 trie_op=(regnode *)trieop;
4626             } else {
4627                 struct regnode_charclass *trieop = (struct regnode_charclass *)
4628                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4629                 StructCopy(first,trieop,struct regnode_charclass);
4630                 trie_op=(regnode *)trieop;
4631             }
4632             OP(trie_op)+=2;
4633             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4634             ri->regstclass = trie_op;
4635         }
4636 #endif
4637         else if (REGNODE_SIMPLE(OP(first)))
4638             ri->regstclass = first;
4639         else if (PL_regkind[OP(first)] == BOUND ||
4640                  PL_regkind[OP(first)] == NBOUND)
4641             ri->regstclass = first;
4642         else if (PL_regkind[OP(first)] == BOL) {
4643             r->extflags |= (OP(first) == MBOL
4644                            ? RXf_ANCH_MBOL
4645                            : (OP(first) == SBOL
4646                               ? RXf_ANCH_SBOL
4647                               : RXf_ANCH_BOL));
4648             first = NEXTOPER(first);
4649             goto again;
4650         }
4651         else if (OP(first) == GPOS) {
4652             r->extflags |= RXf_ANCH_GPOS;
4653             first = NEXTOPER(first);
4654             goto again;
4655         }
4656         else if ((!sawopen || !RExC_sawback) &&
4657             (OP(first) == STAR &&
4658             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4659             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4660         {
4661             /* turn .* into ^.* with an implied $*=1 */
4662             const int type =
4663                 (OP(NEXTOPER(first)) == REG_ANY)
4664                     ? RXf_ANCH_MBOL
4665                     : RXf_ANCH_SBOL;
4666             r->extflags |= type;
4667             r->intflags |= PREGf_IMPLICIT;
4668             first = NEXTOPER(first);
4669             goto again;
4670         }
4671         if (sawplus && (!sawopen || !RExC_sawback)
4672             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4673             /* x+ must match at the 1st pos of run of x's */
4674             r->intflags |= PREGf_SKIP;
4675
4676         /* Scan is after the zeroth branch, first is atomic matcher. */
4677 #ifdef TRIE_STUDY_OPT
4678         DEBUG_PARSE_r(
4679             if (!restudied)
4680                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4681                               (IV)(first - scan + 1))
4682         );
4683 #else
4684         DEBUG_PARSE_r(
4685             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4686                 (IV)(first - scan + 1))
4687         );
4688 #endif
4689
4690
4691         /*
4692         * If there's something expensive in the r.e., find the
4693         * longest literal string that must appear and make it the
4694         * regmust.  Resolve ties in favor of later strings, since
4695         * the regstart check works with the beginning of the r.e.
4696         * and avoiding duplication strengthens checking.  Not a
4697         * strong reason, but sufficient in the absence of others.
4698         * [Now we resolve ties in favor of the earlier string if
4699         * it happens that c_offset_min has been invalidated, since the
4700         * earlier string may buy us something the later one won't.]
4701         */
4702
4703         data.longest_fixed = newSVpvs("");
4704         data.longest_float = newSVpvs("");
4705         data.last_found = newSVpvs("");
4706         data.longest = &(data.longest_fixed);
4707         first = scan;
4708         if (!ri->regstclass) {
4709             cl_init(pRExC_state, &ch_class);
4710             data.start_class = &ch_class;
4711             stclass_flag = SCF_DO_STCLASS_AND;
4712         } else                          /* XXXX Check for BOUND? */
4713             stclass_flag = 0;
4714         data.last_closep = &last_close;
4715
4716         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4717             &data, -1, NULL, NULL,
4718             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4719
4720
4721         CHECK_RESTUDY_GOTO;
4722
4723
4724         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4725              && data.last_start_min == 0 && data.last_end > 0
4726              && !RExC_seen_zerolen
4727              && !(RExC_seen & REG_SEEN_VERBARG)
4728              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4729             r->extflags |= RXf_CHECK_ALL;
4730         scan_commit(pRExC_state, &data,&minlen,0);
4731         SvREFCNT_dec(data.last_found);
4732
4733         /* Note that code very similar to this but for anchored string
4734            follows immediately below, changes may need to be made to both.
4735            Be careful.
4736          */
4737         longest_float_length = CHR_SVLEN(data.longest_float);
4738         if (longest_float_length
4739             || (data.flags & SF_FL_BEFORE_EOL
4740                 && (!(data.flags & SF_FL_BEFORE_MEOL)
4741                     || (RExC_flags & RXf_PMf_MULTILINE))))
4742         {
4743             I32 t,ml;
4744
4745             if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
4746                 && data.offset_fixed == data.offset_float_min
4747                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4748                     goto remove_float;          /* As in (a)+. */
4749
4750             /* copy the information about the longest float from the reg_scan_data
4751                over to the program. */
4752             if (SvUTF8(data.longest_float)) {
4753                 r->float_utf8 = data.longest_float;
4754                 r->float_substr = NULL;
4755             } else {
4756                 r->float_substr = data.longest_float;
4757                 r->float_utf8 = NULL;
4758             }
4759             /* float_end_shift is how many chars that must be matched that
4760                follow this item. We calculate it ahead of time as once the
4761                lookbehind offset is added in we lose the ability to correctly
4762                calculate it.*/
4763             ml = data.minlen_float ? *(data.minlen_float)
4764                                    : (I32)longest_float_length;
4765             r->float_end_shift = ml - data.offset_float_min
4766                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4767                 + data.lookbehind_float;
4768             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4769             r->float_max_offset = data.offset_float_max;
4770             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4771                 r->float_max_offset -= data.lookbehind_float;
4772
4773             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4774                        && (!(data.flags & SF_FL_BEFORE_MEOL)
4775                            || (RExC_flags & RXf_PMf_MULTILINE)));
4776             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4777         }
4778         else {
4779           remove_float:
4780             r->float_substr = r->float_utf8 = NULL;
4781             SvREFCNT_dec(data.longest_float);
4782             longest_float_length = 0;
4783         }
4784
4785         /* Note that code very similar to this but for floating string
4786            is immediately above, changes may need to be made to both.
4787            Be careful.
4788          */
4789         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4790         if (longest_fixed_length
4791             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4792                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4793                     || (RExC_flags & RXf_PMf_MULTILINE))))
4794         {
4795             I32 t,ml;
4796
4797             /* copy the information about the longest fixed
4798                from the reg_scan_data over to the program. */
4799             if (SvUTF8(data.longest_fixed)) {
4800                 r->anchored_utf8 = data.longest_fixed;
4801                 r->anchored_substr = NULL;
4802             } else {
4803                 r->anchored_substr = data.longest_fixed;
4804                 r->anchored_utf8 = NULL;
4805             }
4806             /* fixed_end_shift is how many chars that must be matched that
4807                follow this item. We calculate it ahead of time as once the
4808                lookbehind offset is added in we lose the ability to correctly
4809                calculate it.*/
4810             ml = data.minlen_fixed ? *(data.minlen_fixed)
4811                                    : (I32)longest_fixed_length;
4812             r->anchored_end_shift = ml - data.offset_fixed
4813                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4814                 + data.lookbehind_fixed;
4815             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4816
4817             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4818                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
4819                      || (RExC_flags & RXf_PMf_MULTILINE)));
4820             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4821         }
4822         else {
4823             r->anchored_substr = r->anchored_utf8 = NULL;
4824             SvREFCNT_dec(data.longest_fixed);
4825             longest_fixed_length = 0;
4826         }
4827         if (ri->regstclass
4828             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4829             ri->regstclass = NULL;
4830         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4831             && stclass_flag
4832             && !(data.start_class->flags & ANYOF_EOS)
4833             && !cl_is_anything(data.start_class))
4834         {
4835             const U32 n = add_data(pRExC_state, 1, "f");
4836
4837             Newx(RExC_rxi->data->data[n], 1,
4838                 struct regnode_charclass_class);
4839             StructCopy(data.start_class,
4840                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4841                        struct regnode_charclass_class);
4842             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4843             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4844             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4845                       regprop(r, sv, (regnode*)data.start_class);
4846                       PerlIO_printf(Perl_debug_log,
4847                                     "synthetic stclass \"%s\".\n",
4848                                     SvPVX_const(sv));});
4849         }
4850
4851         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4852         if (longest_fixed_length > longest_float_length) {
4853             r->check_end_shift = r->anchored_end_shift;
4854             r->check_substr = r->anchored_substr;
4855             r->check_utf8 = r->anchored_utf8;
4856             r->check_offset_min = r->check_offset_max = r->anchored_offset;
4857             if (r->extflags & RXf_ANCH_SINGLE)
4858                 r->extflags |= RXf_NOSCAN;
4859         }
4860         else {
4861             r->check_end_shift = r->float_end_shift;
4862             r->check_substr = r->float_substr;
4863             r->check_utf8 = r->float_utf8;
4864             r->check_offset_min = r->float_min_offset;
4865             r->check_offset_max = r->float_max_offset;
4866         }
4867         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4868            This should be changed ASAP!  */
4869         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4870             r->extflags |= RXf_USE_INTUIT;
4871             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4872                 r->extflags |= RXf_INTUIT_TAIL;
4873         }
4874         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4875         if ( (STRLEN)minlen < longest_float_length )
4876             minlen= longest_float_length;
4877         if ( (STRLEN)minlen < longest_fixed_length )
4878             minlen= longest_fixed_length;
4879         */
4880     }
4881     else {
4882         /* Several toplevels. Best we can is to set minlen. */
4883         I32 fake;
4884         struct regnode_charclass_class ch_class;
4885         I32 last_close = 0;
4886
4887         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4888
4889         scan = ri->program + 1;
4890         cl_init(pRExC_state, &ch_class);
4891         data.start_class = &ch_class;
4892         data.last_closep = &last_close;
4893
4894
4895         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4896             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4897
4898         CHECK_RESTUDY_GOTO;
4899
4900         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4901                 = r->float_substr = r->float_utf8 = NULL;
4902         if (!(data.start_class->flags & ANYOF_EOS)
4903             && !cl_is_anything(data.start_class))
4904         {
4905             const U32 n = add_data(pRExC_state, 1, "f");
4906
4907             Newx(RExC_rxi->data->data[n], 1,
4908                 struct regnode_charclass_class);
4909             StructCopy(data.start_class,
4910                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4911                        struct regnode_charclass_class);
4912             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4913             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4914             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4915                       regprop(r, sv, (regnode*)data.start_class);
4916                       PerlIO_printf(Perl_debug_log,
4917                                     "synthetic stclass \"%s\".\n",
4918                                     SvPVX_const(sv));});
4919         }
4920     }
4921
4922     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4923        the "real" pattern. */
4924     DEBUG_OPTIMISE_r({
4925         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4926                       (IV)minlen, (IV)r->minlen);
4927     });
4928     r->minlenret = minlen;
4929     if (r->minlen < minlen)
4930         r->minlen = minlen;
4931
4932     if (RExC_seen & REG_SEEN_GPOS)
4933         r->extflags |= RXf_GPOS_SEEN;
4934     if (RExC_seen & REG_SEEN_LOOKBEHIND)
4935         r->extflags |= RXf_LOOKBEHIND_SEEN;
4936     if (RExC_seen & REG_SEEN_EVAL)
4937         r->extflags |= RXf_EVAL_SEEN;
4938     if (RExC_seen & REG_SEEN_CANY)
4939         r->extflags |= RXf_CANY_SEEN;
4940     if (RExC_seen & REG_SEEN_VERBARG)
4941         r->intflags |= PREGf_VERBARG_SEEN;
4942     if (RExC_seen & REG_SEEN_CUTGROUP)
4943         r->intflags |= PREGf_CUTGROUP_SEEN;
4944     if (RExC_paren_names)
4945         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
4946     else
4947         RXp_PAREN_NAMES(r) = NULL;
4948
4949 #ifdef STUPID_PATTERN_CHECKS
4950     if (RX_PRELEN(rx) == 0)
4951         r->extflags |= RXf_NULL;
4952     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4953         /* XXX: this should happen BEFORE we compile */
4954         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4955     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
4956         r->extflags |= RXf_WHITE;
4957     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
4958         r->extflags |= RXf_START_ONLY;
4959 #else
4960     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4961             /* XXX: this should happen BEFORE we compile */
4962             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4963     else {
4964         regnode *first = ri->program + 1;
4965         U8 fop = OP(first);
4966         U8 nop = OP(NEXTOPER(first));
4967
4968         if (PL_regkind[fop] == NOTHING && nop == END)
4969             r->extflags |= RXf_NULL;
4970         else if (PL_regkind[fop] == BOL && nop == END)
4971             r->extflags |= RXf_START_ONLY;
4972         else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4973             r->extflags |= RXf_WHITE;
4974     }
4975 #endif
4976 #ifdef DEBUGGING
4977     if (RExC_paren_names) {
4978         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
4979         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4980     } else
4981 #endif
4982         ri->name_list_idx = 0;
4983
4984     if (RExC_recurse_count) {
4985         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4986             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4987             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4988         }
4989     }
4990     Newxz(r->offs, RExC_npar, regexp_paren_pair);
4991     /* assume we don't need to swap parens around before we match */
4992
4993     DEBUG_DUMP_r({
4994         PerlIO_printf(Perl_debug_log,"Final program:\n");
4995         regdump(r);
4996     });
4997 #ifdef RE_TRACK_PATTERN_OFFSETS
4998     DEBUG_OFFSETS_r(if (ri->u.offsets) {
4999         const U32 len = ri->u.offsets[0];
5000         U32 i;
5001         GET_RE_DEBUG_FLAGS_DECL;
5002         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5003         for (i = 1; i <= len; i++) {
5004             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5005                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5006                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5007             }
5008         PerlIO_printf(Perl_debug_log, "\n");
5009     });
5010 #endif
5011     return rx;
5012 }
5013
5014 #undef RE_ENGINE_PTR
5015
5016
5017 SV*
5018 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5019                     const U32 flags)
5020 {
5021     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5022
5023     PERL_UNUSED_ARG(value);
5024
5025     if (flags & RXapif_FETCH) {
5026         return reg_named_buff_fetch(rx, key, flags);
5027     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5028         Perl_croak_no_modify(aTHX);
5029         return NULL;
5030     } else if (flags & RXapif_EXISTS) {
5031         return reg_named_buff_exists(rx, key, flags)
5032             ? &PL_sv_yes
5033             : &PL_sv_no;
5034     } else if (flags & RXapif_REGNAMES) {
5035         return reg_named_buff_all(rx, flags);
5036     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5037         return reg_named_buff_scalar(rx, flags);
5038     } else {
5039         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5040         return NULL;
5041     }
5042 }
5043
5044 SV*
5045 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5046                          const U32 flags)
5047 {
5048     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5049     PERL_UNUSED_ARG(lastkey);
5050
5051     if (flags & RXapif_FIRSTKEY)
5052         return reg_named_buff_firstkey(rx, flags);
5053     else if (flags & RXapif_NEXTKEY)
5054         return reg_named_buff_nextkey(rx, flags);
5055     else {
5056         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5057         return NULL;
5058     }
5059 }
5060
5061 SV*
5062 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5063                           const U32 flags)
5064 {
5065     AV *retarray = NULL;
5066     SV *ret;
5067     struct regexp *const rx = (struct regexp *)SvANY(r);
5068
5069     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5070
5071     if (flags & RXapif_ALL)
5072         retarray=newAV();
5073
5074     if (rx && RXp_PAREN_NAMES(rx)) {
5075         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5076         if (he_str) {
5077             IV i;
5078             SV* sv_dat=HeVAL(he_str);
5079             I32 *nums=(I32*)SvPVX(sv_dat);
5080             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5081                 if ((I32)(rx->nparens) >= nums[i]
5082                     && rx->offs[nums[i]].start != -1
5083                     && rx->offs[nums[i]].end != -1)
5084                 {
5085                     ret = newSVpvs("");
5086                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5087                     if (!retarray)
5088                         return ret;
5089                 } else {
5090                     ret = newSVsv(&PL_sv_undef);
5091                 }
5092                 if (retarray)
5093                     av_push(retarray, ret);
5094             }
5095             if (retarray)
5096                 return newRV_noinc(MUTABLE_SV(retarray));
5097         }
5098     }
5099     return NULL;
5100 }
5101
5102 bool
5103 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5104                            const U32 flags)
5105 {
5106     struct regexp *const rx = (struct regexp *)SvANY(r);
5107
5108     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5109
5110     if (rx && RXp_PAREN_NAMES(rx)) {
5111         if (flags & RXapif_ALL) {
5112             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5113         } else {
5114             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5115             if (sv) {
5116                 SvREFCNT_dec(sv);
5117                 return TRUE;
5118             } else {
5119                 return FALSE;
5120             }
5121         }
5122     } else {
5123         return FALSE;
5124     }
5125 }
5126
5127 SV*
5128 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5129 {
5130     struct regexp *const rx = (struct regexp *)SvANY(r);
5131
5132     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5133
5134     if ( rx && RXp_PAREN_NAMES(rx) ) {
5135         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5136
5137         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5138     } else {
5139         return FALSE;
5140     }
5141 }
5142
5143 SV*
5144 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5145 {
5146     struct regexp *const rx = (struct regexp *)SvANY(r);
5147     GET_RE_DEBUG_FLAGS_DECL;
5148
5149     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5150
5151     if (rx && RXp_PAREN_NAMES(rx)) {
5152         HV *hv = RXp_PAREN_NAMES(rx);
5153         HE *temphe;
5154         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5155             IV i;
5156             IV parno = 0;
5157             SV* sv_dat = HeVAL(temphe);
5158             I32 *nums = (I32*)SvPVX(sv_dat);
5159             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5160                 if ((I32)(rx->lastparen) >= nums[i] &&
5161                     rx->offs[nums[i]].start != -1 &&
5162                     rx->offs[nums[i]].end != -1)
5163                 {
5164                     parno = nums[i];
5165                     break;
5166                 }
5167             }
5168             if (parno || flags & RXapif_ALL) {
5169                 return newSVhek(HeKEY_hek(temphe));
5170             }
5171         }
5172     }
5173     return NULL;
5174 }
5175
5176 SV*
5177 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5178 {
5179     SV *ret;
5180     AV *av;
5181     I32 length;
5182     struct regexp *const rx = (struct regexp *)SvANY(r);
5183
5184     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5185
5186     if (rx && RXp_PAREN_NAMES(rx)) {
5187         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5188             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5189         } else if (flags & RXapif_ONE) {
5190             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5191             av = MUTABLE_AV(SvRV(ret));
5192             length = av_len(av);
5193             SvREFCNT_dec(ret);
5194             return newSViv(length + 1);
5195         } else {
5196             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5197             return NULL;
5198         }
5199     }
5200     return &PL_sv_undef;
5201 }
5202
5203 SV*
5204 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5205 {
5206     struct regexp *const rx = (struct regexp *)SvANY(r);
5207     AV *av = newAV();
5208
5209     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5210
5211     if (rx && RXp_PAREN_NAMES(rx)) {
5212         HV *hv= RXp_PAREN_NAMES(rx);
5213         HE *temphe;
5214         (void)hv_iterinit(hv);
5215         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5216             IV i;
5217             IV parno = 0;
5218             SV* sv_dat = HeVAL(temphe);
5219             I32 *nums = (I32*)SvPVX(sv_dat);
5220             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5221                 if ((I32)(rx->lastparen) >= nums[i] &&
5222                     rx->offs[nums[i]].start != -1 &&
5223                     rx->offs[nums[i]].end != -1)
5224                 {
5225                     parno = nums[i];
5226                     break;
5227                 }
5228             }
5229             if (parno || flags & RXapif_ALL) {
5230                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5231             }
5232         }
5233     }
5234
5235     return newRV_noinc(MUTABLE_SV(av));
5236 }
5237
5238 void
5239 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5240                              SV * const sv)
5241 {
5242     struct regexp *const rx = (struct regexp *)SvANY(r);
5243     char *s = NULL;
5244     I32 i = 0;
5245     I32 s1, t1;
5246
5247     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5248
5249     if (!rx->subbeg) {
5250         sv_setsv(sv,&PL_sv_undef);
5251         return;
5252     }
5253     else
5254     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5255         /* $` */
5256         i = rx->offs[0].start;
5257         s = rx->subbeg;
5258     }
5259     else
5260     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5261         /* $' */
5262         s = rx->subbeg + rx->offs[0].end;
5263         i = rx->sublen - rx->offs[0].end;
5264     }
5265     else
5266     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5267         (s1 = rx->offs[paren].start) != -1 &&
5268         (t1 = rx->offs[paren].end) != -1)
5269     {
5270         /* $& $1 ... */
5271         i = t1 - s1;
5272         s = rx->subbeg + s1;
5273     } else {
5274         sv_setsv(sv,&PL_sv_undef);
5275         return;
5276     }
5277     assert(rx->sublen >= (s - rx->subbeg) + i );
5278     if (i >= 0) {
5279         const int oldtainted = PL_tainted;
5280         TAINT_NOT;
5281         sv_setpvn(sv, s, i);
5282         PL_tainted = oldtainted;
5283         if ( (rx->extflags & RXf_CANY_SEEN)
5284             ? (RXp_MATCH_UTF8(rx)
5285                         && (!i || is_utf8_string((U8*)s, i)))
5286             : (RXp_MATCH_UTF8(rx)) )
5287         {
5288             SvUTF8_on(sv);
5289         }
5290         else
5291             SvUTF8_off(sv);
5292         if (PL_tainting) {
5293             if (RXp_MATCH_TAINTED(rx)) {
5294                 if (SvTYPE(sv) >= SVt_PVMG) {
5295                     MAGIC* const mg = SvMAGIC(sv);
5296                     MAGIC* mgt;
5297                     PL_tainted = 1;
5298                     SvMAGIC_set(sv, mg->mg_moremagic);
5299                     SvTAINT(sv);
5300                     if ((mgt = SvMAGIC(sv))) {
5301                         mg->mg_moremagic = mgt;
5302                         SvMAGIC_set(sv, mg);
5303                     }
5304                 } else {
5305                     PL_tainted = 1;
5306                     SvTAINT(sv);
5307                 }
5308             } else
5309                 SvTAINTED_off(sv);
5310         }
5311     } else {
5312         sv_setsv(sv,&PL_sv_undef);
5313         return;
5314     }
5315 }
5316
5317 void
5318 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5319                                                          SV const * const value)
5320 {
5321     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5322
5323     PERL_UNUSED_ARG(rx);
5324     PERL_UNUSED_ARG(paren);
5325     PERL_UNUSED_ARG(value);
5326
5327     if (!PL_localizing)
5328         Perl_croak_no_modify(aTHX);
5329 }
5330
5331 I32
5332 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5333                               const I32 paren)
5334 {
5335     struct regexp *const rx = (struct regexp *)SvANY(r);
5336     I32 i;
5337     I32 s1, t1;
5338
5339     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5340
5341     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5342         switch (paren) {
5343       /* $` / ${^PREMATCH} */
5344       case RX_BUFF_IDX_PREMATCH:
5345         if (rx->offs[0].start != -1) {
5346                         i = rx->offs[0].start;
5347                         if (i > 0) {
5348                                 s1 = 0;
5349                                 t1 = i;
5350                                 goto getlen;
5351                         }
5352             }
5353         return 0;
5354       /* $' / ${^POSTMATCH} */
5355       case RX_BUFF_IDX_POSTMATCH:
5356             if (rx->offs[0].end != -1) {
5357                         i = rx->sublen - rx->offs[0].end;
5358                         if (i > 0) {
5359                                 s1 = rx->offs[0].end;
5360                                 t1 = rx->sublen;
5361                                 goto getlen;
5362                         }
5363             }
5364         return 0;
5365       /* $& / ${^MATCH}, $1, $2, ... */
5366       default:
5367             if (paren <= (I32)rx->nparens &&
5368             (s1 = rx->offs[paren].start) != -1 &&
5369             (t1 = rx->offs[paren].end) != -1)
5370             {
5371             i = t1 - s1;
5372             goto getlen;
5373         } else {
5374             if (ckWARN(WARN_UNINITIALIZED))
5375                 report_uninit((const SV *)sv);
5376             return 0;
5377         }
5378     }
5379   getlen:
5380     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5381         const char * const s = rx->subbeg + s1;
5382         const U8 *ep;
5383         STRLEN el;
5384
5385         i = t1 - s1;
5386         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5387                         i = el;
5388     }
5389     return i;
5390 }
5391
5392 SV*
5393 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5394 {
5395     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5396         PERL_UNUSED_ARG(rx);
5397         if (0)
5398             return NULL;
5399         else
5400             return newSVpvs("Regexp");
5401 }
5402
5403 /* Scans the name of a named buffer from the pattern.
5404  * If flags is REG_RSN_RETURN_NULL returns null.
5405  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5406  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5407  * to the parsed name as looked up in the RExC_paren_names hash.
5408  * If there is an error throws a vFAIL().. type exception.
5409  */
5410
5411 #define REG_RSN_RETURN_NULL    0
5412 #define REG_RSN_RETURN_NAME    1
5413 #define REG_RSN_RETURN_DATA    2
5414
5415 STATIC SV*
5416 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5417 {
5418     char *name_start = RExC_parse;
5419
5420     PERL_ARGS_ASSERT_REG_SCAN_NAME;
5421
5422     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5423          /* skip IDFIRST by using do...while */
5424         if (UTF)
5425             do {
5426                 RExC_parse += UTF8SKIP(RExC_parse);
5427             } while (isALNUM_utf8((U8*)RExC_parse));
5428         else
5429             do {
5430                 RExC_parse++;
5431             } while (isALNUM(*RExC_parse));
5432     }
5433
5434     if ( flags ) {
5435         SV* sv_name
5436             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5437                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5438         if ( flags == REG_RSN_RETURN_NAME)
5439             return sv_name;
5440         else if (flags==REG_RSN_RETURN_DATA) {
5441             HE *he_str = NULL;
5442             SV *sv_dat = NULL;
5443             if ( ! sv_name )      /* should not happen*/
5444                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5445             if (RExC_paren_names)
5446                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5447             if ( he_str )
5448                 sv_dat = HeVAL(he_str);
5449             if ( ! sv_dat )
5450                 vFAIL("Reference to nonexistent named group");
5451             return sv_dat;
5452         }
5453         else {
5454             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5455         }
5456         /* NOT REACHED */
5457     }
5458     return NULL;
5459 }
5460
5461 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5462     int rem=(int)(RExC_end - RExC_parse);                       \
5463     int cut;                                                    \
5464     int num;                                                    \
5465     int iscut=0;                                                \
5466     if (rem>10) {                                               \
5467         rem=10;                                                 \
5468         iscut=1;                                                \
5469     }                                                           \
5470     cut=10-rem;                                                 \
5471     if (RExC_lastparse!=RExC_parse)                             \
5472         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5473             rem, RExC_parse,                                    \
5474             cut + 4,                                            \
5475             iscut ? "..." : "<"                                 \
5476         );                                                      \
5477     else                                                        \
5478         PerlIO_printf(Perl_debug_log,"%16s","");                \
5479                                                                 \
5480     if (SIZE_ONLY)                                              \
5481        num = RExC_size + 1;                                     \
5482     else                                                        \
5483        num=REG_NODE_NUM(RExC_emit);                             \
5484     if (RExC_lastnum!=num)                                      \
5485        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5486     else                                                        \
5487        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5488     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5489         (int)((depth*2)), "",                                   \
5490         (funcname)                                              \
5491     );                                                          \
5492     RExC_lastnum=num;                                           \
5493     RExC_lastparse=RExC_parse;                                  \
5494 })
5495
5496
5497
5498 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5499     DEBUG_PARSE_MSG((funcname));                            \
5500     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5501 })
5502 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5503     DEBUG_PARSE_MSG((funcname));                            \
5504     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5505 })
5506 /*
5507  - reg - regular expression, i.e. main body or parenthesized thing
5508  *
5509  * Caller must absorb opening parenthesis.
5510  *
5511  * Combining parenthesis handling with the base level of regular expression
5512  * is a trifle forced, but the need to tie the tails of the branches to what
5513  * follows makes it hard to avoid.
5514  */
5515 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5516 #ifdef DEBUGGING
5517 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5518 #else
5519 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5520 #endif
5521
5522 STATIC regnode *
5523 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5524     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5525 {
5526     dVAR;
5527     register regnode *ret;              /* Will be the head of the group. */
5528     register regnode *br;
5529     register regnode *lastbr;
5530     register regnode *ender = NULL;
5531     register I32 parno = 0;
5532     I32 flags;
5533     U32 oregflags = RExC_flags;
5534     bool have_branch = 0;
5535     bool is_open = 0;
5536     I32 freeze_paren = 0;
5537     I32 after_freeze = 0;
5538
5539     /* for (?g), (?gc), and (?o) warnings; warning
5540        about (?c) will warn about (?g) -- japhy    */
5541
5542 #define WASTED_O  0x01
5543 #define WASTED_G  0x02
5544 #define WASTED_C  0x04
5545 #define WASTED_GC (0x02|0x04)
5546     I32 wastedflags = 0x00;
5547
5548     char * parse_start = RExC_parse; /* MJD */
5549     char * const oregcomp_parse = RExC_parse;
5550
5551     GET_RE_DEBUG_FLAGS_DECL;
5552
5553     PERL_ARGS_ASSERT_REG;
5554     DEBUG_PARSE("reg ");
5555
5556     *flagp = 0;                         /* Tentatively. */
5557
5558
5559     /* Make an OPEN node, if parenthesized. */
5560     if (paren) {
5561         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5562             char *start_verb = RExC_parse;
5563             STRLEN verb_len = 0;
5564             char *start_arg = NULL;
5565             unsigned char op = 0;
5566             int argok = 1;
5567             int internal_argval = 0; /* internal_argval is only useful if !argok */
5568             while ( *RExC_parse && *RExC_parse != ')' ) {
5569                 if ( *RExC_parse == ':' ) {
5570                     start_arg = RExC_parse + 1;
5571                     break;
5572                 }
5573                 RExC_parse++;
5574             }
5575             ++start_verb;
5576             verb_len = RExC_parse - start_verb;
5577             if ( start_arg ) {
5578                 RExC_parse++;
5579                 while ( *RExC_parse && *RExC_parse != ')' )
5580                     RExC_parse++;
5581                 if ( *RExC_parse != ')' )
5582                     vFAIL("Unterminated verb pattern argument");
5583                 if ( RExC_parse == start_arg )
5584                     start_arg = NULL;
5585             } else {
5586                 if ( *RExC_parse != ')' )
5587                     vFAIL("Unterminated verb pattern");
5588             }
5589
5590             switch ( *start_verb ) {
5591             case 'A':  /* (*ACCEPT) */
5592                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5593                     op = ACCEPT;
5594                     internal_argval = RExC_nestroot;
5595                 }
5596                 break;
5597             case 'C':  /* (*COMMIT) */
5598                 if ( memEQs(start_verb,verb_len,"COMMIT") )
5599                     op = COMMIT;
5600                 break;
5601             case 'F':  /* (*FAIL) */
5602                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5603                     op = OPFAIL;
5604                     argok = 0;
5605                 }
5606                 break;
5607             case ':':  /* (*:NAME) */
5608             case 'M':  /* (*MARK:NAME) */
5609                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5610                     op = MARKPOINT;
5611                     argok = -1;
5612                 }
5613                 break;
5614             case 'P':  /* (*PRUNE) */
5615                 if ( memEQs(start_verb,verb_len,"PRUNE") )
5616                     op = PRUNE;
5617                 break;
5618             case 'S':   /* (*SKIP) */
5619                 if ( memEQs(start_verb,verb_len,"SKIP") )
5620                     op = SKIP;
5621                 break;
5622             case 'T':  /* (*THEN) */
5623                 /* [19:06] <TimToady> :: is then */
5624                 if ( memEQs(start_verb,verb_len,"THEN") ) {
5625                     op = CUTGROUP;
5626                     RExC_seen |= REG_SEEN_CUTGROUP;
5627                 }
5628                 break;
5629             }
5630             if ( ! op ) {
5631                 RExC_parse++;
5632                 vFAIL3("Unknown verb pattern '%.*s'",
5633                     verb_len, start_verb);
5634             }
5635             if ( argok ) {
5636                 if ( start_arg && internal_argval ) {
5637                     vFAIL3("Verb pattern '%.*s' may not have an argument",
5638                         verb_len, start_verb);
5639                 } else if ( argok < 0 && !start_arg ) {
5640                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5641                         verb_len, start_verb);
5642                 } else {
5643                     ret = reganode(pRExC_state, op, internal_argval);
5644                     if ( ! internal_argval && ! SIZE_ONLY ) {
5645                         if (start_arg) {
5646                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5647                             ARG(ret) = add_data( pRExC_state, 1, "S" );
5648                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5649                             ret->flags = 0;
5650                         } else {
5651                             ret->flags = 1;
5652                         }
5653                     }
5654                 }
5655                 if (!internal_argval)
5656                     RExC_seen |= REG_SEEN_VERBARG;
5657             } else if ( start_arg ) {
5658                 vFAIL3("Verb pattern '%.*s' may not have an argument",
5659                         verb_len, start_verb);
5660             } else {
5661                 ret = reg_node(pRExC_state, op);
5662             }
5663             nextchar(pRExC_state);
5664             return ret;
5665         } else
5666         if (*RExC_parse == '?') { /* (?...) */
5667             bool is_logical = 0;
5668             const char * const seqstart = RExC_parse;
5669
5670             RExC_parse++;
5671             paren = *RExC_parse++;
5672             ret = NULL;                 /* For look-ahead/behind. */
5673             switch (paren) {
5674
5675             case 'P':   /* (?P...) variants for those used to PCRE/Python */
5676                 paren = *RExC_parse++;
5677                 if ( paren == '<')         /* (?P<...>) named capture */
5678                     goto named_capture;
5679                 else if (paren == '>') {   /* (?P>name) named recursion */
5680                     goto named_recursion;
5681                 }
5682                 else if (paren == '=') {   /* (?P=...)  named backref */
5683                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
5684                        you change this make sure you change that */
5685                     char* name_start = RExC_parse;
5686                     U32 num = 0;
5687                     SV *sv_dat = reg_scan_name(pRExC_state,
5688                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5689                     if (RExC_parse == name_start || *RExC_parse != ')')
5690                         vFAIL2("Sequence %.3s... not terminated",parse_start);
5691
5692                     if (!SIZE_ONLY) {
5693                         num = add_data( pRExC_state, 1, "S" );
5694                         RExC_rxi->data->data[num]=(void*)sv_dat;
5695                         SvREFCNT_inc_simple_void(sv_dat);
5696                     }
5697                     RExC_sawback = 1;
5698                     ret = reganode(pRExC_state,
5699                            (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5700                            num);
5701                     *flagp |= HASWIDTH;
5702
5703                     Set_Node_Offset(ret, parse_start+1);
5704                     Set_Node_Cur_Length(ret); /* MJD */
5705
5706                     nextchar(pRExC_state);
5707                     return ret;
5708                 }
5709                 RExC_parse++;
5710                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5711                 /*NOTREACHED*/
5712             case '<':           /* (?<...) */
5713                 if (*RExC_parse == '!')
5714                     paren = ',';
5715                 else if (*RExC_parse != '=')
5716               named_capture:
5717                 {               /* (?<...>) */
5718                     char *name_start;
5719                     SV *svname;
5720                     paren= '>';
5721             case '\'':          /* (?'...') */
5722                     name_start= RExC_parse;
5723                     svname = reg_scan_name(pRExC_state,
5724                         SIZE_ONLY ?  /* reverse test from the others */
5725                         REG_RSN_RETURN_NAME :
5726                         REG_RSN_RETURN_NULL);
5727                     if (RExC_parse == name_start) {
5728                         RExC_parse++;
5729                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5730                         /*NOTREACHED*/
5731                     }
5732                     if (*RExC_parse != paren)
5733                         vFAIL2("Sequence (?%c... not terminated",
5734                             paren=='>' ? '<' : paren);
5735                     if (SIZE_ONLY) {
5736                         HE *he_str;
5737                         SV *sv_dat = NULL;
5738                         if (!svname) /* shouldnt happen */
5739                             Perl_croak(aTHX_
5740                                 "panic: reg_scan_name returned NULL");
5741                         if (!RExC_paren_names) {
5742                             RExC_paren_names= newHV();
5743                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
5744 #ifdef DEBUGGING
5745                             RExC_paren_name_list= newAV();
5746                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5747 #endif
5748                         }
5749                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5750                         if ( he_str )
5751                             sv_dat = HeVAL(he_str);
5752                         if ( ! sv_dat ) {
5753                             /* croak baby croak */
5754                             Perl_croak(aTHX_
5755                                 "panic: paren_name hash element allocation failed");
5756                         } else if ( SvPOK(sv_dat) ) {
5757                             /* (?|...) can mean we have dupes so scan to check
5758                                its already been stored. Maybe a flag indicating
5759                                we are inside such a construct would be useful,
5760                                but the arrays are likely to be quite small, so
5761                                for now we punt -- dmq */
5762                             IV count = SvIV(sv_dat);
5763                             I32 *pv = (I32*)SvPVX(sv_dat);
5764                             IV i;
5765                             for ( i = 0 ; i < count ; i++ ) {
5766                                 if ( pv[i] == RExC_npar ) {
5767                                     count = 0;
5768                                     break;
5769                                 }
5770                             }
5771                             if ( count ) {
5772                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5773                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5774                                 pv[count] = RExC_npar;
5775                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5776                             }
5777                         } else {
5778                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
5779                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5780                             SvIOK_on(sv_dat);
5781                             SvIV_set(sv_dat, 1);
5782                         }
5783 #ifdef DEBUGGING
5784                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5785                             SvREFCNT_dec(svname);
5786 #endif
5787
5788                         /*sv_dump(sv_dat);*/
5789                     }
5790                     nextchar(pRExC_state);
5791                     paren = 1;
5792                     goto capturing_parens;
5793                 }
5794                 RExC_seen |= REG_SEEN_LOOKBEHIND;
5795                 RExC_parse++;
5796             case '=':           /* (?=...) */
5797                 RExC_seen_zerolen++;
5798                         break;
5799             case '!':           /* (?!...) */
5800                 RExC_seen_zerolen++;
5801                 if (*RExC_parse == ')') {
5802                     ret=reg_node(pRExC_state, OPFAIL);
5803                     nextchar(pRExC_state);
5804                     return ret;
5805                 }
5806                 break;
5807             case '|':           /* (?|...) */
5808                 /* branch reset, behave like a (?:...) except that
5809                    buffers in alternations share the same numbers */
5810                 paren = ':';
5811                 after_freeze = freeze_paren = RExC_npar;
5812                 break;
5813             case ':':           /* (?:...) */
5814             case '>':           /* (?>...) */
5815                 break;
5816             case '$':           /* (?$...) */
5817             case '@':           /* (?@...) */
5818                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5819                 break;
5820             case '#':           /* (?#...) */
5821                 while (*RExC_parse && *RExC_parse != ')')
5822                     RExC_parse++;
5823                 if (*RExC_parse != ')')
5824                     FAIL("Sequence (?#... not terminated");
5825                 nextchar(pRExC_state);
5826                 *flagp = TRYAGAIN;
5827                 return NULL;
5828             case '0' :           /* (?0) */
5829             case 'R' :           /* (?R) */
5830                 if (*RExC_parse != ')')
5831                     FAIL("Sequence (?R) not terminated");
5832                 ret = reg_node(pRExC_state, GOSTART);
5833                 *flagp |= POSTPONED;
5834                 nextchar(pRExC_state);
5835                 return ret;
5836                 /*notreached*/
5837             { /* named and numeric backreferences */
5838                 I32 num;
5839             case '&':            /* (?&NAME) */
5840                 parse_start = RExC_parse - 1;
5841               named_recursion:
5842                 {
5843                     SV *sv_dat = reg_scan_name(pRExC_state,
5844                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5845                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5846                 }
5847                 goto gen_recurse_regop;
5848                 /* NOT REACHED */
5849             case '+':
5850                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5851                     RExC_parse++;
5852                     vFAIL("Illegal pattern");
5853                 }
5854                 goto parse_recursion;
5855                 /* NOT REACHED*/
5856             case '-': /* (?-1) */
5857                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5858                     RExC_parse--; /* rewind to let it be handled later */
5859                     goto parse_flags;
5860                 }
5861                 /*FALLTHROUGH */
5862             case '1': case '2': case '3': case '4': /* (?1) */
5863             case '5': case '6': case '7': case '8': case '9':
5864                 RExC_parse--;
5865               parse_recursion:
5866                 num = atoi(RExC_parse);
5867                 parse_start = RExC_parse - 1; /* MJD */
5868                 if (*RExC_parse == '-')
5869                     RExC_parse++;
5870                 while (isDIGIT(*RExC_parse))
5871                         RExC_parse++;
5872                 if (*RExC_parse!=')')
5873                     vFAIL("Expecting close bracket");
5874
5875               gen_recurse_regop:
5876                 if ( paren == '-' ) {
5877                     /*
5878                     Diagram of capture buffer numbering.
5879                     Top line is the normal capture buffer numbers
5880                     Botton line is the negative indexing as from
5881                     the X (the (?-2))
5882
5883                     +   1 2    3 4 5 X          6 7
5884                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5885                     -   5 4    3 2 1 X          x x
5886
5887                     */
5888                     num = RExC_npar + num;
5889                     if (num < 1)  {
5890                         RExC_parse++;
5891                         vFAIL("Reference to nonexistent group");
5892                     }
5893                 } else if ( paren == '+' ) {
5894                     num = RExC_npar + num - 1;
5895                 }
5896
5897                 ret = reganode(pRExC_state, GOSUB, num);
5898                 if (!SIZE_ONLY) {
5899                     if (num > (I32)RExC_rx->nparens) {
5900                         RExC_parse++;
5901                         vFAIL("Reference to nonexistent group");
5902                     }
5903                     ARG2L_SET( ret, RExC_recurse_count++);
5904                     RExC_emit++;
5905                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5906                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5907                 } else {
5908                     RExC_size++;
5909                 }
5910                 RExC_seen |= REG_SEEN_RECURSE;
5911                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5912                 Set_Node_Offset(ret, parse_start); /* MJD */
5913
5914                 *flagp |= POSTPONED;
5915                 nextchar(pRExC_state);
5916                 return ret;
5917             } /* named and numeric backreferences */
5918             /* NOT REACHED */
5919
5920             case '?':           /* (??...) */
5921                 is_logical = 1;
5922                 if (*RExC_parse != '{') {
5923                     RExC_parse++;
5924                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5925                     /*NOTREACHED*/
5926                 }
5927                 *flagp |= POSTPONED;
5928                 paren = *RExC_parse++;
5929                 /* FALL THROUGH */
5930             case '{':           /* (?{...}) */
5931             {
5932                 I32 count = 1;
5933                 U32 n = 0;
5934                 char c;
5935                 char *s = RExC_parse;
5936
5937                 RExC_seen_zerolen++;
5938                 RExC_seen |= REG_SEEN_EVAL;
5939                 while (count && (c = *RExC_parse)) {
5940                     if (c == '\\') {
5941                         if (RExC_parse[1])
5942                             RExC_parse++;
5943                     }
5944                     else if (c == '{')
5945                         count++;
5946                     else if (c == '}')
5947                         count--;
5948                     RExC_parse++;
5949                 }
5950                 if (*RExC_parse != ')') {
5951                     RExC_parse = s;
5952                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5953                 }
5954                 if (!SIZE_ONLY) {
5955                     PAD *pad;
5956                     OP_4tree *sop, *rop;
5957                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5958
5959                     ENTER;
5960                     Perl_save_re_context(aTHX);
5961                     rop = sv_compile_2op(sv, &sop, "re", &pad);
5962                     sop->op_private |= OPpREFCOUNTED;
5963                     /* re_dup will OpREFCNT_inc */
5964                     OpREFCNT_set(sop, 1);
5965                     LEAVE;
5966
5967                     n = add_data(pRExC_state, 3, "nop");
5968                     RExC_rxi->data->data[n] = (void*)rop;
5969                     RExC_rxi->data->data[n+1] = (void*)sop;
5970                     RExC_rxi->data->data[n+2] = (void*)pad;
5971                     SvREFCNT_dec(sv);
5972                 }
5973                 else {                                          /* First pass */
5974                     if (PL_reginterp_cnt < ++RExC_seen_evals
5975                         && IN_PERL_RUNTIME)
5976                         /* No compiled RE interpolated, has runtime
5977                            components ===> unsafe.  */
5978                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
5979                     if (PL_tainting && PL_tainted)
5980                         FAIL("Eval-group in insecure regular expression");
5981 #if PERL_VERSION > 8
5982                     if (IN_PERL_COMPILETIME)
5983                         PL_cv_has_eval = 1;
5984 #endif
5985                 }
5986
5987                 nextchar(pRExC_state);
5988                 if (is_logical) {
5989                     ret = reg_node(pRExC_state, LOGICAL);
5990                     if (!SIZE_ONLY)
5991                         ret->flags = 2;
5992                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5993                     /* deal with the length of this later - MJD */
5994                     return ret;
5995                 }
5996                 ret = reganode(pRExC_state, EVAL, n);
5997                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5998                 Set_Node_Offset(ret, parse_start);
5999                 return ret;
6000             }
6001             case '(':           /* (?(?{...})...) and (?(?=...)...) */
6002             {
6003                 int is_define= 0;
6004                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
6005                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6006                         || RExC_parse[1] == '<'
6007                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
6008                         I32 flag;
6009
6010                         ret = reg_node(pRExC_state, LOGICAL);
6011                         if (!SIZE_ONLY)
6012                             ret->flags = 1;
6013                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6014                         goto insert_if;
6015                     }
6016                 }
6017                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
6018                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6019                 {
6020                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
6021                     char *name_start= RExC_parse++;
6022                     U32 num = 0;
6023                     SV *sv_dat=reg_scan_name(pRExC_state,
6024                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6025                     if (RExC_parse == name_start || *RExC_parse != ch)
6026                         vFAIL2("Sequence (?(%c... not terminated",
6027                             (ch == '>' ? '<' : ch));
6028                     RExC_parse++;
6029                     if (!SIZE_ONLY) {
6030                         num = add_data( pRExC_state, 1, "S" );
6031                         RExC_rxi->data->data[num]=(void*)sv_dat;
6032                         SvREFCNT_inc_simple_void(sv_dat);
6033                     }
6034                     ret = reganode(pRExC_state,NGROUPP,num);
6035                     goto insert_if_check_paren;
6036                 }
6037                 else if (RExC_parse[0] == 'D' &&
6038                          RExC_parse[1] == 'E' &&
6039                          RExC_parse[2] == 'F' &&
6040                          RExC_parse[3] == 'I' &&
6041                          RExC_parse[4] == 'N' &&
6042                          RExC_parse[5] == 'E')
6043                 {
6044                     ret = reganode(pRExC_state,DEFINEP,0);
6045                     RExC_parse +=6 ;
6046                     is_define = 1;
6047                     goto insert_if_check_paren;
6048                 }
6049                 else if (RExC_parse[0] == 'R') {
6050                     RExC_parse++;
6051                     parno = 0;
6052                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6053                         parno = atoi(RExC_parse++);
6054                         while (isDIGIT(*RExC_parse))
6055                             RExC_parse++;
6056                     } else if (RExC_parse[0] == '&') {
6057                         SV *sv_dat;
6058                         RExC_parse++;
6059                         sv_dat = reg_scan_name(pRExC_state,
6060                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6061                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6062                     }
6063                     ret = reganode(pRExC_state,INSUBP,parno);
6064                     goto insert_if_check_paren;
6065                 }
6066                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6067                     /* (?(1)...) */
6068                     char c;
6069                     parno = atoi(RExC_parse++);
6070
6071                     while (isDIGIT(*RExC_parse))
6072                         RExC_parse++;
6073                     ret = reganode(pRExC_state, GROUPP, parno);
6074
6075                  insert_if_check_paren:
6076                     if ((c = *nextchar(pRExC_state)) != ')')
6077                         vFAIL("Switch condition not recognized");
6078                   insert_if:
6079                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6080                     br = regbranch(pRExC_state, &flags, 1,depth+1);
6081                     if (br == NULL)
6082                         br = reganode(pRExC_state, LONGJMP, 0);
6083                     else
6084                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6085                     c = *nextchar(pRExC_state);
6086                     if (flags&HASWIDTH)
6087                         *flagp |= HASWIDTH;
6088                     if (c == '|') {
6089                         if (is_define)
6090                             vFAIL("(?(DEFINE)....) does not allow branches");
6091                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6092                         regbranch(pRExC_state, &flags, 1,depth+1);
6093                         REGTAIL(pRExC_state, ret, lastbr);
6094                         if (flags&HASWIDTH)
6095                             *flagp |= HASWIDTH;
6096                         c = *nextchar(pRExC_state);
6097                     }
6098                     else
6099                         lastbr = NULL;
6100                     if (c != ')')
6101                         vFAIL("Switch (?(condition)... contains too many branches");
6102                     ender = reg_node(pRExC_state, TAIL);
6103                     REGTAIL(pRExC_state, br, ender);
6104                     if (lastbr) {
6105                         REGTAIL(pRExC_state, lastbr, ender);
6106                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6107                     }
6108                     else
6109                         REGTAIL(pRExC_state, ret, ender);
6110                     RExC_size++; /* XXX WHY do we need this?!!
6111                                     For large programs it seems to be required
6112                                     but I can't figure out why. -- dmq*/
6113                     return ret;
6114                 }
6115                 else {
6116                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6117                 }
6118             }
6119             case 0:
6120                 RExC_parse--; /* for vFAIL to print correctly */
6121                 vFAIL("Sequence (? incomplete");
6122                 break;
6123             default:
6124                 --RExC_parse;
6125                 parse_flags:      /* (?i) */
6126             {
6127                 U32 posflags = 0, negflags = 0;
6128                 U32 *flagsp = &posflags;
6129
6130                 while (*RExC_parse) {
6131                     /* && strchr("iogcmsx", *RExC_parse) */
6132                     /* (?g), (?gc) and (?o) are useless here
6133                        and must be globally applied -- japhy */
6134                     switch (*RExC_parse) {
6135                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6136                     case ONCE_PAT_MOD: /* 'o' */
6137                     case GLOBAL_PAT_MOD: /* 'g' */
6138                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6139                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6140                             if (! (wastedflags & wflagbit) ) {
6141                                 wastedflags |= wflagbit;
6142                                 vWARN5(
6143                                     RExC_parse + 1,
6144                                     "Useless (%s%c) - %suse /%c modifier",
6145                                     flagsp == &negflags ? "?-" : "?",
6146                                     *RExC_parse,
6147                                     flagsp == &negflags ? "don't " : "",
6148                                     *RExC_parse
6149                                 );
6150                             }
6151                         }
6152                         break;
6153
6154                     case CONTINUE_PAT_MOD: /* 'c' */
6155                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6156                             if (! (wastedflags & WASTED_C) ) {
6157                                 wastedflags |= WASTED_GC;
6158                                 vWARN3(
6159                                     RExC_parse + 1,
6160                                     "Useless (%sc) - %suse /gc modifier",
6161                                     flagsp == &negflags ? "?-" : "?",
6162                                     flagsp == &negflags ? "don't " : ""
6163                                 );
6164                             }
6165                         }
6166                         break;
6167                     case KEEPCOPY_PAT_MOD: /* 'p' */
6168                         if (flagsp == &negflags) {
6169                             if (SIZE_ONLY)
6170                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6171                         } else {
6172                             *flagsp |= RXf_PMf_KEEPCOPY;
6173                         }
6174                         break;
6175                     case '-':
6176                         if (flagsp == &negflags) {
6177                             RExC_parse++;
6178                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6179                             /*NOTREACHED*/
6180                         }
6181                         flagsp = &negflags;
6182                         wastedflags = 0;  /* reset so (?g-c) warns twice */
6183                         break;
6184                     case ':':
6185                         paren = ':';
6186                         /*FALLTHROUGH*/
6187                     case ')':
6188                         RExC_flags |= posflags;
6189                         RExC_flags &= ~negflags;
6190                         if (paren != ':') {
6191                             oregflags |= posflags;
6192                             oregflags &= ~negflags;
6193                         }
6194                         nextchar(pRExC_state);
6195                         if (paren != ':') {
6196                             *flagp = TRYAGAIN;
6197                             return NULL;
6198                         } else {
6199                             ret = NULL;
6200                             goto parse_rest;
6201                         }
6202                         /*NOTREACHED*/
6203                     default:
6204                         RExC_parse++;
6205                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6206                         /*NOTREACHED*/
6207                     }
6208                     ++RExC_parse;
6209                 }
6210             }} /* one for the default block, one for the switch */
6211         }
6212         else {                  /* (...) */
6213           capturing_parens:
6214             parno = RExC_npar;
6215             RExC_npar++;
6216
6217             ret = reganode(pRExC_state, OPEN, parno);
6218             if (!SIZE_ONLY ){
6219                 if (!RExC_nestroot)
6220                     RExC_nestroot = parno;
6221                 if (RExC_seen & REG_SEEN_RECURSE
6222                     && !RExC_open_parens[parno-1])
6223                 {
6224                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6225                         "Setting open paren #%"IVdf" to %d\n",
6226                         (IV)parno, REG_NODE_NUM(ret)));
6227                     RExC_open_parens[parno-1]= ret;
6228                 }
6229             }
6230             Set_Node_Length(ret, 1); /* MJD */
6231             Set_Node_Offset(ret, RExC_parse); /* MJD */
6232             is_open = 1;
6233         }
6234     }
6235     else                        /* ! paren */
6236         ret = NULL;
6237
6238    parse_rest:
6239     /* Pick up the branches, linking them together. */
6240     parse_start = RExC_parse;   /* MJD */
6241     br = regbranch(pRExC_state, &flags, 1,depth+1);
6242
6243     if (freeze_paren) {
6244         if (RExC_npar > after_freeze)
6245             after_freeze = RExC_npar;
6246         RExC_npar = freeze_paren;
6247     }
6248
6249     /*     branch_len = (paren != 0); */
6250
6251     if (br == NULL)
6252         return(NULL);
6253     if (*RExC_parse == '|') {
6254         if (!SIZE_ONLY && RExC_extralen) {
6255             reginsert(pRExC_state, BRANCHJ, br, depth+1);
6256         }
6257         else {                  /* MJD */
6258             reginsert(pRExC_state, BRANCH, br, depth+1);
6259             Set_Node_Length(br, paren != 0);
6260             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6261         }
6262         have_branch = 1;
6263         if (SIZE_ONLY)
6264             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
6265     }
6266     else if (paren == ':') {
6267         *flagp |= flags&SIMPLE;
6268     }
6269     if (is_open) {                              /* Starts with OPEN. */
6270         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
6271     }
6272     else if (paren != '?')              /* Not Conditional */
6273         ret = br;
6274     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6275     lastbr = br;
6276     while (*RExC_parse == '|') {
6277         if (!SIZE_ONLY && RExC_extralen) {
6278             ender = reganode(pRExC_state, LONGJMP,0);
6279             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6280         }
6281         if (SIZE_ONLY)
6282             RExC_extralen += 2;         /* Account for LONGJMP. */
6283         nextchar(pRExC_state);
6284         if (freeze_paren) {
6285             if (RExC_npar > after_freeze)
6286                 after_freeze = RExC_npar;
6287             RExC_npar = freeze_paren;
6288         }
6289         br = regbranch(pRExC_state, &flags, 0, depth+1);
6290
6291         if (br == NULL)
6292             return(NULL);
6293         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
6294         lastbr = br;
6295         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6296     }
6297
6298     if (have_branch || paren != ':') {
6299         /* Make a closing node, and hook it on the end. */
6300         switch (paren) {
6301         case ':':
6302             ender = reg_node(pRExC_state, TAIL);
6303             break;
6304         case 1:
6305             ender = reganode(pRExC_state, CLOSE, parno);
6306             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6307                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6308                         "Setting close paren #%"IVdf" to %d\n",
6309                         (IV)parno, REG_NODE_NUM(ender)));
6310                 RExC_close_parens[parno-1]= ender;
6311                 if (RExC_nestroot == parno)
6312                     RExC_nestroot = 0;
6313             }
6314             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6315             Set_Node_Length(ender,1); /* MJD */
6316             break;
6317         case '<':
6318         case ',':
6319         case '=':
6320         case '!':
6321             *flagp &= ~HASWIDTH;
6322             /* FALL THROUGH */
6323         case '>':
6324             ender = reg_node(pRExC_state, SUCCEED);
6325             break;
6326         case 0:
6327             ender = reg_node(pRExC_state, END);
6328             if (!SIZE_ONLY) {
6329                 assert(!RExC_opend); /* there can only be one! */
6330                 RExC_opend = ender;
6331             }
6332             break;
6333         }
6334         REGTAIL(pRExC_state, lastbr, ender);
6335
6336         if (have_branch && !SIZE_ONLY) {
6337             if (depth==1)
6338                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6339
6340             /* Hook the tails of the branches to the closing node. */
6341             for (br = ret; br; br = regnext(br)) {
6342                 const U8 op = PL_regkind[OP(br)];
6343                 if (op == BRANCH) {
6344                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6345                 }
6346                 else if (op == BRANCHJ) {
6347                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6348                 }
6349             }
6350         }
6351     }
6352
6353     {
6354         const char *p;
6355         static const char parens[] = "=!<,>";
6356
6357         if (paren && (p = strchr(parens, paren))) {
6358             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6359             int flag = (p - parens) > 1;
6360
6361             if (paren == '>')
6362                 node = SUSPEND, flag = 0;
6363             reginsert(pRExC_state, node,ret, depth+1);
6364             Set_Node_Cur_Length(ret);
6365             Set_Node_Offset(ret, parse_start + 1);
6366             ret->flags = flag;
6367             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6368         }
6369     }
6370
6371     /* Check for proper termination. */
6372     if (paren) {
6373         RExC_flags = oregflags;
6374         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6375             RExC_parse = oregcomp_parse;
6376             vFAIL("Unmatched (");
6377         }
6378     }
6379     else if (!paren && RExC_parse < RExC_end) {
6380         if (*RExC_parse == ')') {
6381             RExC_parse++;
6382             vFAIL("Unmatched )");
6383         }
6384         else
6385             FAIL("Junk on end of regexp");      /* "Can't happen". */
6386         /* NOTREACHED */
6387     }
6388     if (after_freeze)
6389         RExC_npar = after_freeze;
6390     return(ret);
6391 }
6392
6393 /*
6394  - regbranch - one alternative of an | operator
6395  *
6396  * Implements the concatenation operator.
6397  */
6398 STATIC regnode *
6399 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6400 {
6401     dVAR;
6402     register regnode *ret;
6403     register regnode *chain = NULL;
6404     register regnode *latest;
6405     I32 flags = 0, c = 0;
6406     GET_RE_DEBUG_FLAGS_DECL;
6407
6408     PERL_ARGS_ASSERT_REGBRANCH;
6409
6410     DEBUG_PARSE("brnc");
6411
6412     if (first)
6413         ret = NULL;
6414     else {
6415         if (!SIZE_ONLY && RExC_extralen)
6416             ret = reganode(pRExC_state, BRANCHJ,0);
6417         else {
6418             ret = reg_node(pRExC_state, BRANCH);
6419             Set_Node_Length(ret, 1);
6420         }
6421     }
6422
6423     if (!first && SIZE_ONLY)
6424         RExC_extralen += 1;                     /* BRANCHJ */
6425
6426     *flagp = WORST;                     /* Tentatively. */
6427
6428     RExC_parse--;
6429     nextchar(pRExC_state);
6430     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6431         flags &= ~TRYAGAIN;
6432         latest = regpiece(pRExC_state, &flags,depth+1);
6433         if (latest == NULL) {
6434             if (flags & TRYAGAIN)
6435                 continue;
6436             return(NULL);
6437         }
6438         else if (ret == NULL)
6439             ret = latest;
6440         *flagp |= flags&(HASWIDTH|POSTPONED);
6441         if (chain == NULL)      /* First piece. */
6442             *flagp |= flags&SPSTART;
6443         else {
6444             RExC_naughty++;
6445             REGTAIL(pRExC_state, chain, latest);
6446         }
6447         chain = latest;
6448         c++;
6449     }
6450     if (chain == NULL) {        /* Loop ran zero times. */
6451         chain = reg_node(pRExC_state, NOTHING);
6452         if (ret == NULL)
6453             ret = chain;
6454     }
6455     if (c == 1) {
6456         *flagp |= flags&SIMPLE;
6457     }
6458
6459     return ret;
6460 }
6461
6462 /*
6463  - regpiece - something followed by possible [*+?]
6464  *
6465  * Note that the branching code sequences used for ? and the general cases
6466  * of * and + are somewhat optimized:  they use the same NOTHING node as
6467  * both the endmarker for their branch list and the body of the last branch.
6468  * It might seem that this node could be dispensed with entirely, but the
6469  * endmarker role is not redundant.
6470  */
6471 STATIC regnode *
6472 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6473 {
6474     dVAR;
6475     register regnode *ret;
6476     register char op;
6477     register char *next;
6478     I32 flags;
6479     const char * const origparse = RExC_parse;
6480     I32 min;
6481     I32 max = REG_INFTY;
6482     char *parse_start;
6483     const char *maxpos = NULL;
6484     GET_RE_DEBUG_FLAGS_DECL;
6485
6486     PERL_ARGS_ASSERT_REGPIECE;
6487
6488     DEBUG_PARSE("piec");
6489
6490     ret = regatom(pRExC_state, &flags,depth+1);
6491     if (ret == NULL) {
6492         if (flags & TRYAGAIN)
6493             *flagp |= TRYAGAIN;
6494         return(NULL);
6495     }
6496
6497     op = *RExC_parse;
6498
6499     if (op == '{' && regcurly(RExC_parse)) {
6500         maxpos = NULL;
6501         parse_start = RExC_parse; /* MJD */
6502         next = RExC_parse + 1;
6503         while (isDIGIT(*next) || *next == ',') {
6504             if (*next == ',') {
6505                 if (maxpos)
6506                     break;
6507                 else
6508                     maxpos = next;
6509             }
6510             next++;
6511         }
6512         if (*next == '}') {             /* got one */
6513             if (!maxpos)
6514                 maxpos = next;
6515             RExC_parse++;
6516             min = atoi(RExC_parse);
6517             if (*maxpos == ',')
6518                 maxpos++;
6519             else
6520                 maxpos = RExC_parse;
6521             max = atoi(maxpos);
6522             if (!max && *maxpos != '0')
6523                 max = REG_INFTY;                /* meaning "infinity" */
6524             else if (max >= REG_INFTY)
6525                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6526             RExC_parse = next;
6527             nextchar(pRExC_state);
6528
6529         do_curly:
6530             if ((flags&SIMPLE)) {
6531                 RExC_naughty += 2 + RExC_naughty / 2;
6532                 reginsert(pRExC_state, CURLY, ret, depth+1);
6533                 Set_Node_Offset(ret, parse_start+1); /* MJD */
6534                 Set_Node_Cur_Length(ret);
6535             }
6536             else {
6537                 regnode * const w = reg_node(pRExC_state, WHILEM);
6538
6539                 w->flags = 0;
6540                 REGTAIL(pRExC_state, ret, w);
6541                 if (!SIZE_ONLY && RExC_extralen) {
6542                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
6543                     reginsert(pRExC_state, NOTHING,ret, depth+1);
6544                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
6545                 }
6546                 reginsert(pRExC_state, CURLYX,ret, depth+1);
6547                                 /* MJD hk */
6548                 Set_Node_Offset(ret, parse_start+1);
6549                 Set_Node_Length(ret,
6550                                 op == '{' ? (RExC_parse - parse_start) : 1);
6551
6552                 if (!SIZE_ONLY && RExC_extralen)
6553                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
6554                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6555                 if (SIZE_ONLY)
6556                     RExC_whilem_seen++, RExC_extralen += 3;
6557                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
6558             }
6559             ret->flags = 0;
6560
6561             if (min > 0)
6562                 *flagp = WORST;
6563             if (max > 0)
6564                 *flagp |= HASWIDTH;
6565             if (max < min)
6566                 vFAIL("Can't do {n,m} with n > m");
6567             if (!SIZE_ONLY) {
6568                 ARG1_SET(ret, (U16)min);
6569                 ARG2_SET(ret, (U16)max);
6570             }
6571
6572             goto nest_check;
6573         }
6574     }
6575
6576     if (!ISMULT1(op)) {
6577         *flagp = flags;
6578         return(ret);
6579     }
6580
6581 #if 0                           /* Now runtime fix should be reliable. */
6582
6583     /* if this is reinstated, don't forget to put this back into perldiag:
6584
6585             =item Regexp *+ operand could be empty at {#} in regex m/%s/
6586
6587            (F) The part of the regexp subject to either the * or + quantifier
6588            could match an empty string. The {#} shows in the regular
6589            expression about where the problem was discovered.
6590
6591     */
6592
6593     if (!(flags&HASWIDTH) && op != '?')
6594       vFAIL("Regexp *+ operand could be empty");
6595 #endif
6596
6597     parse_start = RExC_parse;
6598     nextchar(pRExC_state);
6599
6600     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6601
6602     if (op == '*' && (flags&SIMPLE)) {
6603         reginsert(pRExC_state, STAR, ret, depth+1);
6604         ret->flags = 0;
6605         RExC_naughty += 4;
6606     }
6607     else if (op == '*') {
6608         min = 0;
6609         goto do_curly;
6610     }
6611     else if (op == '+' && (flags&SIMPLE)) {
6612         reginsert(pRExC_state, PLUS, ret, depth+1);
6613         ret->flags = 0;
6614         RExC_naughty += 3;
6615     }
6616     else if (op == '+') {
6617         min = 1;
6618         goto do_curly;
6619     }
6620     else if (op == '?') {
6621         min = 0; max = 1;
6622         goto do_curly;
6623     }
6624   nest_check:
6625     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6626         ckWARN3reg(RExC_parse,
6627                    "%.*s matches null string many times",
6628                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6629                    origparse);
6630     }
6631
6632     if (RExC_parse < RExC_end && *RExC_parse == '?') {
6633         nextchar(pRExC_state);
6634         reginsert(pRExC_state, MINMOD, ret, depth+1);
6635         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6636     }
6637 #ifndef REG_ALLOW_MINMOD_SUSPEND
6638     else
6639 #endif
6640     if (RExC_parse < RExC_end && *RExC_parse == '+') {
6641         regnode *ender;
6642         nextchar(pRExC_state);
6643         ender = reg_node(pRExC_state, SUCCEED);
6644         REGTAIL(pRExC_state, ret, ender);
6645         reginsert(pRExC_state, SUSPEND, ret, depth+1);
6646         ret->flags = 0;
6647         ender = reg_node(pRExC_state, TAIL);
6648         REGTAIL(pRExC_state, ret, ender);
6649         /*ret= ender;*/
6650     }
6651
6652     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6653         RExC_parse++;
6654         vFAIL("Nested quantifiers");
6655     }
6656
6657     return(ret);
6658 }
6659
6660
6661 /* reg_namedseq(pRExC_state,UVp)
6662
6663    This is expected to be called by a parser routine that has
6664    recognized '\N' and needs to handle the rest. RExC_parse is
6665    expected to point at the first char following the N at the time
6666    of the call.
6667
6668    The \N may be inside (indicated by valuep not being NULL) or outside a
6669    character class.
6670
6671    \N may begin either a named sequence, or if outside a character class, mean
6672    to match a non-newline.  For non single-quoted regexes, the tokenizer has
6673    attempted to decide which, and in the case of a named sequence converted it
6674    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6675    where c1... are the characters in the sequence.  For single-quoted regexes,
6676    the tokenizer passes the \N sequence through unchanged; this code will not
6677    attempt to determine this nor expand those.  The net effect is that if the
6678    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6679    signals that this \N occurrence means to match a non-newline.
6680
6681    Only the \N{U+...} form should occur in a character class, for the same
6682    reason that '.' inside a character class means to just match a period: it
6683    just doesn't make sense.
6684
6685    If valuep is non-null then it is assumed that we are parsing inside
6686    of a charclass definition and the first codepoint in the resolved
6687    string is returned via *valuep and the routine will return NULL.
6688    In this mode if a multichar string is returned from the charnames
6689    handler, a warning will be issued, and only the first char in the
6690    sequence will be examined. If the string returned is zero length
6691    then the value of *valuep is undefined and NON-NULL will
6692    be returned to indicate failure. (This will NOT be a valid pointer
6693    to a regnode.)
6694
6695    If valuep is null then it is assumed that we are parsing normal text and a
6696    new EXACT node is inserted into the program containing the resolved string,
6697    and a pointer to the new node is returned.  But if the string is zero length
6698    a NOTHING node is emitted instead.
6699
6700    On success RExC_parse is set to the char following the endbrace.
6701    Parsing failures will generate a fatal error via vFAIL(...)
6702  */
6703 STATIC regnode *
6704 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6705 {
6706     char * endbrace;    /* '}' following the name */
6707     regnode *ret = NULL;
6708 #ifdef DEBUGGING
6709     char* parse_start = RExC_parse - 2;     /* points to the '\N' */
6710 #endif
6711     char* p;
6712
6713     GET_RE_DEBUG_FLAGS_DECL;
6714
6715     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6716
6717     GET_RE_DEBUG_FLAGS;
6718
6719     /* The [^\n] meaning of \N ignores spaces and comments under the /x
6720      * modifier.  The other meaning does not */
6721     p = (RExC_flags & RXf_PMf_EXTENDED)
6722         ? regwhite( pRExC_state, RExC_parse )
6723         : RExC_parse;
6724
6725     /* Disambiguate between \N meaning a named character versus \N meaning
6726      * [^\n].  The former is assumed when it can't be the latter. */
6727     if (*p != '{' || regcurly(p)) {
6728         RExC_parse = p;
6729         if (valuep) {
6730             /* no bare \N in a charclass */
6731             vFAIL("\\N in a character class must be a named character: \\N{...}");
6732         }
6733         nextchar(pRExC_state);
6734         ret = reg_node(pRExC_state, REG_ANY);
6735         *flagp |= HASWIDTH|SIMPLE;
6736         RExC_naughty++;
6737         RExC_parse--;
6738         Set_Node_Length(ret, 1); /* MJD */
6739         return ret;
6740     }
6741
6742     /* Here, we have decided it should be a named sequence */
6743
6744     /* The test above made sure that the next real character is a '{', but
6745      * under the /x modifier, it could be separated by space (or a comment and
6746      * \n) and this is not allowed (for consistency with \x{...} and the
6747      * tokenizer handling of \N{NAME}). */
6748     if (*RExC_parse != '{') {
6749         vFAIL("Missing braces on \\N{}");
6750     }
6751
6752     RExC_parse++;       /* Skip past the '{' */
6753
6754     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6755         || ! (endbrace == RExC_parse            /* nothing between the {} */
6756               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
6757                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6758     {
6759         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
6760         vFAIL("\\N{NAME} must be resolved by the lexer");
6761     }
6762
6763     if (endbrace == RExC_parse) {   /* empty: \N{} */
6764         if (! valuep) {
6765             RExC_parse = endbrace + 1;
6766             return reg_node(pRExC_state,NOTHING);
6767         }
6768
6769         if (SIZE_ONLY) {
6770             ckWARNreg(RExC_parse,
6771                     "Ignoring zero length \\N{} in character class"
6772             );
6773             RExC_parse = endbrace + 1;
6774         }
6775         *valuep = 0;
6776         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6777     }
6778
6779     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
6780     RExC_parse += 2;    /* Skip past the 'U+' */
6781
6782     if (valuep) {   /* In a bracketed char class */
6783         /* We only pay attention to the first char of
6784         multichar strings being returned. I kinda wonder
6785         if this makes sense as it does change the behaviour
6786         from earlier versions, OTOH that behaviour was broken
6787         as well. XXX Solution is to recharacterize as
6788         [rest-of-class]|multi1|multi2... */
6789
6790         STRLEN length_of_hex;
6791         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6792             | PERL_SCAN_DISALLOW_PREFIX
6793             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6794
6795         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
6796         if (endchar < endbrace) {
6797             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6798         }
6799
6800         length_of_hex = (STRLEN)(endchar - RExC_parse);
6801         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
6802
6803         /* The tokenizer should have guaranteed validity, but it's possible to
6804          * bypass it by using single quoting, so check */
6805         if (length_of_hex == 0
6806             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6807         {
6808             RExC_parse += length_of_hex;        /* Includes all the valid */
6809             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
6810                             ? UTF8SKIP(RExC_parse)
6811                             : 1;
6812             /* Guard against malformed utf8 */
6813             if (RExC_parse >= endchar) RExC_parse = endchar;
6814             vFAIL("Invalid hexadecimal number in \\N{U+...}");
6815         }
6816
6817         RExC_parse = endbrace + 1;
6818         if (endchar == endbrace) return NULL;
6819
6820         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
6821     }
6822     else {      /* Not a char class */
6823         char *s;            /* String to put in generated EXACT node */
6824         STRLEN len = 0;     /* Its current byte length */
6825         char *endchar;      /* Points to '.' or '}' ending cur char in the input
6826                                stream */
6827
6828         ret = reg_node(pRExC_state,
6829                         (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6830         s= STRING(ret);
6831
6832         /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
6833          * the input which is of the form now 'c1.c2.c3...}' until find the
6834          * ending brace or exceed length 255.  The characters that exceed this
6835          * limit are dropped.  The limit could be relaxed should it become
6836          * desirable by reparsing this as (?:\N{NAME}), so could generate
6837          * multiple EXACT nodes, as is done for just regular input.  But this
6838          * is primarily a named character, and not intended to be a huge long
6839          * string, so 255 bytes should be good enough */
6840         while (1) {
6841             STRLEN length_of_hex;
6842             I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
6843                             | PERL_SCAN_DISALLOW_PREFIX
6844                             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6845             UV cp;  /* Ord of current character */
6846
6847             /* Code points are separated by dots.  If none, there is only one
6848              * code point, and is terminated by the brace */
6849             endchar = RExC_parse + strcspn(RExC_parse, ".}");
6850
6851             /* The values are Unicode even on EBCDIC machines */
6852             length_of_hex = (STRLEN)(endchar - RExC_parse);
6853             cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
6854             if ( length_of_hex == 0
6855                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6856             {
6857                 RExC_parse += length_of_hex;        /* Includes all the valid */
6858                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
6859                                 ? UTF8SKIP(RExC_parse)
6860                                 : 1;
6861                 /* Guard against malformed utf8 */
6862                 if (RExC_parse >= endchar) RExC_parse = endchar;
6863                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
6864             }
6865
6866             if (! FOLD) {       /* Not folding, just append to the string */
6867                 STRLEN unilen;
6868
6869                 /* Quit before adding this character if would exceed limit */
6870                 if (len + UNISKIP(cp) > U8_MAX) break;
6871
6872                 unilen = reguni(pRExC_state, cp, s);
6873                 if (unilen > 0) {
6874                     s   += unilen;
6875                     len += unilen;
6876                 }
6877             } else {    /* Folding, output the folded equivalent */
6878                 STRLEN foldlen,numlen;
6879                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6880                 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
6881
6882                 /* Quit before exceeding size limit */
6883                 if (len + foldlen > U8_MAX) break;
6884
6885                 for (foldbuf = tmpbuf;
6886                     foldlen;
6887                     foldlen -= numlen)
6888                 {
6889                     cp = utf8_to_uvchr(foldbuf, &numlen);
6890                     if (numlen > 0) {
6891                         const STRLEN unilen = reguni(pRExC_state, cp, s);
6892                         s       += unilen;
6893                         len     += unilen;
6894                         /* In EBCDIC the numlen and unilen can differ. */
6895                         foldbuf += numlen;
6896                         if (numlen >= foldlen)
6897                             break;
6898                     }
6899                     else
6900                         break; /* "Can't happen." */
6901                 }
6902             }
6903
6904             /* Point to the beginning of the next character in the sequence. */
6905             RExC_parse = endchar + 1;
6906
6907             /* Quit if no more characters */
6908             if (RExC_parse >= endbrace) break;
6909         }
6910
6911
6912         if (SIZE_ONLY) {
6913             if (RExC_parse < endbrace) {
6914                 ckWARNreg(RExC_parse - 1,
6915                           "Using just the first characters returned by \\N{}");
6916             }
6917
6918             RExC_size += STR_SZ(len);
6919         } else {
6920             STR_LEN(ret) = len;
6921             RExC_emit += STR_SZ(len);
6922         }
6923
6924         RExC_parse = endbrace + 1;
6925
6926         *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
6927                                with malformed in t/re/pat_advanced.t */
6928         RExC_parse --;
6929         Set_Node_Cur_Length(ret); /* MJD */
6930         nextchar(pRExC_state);
6931     }
6932
6933     return ret;
6934 }
6935
6936
6937 /*
6938  * reg_recode
6939  *
6940  * It returns the code point in utf8 for the value in *encp.
6941  *    value: a code value in the source encoding
6942  *    encp:  a pointer to an Encode object
6943  *
6944  * If the result from Encode is not a single character,
6945  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6946  */
6947 STATIC UV
6948 S_reg_recode(pTHX_ const char value, SV **encp)
6949 {
6950     STRLEN numlen = 1;
6951     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
6952     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6953     const STRLEN newlen = SvCUR(sv);
6954     UV uv = UNICODE_REPLACEMENT;
6955
6956     PERL_ARGS_ASSERT_REG_RECODE;
6957
6958     if (newlen)
6959         uv = SvUTF8(sv)
6960              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6961              : *(U8*)s;
6962
6963     if (!newlen || numlen != newlen) {
6964         uv = UNICODE_REPLACEMENT;
6965         *encp = NULL;
6966     }
6967     return uv;
6968 }
6969
6970
6971 /*
6972  - regatom - the lowest level
6973
6974    Try to identify anything special at the start of the pattern. If there
6975    is, then handle it as required. This may involve generating a single regop,
6976    such as for an assertion; or it may involve recursing, such as to
6977    handle a () structure.
6978
6979    If the string doesn't start with something special then we gobble up
6980    as much literal text as we can.
6981
6982    Once we have been able to handle whatever type of thing started the
6983    sequence, we return.
6984
6985    Note: we have to be careful with escapes, as they can be both literal
6986    and special, and in the case of \10 and friends can either, depending
6987    on context. Specifically there are two seperate switches for handling
6988    escape sequences, with the one for handling literal escapes requiring
6989    a dummy entry for all of the special escapes that are actually handled
6990    by the other.
6991 */
6992
6993 STATIC regnode *
6994 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6995 {
6996     dVAR;
6997     register regnode *ret = NULL;
6998     I32 flags;
6999     char *parse_start = RExC_parse;
7000     GET_RE_DEBUG_FLAGS_DECL;
7001     DEBUG_PARSE("atom");
7002     *flagp = WORST;             /* Tentatively. */
7003
7004     PERL_ARGS_ASSERT_REGATOM;
7005
7006 tryagain:
7007     switch ((U8)*RExC_parse) {
7008     case '^':
7009         RExC_seen_zerolen++;
7010         nextchar(pRExC_state);
7011         if (RExC_flags & RXf_PMf_MULTILINE)
7012             ret = reg_node(pRExC_state, MBOL);
7013         else if (RExC_flags & RXf_PMf_SINGLELINE)
7014             ret = reg_node(pRExC_state, SBOL);
7015         else
7016             ret = reg_node(pRExC_state, BOL);
7017         Set_Node_Length(ret, 1); /* MJD */
7018         break;
7019     case '$':
7020         nextchar(pRExC_state);
7021         if (*RExC_parse)
7022             RExC_seen_zerolen++;
7023         if (RExC_flags & RXf_PMf_MULTILINE)
7024             ret = reg_node(pRExC_state, MEOL);
7025         else if (RExC_flags & RXf_PMf_SINGLELINE)
7026             ret = reg_node(pRExC_state, SEOL);
7027         else
7028             ret = reg_node(pRExC_state, EOL);
7029         Set_Node_Length(ret, 1); /* MJD */
7030         break;
7031     case '.':
7032         nextchar(pRExC_state);
7033         if (RExC_flags & RXf_PMf_SINGLELINE)
7034             ret = reg_node(pRExC_state, SANY);
7035         else
7036             ret = reg_node(pRExC_state, REG_ANY);
7037         *flagp |= HASWIDTH|SIMPLE;
7038         RExC_naughty++;
7039         Set_Node_Length(ret, 1); /* MJD */
7040         break;
7041     case '[':
7042     {
7043         char * const oregcomp_parse = ++RExC_parse;
7044         ret = regclass(pRExC_state,depth+1);
7045         if (*RExC_parse != ']') {
7046             RExC_parse = oregcomp_parse;
7047             vFAIL("Unmatched [");
7048         }
7049         nextchar(pRExC_state);
7050         *flagp |= HASWIDTH|SIMPLE;
7051         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7052         break;
7053     }
7054     case '(':
7055         nextchar(pRExC_state);
7056         ret = reg(pRExC_state, 1, &flags,depth+1);
7057         if (ret == NULL) {
7058                 if (flags & TRYAGAIN) {
7059                     if (RExC_parse == RExC_end) {
7060                          /* Make parent create an empty node if needed. */
7061                         *flagp |= TRYAGAIN;
7062                         return(NULL);
7063                     }
7064                     goto tryagain;
7065                 }
7066                 return(NULL);
7067         }
7068         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7069         break;
7070     case '|':
7071     case ')':
7072         if (flags & TRYAGAIN) {
7073             *flagp |= TRYAGAIN;
7074             return NULL;
7075         }
7076         vFAIL("Internal urp");
7077                                 /* Supposed to be caught earlier. */
7078         break;
7079     case '{':
7080         if (!regcurly(RExC_parse)) {
7081             RExC_parse++;
7082             goto defchar;
7083         }
7084         /* FALL THROUGH */
7085     case '?':
7086     case '+':
7087     case '*':
7088         RExC_parse++;
7089         vFAIL("Quantifier follows nothing");
7090         break;
7091     case 0xDF:
7092     case 0xC3:
7093     case 0xCE:
7094         do_foldchar:
7095         if (!LOC && FOLD) {
7096             U32 len,cp;
7097             len=0; /* silence a spurious compiler warning */
7098             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7099                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7100                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7101                 ret = reganode(pRExC_state, FOLDCHAR, cp);
7102                 Set_Node_Length(ret, 1); /* MJD */
7103                 nextchar(pRExC_state); /* kill whitespace under /x */
7104                 return ret;
7105             }
7106         }
7107         goto outer_default;
7108     case '\\':
7109         /* Special Escapes
7110
7111            This switch handles escape sequences that resolve to some kind
7112            of special regop and not to literal text. Escape sequnces that
7113            resolve to literal text are handled below in the switch marked
7114            "Literal Escapes".
7115
7116            Every entry in this switch *must* have a corresponding entry
7117            in the literal escape switch. However, the opposite is not
7118            required, as the default for this switch is to jump to the
7119            literal text handling code.
7120         */
7121         switch ((U8)*++RExC_parse) {
7122         case 0xDF:
7123         case 0xC3:
7124         case 0xCE:
7125                    goto do_foldchar;
7126         /* Special Escapes */
7127         case 'A':
7128             RExC_seen_zerolen++;
7129             ret = reg_node(pRExC_state, SBOL);
7130             *flagp |= SIMPLE;
7131             goto finish_meta_pat;
7132         case 'G':
7133             ret = reg_node(pRExC_state, GPOS);
7134             RExC_seen |= REG_SEEN_GPOS;
7135             *flagp |= SIMPLE;
7136             goto finish_meta_pat;
7137         case 'K':
7138             RExC_seen_zerolen++;
7139             ret = reg_node(pRExC_state, KEEPS);
7140             *flagp |= SIMPLE;
7141             /* XXX:dmq : disabling in-place substitution seems to
7142              * be necessary here to avoid cases of memory corruption, as
7143              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7144              */
7145             RExC_seen |= REG_SEEN_LOOKBEHIND;
7146             goto finish_meta_pat;
7147         case 'Z':
7148             ret = reg_node(pRExC_state, SEOL);
7149             *flagp |= SIMPLE;
7150             RExC_seen_zerolen++;                /* Do not optimize RE away */
7151             goto finish_meta_pat;
7152         case 'z':
7153             ret = reg_node(pRExC_state, EOS);
7154             *flagp |= SIMPLE;
7155             RExC_seen_zerolen++;                /* Do not optimize RE away */
7156             goto finish_meta_pat;
7157         case 'C':
7158             ret = reg_node(pRExC_state, CANY);
7159             RExC_seen |= REG_SEEN_CANY;
7160             *flagp |= HASWIDTH|SIMPLE;
7161             goto finish_meta_pat;
7162         case 'X':
7163             ret = reg_node(pRExC_state, CLUMP);
7164             *flagp |= HASWIDTH;
7165             goto finish_meta_pat;
7166         case 'w':
7167             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
7168             *flagp |= HASWIDTH|SIMPLE;
7169             goto finish_meta_pat;
7170         case 'W':
7171             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
7172             *flagp |= HASWIDTH|SIMPLE;
7173             goto finish_meta_pat;
7174         case 'b':
7175             RExC_seen_zerolen++;
7176             RExC_seen |= REG_SEEN_LOOKBEHIND;
7177             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
7178             *flagp |= SIMPLE;
7179             goto finish_meta_pat;
7180         case 'B':
7181             RExC_seen_zerolen++;
7182             RExC_seen |= REG_SEEN_LOOKBEHIND;
7183             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
7184             *flagp |= SIMPLE;
7185             goto finish_meta_pat;
7186         case 's':
7187             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
7188             *flagp |= HASWIDTH|SIMPLE;
7189             goto finish_meta_pat;
7190         case 'S':
7191             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
7192             *flagp |= HASWIDTH|SIMPLE;
7193             goto finish_meta_pat;
7194         case 'd':
7195             ret = reg_node(pRExC_state, DIGIT);
7196             *flagp |= HASWIDTH|SIMPLE;
7197             goto finish_meta_pat;
7198         case 'D':
7199             ret = reg_node(pRExC_state, NDIGIT);
7200             *flagp |= HASWIDTH|SIMPLE;
7201             goto finish_meta_pat;
7202         case 'R':
7203             ret = reg_node(pRExC_state, LNBREAK);
7204             *flagp |= HASWIDTH|SIMPLE;
7205             goto finish_meta_pat;
7206         case 'h':
7207             ret = reg_node(pRExC_state, HORIZWS);
7208             *flagp |= HASWIDTH|SIMPLE;
7209             goto finish_meta_pat;
7210         case 'H':
7211             ret = reg_node(pRExC_state, NHORIZWS);
7212             *flagp |= HASWIDTH|SIMPLE;
7213             goto finish_meta_pat;
7214         case 'v':
7215             ret = reg_node(pRExC_state, VERTWS);
7216             *flagp |= HASWIDTH|SIMPLE;
7217             goto finish_meta_pat;
7218         case 'V':
7219             ret = reg_node(pRExC_state, NVERTWS);
7220             *flagp |= HASWIDTH|SIMPLE;
7221          finish_meta_pat:
7222             nextchar(pRExC_state);
7223             Set_Node_Length(ret, 2); /* MJD */
7224             break;
7225         case 'p':
7226         case 'P':
7227             {
7228                 char* const oldregxend = RExC_end;
7229 #ifdef DEBUGGING
7230                 char* parse_start = RExC_parse - 2;
7231 #endif
7232
7233                 if (RExC_parse[1] == '{') {
7234                   /* a lovely hack--pretend we saw [\pX] instead */
7235                     RExC_end = strchr(RExC_parse, '}');
7236                     if (!RExC_end) {
7237                         const U8 c = (U8)*RExC_parse;
7238                         RExC_parse += 2;
7239                         RExC_end = oldregxend;
7240                         vFAIL2("Missing right brace on \\%c{}", c);
7241                     }
7242                     RExC_end++;
7243                 }
7244                 else {
7245                     RExC_end = RExC_parse + 2;
7246                     if (RExC_end > oldregxend)
7247                         RExC_end = oldregxend;
7248                 }
7249                 RExC_parse--;
7250
7251                 ret = regclass(pRExC_state,depth+1);
7252
7253                 RExC_end = oldregxend;
7254                 RExC_parse--;
7255
7256                 Set_Node_Offset(ret, parse_start + 2);
7257                 Set_Node_Cur_Length(ret);
7258                 nextchar(pRExC_state);
7259                 *flagp |= HASWIDTH|SIMPLE;
7260             }
7261             break;
7262         case 'N':
7263             /* Handle \N and \N{NAME} here and not below because it can be
7264             multicharacter. join_exact() will join them up later on.
7265             Also this makes sure that things like /\N{BLAH}+/ and
7266             \N{BLAH} being multi char Just Happen. dmq*/
7267             ++RExC_parse;
7268             ret= reg_namedseq(pRExC_state, NULL, flagp);
7269             break;
7270         case 'k':    /* Handle \k<NAME> and \k'NAME' */
7271         parse_named_seq:
7272         {
7273             char ch= RExC_parse[1];
7274             if (ch != '<' && ch != '\'' && ch != '{') {
7275                 RExC_parse++;
7276                 vFAIL2("Sequence %.2s... not terminated",parse_start);
7277             } else {
7278                 /* this pretty much dupes the code for (?P=...) in reg(), if
7279                    you change this make sure you change that */
7280                 char* name_start = (RExC_parse += 2);
7281                 U32 num = 0;
7282                 SV *sv_dat = reg_scan_name(pRExC_state,
7283                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7284                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7285                 if (RExC_parse == name_start || *RExC_parse != ch)
7286                     vFAIL2("Sequence %.3s... not terminated",parse_start);
7287
7288                 if (!SIZE_ONLY) {
7289                     num = add_data( pRExC_state, 1, "S" );
7290                     RExC_rxi->data->data[num]=(void*)sv_dat;
7291                     SvREFCNT_inc_simple_void(sv_dat);
7292                 }
7293
7294                 RExC_sawback = 1;
7295                 ret = reganode(pRExC_state,
7296                            (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7297                            num);
7298                 *flagp |= HASWIDTH;
7299
7300                 /* override incorrect value set in reganode MJD */
7301                 Set_Node_Offset(ret, parse_start+1);
7302                 Set_Node_Cur_Length(ret); /* MJD */
7303                 nextchar(pRExC_state);
7304
7305             }
7306             break;
7307         }
7308         case 'g':
7309         case '1': case '2': case '3': case '4':
7310         case '5': case '6': case '7': case '8': case '9':
7311             {
7312                 I32 num;
7313                 bool isg = *RExC_parse == 'g';
7314                 bool isrel = 0;
7315                 bool hasbrace = 0;
7316                 if (isg) {
7317                     RExC_parse++;
7318                     if (*RExC_parse == '{') {
7319                         RExC_parse++;
7320                         hasbrace = 1;
7321                     }
7322                     if (*RExC_parse == '-') {
7323                         RExC_parse++;
7324                         isrel = 1;
7325                     }
7326                     if (hasbrace && !isDIGIT(*RExC_parse)) {
7327                         if (isrel) RExC_parse--;
7328                         RExC_parse -= 2;
7329                         goto parse_named_seq;
7330                 }   }
7331                 num = atoi(RExC_parse);
7332                 if (isg && num == 0)
7333                     vFAIL("Reference to invalid group 0");
7334                 if (isrel) {
7335                     num = RExC_npar - num;
7336                     if (num < 1)
7337                         vFAIL("Reference to nonexistent or unclosed group");
7338                 }
7339                 if (!isg && num > 9 && num >= RExC_npar)
7340                     goto defchar;
7341                 else {
7342                     char * const parse_start = RExC_parse - 1; /* MJD */
7343                     while (isDIGIT(*RExC_parse))
7344                         RExC_parse++;
7345                     if (parse_start == RExC_parse - 1)
7346                         vFAIL("Unterminated \\g... pattern");
7347                     if (hasbrace) {
7348                         if (*RExC_parse != '}')
7349                             vFAIL("Unterminated \\g{...} pattern");
7350                         RExC_parse++;
7351                     }
7352                     if (!SIZE_ONLY) {
7353                         if (num > (I32)RExC_rx->nparens)
7354                             vFAIL("Reference to nonexistent group");
7355                     }
7356                     RExC_sawback = 1;
7357                     ret = reganode(pRExC_state,
7358                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7359                                    num);
7360                     *flagp |= HASWIDTH;
7361
7362                     /* override incorrect value set in reganode MJD */
7363                     Set_Node_Offset(ret, parse_start+1);
7364                     Set_Node_Cur_Length(ret); /* MJD */
7365                     RExC_parse--;
7366                     nextchar(pRExC_state);
7367                 }
7368             }
7369             break;
7370         case '\0':
7371             if (RExC_parse >= RExC_end)
7372                 FAIL("Trailing \\");
7373             /* FALL THROUGH */
7374         default:
7375             /* Do not generate "unrecognized" warnings here, we fall
7376                back into the quick-grab loop below */
7377             parse_start--;
7378             goto defchar;
7379         }
7380         break;
7381
7382     case '#':
7383         if (RExC_flags & RXf_PMf_EXTENDED) {
7384             if ( reg_skipcomment( pRExC_state ) )
7385                 goto tryagain;
7386         }
7387         /* FALL THROUGH */
7388
7389     default:
7390         outer_default:{
7391             register STRLEN len;
7392             register UV ender;
7393             register char *p;
7394             char *s;
7395             STRLEN foldlen;
7396             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7397
7398             parse_start = RExC_parse - 1;
7399
7400             RExC_parse++;
7401
7402         defchar:
7403             ender = 0;
7404             ret = reg_node(pRExC_state,
7405                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7406             s = STRING(ret);
7407             for (len = 0, p = RExC_parse - 1;
7408               len < 127 && p < RExC_end;
7409               len++)
7410             {
7411                 char * const oldp = p;
7412
7413                 if (RExC_flags & RXf_PMf_EXTENDED)
7414                     p = regwhite( pRExC_state, p );
7415                 switch ((U8)*p) {
7416                 case 0xDF:
7417                 case 0xC3:
7418                 case 0xCE:
7419                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7420                                 goto normal_default;
7421                 case '^':
7422                 case '$':
7423                 case '.':
7424                 case '[':
7425                 case '(':
7426                 case ')':
7427                 case '|':
7428                     goto loopdone;
7429                 case '\\':
7430                     /* Literal Escapes Switch
7431
7432                        This switch is meant to handle escape sequences that
7433                        resolve to a literal character.
7434
7435                        Every escape sequence that represents something
7436                        else, like an assertion or a char class, is handled
7437                        in the switch marked 'Special Escapes' above in this
7438                        routine, but also has an entry here as anything that
7439                        isn't explicitly mentioned here will be treated as
7440                        an unescaped equivalent literal.
7441                     */
7442
7443                     switch ((U8)*++p) {
7444                     /* These are all the special escapes. */
7445                     case 0xDF:
7446                     case 0xC3:
7447                     case 0xCE:
7448                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7449                                 goto normal_default;
7450                     case 'A':             /* Start assertion */
7451                     case 'b': case 'B':   /* Word-boundary assertion*/
7452                     case 'C':             /* Single char !DANGEROUS! */
7453                     case 'd': case 'D':   /* digit class */
7454                     case 'g': case 'G':   /* generic-backref, pos assertion */
7455                     case 'h': case 'H':   /* HORIZWS */
7456                     case 'k': case 'K':   /* named backref, keep marker */
7457                     case 'N':             /* named char sequence */
7458                     case 'p': case 'P':   /* Unicode property */
7459                               case 'R':   /* LNBREAK */
7460                     case 's': case 'S':   /* space class */
7461                     case 'v': case 'V':   /* VERTWS */
7462                     case 'w': case 'W':   /* word class */
7463                     case 'X':             /* eXtended Unicode "combining character sequence" */
7464                     case 'z': case 'Z':   /* End of line/string assertion */
7465                         --p;
7466                         goto loopdone;
7467
7468                     /* Anything after here is an escape that resolves to a
7469                        literal. (Except digits, which may or may not)
7470                      */
7471                     case 'n':
7472                         ender = '\n';
7473                         p++;
7474                         break;
7475                     case 'r':
7476                         ender = '\r';
7477                         p++;
7478                         break;
7479                     case 't':
7480                         ender = '\t';
7481                         p++;
7482                         break;
7483                     case 'f':
7484                         ender = '\f';
7485                         p++;
7486                         break;
7487                     case 'e':
7488                           ender = ASCII_TO_NATIVE('\033');
7489                         p++;
7490                         break;
7491                     case 'a':
7492                           ender = ASCII_TO_NATIVE('\007');
7493                         p++;
7494                         break;
7495                     case 'o':
7496                         {
7497                             STRLEN brace_len = len;
7498                             UV result;
7499                             const char* error_msg;
7500
7501                             bool valid = grok_bslash_o(p,
7502                                                        &result,
7503                                                        &brace_len,
7504                                                        &error_msg,
7505                                                        1);
7506                             p += brace_len;
7507                             if (! valid) {
7508                                 RExC_parse = p; /* going to die anyway; point
7509                                                    to exact spot of failure */
7510                                 vFAIL(error_msg);
7511                             }
7512                             else
7513                             {
7514                                 ender = result;
7515                             }
7516                             if (PL_encoding && ender < 0x100) {
7517                                 goto recode_encoding;
7518                             }
7519                             if (ender > 0xff) {
7520                                 REQUIRE_UTF8;
7521                             }
7522                             break;
7523                         }
7524                     case 'x':
7525                         if (*++p == '{') {
7526                             char* const e = strchr(p, '}');
7527
7528                             if (!e) {
7529                                 RExC_parse = p + 1;
7530                                 vFAIL("Missing right brace on \\x{}");
7531                             }
7532                             else {
7533                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7534                                     | PERL_SCAN_DISALLOW_PREFIX;
7535                                 STRLEN numlen = e - p - 1;
7536                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7537                                 if (ender > 0xff)
7538                                     REQUIRE_UTF8;
7539                                 p = e + 1;
7540                             }
7541                         }
7542                         else {
7543                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7544                             STRLEN numlen = 2;
7545                             ender = grok_hex(p, &numlen, &flags, NULL);
7546                             p += numlen;
7547                         }
7548                         if (PL_encoding && ender < 0x100)
7549                             goto recode_encoding;
7550                         break;
7551                     case 'c':
7552                         p++;
7553                         ender = grok_bslash_c(*p++, SIZE_ONLY);
7554                         break;
7555                     case '0': case '1': case '2': case '3':case '4':
7556                     case '5': case '6': case '7': case '8':case '9':
7557                         if (*p == '0' ||
7558                             (isOCTAL(p[1]) && atoi(p) >= RExC_npar))
7559                         {
7560                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
7561                             STRLEN numlen = 3;
7562                             ender = grok_oct(p, &numlen, &flags, NULL);
7563                             if (ender > 0xff) {
7564                                 REQUIRE_UTF8;
7565                             }
7566                             p += numlen;
7567                         }
7568                         else {
7569                             --p;
7570                             goto loopdone;
7571                         }
7572                         if (PL_encoding && ender < 0x100)
7573                             goto recode_encoding;
7574                         break;
7575                     recode_encoding:
7576                         {
7577                             SV* enc = PL_encoding;
7578                             ender = reg_recode((const char)(U8)ender, &enc);
7579                             if (!enc && SIZE_ONLY)
7580                                 ckWARNreg(p, "Invalid escape in the specified encoding");
7581                             REQUIRE_UTF8;
7582                         }
7583                         break;
7584                     case '\0':
7585                         if (p >= RExC_end)
7586                             FAIL("Trailing \\");
7587                         /* FALL THROUGH */
7588                     default:
7589                         if (!SIZE_ONLY&& isALPHA(*p))
7590                             ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7591                         goto normal_default;
7592                     }
7593                     break;
7594                 default:
7595                   normal_default:
7596                     if (UTF8_IS_START(*p) && UTF) {
7597                         STRLEN numlen;
7598                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7599                                                &numlen, UTF8_ALLOW_DEFAULT);
7600                         p += numlen;
7601                     }
7602                     else
7603                         ender = *p++;
7604                     break;
7605                 }
7606                 if ( RExC_flags & RXf_PMf_EXTENDED)
7607                     p = regwhite( pRExC_state, p );
7608                 if (UTF && FOLD) {
7609                     /* Prime the casefolded buffer. */
7610                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7611                 }
7612                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7613                     if (len)
7614                         p = oldp;
7615                     else if (UTF) {
7616                          if (FOLD) {
7617                               /* Emit all the Unicode characters. */
7618                               STRLEN numlen;
7619                               for (foldbuf = tmpbuf;
7620                                    foldlen;
7621                                    foldlen -= numlen) {
7622                                    ender = utf8_to_uvchr(foldbuf, &numlen);
7623                                    if (numlen > 0) {
7624                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
7625                                         s       += unilen;
7626                                         len     += unilen;
7627                                         /* In EBCDIC the numlen
7628                                          * and unilen can differ. */
7629                                         foldbuf += numlen;
7630                                         if (numlen >= foldlen)
7631                                              break;
7632                                    }
7633                                    else
7634                                         break; /* "Can't happen." */
7635                               }
7636                          }
7637                          else {
7638                               const STRLEN unilen = reguni(pRExC_state, ender, s);
7639                               if (unilen > 0) {
7640                                    s   += unilen;
7641                                    len += unilen;
7642                               }
7643                          }
7644                     }
7645                     else {
7646                         len++;
7647                         REGC((char)ender, s++);
7648                     }
7649                     break;
7650                 }
7651                 if (UTF) {
7652                      if (FOLD) {
7653                           /* Emit all the Unicode characters. */
7654                           STRLEN numlen;
7655                           for (foldbuf = tmpbuf;
7656                                foldlen;
7657                                foldlen -= numlen) {
7658                                ender = utf8_to_uvchr(foldbuf, &numlen);
7659                                if (numlen > 0) {
7660                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
7661                                     len     += unilen;
7662                                     s       += unilen;
7663                                     /* In EBCDIC the numlen
7664                                      * and unilen can differ. */
7665                                     foldbuf += numlen;
7666                                     if (numlen >= foldlen)
7667                                          break;
7668                                }
7669                                else
7670                                     break;
7671                           }
7672                      }
7673                      else {
7674                           const STRLEN unilen = reguni(pRExC_state, ender, s);
7675                           if (unilen > 0) {
7676                                s   += unilen;
7677                                len += unilen;
7678                           }
7679                      }
7680                      len--;
7681                 }
7682                 else
7683                     REGC((char)ender, s++);
7684             }
7685         loopdone:
7686             RExC_parse = p - 1;
7687             Set_Node_Cur_Length(ret); /* MJD */
7688             nextchar(pRExC_state);
7689             {
7690                 /* len is STRLEN which is unsigned, need to copy to signed */
7691                 IV iv = len;
7692                 if (iv < 0)
7693                     vFAIL("Internal disaster");
7694             }
7695             if (len > 0)
7696                 *flagp |= HASWIDTH;
7697             if (len == 1 && UNI_IS_INVARIANT(ender))
7698                 *flagp |= SIMPLE;
7699
7700             if (SIZE_ONLY)
7701                 RExC_size += STR_SZ(len);
7702             else {
7703                 STR_LEN(ret) = len;
7704                 RExC_emit += STR_SZ(len);
7705             }
7706         }
7707         break;
7708     }
7709
7710     return(ret);
7711 }
7712
7713 STATIC char *
7714 S_regwhite( RExC_state_t *pRExC_state, char *p )
7715 {
7716     const char *e = RExC_end;
7717
7718     PERL_ARGS_ASSERT_REGWHITE;
7719
7720     while (p < e) {
7721         if (isSPACE(*p))
7722             ++p;
7723         else if (*p == '#') {
7724             bool ended = 0;
7725             do {
7726                 if (*p++ == '\n') {
7727                     ended = 1;
7728                     break;
7729                 }
7730             } while (p < e);
7731             if (!ended)
7732                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7733         }
7734         else
7735             break;
7736     }
7737     return p;
7738 }
7739
7740 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7741    Character classes ([:foo:]) can also be negated ([:^foo:]).
7742    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7743    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7744    but trigger failures because they are currently unimplemented. */
7745
7746 #define POSIXCC_DONE(c)   ((c) == ':')
7747 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7748 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7749
7750 STATIC I32
7751 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7752 {
7753     dVAR;
7754     I32 namedclass = OOB_NAMEDCLASS;
7755
7756     PERL_ARGS_ASSERT_REGPPOSIXCC;
7757
7758     if (value == '[' && RExC_parse + 1 < RExC_end &&
7759         /* I smell either [: or [= or [. -- POSIX has been here, right? */
7760         POSIXCC(UCHARAT(RExC_parse))) {
7761         const char c = UCHARAT(RExC_parse);
7762         char* const s = RExC_parse++;
7763
7764         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7765             RExC_parse++;
7766         if (RExC_parse == RExC_end)
7767             /* Grandfather lone [:, [=, [. */
7768             RExC_parse = s;
7769         else {
7770             const char* const t = RExC_parse++; /* skip over the c */
7771             assert(*t == c);
7772
7773             if (UCHARAT(RExC_parse) == ']') {
7774                 const char *posixcc = s + 1;
7775                 RExC_parse++; /* skip over the ending ] */
7776
7777                 if (*s == ':') {
7778                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7779                     const I32 skip = t - posixcc;
7780
7781                     /* Initially switch on the length of the name.  */
7782                     switch (skip) {
7783                     case 4:
7784                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7785                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7786                         break;
7787                     case 5:
7788                         /* Names all of length 5.  */
7789                         /* alnum alpha ascii blank cntrl digit graph lower
7790                            print punct space upper  */
7791                         /* Offset 4 gives the best switch position.  */
7792                         switch (posixcc[4]) {
7793                         case 'a':
7794                             if (memEQ(posixcc, "alph", 4)) /* alpha */
7795                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7796                             break;
7797                         case 'e':
7798                             if (memEQ(posixcc, "spac", 4)) /* space */
7799                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7800                             break;
7801                         case 'h':
7802                             if (memEQ(posixcc, "grap", 4)) /* graph */
7803                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7804                             break;
7805                         case 'i':
7806                             if (memEQ(posixcc, "asci", 4)) /* ascii */
7807                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7808                             break;
7809                         case 'k':
7810                             if (memEQ(posixcc, "blan", 4)) /* blank */
7811                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7812                             break;
7813                         case 'l':
7814                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7815                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7816                             break;
7817                         case 'm':
7818                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
7819                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7820                             break;
7821                         case 'r':
7822                             if (memEQ(posixcc, "lowe", 4)) /* lower */
7823                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7824                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
7825                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7826                             break;
7827                         case 't':
7828                             if (memEQ(posixcc, "digi", 4)) /* digit */
7829                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7830                             else if (memEQ(posixcc, "prin", 4)) /* print */
7831                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7832                             else if (memEQ(posixcc, "punc", 4)) /* punct */
7833                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7834                             break;
7835                         }
7836                         break;
7837                     case 6:
7838                         if (memEQ(posixcc, "xdigit", 6))
7839                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7840                         break;
7841                     }
7842
7843                     if (namedclass == OOB_NAMEDCLASS)
7844                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7845                                       t - s - 1, s + 1);
7846                     assert (posixcc[skip] == ':');
7847                     assert (posixcc[skip+1] == ']');
7848                 } else if (!SIZE_ONLY) {
7849                     /* [[=foo=]] and [[.foo.]] are still future. */
7850
7851                     /* adjust RExC_parse so the warning shows after
7852                        the class closes */
7853                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7854                         RExC_parse++;
7855                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7856                 }
7857             } else {
7858                 /* Maternal grandfather:
7859                  * "[:" ending in ":" but not in ":]" */
7860                 RExC_parse = s;
7861             }
7862         }
7863     }
7864
7865     return namedclass;
7866 }
7867
7868 STATIC void
7869 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7870 {
7871     dVAR;
7872
7873     PERL_ARGS_ASSERT_CHECKPOSIXCC;
7874
7875     if (POSIXCC(UCHARAT(RExC_parse))) {
7876         const char *s = RExC_parse;
7877         const char  c = *s++;
7878
7879         while (isALNUM(*s))
7880             s++;
7881         if (*s && c == *s && s[1] == ']') {
7882             ckWARN3reg(s+2,
7883                        "POSIX syntax [%c %c] belongs inside character classes",
7884                        c, c);
7885
7886             /* [[=foo=]] and [[.foo.]] are still future. */
7887             if (POSIXCC_NOTYET(c)) {
7888                 /* adjust RExC_parse so the error shows after
7889                    the class closes */
7890                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7891                     NOOP;
7892                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7893             }
7894         }
7895     }
7896 }
7897
7898
7899 #define _C_C_T_(NAME,TEST,WORD)                         \
7900 ANYOF_##NAME:                                           \
7901     if (LOC)                                            \
7902         ANYOF_CLASS_SET(ret, ANYOF_##NAME);             \
7903     else {                                              \
7904         for (value = 0; value < 256; value++)           \
7905             if (TEST)                                   \
7906                 ANYOF_BITMAP_SET(ret, value);           \
7907     }                                                   \
7908     yesno = '+';                                        \
7909     what = WORD;                                        \
7910     break;                                              \
7911 case ANYOF_N##NAME:                                     \
7912     if (LOC)                                            \
7913         ANYOF_CLASS_SET(ret, ANYOF_N##NAME);            \
7914     else {                                              \
7915         for (value = 0; value < 256; value++)           \
7916             if (!TEST)                                  \
7917                 ANYOF_BITMAP_SET(ret, value);           \
7918     }                                                   \
7919     yesno = '!';                                        \
7920     what = WORD;                                        \
7921     break
7922
7923 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
7924 ANYOF_##NAME:                                           \
7925         for (value = 0; value < 256; value++)           \
7926             if (TEST)                                   \
7927                 ANYOF_BITMAP_SET(ret, value);           \
7928     yesno = '+';                                        \
7929     what = WORD;                                        \
7930     break;                                              \
7931 case ANYOF_N##NAME:                                     \
7932         for (value = 0; value < 256; value++)           \
7933             if (!TEST)                                  \
7934                 ANYOF_BITMAP_SET(ret, value);           \
7935     yesno = '!';                                        \
7936     what = WORD;                                        \
7937     break
7938
7939 /*
7940    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7941    so that it is possible to override the option here without having to
7942    rebuild the entire core. as we are required to do if we change regcomp.h
7943    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7944 */
7945 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7946 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7947 #endif
7948
7949 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7950 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7951 #else
7952 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7953 #endif
7954
7955 /*
7956    parse a class specification and produce either an ANYOF node that
7957    matches the pattern or if the pattern matches a single char only and
7958    that char is < 256 and we are case insensitive then we produce an
7959    EXACT node instead.
7960 */
7961
7962 STATIC regnode *
7963 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7964 {
7965     dVAR;
7966     register UV nextvalue;
7967     register IV prevvalue = OOB_UNICODE;
7968     register IV range = 0;
7969     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7970     register regnode *ret;
7971     STRLEN numlen;
7972     IV namedclass;
7973     char *rangebegin = NULL;
7974     bool need_class = 0;
7975     SV *listsv = NULL;
7976     UV n;
7977     bool optimize_invert   = TRUE;
7978     AV* unicode_alternate  = NULL;
7979 #ifdef EBCDIC
7980     UV literal_endpoint = 0;
7981 #endif
7982     UV stored = 0;  /* number of chars stored in the class */
7983
7984     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7985         case we need to change the emitted regop to an EXACT. */
7986     const char * orig_parse = RExC_parse;
7987     GET_RE_DEBUG_FLAGS_DECL;
7988
7989     PERL_ARGS_ASSERT_REGCLASS;
7990 #ifndef DEBUGGING
7991     PERL_UNUSED_ARG(depth);
7992 #endif
7993
7994     DEBUG_PARSE("clas");
7995
7996     /* Assume we are going to generate an ANYOF node. */
7997     ret = reganode(pRExC_state, ANYOF, 0);
7998
7999     if (!SIZE_ONLY)
8000         ANYOF_FLAGS(ret) = 0;
8001
8002     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
8003         RExC_naughty++;
8004         RExC_parse++;
8005         if (!SIZE_ONLY)
8006             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8007     }
8008
8009     if (SIZE_ONLY) {
8010         RExC_size += ANYOF_SKIP;
8011         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8012     }
8013     else {
8014         RExC_emit += ANYOF_SKIP;
8015         if (FOLD)
8016             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
8017         if (LOC)
8018             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
8019         ANYOF_BITMAP_ZERO(ret);
8020         listsv = newSVpvs("# comment\n");
8021     }
8022
8023     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8024
8025     if (!SIZE_ONLY && POSIXCC(nextvalue))
8026         checkposixcc(pRExC_state);
8027
8028     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8029     if (UCHARAT(RExC_parse) == ']')
8030         goto charclassloop;
8031
8032 parseit:
8033     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
8034
8035     charclassloop:
8036
8037         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8038
8039         if (!range)
8040             rangebegin = RExC_parse;
8041         if (UTF) {
8042             value = utf8n_to_uvchr((U8*)RExC_parse,
8043                                    RExC_end - RExC_parse,
8044                                    &numlen, UTF8_ALLOW_DEFAULT);
8045             RExC_parse += numlen;
8046         }
8047         else
8048             value = UCHARAT(RExC_parse++);
8049
8050         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8051         if (value == '[' && POSIXCC(nextvalue))
8052             namedclass = regpposixcc(pRExC_state, value);
8053         else if (value == '\\') {
8054             if (UTF) {
8055                 value = utf8n_to_uvchr((U8*)RExC_parse,
8056                                    RExC_end - RExC_parse,
8057                                    &numlen, UTF8_ALLOW_DEFAULT);
8058                 RExC_parse += numlen;
8059             }
8060             else
8061                 value = UCHARAT(RExC_parse++);
8062             /* Some compilers cannot handle switching on 64-bit integer
8063              * values, therefore value cannot be an UV.  Yes, this will
8064              * be a problem later if we want switch on Unicode.
8065              * A similar issue a little bit later when switching on
8066              * namedclass. --jhi */
8067             switch ((I32)value) {
8068             case 'w':   namedclass = ANYOF_ALNUM;       break;
8069             case 'W':   namedclass = ANYOF_NALNUM;      break;
8070             case 's':   namedclass = ANYOF_SPACE;       break;
8071             case 'S':   namedclass = ANYOF_NSPACE;      break;
8072             case 'd':   namedclass = ANYOF_DIGIT;       break;
8073             case 'D':   namedclass = ANYOF_NDIGIT;      break;
8074             case 'v':   namedclass = ANYOF_VERTWS;      break;
8075             case 'V':   namedclass = ANYOF_NVERTWS;     break;
8076             case 'h':   namedclass = ANYOF_HORIZWS;     break;
8077             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
8078             case 'N':  /* Handle \N{NAME} in class */
8079                 {
8080                     /* We only pay attention to the first char of
8081                     multichar strings being returned. I kinda wonder
8082                     if this makes sense as it does change the behaviour
8083                     from earlier versions, OTOH that behaviour was broken
8084                     as well. */
8085                     UV v; /* value is register so we cant & it /grrr */
8086                     if (reg_namedseq(pRExC_state, &v, NULL)) {
8087                         goto parseit;
8088                     }
8089                     value= v;
8090                 }
8091                 break;
8092             case 'p':
8093             case 'P':
8094                 {
8095                 char *e;
8096                 if (RExC_parse >= RExC_end)
8097                     vFAIL2("Empty \\%c{}", (U8)value);
8098                 if (*RExC_parse == '{') {
8099                     const U8 c = (U8)value;
8100                     e = strchr(RExC_parse++, '}');
8101                     if (!e)
8102                         vFAIL2("Missing right brace on \\%c{}", c);
8103                     while (isSPACE(UCHARAT(RExC_parse)))
8104                         RExC_parse++;
8105                     if (e == RExC_parse)
8106                         vFAIL2("Empty \\%c{}", c);
8107                     n = e - RExC_parse;
8108                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8109                         n--;
8110                 }
8111                 else {
8112                     e = RExC_parse;
8113                     n = 1;
8114                 }
8115                 if (!SIZE_ONLY) {
8116                     if (UCHARAT(RExC_parse) == '^') {
8117                          RExC_parse++;
8118                          n--;
8119                          value = value == 'p' ? 'P' : 'p'; /* toggle */
8120                          while (isSPACE(UCHARAT(RExC_parse))) {
8121                               RExC_parse++;
8122                               n--;
8123                          }
8124                     }
8125                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8126                         (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8127                 }
8128                 RExC_parse = e + 1;
8129                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8130                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
8131                 }
8132                 break;
8133             case 'n':   value = '\n';                   break;
8134             case 'r':   value = '\r';                   break;
8135             case 't':   value = '\t';                   break;
8136             case 'f':   value = '\f';                   break;
8137             case 'b':   value = '\b';                   break;
8138             case 'e':   value = ASCII_TO_NATIVE('\033');break;
8139             case 'a':   value = ASCII_TO_NATIVE('\007');break;
8140             case 'o':
8141                 RExC_parse--;   /* function expects to be pointed at the 'o' */
8142                 {
8143                     const char* error_msg;
8144                     bool valid = grok_bslash_o(RExC_parse,
8145                                                &value,
8146                                                &numlen,
8147                                                &error_msg,
8148                                                SIZE_ONLY);
8149                     RExC_parse += numlen;
8150                     if (! valid) {
8151                         vFAIL(error_msg);
8152                     }
8153                 }
8154                 if (PL_encoding && value < 0x100) {
8155                     goto recode_encoding;
8156                 }
8157                 break;
8158             case 'x':
8159                 if (*RExC_parse == '{') {
8160                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8161                         | PERL_SCAN_DISALLOW_PREFIX;
8162                     char * const e = strchr(RExC_parse++, '}');
8163                     if (!e)
8164                         vFAIL("Missing right brace on \\x{}");
8165
8166                     numlen = e - RExC_parse;
8167                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8168                     RExC_parse = e + 1;
8169                 }
8170                 else {
8171                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8172                     numlen = 2;
8173                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8174                     RExC_parse += numlen;
8175                 }
8176                 if (PL_encoding && value < 0x100)
8177                     goto recode_encoding;
8178                 break;
8179             case 'c':
8180                 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
8181                 break;
8182             case '0': case '1': case '2': case '3': case '4':
8183             case '5': case '6': case '7':
8184                 {
8185                     /* Take 1-3 octal digits */
8186                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8187                     numlen = 3;
8188                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8189                     RExC_parse += numlen;
8190                     if (PL_encoding && value < 0x100)
8191                         goto recode_encoding;
8192                     break;
8193                 }
8194             recode_encoding:
8195                 {
8196                     SV* enc = PL_encoding;
8197                     value = reg_recode((const char)(U8)value, &enc);
8198                     if (!enc && SIZE_ONLY)
8199                         ckWARNreg(RExC_parse,
8200                                   "Invalid escape in the specified encoding");
8201                     break;
8202                 }
8203             default:
8204                 /* Allow \_ to not give an error */
8205                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
8206                     ckWARN2reg(RExC_parse,
8207                                "Unrecognized escape \\%c in character class passed through",
8208                                (int)value);
8209                 }
8210                 break;
8211             }
8212         } /* end of \blah */
8213 #ifdef EBCDIC
8214         else
8215             literal_endpoint++;
8216 #endif
8217
8218         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8219
8220             if (!SIZE_ONLY && !need_class)
8221                 ANYOF_CLASS_ZERO(ret);
8222
8223             need_class = 1;
8224
8225             /* a bad range like a-\d, a-[:digit:] ? */
8226             if (range) {
8227                 if (!SIZE_ONLY) {
8228                     const int w =
8229                         RExC_parse >= rangebegin ?
8230                         RExC_parse - rangebegin : 0;
8231                     ckWARN4reg(RExC_parse,
8232                                "False [] range \"%*.*s\"",
8233                                w, w, rangebegin);
8234
8235                     if (prevvalue < 256) {
8236                         ANYOF_BITMAP_SET(ret, prevvalue);
8237                         ANYOF_BITMAP_SET(ret, '-');
8238                     }
8239                     else {
8240                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8241                         Perl_sv_catpvf(aTHX_ listsv,
8242                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8243                     }
8244                 }
8245
8246                 range = 0; /* this was not a true range */
8247             }
8248
8249
8250
8251             if (!SIZE_ONLY) {
8252                 const char *what = NULL;
8253                 char yesno = 0;
8254
8255                 if (namedclass > OOB_NAMEDCLASS)
8256                     optimize_invert = FALSE;
8257                 /* Possible truncation here but in some 64-bit environments
8258                  * the compiler gets heartburn about switch on 64-bit values.
8259                  * A similar issue a little earlier when switching on value.
8260                  * --jhi */
8261                 switch ((I32)namedclass) {
8262
8263                 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8264                 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8265                 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8266                 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8267                 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8268                 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8269                 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8270                 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8271                 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8272                 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8273 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8274                 case _C_C_T_(ALNUM, isALNUM(value), "Word");
8275                 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
8276 #else
8277                 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8278                 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8279 #endif
8280                 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
8281                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8282                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8283                 case ANYOF_ASCII:
8284                     if (LOC)
8285                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8286                     else {
8287 #ifndef EBCDIC
8288                         for (value = 0; value < 128; value++)
8289                             ANYOF_BITMAP_SET(ret, value);
8290 #else  /* EBCDIC */
8291                         for (value = 0; value < 256; value++) {
8292                             if (isASCII(value))
8293                                 ANYOF_BITMAP_SET(ret, value);
8294                         }
8295 #endif /* EBCDIC */
8296                     }
8297                     yesno = '+';
8298                     what = "ASCII";
8299                     break;
8300                 case ANYOF_NASCII:
8301                     if (LOC)
8302                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8303                     else {
8304 #ifndef EBCDIC
8305                         for (value = 128; value < 256; value++)
8306                             ANYOF_BITMAP_SET(ret, value);
8307 #else  /* EBCDIC */
8308                         for (value = 0; value < 256; value++) {
8309                             if (!isASCII(value))
8310                                 ANYOF_BITMAP_SET(ret, value);
8311                         }
8312 #endif /* EBCDIC */
8313                     }
8314                     yesno = '!';
8315                     what = "ASCII";
8316                     break;
8317                 case ANYOF_DIGIT:
8318                     if (LOC)
8319                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8320                     else {
8321                         /* consecutive digits assumed */
8322                         for (value = '0'; value <= '9'; value++)
8323                             ANYOF_BITMAP_SET(ret, value);
8324                     }
8325                     yesno = '+';
8326                     what = POSIX_CC_UNI_NAME("Digit");
8327                     break;
8328                 case ANYOF_NDIGIT:
8329                     if (LOC)
8330                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8331                     else {
8332                         /* consecutive digits assumed */
8333                         for (value = 0; value < '0'; value++)
8334                             ANYOF_BITMAP_SET(ret, value);
8335                         for (value = '9' + 1; value < 256; value++)
8336                             ANYOF_BITMAP_SET(ret, value);
8337                     }
8338                     yesno = '!';
8339                     what = POSIX_CC_UNI_NAME("Digit");
8340                     break;
8341                 case ANYOF_MAX:
8342                     /* this is to handle \p and \P */
8343                     break;
8344                 default:
8345                     vFAIL("Invalid [::] class");
8346                     break;
8347                 }
8348                 if (what) {
8349                     /* Strings such as "+utf8::isWord\n" */
8350                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8351                 }
8352                 if (LOC)
8353                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8354                 continue;
8355             }
8356         } /* end of namedclass \blah */
8357
8358         if (range) {
8359             if (prevvalue > (IV)value) /* b-a */ {
8360                 const int w = RExC_parse - rangebegin;
8361                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8362                 range = 0; /* not a valid range */
8363             }
8364         }
8365         else {
8366             prevvalue = value; /* save the beginning of the range */
8367             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8368                 RExC_parse[1] != ']') {
8369                 RExC_parse++;
8370
8371                 /* a bad range like \w-, [:word:]- ? */
8372                 if (namedclass > OOB_NAMEDCLASS) {
8373                     if (ckWARN(WARN_REGEXP)) {
8374                         const int w =
8375                             RExC_parse >= rangebegin ?
8376                             RExC_parse - rangebegin : 0;
8377                         vWARN4(RExC_parse,
8378                                "False [] range \"%*.*s\"",
8379                                w, w, rangebegin);
8380                     }
8381                     if (!SIZE_ONLY)
8382                         ANYOF_BITMAP_SET(ret, '-');
8383                 } else
8384                     range = 1;  /* yeah, it's a range! */
8385                 continue;       /* but do it the next time */
8386             }
8387         }
8388
8389         /* now is the next time */
8390         /*stored += (value - prevvalue + 1);*/
8391         if (!SIZE_ONLY) {
8392             if (prevvalue < 256) {
8393                 const IV ceilvalue = value < 256 ? value : 255;
8394                 IV i;
8395 #ifdef EBCDIC
8396                 /* In EBCDIC [\x89-\x91] should include
8397                  * the \x8e but [i-j] should not. */
8398                 if (literal_endpoint == 2 &&
8399                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8400                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8401                 {
8402                     if (isLOWER(prevvalue)) {
8403                         for (i = prevvalue; i <= ceilvalue; i++)
8404                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8405                                 stored++;
8406                                 ANYOF_BITMAP_SET(ret, i);
8407                             }
8408                     } else {
8409                         for (i = prevvalue; i <= ceilvalue; i++)
8410                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8411                                 stored++;
8412                                 ANYOF_BITMAP_SET(ret, i);
8413                             }
8414                     }
8415                 }
8416                 else
8417 #endif
8418                       for (i = prevvalue; i <= ceilvalue; i++) {
8419                         if (!ANYOF_BITMAP_TEST(ret,i)) {
8420                             stored++;
8421                             ANYOF_BITMAP_SET(ret, i);
8422                         }
8423                       }
8424           }
8425           if (value > 255 || UTF) {
8426                 const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
8427                 const UV natvalue      = NATIVE_TO_UNI(value);
8428                 stored+=2; /* can't optimize this class */
8429                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8430                 if (prevnatvalue < natvalue) { /* what about > ? */
8431                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8432                                    prevnatvalue, natvalue);
8433                 }
8434                 else if (prevnatvalue == natvalue) {
8435                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8436                     if (FOLD) {
8437                          U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8438                          STRLEN foldlen;
8439                          const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8440
8441 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8442                          if (RExC_precomp[0] == ':' &&
8443                              RExC_precomp[1] == '[' &&
8444                              (f == 0xDF || f == 0x92)) {
8445                              f = NATIVE_TO_UNI(f);
8446                         }
8447 #endif
8448                          /* If folding and foldable and a single
8449                           * character, insert also the folded version
8450                           * to the charclass. */
8451                          if (f != value) {
8452 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8453                              if ((RExC_precomp[0] == ':' &&
8454                                   RExC_precomp[1] == '[' &&
8455                                   (f == 0xA2 &&
8456                                    (value == 0xFB05 || value == 0xFB06))) ?
8457                                  foldlen == ((STRLEN)UNISKIP(f) - 1) :
8458                                  foldlen == (STRLEN)UNISKIP(f) )
8459 #else
8460                               if (foldlen == (STRLEN)UNISKIP(f))
8461 #endif
8462                                   Perl_sv_catpvf(aTHX_ listsv,
8463                                                  "%04"UVxf"\n", f);
8464                               else {
8465                                   /* Any multicharacter foldings
8466                                    * require the following transform:
8467                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8468                                    * where E folds into "pq" and F folds
8469                                    * into "rst", all other characters
8470                                    * fold to single characters.  We save
8471                                    * away these multicharacter foldings,
8472                                    * to be later saved as part of the
8473                                    * additional "s" data. */
8474                                   SV *sv;
8475
8476                                   if (!unicode_alternate)
8477                                       unicode_alternate = newAV();
8478                                   sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8479                                                      TRUE);
8480                                   av_push(unicode_alternate, sv);
8481                               }
8482                          }
8483
8484                          /* If folding and the value is one of the Greek
8485                           * sigmas insert a few more sigmas to make the
8486                           * folding rules of the sigmas to work right.
8487                           * Note that not all the possible combinations
8488                           * are handled here: some of them are handled
8489                           * by the standard folding rules, and some of
8490                           * them (literal or EXACTF cases) are handled
8491                           * during runtime in regexec.c:S_find_byclass(). */
8492                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8493                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8494                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8495                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8496                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8497                          }
8498                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8499                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8500                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8501                     }
8502                 }
8503             }
8504 #ifdef EBCDIC
8505             literal_endpoint = 0;
8506 #endif
8507         }
8508
8509         range = 0; /* this range (if it was one) is done now */
8510     }
8511
8512     if (need_class) {
8513         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8514         if (SIZE_ONLY)
8515             RExC_size += ANYOF_CLASS_ADD_SKIP;
8516         else
8517             RExC_emit += ANYOF_CLASS_ADD_SKIP;
8518     }
8519
8520
8521     if (SIZE_ONLY)
8522         return ret;
8523     /****** !SIZE_ONLY AFTER HERE *********/
8524
8525     if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8526         && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8527     ) {
8528         /* optimize single char class to an EXACT node
8529            but *only* when its not a UTF/high char  */
8530         const char * cur_parse= RExC_parse;
8531         RExC_emit = (regnode *)orig_emit;
8532         RExC_parse = (char *)orig_parse;
8533         ret = reg_node(pRExC_state,
8534                        (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8535         RExC_parse = (char *)cur_parse;
8536         *STRING(ret)= (char)value;
8537         STR_LEN(ret)= 1;
8538         RExC_emit += STR_SZ(1);
8539         SvREFCNT_dec(listsv);
8540         return ret;
8541     }
8542     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8543     if ( /* If the only flag is folding (plus possibly inversion). */
8544         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8545        ) {
8546         for (value = 0; value < 256; ++value) {
8547             if (ANYOF_BITMAP_TEST(ret, value)) {
8548                 UV fold = PL_fold[value];
8549
8550                 if (fold != value)
8551                     ANYOF_BITMAP_SET(ret, fold);
8552             }
8553         }
8554         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8555     }
8556
8557     /* optimize inverted simple patterns (e.g. [^a-z]) */
8558     if (optimize_invert &&
8559         /* If the only flag is inversion. */
8560         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8561         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8562             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8563         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8564     }
8565     {
8566         AV * const av = newAV();
8567         SV *rv;
8568         /* The 0th element stores the character class description
8569          * in its textual form: used later (regexec.c:Perl_regclass_swash())
8570          * to initialize the appropriate swash (which gets stored in
8571          * the 1st element), and also useful for dumping the regnode.
8572          * The 2nd element stores the multicharacter foldings,
8573          * used later (regexec.c:S_reginclass()). */
8574         av_store(av, 0, listsv);
8575         av_store(av, 1, NULL);
8576         av_store(av, 2, MUTABLE_SV(unicode_alternate));
8577         rv = newRV_noinc(MUTABLE_SV(av));
8578         n = add_data(pRExC_state, 1, "s");
8579         RExC_rxi->data->data[n] = (void*)rv;
8580         ARG_SET(ret, n);
8581     }
8582     return ret;
8583 }
8584 #undef _C_C_T_
8585
8586
8587 /* reg_skipcomment()
8588
8589    Absorbs an /x style # comments from the input stream.
8590    Returns true if there is more text remaining in the stream.
8591    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8592    terminates the pattern without including a newline.
8593
8594    Note its the callers responsibility to ensure that we are
8595    actually in /x mode
8596
8597 */
8598
8599 STATIC bool
8600 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8601 {
8602     bool ended = 0;
8603
8604     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8605
8606     while (RExC_parse < RExC_end)
8607         if (*RExC_parse++ == '\n') {
8608             ended = 1;
8609             break;
8610         }
8611     if (!ended) {
8612         /* we ran off the end of the pattern without ending
8613            the comment, so we have to add an \n when wrapping */
8614         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8615         return 0;
8616     } else
8617         return 1;
8618 }
8619
8620 /* nextchar()
8621
8622    Advance that parse position, and optionall absorbs
8623    "whitespace" from the inputstream.
8624
8625    Without /x "whitespace" means (?#...) style comments only,
8626    with /x this means (?#...) and # comments and whitespace proper.
8627
8628    Returns the RExC_parse point from BEFORE the scan occurs.
8629
8630    This is the /x friendly way of saying RExC_parse++.
8631 */
8632
8633 STATIC char*
8634 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8635 {
8636     char* const retval = RExC_parse++;
8637
8638     PERL_ARGS_ASSERT_NEXTCHAR;
8639
8640     for (;;) {
8641         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8642                 RExC_parse[2] == '#') {
8643             while (*RExC_parse != ')') {
8644                 if (RExC_parse == RExC_end)
8645                     FAIL("Sequence (?#... not terminated");
8646                 RExC_parse++;
8647             }
8648             RExC_parse++;
8649             continue;
8650         }
8651         if (RExC_flags & RXf_PMf_EXTENDED) {
8652             if (isSPACE(*RExC_parse)) {
8653                 RExC_parse++;
8654                 continue;
8655             }
8656             else if (*RExC_parse == '#') {
8657                 if ( reg_skipcomment( pRExC_state ) )
8658                     continue;
8659             }
8660         }
8661         return retval;
8662     }
8663 }
8664
8665 /*
8666 - reg_node - emit a node
8667 */
8668 STATIC regnode *                        /* Location. */
8669 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8670 {
8671     dVAR;
8672     register regnode *ptr;
8673     regnode * const ret = RExC_emit;
8674     GET_RE_DEBUG_FLAGS_DECL;
8675
8676     PERL_ARGS_ASSERT_REG_NODE;
8677
8678     if (SIZE_ONLY) {
8679         SIZE_ALIGN(RExC_size);
8680         RExC_size += 1;
8681         return(ret);
8682     }
8683     if (RExC_emit >= RExC_emit_bound)
8684         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8685
8686     NODE_ALIGN_FILL(ret);
8687     ptr = ret;
8688     FILL_ADVANCE_NODE(ptr, op);
8689 #ifdef RE_TRACK_PATTERN_OFFSETS
8690     if (RExC_offsets) {         /* MJD */
8691         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8692               "reg_node", __LINE__,
8693               PL_reg_name[op],
8694               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8695                 ? "Overwriting end of array!\n" : "OK",
8696               (UV)(RExC_emit - RExC_emit_start),
8697               (UV)(RExC_parse - RExC_start),
8698               (UV)RExC_offsets[0]));
8699         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8700     }
8701 #endif
8702     RExC_emit = ptr;
8703     return(ret);
8704 }
8705
8706 /*
8707 - reganode - emit a node with an argument
8708 */
8709 STATIC regnode *                        /* Location. */
8710 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8711 {
8712     dVAR;
8713     register regnode *ptr;
8714     regnode * const ret = RExC_emit;
8715     GET_RE_DEBUG_FLAGS_DECL;
8716
8717     PERL_ARGS_ASSERT_REGANODE;
8718
8719     if (SIZE_ONLY) {
8720         SIZE_ALIGN(RExC_size);
8721         RExC_size += 2;
8722         /*
8723            We can't do this:
8724
8725            assert(2==regarglen[op]+1);
8726
8727            Anything larger than this has to allocate the extra amount.
8728            If we changed this to be:
8729
8730            RExC_size += (1 + regarglen[op]);
8731
8732            then it wouldn't matter. Its not clear what side effect
8733            might come from that so its not done so far.
8734            -- dmq
8735         */
8736         return(ret);
8737     }
8738     if (RExC_emit >= RExC_emit_bound)
8739         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8740
8741     NODE_ALIGN_FILL(ret);
8742     ptr = ret;
8743     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8744 #ifdef RE_TRACK_PATTERN_OFFSETS
8745     if (RExC_offsets) {         /* MJD */
8746         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8747               "reganode",
8748               __LINE__,
8749               PL_reg_name[op],
8750               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8751               "Overwriting end of array!\n" : "OK",
8752               (UV)(RExC_emit - RExC_emit_start),
8753               (UV)(RExC_parse - RExC_start),
8754               (UV)RExC_offsets[0]));
8755         Set_Cur_Node_Offset;
8756     }
8757 #endif
8758     RExC_emit = ptr;
8759     return(ret);
8760 }
8761
8762 /*
8763 - reguni - emit (if appropriate) a Unicode character
8764 */
8765 STATIC STRLEN
8766 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8767 {
8768     dVAR;
8769
8770     PERL_ARGS_ASSERT_REGUNI;
8771
8772     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8773 }
8774
8775 /*
8776 - reginsert - insert an operator in front of already-emitted operand
8777 *
8778 * Means relocating the operand.
8779 */
8780 STATIC void
8781 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8782 {
8783     dVAR;
8784     register regnode *src;
8785     register regnode *dst;
8786     register regnode *place;
8787     const int offset = regarglen[(U8)op];
8788     const int size = NODE_STEP_REGNODE + offset;
8789     GET_RE_DEBUG_FLAGS_DECL;
8790
8791     PERL_ARGS_ASSERT_REGINSERT;
8792     PERL_UNUSED_ARG(depth);
8793 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8794     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8795     if (SIZE_ONLY) {
8796         RExC_size += size;
8797         return;
8798     }
8799
8800     src = RExC_emit;
8801     RExC_emit += size;
8802     dst = RExC_emit;
8803     if (RExC_open_parens) {
8804         int paren;
8805         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8806         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8807             if ( RExC_open_parens[paren] >= opnd ) {
8808                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8809                 RExC_open_parens[paren] += size;
8810             } else {
8811                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8812             }
8813             if ( RExC_close_parens[paren] >= opnd ) {
8814                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8815                 RExC_close_parens[paren] += size;
8816             } else {
8817                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8818             }
8819         }
8820     }
8821
8822     while (src > opnd) {
8823         StructCopy(--src, --dst, regnode);
8824 #ifdef RE_TRACK_PATTERN_OFFSETS
8825         if (RExC_offsets) {     /* MJD 20010112 */
8826             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8827                   "reg_insert",
8828                   __LINE__,
8829                   PL_reg_name[op],
8830                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8831                     ? "Overwriting end of array!\n" : "OK",
8832                   (UV)(src - RExC_emit_start),
8833                   (UV)(dst - RExC_emit_start),
8834                   (UV)RExC_offsets[0]));
8835             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8836             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8837         }
8838 #endif
8839     }
8840
8841
8842     place = opnd;               /* Op node, where operand used to be. */
8843 #ifdef RE_TRACK_PATTERN_OFFSETS
8844     if (RExC_offsets) {         /* MJD */
8845         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8846               "reginsert",
8847               __LINE__,
8848               PL_reg_name[op],
8849               (UV)(place - RExC_emit_start) > RExC_offsets[0]
8850               ? "Overwriting end of array!\n" : "OK",
8851               (UV)(place - RExC_emit_start),
8852               (UV)(RExC_parse - RExC_start),
8853               (UV)RExC_offsets[0]));
8854         Set_Node_Offset(place, RExC_parse);
8855         Set_Node_Length(place, 1);
8856     }
8857 #endif
8858     src = NEXTOPER(place);
8859     FILL_ADVANCE_NODE(place, op);
8860     Zero(src, offset, regnode);
8861 }
8862
8863 /*
8864 - regtail - set the next-pointer at the end of a node chain of p to val.
8865 - SEE ALSO: regtail_study
8866 */
8867 /* TODO: All three parms should be const */
8868 STATIC void
8869 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8870 {
8871     dVAR;
8872     register regnode *scan;
8873     GET_RE_DEBUG_FLAGS_DECL;
8874
8875     PERL_ARGS_ASSERT_REGTAIL;
8876 #ifndef DEBUGGING
8877     PERL_UNUSED_ARG(depth);
8878 #endif
8879
8880     if (SIZE_ONLY)
8881         return;
8882
8883     /* Find last node. */
8884     scan = p;
8885     for (;;) {
8886         regnode * const temp = regnext(scan);
8887         DEBUG_PARSE_r({
8888             SV * const mysv=sv_newmortal();
8889             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8890             regprop(RExC_rx, mysv, scan);
8891             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8892                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8893                     (temp == NULL ? "->" : ""),
8894                     (temp == NULL ? PL_reg_name[OP(val)] : "")
8895             );
8896         });
8897         if (temp == NULL)
8898             break;
8899         scan = temp;
8900     }
8901
8902     if (reg_off_by_arg[OP(scan)]) {
8903         ARG_SET(scan, val - scan);
8904     }
8905     else {
8906         NEXT_OFF(scan) = val - scan;
8907     }
8908 }
8909
8910 #ifdef DEBUGGING
8911 /*
8912 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8913 - Look for optimizable sequences at the same time.
8914 - currently only looks for EXACT chains.
8915
8916 This is expermental code. The idea is to use this routine to perform
8917 in place optimizations on branches and groups as they are constructed,
8918 with the long term intention of removing optimization from study_chunk so
8919 that it is purely analytical.
8920
8921 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8922 to control which is which.
8923
8924 */
8925 /* TODO: All four parms should be const */
8926
8927 STATIC U8
8928 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8929 {
8930     dVAR;
8931     register regnode *scan;
8932     U8 exact = PSEUDO;
8933 #ifdef EXPERIMENTAL_INPLACESCAN
8934     I32 min = 0;
8935 #endif
8936     GET_RE_DEBUG_FLAGS_DECL;
8937
8938     PERL_ARGS_ASSERT_REGTAIL_STUDY;
8939
8940
8941     if (SIZE_ONLY)
8942         return exact;
8943
8944     /* Find last node. */
8945
8946     scan = p;
8947     for (;;) {
8948         regnode * const temp = regnext(scan);
8949 #ifdef EXPERIMENTAL_INPLACESCAN
8950         if (PL_regkind[OP(scan)] == EXACT)
8951             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8952                 return EXACT;
8953 #endif
8954         if ( exact ) {
8955             switch (OP(scan)) {
8956                 case EXACT:
8957                 case EXACTF:
8958                 case EXACTFL:
8959                         if( exact == PSEUDO )
8960                             exact= OP(scan);
8961                         else if ( exact != OP(scan) )
8962                             exact= 0;
8963                 case NOTHING:
8964                     break;
8965                 default:
8966                     exact= 0;
8967             }
8968         }
8969         DEBUG_PARSE_r({
8970             SV * const mysv=sv_newmortal();
8971             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8972             regprop(RExC_rx, mysv, scan);
8973             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8974                 SvPV_nolen_const(mysv),
8975                 REG_NODE_NUM(scan),
8976                 PL_reg_name[exact]);
8977         });
8978         if (temp == NULL)
8979             break;
8980         scan = temp;
8981     }
8982     DEBUG_PARSE_r({
8983         SV * const mysv_val=sv_newmortal();
8984         DEBUG_PARSE_MSG("");
8985         regprop(RExC_rx, mysv_val, val);
8986         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8987                       SvPV_nolen_const(mysv_val),
8988                       (IV)REG_NODE_NUM(val),
8989                       (IV)(val - scan)
8990         );
8991     });
8992     if (reg_off_by_arg[OP(scan)]) {
8993         ARG_SET(scan, val - scan);
8994     }
8995     else {
8996         NEXT_OFF(scan) = val - scan;
8997     }
8998
8999     return exact;
9000 }
9001 #endif
9002
9003 /*
9004  - regcurly - a little FSA that accepts {\d+,?\d*}
9005  */
9006 #ifndef PERL_IN_XSUB_RE
9007 I32
9008 Perl_regcurly(register const char *s)
9009 {
9010     PERL_ARGS_ASSERT_REGCURLY;
9011
9012     if (*s++ != '{')
9013         return FALSE;
9014     if (!isDIGIT(*s))
9015         return FALSE;
9016     while (isDIGIT(*s))
9017         s++;
9018     if (*s == ',')
9019         s++;
9020     while (isDIGIT(*s))
9021         s++;
9022     if (*s != '}')
9023         return FALSE;
9024     return TRUE;
9025 }
9026 #endif
9027
9028 /*
9029  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
9030  */
9031 #ifdef DEBUGGING
9032 static void
9033 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9034 {
9035     int bit;
9036     int set=0;
9037
9038     for (bit=0; bit<32; bit++) {
9039         if (flags & (1<<bit)) {
9040             if (!set++ && lead)
9041                 PerlIO_printf(Perl_debug_log, "%s",lead);
9042             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9043         }
9044     }
9045     if (lead)  {
9046         if (set)
9047             PerlIO_printf(Perl_debug_log, "\n");
9048         else
9049             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9050     }
9051 }
9052 #endif
9053
9054 void
9055 Perl_regdump(pTHX_ const regexp *r)
9056 {
9057 #ifdef DEBUGGING
9058     dVAR;
9059     SV * const sv = sv_newmortal();
9060     SV *dsv= sv_newmortal();
9061     RXi_GET_DECL(r,ri);
9062     GET_RE_DEBUG_FLAGS_DECL;
9063
9064     PERL_ARGS_ASSERT_REGDUMP;
9065
9066     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
9067
9068     /* Header fields of interest. */
9069     if (r->anchored_substr) {
9070         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9071             RE_SV_DUMPLEN(r->anchored_substr), 30);
9072         PerlIO_printf(Perl_debug_log,
9073                       "anchored %s%s at %"IVdf" ",
9074                       s, RE_SV_TAIL(r->anchored_substr),
9075                       (IV)r->anchored_offset);
9076     } else if (r->anchored_utf8) {
9077         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9078             RE_SV_DUMPLEN(r->anchored_utf8), 30);
9079         PerlIO_printf(Perl_debug_log,
9080                       "anchored utf8 %s%s at %"IVdf" ",
9081                       s, RE_SV_TAIL(r->anchored_utf8),
9082                       (IV)r->anchored_offset);
9083     }
9084     if (r->float_substr) {
9085         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9086             RE_SV_DUMPLEN(r->float_substr), 30);
9087         PerlIO_printf(Perl_debug_log,
9088                       "floating %s%s at %"IVdf"..%"UVuf" ",
9089                       s, RE_SV_TAIL(r->float_substr),
9090                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9091     } else if (r->float_utf8) {
9092         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9093             RE_SV_DUMPLEN(r->float_utf8), 30);
9094         PerlIO_printf(Perl_debug_log,
9095                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9096                       s, RE_SV_TAIL(r->float_utf8),
9097                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9098     }
9099     if (r->check_substr || r->check_utf8)
9100         PerlIO_printf(Perl_debug_log,
9101                       (const char *)
9102                       (r->check_substr == r->float_substr
9103                        && r->check_utf8 == r->float_utf8
9104                        ? "(checking floating" : "(checking anchored"));
9105     if (r->extflags & RXf_NOSCAN)
9106         PerlIO_printf(Perl_debug_log, " noscan");
9107     if (r->extflags & RXf_CHECK_ALL)
9108         PerlIO_printf(Perl_debug_log, " isall");
9109     if (r->check_substr || r->check_utf8)
9110         PerlIO_printf(Perl_debug_log, ") ");
9111
9112     if (ri->regstclass) {
9113         regprop(r, sv, ri->regstclass);
9114         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9115     }
9116     if (r->extflags & RXf_ANCH) {
9117         PerlIO_printf(Perl_debug_log, "anchored");
9118         if (r->extflags & RXf_ANCH_BOL)
9119             PerlIO_printf(Perl_debug_log, "(BOL)");
9120         if (r->extflags & RXf_ANCH_MBOL)
9121             PerlIO_printf(Perl_debug_log, "(MBOL)");
9122         if (r->extflags & RXf_ANCH_SBOL)
9123             PerlIO_printf(Perl_debug_log, "(SBOL)");
9124         if (r->extflags & RXf_ANCH_GPOS)
9125             PerlIO_printf(Perl_debug_log, "(GPOS)");
9126         PerlIO_putc(Perl_debug_log, ' ');
9127     }
9128     if (r->extflags & RXf_GPOS_SEEN)
9129         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9130     if (r->intflags & PREGf_SKIP)
9131         PerlIO_printf(Perl_debug_log, "plus ");
9132     if (r->intflags & PREGf_IMPLICIT)
9133         PerlIO_printf(Perl_debug_log, "implicit ");
9134     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9135     if (r->extflags & RXf_EVAL_SEEN)
9136         PerlIO_printf(Perl_debug_log, "with eval ");
9137     PerlIO_printf(Perl_debug_log, "\n");
9138     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9139 #else
9140     PERL_ARGS_ASSERT_REGDUMP;
9141     PERL_UNUSED_CONTEXT;
9142     PERL_UNUSED_ARG(r);
9143 #endif  /* DEBUGGING */
9144 }
9145
9146 /*
9147 - regprop - printable representation of opcode
9148 */
9149 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9150 STMT_START { \
9151         if (do_sep) {                           \
9152             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9153             if (flags & ANYOF_INVERT)           \
9154                 /*make sure the invert info is in each */ \
9155                 sv_catpvs(sv, "^");             \
9156             do_sep = 0;                         \
9157         }                                       \
9158 } STMT_END
9159
9160 void
9161 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9162 {
9163 #ifdef DEBUGGING
9164     dVAR;
9165     register int k;
9166     RXi_GET_DECL(prog,progi);
9167     GET_RE_DEBUG_FLAGS_DECL;
9168
9169     PERL_ARGS_ASSERT_REGPROP;
9170
9171     sv_setpvs(sv, "");
9172
9173     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
9174         /* It would be nice to FAIL() here, but this may be called from
9175            regexec.c, and it would be hard to supply pRExC_state. */
9176         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9177     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9178
9179     k = PL_regkind[OP(o)];
9180
9181     if (k == EXACT) {
9182         sv_catpvs(sv, " ");
9183         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9184          * is a crude hack but it may be the best for now since
9185          * we have no flag "this EXACTish node was UTF-8"
9186          * --jhi */
9187         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9188                   PERL_PV_ESCAPE_UNI_DETECT |
9189                   PERL_PV_PRETTY_ELLIPSES   |
9190                   PERL_PV_PRETTY_LTGT       |
9191                   PERL_PV_PRETTY_NOCLEAR
9192                   );
9193     } else if (k == TRIE) {
9194         /* print the details of the trie in dumpuntil instead, as
9195          * progi->data isn't available here */
9196         const char op = OP(o);
9197         const U32 n = ARG(o);
9198         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9199                (reg_ac_data *)progi->data->data[n] :
9200                NULL;
9201         const reg_trie_data * const trie
9202             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9203
9204         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9205         DEBUG_TRIE_COMPILE_r(
9206             Perl_sv_catpvf(aTHX_ sv,
9207                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9208                 (UV)trie->startstate,
9209                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9210                 (UV)trie->wordcount,
9211                 (UV)trie->minlen,
9212                 (UV)trie->maxlen,
9213                 (UV)TRIE_CHARCOUNT(trie),
9214                 (UV)trie->uniquecharcount
9215             )
9216         );
9217         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9218             int i;
9219             int rangestart = -1;
9220             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9221             sv_catpvs(sv, "[");
9222             for (i = 0; i <= 256; i++) {
9223                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9224                     if (rangestart == -1)
9225                         rangestart = i;
9226                 } else if (rangestart != -1) {
9227                     if (i <= rangestart + 3)
9228                         for (; rangestart < i; rangestart++)
9229                             put_byte(sv, rangestart);
9230                     else {
9231                         put_byte(sv, rangestart);
9232                         sv_catpvs(sv, "-");
9233                         put_byte(sv, i - 1);
9234                     }
9235                     rangestart = -1;
9236                 }
9237             }
9238             sv_catpvs(sv, "]");
9239         }
9240
9241     } else if (k == CURLY) {
9242         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9243             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9244         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9245     }
9246     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
9247         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9248     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9249         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
9250         if ( RXp_PAREN_NAMES(prog) ) {
9251             if ( k != REF || OP(o) < NREF) {
9252                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9253                 SV **name= av_fetch(list, ARG(o), 0 );
9254                 if (name)
9255                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9256             }
9257             else {
9258                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9259                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9260                 I32 *nums=(I32*)SvPVX(sv_dat);
9261                 SV **name= av_fetch(list, nums[0], 0 );
9262                 I32 n;
9263                 if (name) {
9264                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
9265                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9266                                     (n ? "," : ""), (IV)nums[n]);
9267                     }
9268                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9269                 }
9270             }
9271         }
9272     } else if (k == GOSUB)
9273         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9274     else if (k == VERB) {
9275         if (!o->flags)
9276             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9277                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9278     } else if (k == LOGICAL)
9279         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
9280     else if (k == FOLDCHAR)
9281         Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9282     else if (k == ANYOF) {
9283         int i, rangestart = -1;
9284         const U8 flags = ANYOF_FLAGS(o);
9285         int do_sep = 0;
9286
9287         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9288         static const char * const anyofs[] = {
9289             "\\w",
9290             "\\W",
9291             "\\s",
9292             "\\S",
9293             "\\d",
9294             "\\D",
9295             "[:alnum:]",
9296             "[:^alnum:]",
9297             "[:alpha:]",
9298             "[:^alpha:]",
9299             "[:ascii:]",
9300             "[:^ascii:]",
9301             "[:cntrl:]",
9302             "[:^cntrl:]",
9303             "[:graph:]",
9304             "[:^graph:]",
9305             "[:lower:]",
9306             "[:^lower:]",
9307             "[:print:]",
9308             "[:^print:]",
9309             "[:punct:]",
9310             "[:^punct:]",
9311             "[:upper:]",
9312             "[:^upper:]",
9313             "[:xdigit:]",
9314             "[:^xdigit:]",
9315             "[:space:]",
9316             "[:^space:]",
9317             "[:blank:]",
9318             "[:^blank:]"
9319         };
9320
9321         if (flags & ANYOF_LOCALE)
9322             sv_catpvs(sv, "{loc}");
9323         if (flags & ANYOF_FOLD)
9324             sv_catpvs(sv, "{i}");
9325         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9326         if (flags & ANYOF_INVERT)
9327             sv_catpvs(sv, "^");
9328
9329         /* output what the standard cp 0-255 bitmap matches */
9330         for (i = 0; i <= 256; i++) {
9331             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9332                 if (rangestart == -1)
9333                     rangestart = i;
9334             } else if (rangestart != -1) {
9335                 if (i <= rangestart + 3)
9336                     for (; rangestart < i; rangestart++)
9337                         put_byte(sv, rangestart);
9338                 else {
9339                     put_byte(sv, rangestart);
9340                     sv_catpvs(sv, "-");
9341                     put_byte(sv, i - 1);
9342                 }
9343                 do_sep = 1;
9344                 rangestart = -1;
9345             }
9346         }
9347
9348         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9349         /* output any special charclass tests (used mostly under use locale) */
9350         if (o->flags & ANYOF_CLASS)
9351             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9352                 if (ANYOF_CLASS_TEST(o,i)) {
9353                     sv_catpv(sv, anyofs[i]);
9354                     do_sep = 1;
9355                 }
9356
9357         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9358
9359         /* output information about the unicode matching */
9360         if (flags & ANYOF_UNICODE)
9361             sv_catpvs(sv, "{unicode}");
9362         else if (flags & ANYOF_UNICODE_ALL)
9363             sv_catpvs(sv, "{unicode_all}");
9364
9365         {
9366             SV *lv;
9367             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9368
9369             if (lv) {
9370                 if (sw) {
9371                     U8 s[UTF8_MAXBYTES_CASE+1];
9372
9373                     for (i = 0; i <= 256; i++) { /* just the first 256 */
9374                         uvchr_to_utf8(s, i);
9375
9376                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
9377                             if (rangestart == -1)
9378                                 rangestart = i;
9379                         } else if (rangestart != -1) {
9380                             if (i <= rangestart + 3)
9381                                 for (; rangestart < i; rangestart++) {
9382                                     const U8 * const e = uvchr_to_utf8(s,rangestart);
9383                                     U8 *p;
9384                                     for(p = s; p < e; p++)
9385                                         put_byte(sv, *p);
9386                                 }
9387                             else {
9388                                 const U8 *e = uvchr_to_utf8(s,rangestart);
9389                                 U8 *p;
9390                                 for (p = s; p < e; p++)
9391                                     put_byte(sv, *p);
9392                                 sv_catpvs(sv, "-");
9393                                 e = uvchr_to_utf8(s, i-1);
9394                                 for (p = s; p < e; p++)
9395                                     put_byte(sv, *p);
9396                                 }
9397                                 rangestart = -1;
9398                             }
9399                         }
9400
9401                     sv_catpvs(sv, "..."); /* et cetera */
9402                 }
9403
9404                 {
9405                     char *s = savesvpv(lv);
9406                     char * const origs = s;
9407
9408                     while (*s && *s != '\n')
9409                         s++;
9410
9411                     if (*s == '\n') {
9412                         const char * const t = ++s;
9413
9414                         while (*s) {
9415                             if (*s == '\n')
9416                                 *s = ' ';
9417                             s++;
9418                         }
9419                         if (s[-1] == ' ')
9420                             s[-1] = 0;
9421
9422                         sv_catpv(sv, t);
9423                     }
9424
9425                     Safefree(origs);
9426                 }
9427             }
9428         }
9429
9430         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9431     }
9432     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9433         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9434 #else
9435     PERL_UNUSED_CONTEXT;
9436     PERL_UNUSED_ARG(sv);
9437     PERL_UNUSED_ARG(o);
9438     PERL_UNUSED_ARG(prog);
9439 #endif  /* DEBUGGING */
9440 }
9441
9442 SV *
9443 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9444 {                               /* Assume that RE_INTUIT is set */
9445     dVAR;
9446     struct regexp *const prog = (struct regexp *)SvANY(r);
9447     GET_RE_DEBUG_FLAGS_DECL;
9448
9449     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9450     PERL_UNUSED_CONTEXT;
9451
9452     DEBUG_COMPILE_r(
9453         {
9454             const char * const s = SvPV_nolen_const(prog->check_substr
9455                       ? prog->check_substr : prog->check_utf8);
9456
9457             if (!PL_colorset) reginitcolors();
9458             PerlIO_printf(Perl_debug_log,
9459                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9460                       PL_colors[4],
9461                       prog->check_substr ? "" : "utf8 ",
9462                       PL_colors[5],PL_colors[0],
9463                       s,
9464                       PL_colors[1],
9465                       (strlen(s) > 60 ? "..." : ""));
9466         } );
9467
9468     return prog->check_substr ? prog->check_substr : prog->check_utf8;
9469 }
9470
9471 /*
9472    pregfree()
9473
9474    handles refcounting and freeing the perl core regexp structure. When
9475    it is necessary to actually free the structure the first thing it
9476    does is call the 'free' method of the regexp_engine associated to to
9477    the regexp, allowing the handling of the void *pprivate; member
9478    first. (This routine is not overridable by extensions, which is why
9479    the extensions free is called first.)
9480
9481    See regdupe and regdupe_internal if you change anything here.
9482 */
9483 #ifndef PERL_IN_XSUB_RE
9484 void
9485 Perl_pregfree(pTHX_ REGEXP *r)
9486 {
9487     SvREFCNT_dec(r);
9488 }
9489
9490 void
9491 Perl_pregfree2(pTHX_ REGEXP *rx)
9492 {
9493     dVAR;
9494     struct regexp *const r = (struct regexp *)SvANY(rx);
9495     GET_RE_DEBUG_FLAGS_DECL;
9496
9497     PERL_ARGS_ASSERT_PREGFREE2;
9498
9499     if (r->mother_re) {
9500         ReREFCNT_dec(r->mother_re);
9501     } else {
9502         CALLREGFREE_PVT(rx); /* free the private data */
9503         SvREFCNT_dec(RXp_PAREN_NAMES(r));
9504     }
9505     if (r->substrs) {
9506         SvREFCNT_dec(r->anchored_substr);
9507         SvREFCNT_dec(r->anchored_utf8);
9508         SvREFCNT_dec(r->float_substr);
9509         SvREFCNT_dec(r->float_utf8);
9510         Safefree(r->substrs);
9511     }
9512     RX_MATCH_COPY_FREE(rx);
9513 #ifdef PERL_OLD_COPY_ON_WRITE
9514     SvREFCNT_dec(r->saved_copy);
9515 #endif
9516     Safefree(r->offs);
9517 }
9518
9519 /*  reg_temp_copy()
9520
9521     This is a hacky workaround to the structural issue of match results
9522     being stored in the regexp structure which is in turn stored in
9523     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9524     could be PL_curpm in multiple contexts, and could require multiple
9525     result sets being associated with the pattern simultaneously, such
9526     as when doing a recursive match with (??{$qr})
9527
9528     The solution is to make a lightweight copy of the regexp structure
9529     when a qr// is returned from the code executed by (??{$qr}) this
9530     lightweight copy doesnt actually own any of its data except for
9531     the starp/end and the actual regexp structure itself.
9532
9533 */
9534
9535
9536 REGEXP *
9537 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9538 {
9539     struct regexp *ret;
9540     struct regexp *const r = (struct regexp *)SvANY(rx);
9541     register const I32 npar = r->nparens+1;
9542
9543     PERL_ARGS_ASSERT_REG_TEMP_COPY;
9544
9545     if (!ret_x)
9546         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9547     ret = (struct regexp *)SvANY(ret_x);
9548
9549     (void)ReREFCNT_inc(rx);
9550     /* We can take advantage of the existing "copied buffer" mechanism in SVs
9551        by pointing directly at the buffer, but flagging that the allocated
9552        space in the copy is zero. As we've just done a struct copy, it's now
9553        a case of zero-ing that, rather than copying the current length.  */
9554     SvPV_set(ret_x, RX_WRAPPED(rx));
9555     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9556     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9557            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9558     SvLEN_set(ret_x, 0);
9559     SvSTASH_set(ret_x, NULL);
9560     SvMAGIC_set(ret_x, NULL);
9561     Newx(ret->offs, npar, regexp_paren_pair);
9562     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9563     if (r->substrs) {
9564         Newx(ret->substrs, 1, struct reg_substr_data);
9565         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9566
9567         SvREFCNT_inc_void(ret->anchored_substr);
9568         SvREFCNT_inc_void(ret->anchored_utf8);
9569         SvREFCNT_inc_void(ret->float_substr);
9570         SvREFCNT_inc_void(ret->float_utf8);
9571
9572         /* check_substr and check_utf8, if non-NULL, point to either their
9573            anchored or float namesakes, and don't hold a second reference.  */
9574     }
9575     RX_MATCH_COPIED_off(ret_x);
9576 #ifdef PERL_OLD_COPY_ON_WRITE
9577     ret->saved_copy = NULL;
9578 #endif
9579     ret->mother_re = rx;
9580
9581     return ret_x;
9582 }
9583 #endif
9584
9585 /* regfree_internal()
9586
9587    Free the private data in a regexp. This is overloadable by
9588    extensions. Perl takes care of the regexp structure in pregfree(),
9589    this covers the *pprivate pointer which technically perldoesnt
9590    know about, however of course we have to handle the
9591    regexp_internal structure when no extension is in use.
9592
9593    Note this is called before freeing anything in the regexp
9594    structure.
9595  */
9596
9597 void
9598 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9599 {
9600     dVAR;
9601     struct regexp *const r = (struct regexp *)SvANY(rx);
9602     RXi_GET_DECL(r,ri);
9603     GET_RE_DEBUG_FLAGS_DECL;
9604
9605     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9606
9607     DEBUG_COMPILE_r({
9608         if (!PL_colorset)
9609             reginitcolors();
9610         {
9611             SV *dsv= sv_newmortal();
9612             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
9613                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
9614             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9615                 PL_colors[4],PL_colors[5],s);
9616         }
9617     });
9618 #ifdef RE_TRACK_PATTERN_OFFSETS
9619     if (ri->u.offsets)
9620         Safefree(ri->u.offsets);             /* 20010421 MJD */
9621 #endif
9622     if (ri->data) {
9623         int n = ri->data->count;
9624         PAD* new_comppad = NULL;
9625         PAD* old_comppad;
9626         PADOFFSET refcnt;
9627
9628         while (--n >= 0) {
9629           /* If you add a ->what type here, update the comment in regcomp.h */
9630             switch (ri->data->what[n]) {
9631             case 'a':
9632             case 's':
9633             case 'S':
9634             case 'u':
9635                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9636                 break;
9637             case 'f':
9638                 Safefree(ri->data->data[n]);
9639                 break;
9640             case 'p':
9641                 new_comppad = MUTABLE_AV(ri->data->data[n]);
9642                 break;
9643             case 'o':
9644                 if (new_comppad == NULL)
9645                     Perl_croak(aTHX_ "panic: pregfree comppad");
9646                 PAD_SAVE_LOCAL(old_comppad,
9647                     /* Watch out for global destruction's random ordering. */
9648                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9649                 );
9650                 OP_REFCNT_LOCK;
9651                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9652                 OP_REFCNT_UNLOCK;
9653                 if (!refcnt)
9654                     op_free((OP_4tree*)ri->data->data[n]);
9655
9656                 PAD_RESTORE_LOCAL(old_comppad);
9657                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
9658                 new_comppad = NULL;
9659                 break;
9660             case 'n':
9661                 break;
9662             case 'T':
9663                 { /* Aho Corasick add-on structure for a trie node.
9664                      Used in stclass optimization only */
9665                     U32 refcount;
9666                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9667                     OP_REFCNT_LOCK;
9668                     refcount = --aho->refcount;
9669                     OP_REFCNT_UNLOCK;
9670                     if ( !refcount ) {
9671                         PerlMemShared_free(aho->states);
9672                         PerlMemShared_free(aho->fail);
9673                          /* do this last!!!! */
9674                         PerlMemShared_free(ri->data->data[n]);
9675                         PerlMemShared_free(ri->regstclass);
9676                     }
9677                 }
9678                 break;
9679             case 't':
9680                 {
9681                     /* trie structure. */
9682                     U32 refcount;
9683                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9684                     OP_REFCNT_LOCK;
9685                     refcount = --trie->refcount;
9686                     OP_REFCNT_UNLOCK;
9687                     if ( !refcount ) {
9688                         PerlMemShared_free(trie->charmap);
9689                         PerlMemShared_free(trie->states);
9690                         PerlMemShared_free(trie->trans);
9691                         if (trie->bitmap)
9692                             PerlMemShared_free(trie->bitmap);
9693                         if (trie->jump)
9694                             PerlMemShared_free(trie->jump);
9695                         PerlMemShared_free(trie->wordinfo);
9696                         /* do this last!!!! */
9697                         PerlMemShared_free(ri->data->data[n]);
9698                     }
9699                 }
9700                 break;
9701             default:
9702                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9703             }
9704         }
9705         Safefree(ri->data->what);
9706         Safefree(ri->data);
9707     }
9708
9709     Safefree(ri);
9710 }
9711
9712 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
9713 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
9714 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
9715
9716 /*
9717    re_dup - duplicate a regexp.
9718
9719    This routine is expected to clone a given regexp structure. It is only
9720    compiled under USE_ITHREADS.
9721
9722    After all of the core data stored in struct regexp is duplicated
9723    the regexp_engine.dupe method is used to copy any private data
9724    stored in the *pprivate pointer. This allows extensions to handle
9725    any duplication it needs to do.
9726
9727    See pregfree() and regfree_internal() if you change anything here.
9728 */
9729 #if defined(USE_ITHREADS)
9730 #ifndef PERL_IN_XSUB_RE
9731 void
9732 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
9733 {
9734     dVAR;
9735     I32 npar;
9736     const struct regexp *r = (const struct regexp *)SvANY(sstr);
9737     struct regexp *ret = (struct regexp *)SvANY(dstr);
9738
9739     PERL_ARGS_ASSERT_RE_DUP_GUTS;
9740
9741     npar = r->nparens+1;
9742     Newx(ret->offs, npar, regexp_paren_pair);
9743     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9744     if(ret->swap) {
9745         /* no need to copy these */
9746         Newx(ret->swap, npar, regexp_paren_pair);
9747     }
9748
9749     if (ret->substrs) {
9750         /* Do it this way to avoid reading from *r after the StructCopy().
9751            That way, if any of the sv_dup_inc()s dislodge *r from the L1
9752            cache, it doesn't matter.  */
9753         const bool anchored = r->check_substr
9754             ? r->check_substr == r->anchored_substr
9755             : r->check_utf8 == r->anchored_utf8;
9756         Newx(ret->substrs, 1, struct reg_substr_data);
9757         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9758
9759         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9760         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9761         ret->float_substr = sv_dup_inc(ret->float_substr, param);
9762         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9763
9764         /* check_substr and check_utf8, if non-NULL, point to either their
9765            anchored or float namesakes, and don't hold a second reference.  */
9766
9767         if (ret->check_substr) {
9768             if (anchored) {
9769                 assert(r->check_utf8 == r->anchored_utf8);
9770                 ret->check_substr = ret->anchored_substr;
9771                 ret->check_utf8 = ret->anchored_utf8;
9772             } else {
9773                 assert(r->check_substr == r->float_substr);
9774                 assert(r->check_utf8 == r->float_utf8);
9775                 ret->check_substr = ret->float_substr;
9776                 ret->check_utf8 = ret->float_utf8;
9777             }
9778         } else if (ret->check_utf8) {
9779             if (anchored) {
9780                 ret->check_utf8 = ret->anchored_utf8;
9781             } else {
9782                 ret->check_utf8 = ret->float_utf8;
9783             }
9784         }
9785     }
9786
9787     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9788
9789     if (ret->pprivate)
9790         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
9791
9792     if (RX_MATCH_COPIED(dstr))
9793         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
9794     else
9795         ret->subbeg = NULL;
9796 #ifdef PERL_OLD_COPY_ON_WRITE
9797     ret->saved_copy = NULL;
9798 #endif
9799
9800     if (ret->mother_re) {
9801         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9802             /* Our storage points directly to our mother regexp, but that's
9803                1: a buffer in a different thread
9804                2: something we no longer hold a reference on
9805                so we need to copy it locally.  */
9806             /* Note we need to sue SvCUR() on our mother_re, because it, in
9807                turn, may well be pointing to its own mother_re.  */
9808             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9809                                    SvCUR(ret->mother_re)+1));
9810             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9811         }
9812         ret->mother_re      = NULL;
9813     }
9814     ret->gofs = 0;
9815 }
9816 #endif /* PERL_IN_XSUB_RE */
9817
9818 /*
9819    regdupe_internal()
9820
9821    This is the internal complement to regdupe() which is used to copy
9822    the structure pointed to by the *pprivate pointer in the regexp.
9823    This is the core version of the extension overridable cloning hook.
9824    The regexp structure being duplicated will be copied by perl prior
9825    to this and will be provided as the regexp *r argument, however
9826    with the /old/ structures pprivate pointer value. Thus this routine
9827    may override any copying normally done by perl.
9828
9829    It returns a pointer to the new regexp_internal structure.
9830 */
9831
9832 void *
9833 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
9834 {
9835     dVAR;
9836     struct regexp *const r = (struct regexp *)SvANY(rx);
9837     regexp_internal *reti;
9838     int len, npar;
9839     RXi_GET_DECL(r,ri);
9840
9841     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
9842
9843     npar = r->nparens+1;
9844     len = ProgLen(ri);
9845
9846     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
9847     Copy(ri->program, reti->program, len+1, regnode);
9848
9849
9850     reti->regstclass = NULL;
9851
9852     if (ri->data) {
9853         struct reg_data *d;
9854         const int count = ri->data->count;
9855         int i;
9856
9857         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9858                 char, struct reg_data);
9859         Newx(d->what, count, U8);
9860
9861         d->count = count;
9862         for (i = 0; i < count; i++) {
9863             d->what[i] = ri->data->what[i];
9864             switch (d->what[i]) {
9865                 /* legal options are one of: sSfpontTua
9866                    see also regcomp.h and pregfree() */
9867             case 'a': /* actually an AV, but the dup function is identical.  */
9868             case 's':
9869             case 'S':
9870             case 'p': /* actually an AV, but the dup function is identical.  */
9871             case 'u': /* actually an HV, but the dup function is identical.  */
9872                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
9873                 break;
9874             case 'f':
9875                 /* This is cheating. */
9876                 Newx(d->data[i], 1, struct regnode_charclass_class);
9877                 StructCopy(ri->data->data[i], d->data[i],
9878                             struct regnode_charclass_class);
9879                 reti->regstclass = (regnode*)d->data[i];
9880                 break;
9881             case 'o':
9882                 /* Compiled op trees are readonly and in shared memory,
9883                    and can thus be shared without duplication. */
9884                 OP_REFCNT_LOCK;
9885                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9886                 OP_REFCNT_UNLOCK;
9887                 break;
9888             case 'T':
9889                 /* Trie stclasses are readonly and can thus be shared
9890                  * without duplication. We free the stclass in pregfree
9891                  * when the corresponding reg_ac_data struct is freed.
9892                  */
9893                 reti->regstclass= ri->regstclass;
9894                 /* Fall through */
9895             case 't':
9896                 OP_REFCNT_LOCK;
9897                 ((reg_trie_data*)ri->data->data[i])->refcount++;
9898                 OP_REFCNT_UNLOCK;
9899                 /* Fall through */
9900             case 'n':
9901                 d->data[i] = ri->data->data[i];
9902                 break;
9903             default:
9904                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9905             }
9906         }
9907
9908         reti->data = d;
9909     }
9910     else
9911         reti->data = NULL;
9912
9913     reti->name_list_idx = ri->name_list_idx;
9914
9915 #ifdef RE_TRACK_PATTERN_OFFSETS
9916     if (ri->u.offsets) {
9917         Newx(reti->u.offsets, 2*len+1, U32);
9918         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9919     }
9920 #else
9921     SetProgLen(reti,len);
9922 #endif
9923
9924     return (void*)reti;
9925 }
9926
9927 #endif    /* USE_ITHREADS */
9928
9929 #ifndef PERL_IN_XSUB_RE
9930
9931 /*
9932  - regnext - dig the "next" pointer out of a node
9933  */
9934 regnode *
9935 Perl_regnext(pTHX_ register regnode *p)
9936 {
9937     dVAR;
9938     register I32 offset;
9939
9940     if (!p)
9941         return(NULL);
9942
9943     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
9944         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
9945     }
9946
9947     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9948     if (offset == 0)
9949         return(NULL);
9950
9951     return(p+offset);
9952 }
9953 #endif
9954
9955 STATIC void
9956 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9957 {
9958     va_list args;
9959     STRLEN l1 = strlen(pat1);
9960     STRLEN l2 = strlen(pat2);
9961     char buf[512];
9962     SV *msv;
9963     const char *message;
9964
9965     PERL_ARGS_ASSERT_RE_CROAK2;
9966
9967     if (l1 > 510)
9968         l1 = 510;
9969     if (l1 + l2 > 510)
9970         l2 = 510 - l1;
9971     Copy(pat1, buf, l1 , char);
9972     Copy(pat2, buf + l1, l2 , char);
9973     buf[l1 + l2] = '\n';
9974     buf[l1 + l2 + 1] = '\0';
9975 #ifdef I_STDARG
9976     /* ANSI variant takes additional second argument */
9977     va_start(args, pat2);
9978 #else
9979     va_start(args);
9980 #endif
9981     msv = vmess(buf, &args);
9982     va_end(args);
9983     message = SvPV_const(msv,l1);
9984     if (l1 > 512)
9985         l1 = 512;
9986     Copy(message, buf, l1 , char);
9987     buf[l1-1] = '\0';                   /* Overwrite \n */
9988     Perl_croak(aTHX_ "%s", buf);
9989 }
9990
9991 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
9992
9993 #ifndef PERL_IN_XSUB_RE
9994 void
9995 Perl_save_re_context(pTHX)
9996 {
9997     dVAR;
9998
9999     struct re_save_state *state;
10000
10001     SAVEVPTR(PL_curcop);
10002     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10003
10004     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10005     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
10006     SSPUSHUV(SAVEt_RE_STATE);
10007
10008     Copy(&PL_reg_state, state, 1, struct re_save_state);
10009
10010     PL_reg_start_tmp = 0;
10011     PL_reg_start_tmpl = 0;
10012     PL_reg_oldsaved = NULL;
10013     PL_reg_oldsavedlen = 0;
10014     PL_reg_maxiter = 0;
10015     PL_reg_leftiter = 0;
10016     PL_reg_poscache = NULL;
10017     PL_reg_poscache_size = 0;
10018 #ifdef PERL_OLD_COPY_ON_WRITE
10019     PL_nrs = NULL;
10020 #endif
10021
10022     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10023     if (PL_curpm) {
10024         const REGEXP * const rx = PM_GETRE(PL_curpm);
10025         if (rx) {
10026             U32 i;
10027             for (i = 1; i <= RX_NPARENS(rx); i++) {
10028                 char digits[TYPE_CHARS(long)];
10029                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
10030                 GV *const *const gvp
10031                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10032
10033                 if (gvp) {
10034                     GV * const gv = *gvp;
10035                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10036                         save_scalar(gv);
10037                 }
10038             }
10039         }
10040     }
10041 }
10042 #endif
10043
10044 static void
10045 clear_re(pTHX_ void *r)
10046 {
10047     dVAR;
10048     ReREFCNT_dec((REGEXP *)r);
10049 }
10050
10051 #ifdef DEBUGGING
10052
10053 STATIC void
10054 S_put_byte(pTHX_ SV *sv, int c)
10055 {
10056     PERL_ARGS_ASSERT_PUT_BYTE;
10057
10058     /* Our definition of isPRINT() ignores locales, so only bytes that are
10059        not part of UTF-8 are considered printable. I assume that the same
10060        holds for UTF-EBCDIC.
10061        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10062        which Wikipedia says:
10063
10064        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10065        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10066        identical, to the ASCII delete (DEL) or rubout control character.
10067        ) So the old condition can be simplified to !isPRINT(c)  */
10068     if (!isPRINT(c))
10069         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
10070     else {
10071         const char string = c;
10072         if (c == '-' || c == ']' || c == '\\' || c == '^')
10073             sv_catpvs(sv, "\\");
10074         sv_catpvn(sv, &string, 1);
10075     }
10076 }
10077
10078
10079 #define CLEAR_OPTSTART \
10080     if (optstart) STMT_START { \
10081             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
10082             optstart=NULL; \
10083     } STMT_END
10084
10085 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
10086
10087 STATIC const regnode *
10088 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
10089             const regnode *last, const regnode *plast,
10090             SV* sv, I32 indent, U32 depth)
10091 {
10092     dVAR;
10093     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
10094     register const regnode *next;
10095     const regnode *optstart= NULL;
10096
10097     RXi_GET_DECL(r,ri);
10098     GET_RE_DEBUG_FLAGS_DECL;
10099
10100     PERL_ARGS_ASSERT_DUMPUNTIL;
10101
10102 #ifdef DEBUG_DUMPUNTIL
10103     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10104         last ? last-start : 0,plast ? plast-start : 0);
10105 #endif
10106
10107     if (plast && plast < last)
10108         last= plast;
10109
10110     while (PL_regkind[op] != END && (!last || node < last)) {
10111         /* While that wasn't END last time... */
10112         NODE_ALIGN(node);
10113         op = OP(node);
10114         if (op == CLOSE || op == WHILEM)
10115             indent--;
10116         next = regnext((regnode *)node);
10117
10118         /* Where, what. */
10119         if (OP(node) == OPTIMIZED) {
10120             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10121                 optstart = node;
10122             else
10123                 goto after_print;
10124         } else
10125             CLEAR_OPTSTART;
10126
10127         regprop(r, sv, node);
10128         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10129                       (int)(2*indent + 1), "", SvPVX_const(sv));
10130
10131         if (OP(node) != OPTIMIZED) {
10132             if (next == NULL)           /* Next ptr. */
10133                 PerlIO_printf(Perl_debug_log, " (0)");
10134             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10135                 PerlIO_printf(Perl_debug_log, " (FAIL)");
10136             else
10137                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10138             (void)PerlIO_putc(Perl_debug_log, '\n');
10139         }
10140
10141       after_print:
10142         if (PL_regkind[(U8)op] == BRANCHJ) {
10143             assert(next);
10144             {
10145                 register const regnode *nnode = (OP(next) == LONGJMP
10146                                              ? regnext((regnode *)next)
10147                                              : next);
10148                 if (last && nnode > last)
10149                     nnode = last;
10150                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10151             }
10152         }
10153         else if (PL_regkind[(U8)op] == BRANCH) {
10154             assert(next);
10155             DUMPUNTIL(NEXTOPER(node), next);
10156         }
10157         else if ( PL_regkind[(U8)op]  == TRIE ) {
10158             const regnode *this_trie = node;
10159             const char op = OP(node);
10160             const U32 n = ARG(node);
10161             const reg_ac_data * const ac = op>=AHOCORASICK ?
10162                (reg_ac_data *)ri->data->data[n] :
10163                NULL;
10164             const reg_trie_data * const trie =
10165                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10166 #ifdef DEBUGGING
10167             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10168 #endif
10169             const regnode *nextbranch= NULL;
10170             I32 word_idx;
10171             sv_setpvs(sv, "");
10172             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10173                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10174
10175                 PerlIO_printf(Perl_debug_log, "%*s%s ",
10176                    (int)(2*(indent+3)), "",
10177                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10178                             PL_colors[0], PL_colors[1],
10179                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10180                             PERL_PV_PRETTY_ELLIPSES    |
10181                             PERL_PV_PRETTY_LTGT
10182                             )
10183                             : "???"
10184                 );
10185                 if (trie->jump) {
10186                     U16 dist= trie->jump[word_idx+1];
10187                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10188                                   (UV)((dist ? this_trie + dist : next) - start));
10189                     if (dist) {
10190                         if (!nextbranch)
10191                             nextbranch= this_trie + trie->jump[0];
10192                         DUMPUNTIL(this_trie + dist, nextbranch);
10193                     }
10194                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10195                         nextbranch= regnext((regnode *)nextbranch);
10196                 } else {
10197                     PerlIO_printf(Perl_debug_log, "\n");
10198                 }
10199             }
10200             if (last && next > last)
10201                 node= last;
10202             else
10203                 node= next;
10204         }
10205         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
10206             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10207                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10208         }
10209         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10210             assert(next);
10211             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10212         }
10213         else if ( op == PLUS || op == STAR) {
10214             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10215         }
10216         else if (op == ANYOF) {
10217             /* arglen 1 + class block */
10218             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10219                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10220             node = NEXTOPER(node);
10221         }
10222         else if (PL_regkind[(U8)op] == EXACT) {
10223             /* Literal string, where present. */
10224             node += NODE_SZ_STR(node) - 1;
10225             node = NEXTOPER(node);
10226         }
10227         else {
10228             node = NEXTOPER(node);
10229             node += regarglen[(U8)op];
10230         }
10231         if (op == CURLYX || op == OPEN)
10232             indent++;
10233     }
10234     CLEAR_OPTSTART;
10235 #ifdef DEBUG_DUMPUNTIL
10236     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10237 #endif
10238     return node;
10239 }
10240
10241 #endif  /* DEBUGGING */
10242
10243 /*
10244  * Local variables:
10245  * c-indentation-style: bsd
10246  * c-basic-offset: 4
10247  * indent-tabs-mode: t
10248  * End:
10249  *
10250  * ex: set ts=8 sts=4 sw=4 noet:
10251  */