src/5013005/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to  pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #ifdef op
  89 #undef op
  90 #endif /* op */
  91
  92 #ifdef MSDOS
  93 #  if defined(BUGGY_MSC6)
  94  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  95 #    pragma optimize("a",off)
  96  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  97 #    pragma optimize("w",on )
  98 #  endif /* BUGGY_MSC6 */
  99 #endif /* MSDOS */
 100
 101 #ifndef STATIC
 102 #define STATIC  static
 103 #endif
 104
 105 typedef struct RExC_state_t {
 106     U32         flags;                  /* are we folding, multilining? */
 107     char        *precomp;               /* uncompiled string. */
 108     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 109     regexp      *rx;                    /* perl core regexp structure */
 110     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 111     char        *start;                 /* Start of input for compile */
 112     char        *end;                   /* End of input for compile */
 113     char        *parse;                 /* Input-scan pointer. */
 114     I32         whilem_seen;            /* number of WHILEM in this expr */
 115     regnode     *emit_start;            /* Start of emitted-code area */
 116     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 117     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 118     I32         naughty;                /* How bad is this pattern? */
 119     I32         sawback;                /* Did we see \1, ...? */
 120     U32         seen;
 121     I32         size;                   /* Code size. */
 122     I32         npar;                   /* Capture buffer count, (OPEN). */
 123     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 124     I32         nestroot;               /* root parens we are in - used by accept */
 125     I32         extralen;
 126     I32         seen_zerolen;
 127     I32         seen_evals;
 128     regnode     **open_parens;          /* pointers to open parens */
 129     regnode     **close_parens;         /* pointers to close parens */
 130     regnode     *opend;                 /* END node in program */
 131     I32         utf8;           /* whether the pattern is utf8 or not */
 132     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 133                                 /* XXX use this for future optimisation of case
 134                                  * where pattern must be upgraded to utf8. */
 135     HV          *paren_names;           /* Paren names */
 136
 137     regnode     **recurse;              /* Recurse regops */
 138     I32         recurse_count;          /* Number of recurse regops */
 139 #if ADD_TO_REGEXEC
 140     char        *starttry;              /* -Dr: where regtry was called. */
 141 #define RExC_starttry   (pRExC_state->starttry)
 142 #endif
 143 #ifdef DEBUGGING
 144     const char  *lastparse;
 145     I32         lastnum;
 146     AV          *paren_name_list;       /* idx -> name */
 147 #define RExC_lastparse  (pRExC_state->lastparse)
 148 #define RExC_lastnum    (pRExC_state->lastnum)
 149 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 150 #endif
 151 } RExC_state_t;
 152
 153 #define RExC_flags      (pRExC_state->flags)
 154 #define RExC_precomp    (pRExC_state->precomp)
 155 #define RExC_rx_sv      (pRExC_state->rx_sv)
 156 #define RExC_rx         (pRExC_state->rx)
 157 #define RExC_rxi        (pRExC_state->rxi)
 158 #define RExC_start      (pRExC_state->start)
 159 #define RExC_end        (pRExC_state->end)
 160 #define RExC_parse      (pRExC_state->parse)
 161 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 162 #ifdef RE_TRACK_PATTERN_OFFSETS
 163 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 164 #endif
 165 #define RExC_emit       (pRExC_state->emit)
 166 #define RExC_emit_start (pRExC_state->emit_start)
 167 #define RExC_emit_bound (pRExC_state->emit_bound)
 168 #define RExC_naughty    (pRExC_state->naughty)
 169 #define RExC_sawback    (pRExC_state->sawback)
 170 #define RExC_seen       (pRExC_state->seen)
 171 #define RExC_size       (pRExC_state->size)
 172 #define RExC_npar       (pRExC_state->npar)
 173 #define RExC_nestroot   (pRExC_state->nestroot)
 174 #define RExC_extralen   (pRExC_state->extralen)
 175 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 176 #define RExC_seen_evals (pRExC_state->seen_evals)
 177 #define RExC_utf8       (pRExC_state->utf8)
 178 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 179 #define RExC_open_parens        (pRExC_state->open_parens)
 180 #define RExC_close_parens       (pRExC_state->close_parens)
 181 #define RExC_opend      (pRExC_state->opend)
 182 #define RExC_paren_names        (pRExC_state->paren_names)
 183 #define RExC_recurse    (pRExC_state->recurse)
 184 #define RExC_recurse_count      (pRExC_state->recurse_count)
 185
 186
 187 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 188 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 189         ((*s) == '{' && regcurly(s)))
 190
 191 #ifdef SPSTART
 192 #undef SPSTART          /* dratted cpp namespace... */
 193 #endif
 194 /*
 195  * Flags to be passed up and down.
 196  */
 197 #define WORST           0       /* Worst case. */
 198 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 199
 200 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 201  * character, and if utf8, must be invariant. */
 202 #define SIMPLE          0x02
 203 #define SPSTART         0x04    /* Starts with * or +. */
 204 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 205 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 206
 207 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 208
 209 /* whether trie related optimizations are enabled */
 210 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 211 #define TRIE_STUDY_OPT
 212 #define FULL_TRIE_STUDY
 213 #define TRIE_STCLASS
 214 #endif
 215
 216
 217
 218 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 219 #define PBITVAL(paren) (1 << ((paren) & 7))
 220 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 221 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 222 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 223
 224 /* If not already in utf8, do a longjmp back to the beginning */
 225 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 226 #define REQUIRE_UTF8    STMT_START {                                       \
 227                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 228                         } STMT_END
 229
 230 /* About scan_data_t.
 231
 232   During optimisation we recurse through the regexp program performing
 233   various inplace (keyhole style) optimisations. In addition study_chunk
 234   and scan_commit populate this data structure with information about
 235   what strings MUST appear in the pattern. We look for the longest
 236   string that must appear for at a fixed location, and we look for the
 237   longest string that may appear at a floating location. So for instance
 238   in the pattern:
 239
 240     /FOO[xX]A.*B[xX]BAR/
 241
 242   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 243   strings (because they follow a .* construct). study_chunk will identify
 244   both FOO and BAR as being the longest fixed and floating strings respectively.
 245
 246   The strings can be composites, for instance
 247
 248      /(f)(o)(o)/
 249
 250   will result in a composite fixed substring 'foo'.
 251
 252   For each string some basic information is maintained:
 253
 254   - offset or min_offset
 255     This is the position the string must appear at, or not before.
 256     It also implicitly (when combined with minlenp) tells us how many
 257     character must match before the string we are searching.
 258     Likewise when combined with minlenp and the length of the string
 259     tells us how many characters must appear after the string we have
 260     found.
 261
 262   - max_offset
 263     Only used for floating strings. This is the rightmost point that
 264     the string can appear at. Ifset to I32 max it indicates that the
 265     string can occur infinitely far to the right.
 266
 267   - minlenp
 268     A pointer to the minimum length of the pattern that the string
 269     was found inside. This is important as in the case of positive
 270     lookahead or positive lookbehind we can have multiple patterns
 271     involved. Consider
 272
 273     /(?=FOO).*F/
 274
 275     The minimum length of the pattern overall is 3, the minimum length
 276     of the lookahead part is 3, but the minimum length of the part that
 277     will actually match is 1. So 'FOO's minimum length is 3, but the
 278     minimum length for the F is 1. This is important as the minimum length
 279     is used to determine offsets in front of and behind the string being
 280     looked for.  Since strings can be composites this is the length of the
 281     pattern at the time it was commited with a scan_commit. Note that
 282     the length is calculated by study_chunk, so that the minimum lengths
 283     are not known until the full pattern has been compiled, thus the
 284     pointer to the value.
 285
 286   - lookbehind
 287
 288     In the case of lookbehind the string being searched for can be
 289     offset past the start point of the final matching string.
 290     If this value was just blithely removed from the min_offset it would
 291     invalidate some of the calculations for how many chars must match
 292     before or after (as they are derived from min_offset and minlen and
 293     the length of the string being searched for).
 294     When the final pattern is compiled and the data is moved from the
 295     scan_data_t structure into the regexp structure the information
 296     about lookbehind is factored in, with the information that would
 297     have been lost precalculated in the end_shift field for the
 298     associated string.
 299
 300   The fields pos_min and pos_delta are used to store the minimum offset
 301   and the delta to the maximum offset at the current point in the pattern.
 302
 303 */
 304
 305 typedef struct scan_data_t {
 306     /*I32 len_min;      unused */
 307     /*I32 len_delta;    unused */
 308     I32 pos_min;
 309     I32 pos_delta;
 310     SV *last_found;
 311     I32 last_end;           /* min value, <0 unless valid. */
 312     I32 last_start_min;
 313     I32 last_start_max;
 314     SV **longest;           /* Either &l_fixed, or &l_float. */
 315     SV *longest_fixed;      /* longest fixed string found in pattern */
 316     I32 offset_fixed;       /* offset where it starts */
 317     I32 *minlen_fixed;      /* pointer to the minlen relevent to the string */
 318     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 319     SV *longest_float;      /* longest floating string found in pattern */
 320     I32 offset_float_min;   /* earliest point in string it can appear */
 321     I32 offset_float_max;   /* latest point in string it can appear */
 322     I32 *minlen_float;      /* pointer to the minlen relevent to the string */
 323     I32 lookbehind_float;   /* is the position of the string modified by LB */
 324     I32 flags;
 325     I32 whilem_c;
 326     I32 *last_closep;
 327     struct regnode_charclass_class *start_class;
 328 } scan_data_t;
 329
 330 /*
 331  * Forward declarations for pregcomp()'s friends.
 332  */
 333
 334 static const scan_data_t zero_scan_data =
 335   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 336
 337 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 338 #define SF_BEFORE_SEOL          0x0001
 339 #define SF_BEFORE_MEOL          0x0002
 340 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 341 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 342
 343 #ifdef NO_UNARY_PLUS
 344 #  define SF_FIX_SHIFT_EOL      (0+2)
 345 #  define SF_FL_SHIFT_EOL               (0+4)
 346 #else
 347 #  define SF_FIX_SHIFT_EOL      (+2)
 348 #  define SF_FL_SHIFT_EOL               (+4)
 349 #endif
 350
 351 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 352 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 353
 354 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 355 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 356 #define SF_IS_INF               0x0040
 357 #define SF_HAS_PAR              0x0080
 358 #define SF_IN_PAR               0x0100
 359 #define SF_HAS_EVAL             0x0200
 360 #define SCF_DO_SUBSTR           0x0400
 361 #define SCF_DO_STCLASS_AND      0x0800
 362 #define SCF_DO_STCLASS_OR       0x1000
 363 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 364 #define SCF_WHILEM_VISITED_POS  0x2000
 365
 366 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 367 #define SCF_SEEN_ACCEPT         0x8000
 368
 369 #define UTF (RExC_utf8 != 0)
 370 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
 371 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
 372
 373 #define OOB_UNICODE             12345678
 374 #define OOB_NAMEDCLASS          -1
 375
 376 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 377 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 378
 379
 380 /* length of regex to show in messages that don't mark a position within */
 381 #define RegexLengthToShowInErrorMessages 127
 382
 383 /*
 384  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 385  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 386  * op/pragma/warn/regcomp.
 387  */
 388 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 389 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 390
 391 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 392
 393 /*
 394  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 395  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 396  * "...".
 397  */
 398 #define _FAIL(code) STMT_START {                                        \
 399     const char *ellipses = "";                                          \
 400     IV len = RExC_end - RExC_precomp;                                   \
 401                                                                         \
 402     if (!SIZE_ONLY)                                                     \
 403         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 404     if (len > RegexLengthToShowInErrorMessages) {                       \
 405         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 406         len = RegexLengthToShowInErrorMessages - 10;                    \
 407         ellipses = "...";                                               \
 408     }                                                                   \
 409     code;                                                               \
 410 } STMT_END
 411
 412 #define FAIL(msg) _FAIL(                            \
 413     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 414             msg, (int)len, RExC_precomp, ellipses))
 415
 416 #define FAIL2(msg,arg) _FAIL(                       \
 417     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 418             arg, (int)len, RExC_precomp, ellipses))
 419
 420 /*
 421  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 422  */
 423 #define Simple_vFAIL(m) STMT_START {                                    \
 424     const IV offset = RExC_parse - RExC_precomp;                        \
 425     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 426             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 427 } STMT_END
 428
 429 /*
 430  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 431  */
 432 #define vFAIL(m) STMT_START {                           \
 433     if (!SIZE_ONLY)                                     \
 434         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 435     Simple_vFAIL(m);                                    \
 436 } STMT_END
 437
 438 /*
 439  * Like Simple_vFAIL(), but accepts two arguments.
 440  */
 441 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 442     const IV offset = RExC_parse - RExC_precomp;                        \
 443     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 444             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 445 } STMT_END
 446
 447 /*
 448  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 449  */
 450 #define vFAIL2(m,a1) STMT_START {                       \
 451     if (!SIZE_ONLY)                                     \
 452         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 453     Simple_vFAIL2(m, a1);                               \
 454 } STMT_END
 455
 456
 457 /*
 458  * Like Simple_vFAIL(), but accepts three arguments.
 459  */
 460 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 461     const IV offset = RExC_parse - RExC_precomp;                \
 462     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 463             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 464 } STMT_END
 465
 466 /*
 467  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 468  */
 469 #define vFAIL3(m,a1,a2) STMT_START {                    \
 470     if (!SIZE_ONLY)                                     \
 471         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 472     Simple_vFAIL3(m, a1, a2);                           \
 473 } STMT_END
 474
 475 /*
 476  * Like Simple_vFAIL(), but accepts four arguments.
 477  */
 478 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 479     const IV offset = RExC_parse - RExC_precomp;                \
 480     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 481             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 482 } STMT_END
 483
 484 #define ckWARNreg(loc,m) STMT_START {                                   \
 485     const IV offset = loc - RExC_precomp;                               \
 486     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 487             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 488 } STMT_END
 489
 490 #define ckWARNregdep(loc,m) STMT_START {                                \
 491     const IV offset = loc - RExC_precomp;                               \
 492     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 493             m REPORT_LOCATION,                                          \
 494             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 495 } STMT_END
 496
 497 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 498     const IV offset = loc - RExC_precomp;                               \
 499     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 500             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 501 } STMT_END
 502
 503 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 504     const IV offset = loc - RExC_precomp;                               \
 505     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 506             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 507 } STMT_END
 508
 509 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 510     const IV offset = loc - RExC_precomp;                               \
 511     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 512             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 513 } STMT_END
 514
 515 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 516     const IV offset = loc - RExC_precomp;                               \
 517     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 518             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 519 } STMT_END
 520
 521 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 522     const IV offset = loc - RExC_precomp;                               \
 523     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 524             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 525 } STMT_END
 526
 527 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 528     const IV offset = loc - RExC_precomp;                               \
 529     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 530             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 531 } STMT_END
 532
 533
 534 /* Allow for side effects in s */
 535 #define REGC(c,s) STMT_START {                  \
 536     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 537 } STMT_END
 538
 539 /* Macros for recording node offsets.   20001227 mjd@plover.com
 540  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 541  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 542  * Element 0 holds the number n.
 543  * Position is 1 indexed.
 544  */
 545 #ifndef RE_TRACK_PATTERN_OFFSETS
 546 #define Set_Node_Offset_To_R(node,byte)
 547 #define Set_Node_Offset(node,byte)
 548 #define Set_Cur_Node_Offset
 549 #define Set_Node_Length_To_R(node,len)
 550 #define Set_Node_Length(node,len)
 551 #define Set_Node_Cur_Length(node)
 552 #define Node_Offset(n)
 553 #define Node_Length(n)
 554 #define Set_Node_Offset_Length(node,offset,len)
 555 #define ProgLen(ri) ri->u.proglen
 556 #define SetProgLen(ri,x) ri->u.proglen = x
 557 #else
 558 #define ProgLen(ri) ri->u.offsets[0]
 559 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 560 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 561     if (! SIZE_ONLY) {                                                  \
 562         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 563                     __LINE__, (int)(node), (int)(byte)));               \
 564         if((node) < 0) {                                                \
 565             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 566         } else {                                                        \
 567             RExC_offsets[2*(node)-1] = (byte);                          \
 568         }                                                               \
 569     }                                                                   \
 570 } STMT_END
 571
 572 #define Set_Node_Offset(node,byte) \
 573     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 574 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 575
 576 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 577     if (! SIZE_ONLY) {                                                  \
 578         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 579                 __LINE__, (int)(node), (int)(len)));                    \
 580         if((node) < 0) {                                                \
 581             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 582         } else {                                                        \
 583             RExC_offsets[2*(node)] = (len);                             \
 584         }                                                               \
 585     }                                                                   \
 586 } STMT_END
 587
 588 #define Set_Node_Length(node,len) \
 589     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 590 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 591 #define Set_Node_Cur_Length(node) \
 592     Set_Node_Length(node, RExC_parse - parse_start)
 593
 594 /* Get offsets and lengths */
 595 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 596 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 597
 598 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 599     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 600     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 601 } STMT_END
 602 #endif
 603
 604 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 605 #define EXPERIMENTAL_INPLACESCAN
 606 #endif /*RE_TRACK_PATTERN_OFFSETS*/
 607
 608 #define DEBUG_STUDYDATA(str,data,depth)                              \
 609 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 610     PerlIO_printf(Perl_debug_log,                                    \
 611         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 612         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 613         (int)(depth)*2, "",                                          \
 614         (IV)((data)->pos_min),                                       \
 615         (IV)((data)->pos_delta),                                     \
 616         (UV)((data)->flags),                                         \
 617         (IV)((data)->whilem_c),                                      \
 618         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 619         is_inf ? "INF " : ""                                         \
 620     );                                                               \
 621     if ((data)->last_found)                                          \
 622         PerlIO_printf(Perl_debug_log,                                \
 623             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 624             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 625             SvPVX_const((data)->last_found),                         \
 626             (IV)((data)->last_end),                                  \
 627             (IV)((data)->last_start_min),                            \
 628             (IV)((data)->last_start_max),                            \
 629             ((data)->longest &&                                      \
 630              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 631             SvPVX_const((data)->longest_fixed),                      \
 632             (IV)((data)->offset_fixed),                              \
 633             ((data)->longest &&                                      \
 634              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 635             SvPVX_const((data)->longest_float),                      \
 636             (IV)((data)->offset_float_min),                          \
 637             (IV)((data)->offset_float_max)                           \
 638         );                                                           \
 639     PerlIO_printf(Perl_debug_log,"\n");                              \
 640 });
 641
 642 static void clear_re(pTHX_ void *r);
 643
 644 /* Mark that we cannot extend a found fixed substring at this point.
 645    Update the longest found anchored substring and the longest found
 646    floating substrings if needed. */
 647
 648 STATIC void
 649 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 650 {
 651     const STRLEN l = CHR_SVLEN(data->last_found);
 652     const STRLEN old_l = CHR_SVLEN(*data->longest);
 653     GET_RE_DEBUG_FLAGS_DECL;
 654
 655     PERL_ARGS_ASSERT_SCAN_COMMIT;
 656
 657     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 658         SvSetMagicSV(*data->longest, data->last_found);
 659         if (*data->longest == data->longest_fixed) {
 660             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 661             if (data->flags & SF_BEFORE_EOL)
 662                 data->flags
 663                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 664             else
 665                 data->flags &= ~SF_FIX_BEFORE_EOL;
 666             data->minlen_fixed=minlenp;
 667             data->lookbehind_fixed=0;
 668         }
 669         else { /* *data->longest == data->longest_float */
 670             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 671             data->offset_float_max = (l
 672                                       ? data->last_start_max
 673                                       : data->pos_min + data->pos_delta);
 674             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 675                 data->offset_float_max = I32_MAX;
 676             if (data->flags & SF_BEFORE_EOL)
 677                 data->flags
 678                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 679             else
 680                 data->flags &= ~SF_FL_BEFORE_EOL;
 681             data->minlen_float=minlenp;
 682             data->lookbehind_float=0;
 683         }
 684     }
 685     SvCUR_set(data->last_found, 0);
 686     {
 687         SV * const sv = data->last_found;
 688         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 689             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 690             if (mg)
 691                 mg->mg_len = 0;
 692         }
 693     }
 694     data->last_end = -1;
 695     data->flags &= ~SF_BEFORE_EOL;
 696     DEBUG_STUDYDATA("commit: ",data,0);
 697 }
 698
 699 /* Can match anything (initialization) */
 700 STATIC void
 701 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 702 {
 703     PERL_ARGS_ASSERT_CL_ANYTHING;
 704
 705     ANYOF_CLASS_ZERO(cl);
 706     ANYOF_BITMAP_SETALL(cl);
 707     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
 708     if (LOC)
 709         cl->flags |= ANYOF_LOCALE;
 710 }
 711
 712 /* Can match anything (initialization) */
 713 STATIC int
 714 S_cl_is_anything(const struct regnode_charclass_class *cl)
 715 {
 716     int value;
 717
 718     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 719
 720     for (value = 0; value <= ANYOF_MAX; value += 2)
 721         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 722             return 1;
 723     if (!(cl->flags & ANYOF_UNICODE_ALL))
 724         return 0;
 725     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 726         return 0;
 727     return 1;
 728 }
 729
 730 /* Can match anything (initialization) */
 731 STATIC void
 732 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 733 {
 734     PERL_ARGS_ASSERT_CL_INIT;
 735
 736     Zero(cl, 1, struct regnode_charclass_class);
 737     cl->type = ANYOF;
 738     cl_anything(pRExC_state, cl);
 739 }
 740
 741 STATIC void
 742 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 743 {
 744     PERL_ARGS_ASSERT_CL_INIT_ZERO;
 745
 746     Zero(cl, 1, struct regnode_charclass_class);
 747     cl->type = ANYOF;
 748     cl_anything(pRExC_state, cl);
 749     if (LOC)
 750         cl->flags |= ANYOF_LOCALE;
 751 }
 752
 753 /* 'And' a given class with another one.  Can create false positives */
 754 /* We assume that cl is not inverted */
 755 STATIC void
 756 S_cl_and(struct regnode_charclass_class *cl,
 757         const struct regnode_charclass_class *and_with)
 758 {
 759     PERL_ARGS_ASSERT_CL_AND;
 760
 761     assert(and_with->type == ANYOF);
 762     if (!(and_with->flags & ANYOF_CLASS)
 763         && !(cl->flags & ANYOF_CLASS)
 764         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 765         && !(and_with->flags & ANYOF_FOLD)
 766         && !(cl->flags & ANYOF_FOLD)) {
 767         int i;
 768
 769         if (and_with->flags & ANYOF_INVERT)
 770             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 771                 cl->bitmap[i] &= ~and_with->bitmap[i];
 772         else
 773             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 774                 cl->bitmap[i] &= and_with->bitmap[i];
 775     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 776     if (!(and_with->flags & ANYOF_EOS))
 777         cl->flags &= ~ANYOF_EOS;
 778
 779     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
 780         !(and_with->flags & ANYOF_INVERT)) {
 781         cl->flags &= ~ANYOF_UNICODE_ALL;
 782         cl->flags |= ANYOF_UNICODE;
 783         ARG_SET(cl, ARG(and_with));
 784     }
 785     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
 786         !(and_with->flags & ANYOF_INVERT))
 787         cl->flags &= ~ANYOF_UNICODE_ALL;
 788     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
 789         !(and_with->flags & ANYOF_INVERT))
 790         cl->flags &= ~ANYOF_UNICODE;
 791 }
 792
 793 /* 'OR' a given class with another one.  Can create false positives */
 794 /* We assume that cl is not inverted */
 795 STATIC void
 796 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 797 {
 798     PERL_ARGS_ASSERT_CL_OR;
 799
 800     if (or_with->flags & ANYOF_INVERT) {
 801         /* We do not use
 802          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 803          *   <= (B1 | !B2) | (CL1 | !CL2)
 804          * which is wasteful if CL2 is small, but we ignore CL2:
 805          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 806          * XXXX Can we handle case-fold?  Unclear:
 807          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 808          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 809          */
 810         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 811              && !(or_with->flags & ANYOF_FOLD)
 812              && !(cl->flags & ANYOF_FOLD) ) {
 813             int i;
 814
 815             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 816                 cl->bitmap[i] |= ~or_with->bitmap[i];
 817         } /* XXXX: logic is complicated otherwise */
 818         else {
 819             cl_anything(pRExC_state, cl);
 820         }
 821     } else {
 822         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 823         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 824              && (!(or_with->flags & ANYOF_FOLD)
 825                  || (cl->flags & ANYOF_FOLD)) ) {
 826             int i;
 827
 828             /* OR char bitmap and class bitmap separately */
 829             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 830                 cl->bitmap[i] |= or_with->bitmap[i];
 831             if (or_with->flags & ANYOF_CLASS) {
 832                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 833                     cl->classflags[i] |= or_with->classflags[i];
 834                 cl->flags |= ANYOF_CLASS;
 835             }
 836         }
 837         else { /* XXXX: logic is complicated, leave it along for a moment. */
 838             cl_anything(pRExC_state, cl);
 839         }
 840     }
 841     if (or_with->flags & ANYOF_EOS)
 842         cl->flags |= ANYOF_EOS;
 843
 844     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
 845         ARG(cl) != ARG(or_with)) {
 846         cl->flags |= ANYOF_UNICODE_ALL;
 847         cl->flags &= ~ANYOF_UNICODE;
 848     }
 849     if (or_with->flags & ANYOF_UNICODE_ALL) {
 850         cl->flags |= ANYOF_UNICODE_ALL;
 851         cl->flags &= ~ANYOF_UNICODE;
 852     }
 853 }
 854
 855 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 856 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 857 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 858 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
 859
 860
 861 #ifdef DEBUGGING
 862 /*
 863    dump_trie(trie,widecharmap,revcharmap)
 864    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
 865    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 866
 867    These routines dump out a trie in a somewhat readable format.
 868    The _interim_ variants are used for debugging the interim
 869    tables that are used to generate the final compressed
 870    representation which is what dump_trie expects.
 871
 872    Part of the reason for their existance is to provide a form
 873    of documentation as to how the different representations function.
 874
 875 */
 876
 877 /*
 878   Dumps the final compressed table form of the trie to Perl_debug_log.
 879   Used for debugging make_trie().
 880 */
 881
 882 STATIC void
 883 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
 884             AV *revcharmap, U32 depth)
 885 {
 886     U32 state;
 887     SV *sv=sv_newmortal();
 888     int colwidth= widecharmap ? 6 : 4;
 889     U16 word;
 890     GET_RE_DEBUG_FLAGS_DECL;
 891
 892     PERL_ARGS_ASSERT_DUMP_TRIE;
 893
 894     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
 895         (int)depth * 2 + 2,"",
 896         "Match","Base","Ofs" );
 897
 898     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
 899         SV ** const tmp = av_fetch( revcharmap, state, 0);
 900         if ( tmp ) {
 901             PerlIO_printf( Perl_debug_log, "%*s",
 902                 colwidth,
 903                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 904                             PL_colors[0], PL_colors[1],
 905                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 906                             PERL_PV_ESCAPE_FIRSTCHAR
 907                 )
 908             );
 909         }
 910     }
 911     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
 912         (int)depth * 2 + 2,"");
 913
 914     for( state = 0 ; state < trie->uniquecharcount ; state++ )
 915         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
 916     PerlIO_printf( Perl_debug_log, "\n");
 917
 918     for( state = 1 ; state < trie->statecount ; state++ ) {
 919         const U32 base = trie->states[ state ].trans.base;
 920
 921         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
 922
 923         if ( trie->states[ state ].wordnum ) {
 924             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
 925         } else {
 926             PerlIO_printf( Perl_debug_log, "%6s", "" );
 927         }
 928
 929         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
 930
 931         if ( base ) {
 932             U32 ofs = 0;
 933
 934             while( ( base + ofs  < trie->uniquecharcount ) ||
 935                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
 936                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
 937                     ofs++;
 938
 939             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
 940
 941             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
 942                 if ( ( base + ofs >= trie->uniquecharcount ) &&
 943                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
 944                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
 945                 {
 946                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
 947                     colwidth,
 948                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
 949                 } else {
 950                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
 951                 }
 952             }
 953
 954             PerlIO_printf( Perl_debug_log, "]");
 955
 956         }
 957         PerlIO_printf( Perl_debug_log, "\n" );
 958     }
 959     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
 960     for (word=1; word <= trie->wordcount; word++) {
 961         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
 962             (int)word, (int)(trie->wordinfo[word].prev),
 963             (int)(trie->wordinfo[word].len));
 964     }
 965     PerlIO_printf(Perl_debug_log, "\n" );
 966 }
 967 /*
 968   Dumps a fully constructed but uncompressed trie in list form.
 969   List tries normally only are used for construction when the number of
 970   possible chars (trie->uniquecharcount) is very high.
 971   Used for debugging make_trie().
 972 */
 973 STATIC void
 974 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
 975                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
 976                          U32 depth)
 977 {
 978     U32 state;
 979     SV *sv=sv_newmortal();
 980     int colwidth= widecharmap ? 6 : 4;
 981     GET_RE_DEBUG_FLAGS_DECL;
 982
 983     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
 984
 985     /* print out the table precompression.  */
 986     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
 987         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
 988         "------:-----+-----------------\n" );
 989
 990     for( state=1 ; state < next_alloc ; state ++ ) {
 991         U16 charid;
 992
 993         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
 994             (int)depth * 2 + 2,"", (UV)state  );
 995         if ( ! trie->states[ state ].wordnum ) {
 996             PerlIO_printf( Perl_debug_log, "%5s| ","");
 997         } else {
 998             PerlIO_printf( Perl_debug_log, "W%4x| ",
 999                 trie->states[ state ].wordnum
1000             );
1001         }
1002         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1003             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1004             if ( tmp ) {
1005                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1006                     colwidth,
1007                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1008                             PL_colors[0], PL_colors[1],
1009                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1010                             PERL_PV_ESCAPE_FIRSTCHAR
1011                     ) ,
1012                     TRIE_LIST_ITEM(state,charid).forid,
1013                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1014                 );
1015                 if (!(charid % 10))
1016                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1017                         (int)((depth * 2) + 14), "");
1018             }
1019         }
1020         PerlIO_printf( Perl_debug_log, "\n");
1021     }
1022 }
1023
1024 /*
1025   Dumps a fully constructed but uncompressed trie in table form.
1026   This is the normal DFA style state transition table, with a few
1027   twists to facilitate compression later.
1028   Used for debugging make_trie().
1029 */
1030 STATIC void
1031 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1032                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1033                           U32 depth)
1034 {
1035     U32 state;
1036     U16 charid;
1037     SV *sv=sv_newmortal();
1038     int colwidth= widecharmap ? 6 : 4;
1039     GET_RE_DEBUG_FLAGS_DECL;
1040
1041     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1042
1043     /*
1044        print out the table precompression so that we can do a visual check
1045        that they are identical.
1046      */
1047
1048     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1049
1050     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1051         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1052         if ( tmp ) {
1053             PerlIO_printf( Perl_debug_log, "%*s",
1054                 colwidth,
1055                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1056                             PL_colors[0], PL_colors[1],
1057                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1058                             PERL_PV_ESCAPE_FIRSTCHAR
1059                 )
1060             );
1061         }
1062     }
1063
1064     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1065
1066     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1067         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1068     }
1069
1070     PerlIO_printf( Perl_debug_log, "\n" );
1071
1072     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1073
1074         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1075             (int)depth * 2 + 2,"",
1076             (UV)TRIE_NODENUM( state ) );
1077
1078         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1079             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1080             if (v)
1081                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1082             else
1083                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1084         }
1085         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1086             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1087         } else {
1088             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1089             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1090         }
1091     }
1092 }
1093
1094 #endif
1095
1096
1097 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1098   startbranch: the first branch in the whole branch sequence
1099   first      : start branch of sequence of branch-exact nodes.
1100                May be the same as startbranch
1101   last       : Thing following the last branch.
1102                May be the same as tail.
1103   tail       : item following the branch sequence
1104   count      : words in the sequence
1105   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1106   depth      : indent depth
1107
1108 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1109
1110 A trie is an N'ary tree where the branches are determined by digital
1111 decomposition of the key. IE, at the root node you look up the 1st character and
1112 follow that branch repeat until you find the end of the branches. Nodes can be
1113 marked as "accepting" meaning they represent a complete word. Eg:
1114
1115   /he|she|his|hers/
1116
1117 would convert into the following structure. Numbers represent states, letters
1118 following numbers represent valid transitions on the letter from that state, if
1119 the number is in square brackets it represents an accepting state, otherwise it
1120 will be in parenthesis.
1121
1122       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1123       |    |
1124       |   (2)
1125       |    |
1126      (1)   +-i->(6)-+-s->[7]
1127       |
1128       +-s->(3)-+-h->(4)-+-e->[5]
1129
1130       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1131
1132 This shows that when matching against the string 'hers' we will begin at state 1
1133 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1134 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1135 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1136 single traverse. We store a mapping from accepting to state to which word was
1137 matched, and then when we have multiple possibilities we try to complete the
1138 rest of the regex in the order in which they occured in the alternation.
1139
1140 The only prior NFA like behaviour that would be changed by the TRIE support is
1141 the silent ignoring of duplicate alternations which are of the form:
1142
1143  / (DUPE|DUPE) X? (?{ ... }) Y /x
1144
1145 Thus EVAL blocks follwing a trie may be called a different number of times with
1146 and without the optimisation. With the optimisations dupes will be silently
1147 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1148 the following demonstrates:
1149
1150  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1151
1152 which prints out 'word' three times, but
1153
1154  'words'=~/(word|word|word)(?{ print $1 })S/
1155
1156 which doesnt print it out at all. This is due to other optimisations kicking in.
1157
1158 Example of what happens on a structural level:
1159
1160 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1161
1162    1: CURLYM[1] {1,32767}(18)
1163    5:   BRANCH(8)
1164    6:     EXACT <ac>(16)
1165    8:   BRANCH(11)
1166    9:     EXACT <ad>(16)
1167   11:   BRANCH(14)
1168   12:     EXACT <ab>(16)
1169   16:   SUCCEED(0)
1170   17:   NOTHING(18)
1171   18: END(0)
1172
1173 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1174 and should turn into:
1175
1176    1: CURLYM[1] {1,32767}(18)
1177    5:   TRIE(16)
1178         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1179           <ac>
1180           <ad>
1181           <ab>
1182   16:   SUCCEED(0)
1183   17:   NOTHING(18)
1184   18: END(0)
1185
1186 Cases where tail != last would be like /(?foo|bar)baz/:
1187
1188    1: BRANCH(4)
1189    2:   EXACT <foo>(8)
1190    4: BRANCH(7)
1191    5:   EXACT <bar>(8)
1192    7: TAIL(8)
1193    8: EXACT <baz>(10)
1194   10: END(0)
1195
1196 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1197 and would end up looking like:
1198
1199     1: TRIE(8)
1200       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1201         <foo>
1202         <bar>
1203    7: TAIL(8)
1204    8: EXACT <baz>(10)
1205   10: END(0)
1206
1207     d = uvuni_to_utf8_flags(d, uv, 0);
1208
1209 is the recommended Unicode-aware way of saying
1210
1211     *(d++) = uv;
1212 */
1213
1214 #define TRIE_STORE_REVCHAR                                                 \
1215     STMT_START {                                                           \
1216         if (UTF) {                                                         \
1217             SV *zlopp = newSV(2);                                          \
1218             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1219             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1220             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1221             SvPOK_on(zlopp);                                               \
1222             SvUTF8_on(zlopp);                                              \
1223             av_push(revcharmap, zlopp);                                    \
1224         } else {                                                           \
1225             char ooooff = (char)uvc;                                               \
1226             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1227         }                                                                  \
1228         } STMT_END
1229
1230 #define TRIE_READ_CHAR STMT_START {                                           \
1231     wordlen++;                                                                \
1232     if ( UTF ) {                                                              \
1233         if ( folder ) {                                                       \
1234             if ( foldlen > 0 ) {                                              \
1235                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1236                foldlen -= len;                                                \
1237                scan += len;                                                   \
1238                len = 0;                                                       \
1239             } else {                                                          \
1240                 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1241                 uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1242                 foldlen -= UNISKIP( uvc );                                    \
1243                 scan = foldbuf + UNISKIP( uvc );                              \
1244             }                                                                 \
1245         } else {                                                              \
1246             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1247         }                                                                     \
1248     } else {                                                                  \
1249         uvc = (U32)*uc;                                                       \
1250         len = 1;                                                              \
1251     }                                                                         \
1252 } STMT_END
1253
1254
1255
1256 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1257     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1258         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1259         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1260     }                                                           \
1261     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1262     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1263     TRIE_LIST_CUR( state )++;                                   \
1264 } STMT_END
1265
1266 #define TRIE_LIST_NEW(state) STMT_START {                       \
1267     Newxz( trie->states[ state ].trans.list,               \
1268         4, reg_trie_trans_le );                                 \
1269      TRIE_LIST_CUR( state ) = 1;                                \
1270      TRIE_LIST_LEN( state ) = 4;                                \
1271 } STMT_END
1272
1273 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1274     U16 dupe= trie->states[ state ].wordnum;                    \
1275     regnode * const noper_next = regnext( noper );              \
1276                                                                 \
1277     DEBUG_r({                                                   \
1278         /* store the word for dumping */                        \
1279         SV* tmp;                                                \
1280         if (OP(noper) != NOTHING)                               \
1281             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1282         else                                                    \
1283             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1284         av_push( trie_words, tmp );                             \
1285     });                                                         \
1286                                                                 \
1287     curword++;                                                  \
1288     trie->wordinfo[curword].prev   = 0;                         \
1289     trie->wordinfo[curword].len    = wordlen;                   \
1290     trie->wordinfo[curword].accept = state;                     \
1291                                                                 \
1292     if ( noper_next < tail ) {                                  \
1293         if (!trie->jump)                                        \
1294             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1295         trie->jump[curword] = (U16)(noper_next - convert);      \
1296         if (!jumper)                                            \
1297             jumper = noper_next;                                \
1298         if (!nextbranch)                                        \
1299             nextbranch= regnext(cur);                           \
1300     }                                                           \
1301                                                                 \
1302     if ( dupe ) {                                               \
1303         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1304         /* chain, so that when the bits of chain are later    */\
1305         /* linked together, the dups appear in the chain      */\
1306         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1307         trie->wordinfo[dupe].prev = curword;                    \
1308     } else {                                                    \
1309         /* we haven't inserted this word yet.                */ \
1310         trie->states[ state ].wordnum = curword;                \
1311     }                                                           \
1312 } STMT_END
1313
1314
1315 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1316      ( ( base + charid >=  ucharcount                                   \
1317          && base + charid < ubound                                      \
1318          && state == trie->trans[ base - ucharcount + charid ].check    \
1319          && trie->trans[ base - ucharcount + charid ].next )            \
1320            ? trie->trans[ base - ucharcount + charid ].next             \
1321            : ( state==1 ? special : 0 )                                 \
1322       )
1323
1324 #define MADE_TRIE       1
1325 #define MADE_JUMP_TRIE  2
1326 #define MADE_EXACT_TRIE 4
1327
1328 STATIC I32
1329 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1330 {
1331     dVAR;
1332     /* first pass, loop through and scan words */
1333     reg_trie_data *trie;
1334     HV *widecharmap = NULL;
1335     AV *revcharmap = newAV();
1336     regnode *cur;
1337     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1338     STRLEN len = 0;
1339     UV uvc = 0;
1340     U16 curword = 0;
1341     U32 next_alloc = 0;
1342     regnode *jumper = NULL;
1343     regnode *nextbranch = NULL;
1344     regnode *convert = NULL;
1345     U32 *prev_states; /* temp array mapping each state to previous one */
1346     /* we just use folder as a flag in utf8 */
1347     const U8 * const folder = ( flags == EXACTF
1348                        ? PL_fold
1349                        : ( flags == EXACTFL
1350                            ? PL_fold_locale
1351                            : NULL
1352                          )
1353                      );
1354
1355 #ifdef DEBUGGING
1356     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1357     AV *trie_words = NULL;
1358     /* along with revcharmap, this only used during construction but both are
1359      * useful during debugging so we store them in the struct when debugging.
1360      */
1361 #else
1362     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1363     STRLEN trie_charcount=0;
1364 #endif
1365     SV *re_trie_maxbuff;
1366     GET_RE_DEBUG_FLAGS_DECL;
1367
1368     PERL_ARGS_ASSERT_MAKE_TRIE;
1369 #ifndef DEBUGGING
1370     PERL_UNUSED_ARG(depth);
1371 #endif
1372
1373     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1374     trie->refcount = 1;
1375     trie->startstate = 1;
1376     trie->wordcount = word_count;
1377     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1378     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1379     if (!(UTF && folder))
1380         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1381     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1382                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1383
1384     DEBUG_r({
1385         trie_words = newAV();
1386     });
1387
1388     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1389     if (!SvIOK(re_trie_maxbuff)) {
1390         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1391     }
1392     DEBUG_OPTIMISE_r({
1393                 PerlIO_printf( Perl_debug_log,
1394                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1395                   (int)depth * 2 + 2, "",
1396                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1397                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1398                   (int)depth);
1399     });
1400
1401    /* Find the node we are going to overwrite */
1402     if ( first == startbranch && OP( last ) != BRANCH ) {
1403         /* whole branch chain */
1404         convert = first;
1405     } else {
1406         /* branch sub-chain */
1407         convert = NEXTOPER( first );
1408     }
1409
1410     /*  -- First loop and Setup --
1411
1412        We first traverse the branches and scan each word to determine if it
1413        contains widechars, and how many unique chars there are, this is
1414        important as we have to build a table with at least as many columns as we
1415        have unique chars.
1416
1417        We use an array of integers to represent the character codes 0..255
1418        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1419        native representation of the character value as the key and IV's for the
1420        coded index.
1421
1422        *TODO* If we keep track of how many times each character is used we can
1423        remap the columns so that the table compression later on is more
1424        efficient in terms of memory by ensuring most common value is in the
1425        middle and the least common are on the outside.  IMO this would be better
1426        than a most to least common mapping as theres a decent chance the most
1427        common letter will share a node with the least common, meaning the node
1428        will not be compressable. With a middle is most common approach the worst
1429        case is when we have the least common nodes twice.
1430
1431      */
1432
1433     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1434         regnode * const noper = NEXTOPER( cur );
1435         const U8 *uc = (U8*)STRING( noper );
1436         const U8 * const e  = uc + STR_LEN( noper );
1437         STRLEN foldlen = 0;
1438         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1439         const U8 *scan = (U8*)NULL;
1440         U32 wordlen      = 0;         /* required init */
1441         STRLEN chars = 0;
1442         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1443
1444         if (OP(noper) == NOTHING) {
1445             trie->minlen= 0;
1446             continue;
1447         }
1448         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1449             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1450                                           regardless of encoding */
1451
1452         for ( ; uc < e ; uc += len ) {
1453             TRIE_CHARCOUNT(trie)++;
1454             TRIE_READ_CHAR;
1455             chars++;
1456             if ( uvc < 256 ) {
1457                 if ( !trie->charmap[ uvc ] ) {
1458                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1459                     if ( folder )
1460                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1461                     TRIE_STORE_REVCHAR;
1462                 }
1463                 if ( set_bit ) {
1464                     /* store the codepoint in the bitmap, and if its ascii
1465                        also store its folded equivelent. */
1466                     TRIE_BITMAP_SET(trie,uvc);
1467
1468                     /* store the folded codepoint */
1469                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1470
1471                     if ( !UTF ) {
1472                         /* store first byte of utf8 representation of
1473                            codepoints in the 127 < uvc < 256 range */
1474                         if (127 < uvc && uvc < 192) {
1475                             TRIE_BITMAP_SET(trie,194);
1476                         } else if (191 < uvc ) {
1477                             TRIE_BITMAP_SET(trie,195);
1478                         /* && uvc < 256 -- we know uvc is < 256 already */
1479                         }
1480                     }
1481                     set_bit = 0; /* We've done our bit :-) */
1482                 }
1483             } else {
1484                 SV** svpp;
1485                 if ( !widecharmap )
1486                     widecharmap = newHV();
1487
1488                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1489
1490                 if ( !svpp )
1491                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1492
1493                 if ( !SvTRUE( *svpp ) ) {
1494                     sv_setiv( *svpp, ++trie->uniquecharcount );
1495                     TRIE_STORE_REVCHAR;
1496                 }
1497             }
1498         }
1499         if( cur == first ) {
1500             trie->minlen=chars;
1501             trie->maxlen=chars;
1502         } else if (chars < trie->minlen) {
1503             trie->minlen=chars;
1504         } else if (chars > trie->maxlen) {
1505             trie->maxlen=chars;
1506         }
1507
1508     } /* end first pass */
1509     DEBUG_TRIE_COMPILE_r(
1510         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1511                 (int)depth * 2 + 2,"",
1512                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1513                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1514                 (int)trie->minlen, (int)trie->maxlen )
1515     );
1516
1517     /*
1518         We now know what we are dealing with in terms of unique chars and
1519         string sizes so we can calculate how much memory a naive
1520         representation using a flat table  will take. If it's over a reasonable
1521         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1522         conservative but potentially much slower representation using an array
1523         of lists.
1524
1525         At the end we convert both representations into the same compressed
1526         form that will be used in regexec.c for matching with. The latter
1527         is a form that cannot be used to construct with but has memory
1528         properties similar to the list form and access properties similar
1529         to the table form making it both suitable for fast searches and
1530         small enough that its feasable to store for the duration of a program.
1531
1532         See the comment in the code where the compressed table is produced
1533         inplace from the flat tabe representation for an explanation of how
1534         the compression works.
1535
1536     */
1537
1538
1539     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1540     prev_states[1] = 0;
1541
1542     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1543         /*
1544             Second Pass -- Array Of Lists Representation
1545
1546             Each state will be represented by a list of charid:state records
1547             (reg_trie_trans_le) the first such element holds the CUR and LEN
1548             points of the allocated array. (See defines above).
1549
1550             We build the initial structure using the lists, and then convert
1551             it into the compressed table form which allows faster lookups
1552             (but cant be modified once converted).
1553         */
1554
1555         STRLEN transcount = 1;
1556
1557         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1558             "%*sCompiling trie using list compiler\n",
1559             (int)depth * 2 + 2, ""));
1560
1561         trie->states = (reg_trie_state *)
1562             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1563                                   sizeof(reg_trie_state) );
1564         TRIE_LIST_NEW(1);
1565         next_alloc = 2;
1566
1567         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1568
1569             regnode * const noper = NEXTOPER( cur );
1570             U8 *uc           = (U8*)STRING( noper );
1571             const U8 * const e = uc + STR_LEN( noper );
1572             U32 state        = 1;         /* required init */
1573             U16 charid       = 0;         /* sanity init */
1574             U8 *scan         = (U8*)NULL; /* sanity init */
1575             STRLEN foldlen   = 0;         /* required init */
1576             U32 wordlen      = 0;         /* required init */
1577             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1578
1579             if (OP(noper) != NOTHING) {
1580                 for ( ; uc < e ; uc += len ) {
1581
1582                     TRIE_READ_CHAR;
1583
1584                     if ( uvc < 256 ) {
1585                         charid = trie->charmap[ uvc ];
1586                     } else {
1587                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1588                         if ( !svpp ) {
1589                             charid = 0;
1590                         } else {
1591                             charid=(U16)SvIV( *svpp );
1592                         }
1593                     }
1594                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1595                     if ( charid ) {
1596
1597                         U16 check;
1598                         U32 newstate = 0;
1599
1600                         charid--;
1601                         if ( !trie->states[ state ].trans.list ) {
1602                             TRIE_LIST_NEW( state );
1603                         }
1604                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1605                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1606                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1607                                 break;
1608                             }
1609                         }
1610                         if ( ! newstate ) {
1611                             newstate = next_alloc++;
1612                             prev_states[newstate] = state;
1613                             TRIE_LIST_PUSH( state, charid, newstate );
1614                             transcount++;
1615                         }
1616                         state = newstate;
1617                     } else {
1618                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1619                     }
1620                 }
1621             }
1622             TRIE_HANDLE_WORD(state);
1623
1624         } /* end second pass */
1625
1626         /* next alloc is the NEXT state to be allocated */
1627         trie->statecount = next_alloc;
1628         trie->states = (reg_trie_state *)
1629             PerlMemShared_realloc( trie->states,
1630                                    next_alloc
1631                                    * sizeof(reg_trie_state) );
1632
1633         /* and now dump it out before we compress it */
1634         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1635                                                          revcharmap, next_alloc,
1636                                                          depth+1)
1637         );
1638
1639         trie->trans = (reg_trie_trans *)
1640             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1641         {
1642             U32 state;
1643             U32 tp = 0;
1644             U32 zp = 0;
1645
1646
1647             for( state=1 ; state < next_alloc ; state ++ ) {
1648                 U32 base=0;
1649
1650                 /*
1651                 DEBUG_TRIE_COMPILE_MORE_r(
1652                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1653                 );
1654                 */
1655
1656                 if (trie->states[state].trans.list) {
1657                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1658                     U16 maxid=minid;
1659                     U16 idx;
1660
1661                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1662                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1663                         if ( forid < minid ) {
1664                             minid=forid;
1665                         } else if ( forid > maxid ) {
1666                             maxid=forid;
1667                         }
1668                     }
1669                     if ( transcount < tp + maxid - minid + 1) {
1670                         transcount *= 2;
1671                         trie->trans = (reg_trie_trans *)
1672                             PerlMemShared_realloc( trie->trans,
1673                                                      transcount
1674                                                      * sizeof(reg_trie_trans) );
1675                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1676                     }
1677                     base = trie->uniquecharcount + tp - minid;
1678                     if ( maxid == minid ) {
1679                         U32 set = 0;
1680                         for ( ; zp < tp ; zp++ ) {
1681                             if ( ! trie->trans[ zp ].next ) {
1682                                 base = trie->uniquecharcount + zp - minid;
1683                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1684                                 trie->trans[ zp ].check = state;
1685                                 set = 1;
1686                                 break;
1687                             }
1688                         }
1689                         if ( !set ) {
1690                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1691                             trie->trans[ tp ].check = state;
1692                             tp++;
1693                             zp = tp;
1694                         }
1695                     } else {
1696                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1697                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1698                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1699                             trie->trans[ tid ].check = state;
1700                         }
1701                         tp += ( maxid - minid + 1 );
1702                     }
1703                     Safefree(trie->states[ state ].trans.list);
1704                 }
1705                 /*
1706                 DEBUG_TRIE_COMPILE_MORE_r(
1707                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1708                 );
1709                 */
1710                 trie->states[ state ].trans.base=base;
1711             }
1712             trie->lasttrans = tp + 1;
1713         }
1714     } else {
1715         /*
1716            Second Pass -- Flat Table Representation.
1717
1718            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1719            We know that we will need Charcount+1 trans at most to store the data
1720            (one row per char at worst case) So we preallocate both structures
1721            assuming worst case.
1722
1723            We then construct the trie using only the .next slots of the entry
1724            structs.
1725
1726            We use the .check field of the first entry of the node  temporarily to
1727            make compression both faster and easier by keeping track of how many non
1728            zero fields are in the node.
1729
1730            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1731            transition.
1732
1733            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1734            number representing the first entry of the node, and state as a
1735            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1736            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1737            are 2 entrys per node. eg:
1738
1739              A B       A B
1740           1. 2 4    1. 3 7
1741           2. 0 3    3. 0 5
1742           3. 0 0    5. 0 0
1743           4. 0 0    7. 0 0
1744
1745            The table is internally in the right hand, idx form. However as we also
1746            have to deal with the states array which is indexed by nodenum we have to
1747            use TRIE_NODENUM() to convert.
1748
1749         */
1750         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1751             "%*sCompiling trie using table compiler\n",
1752             (int)depth * 2 + 2, ""));
1753
1754         trie->trans = (reg_trie_trans *)
1755             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1756                                   * trie->uniquecharcount + 1,
1757                                   sizeof(reg_trie_trans) );
1758         trie->states = (reg_trie_state *)
1759             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1760                                   sizeof(reg_trie_state) );
1761         next_alloc = trie->uniquecharcount + 1;
1762
1763
1764         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1765
1766             regnode * const noper   = NEXTOPER( cur );
1767             const U8 *uc     = (U8*)STRING( noper );
1768             const U8 * const e = uc + STR_LEN( noper );
1769
1770             U32 state        = 1;         /* required init */
1771
1772             U16 charid       = 0;         /* sanity init */
1773             U32 accept_state = 0;         /* sanity init */
1774             U8 *scan         = (U8*)NULL; /* sanity init */
1775
1776             STRLEN foldlen   = 0;         /* required init */
1777             U32 wordlen      = 0;         /* required init */
1778             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1779
1780             if ( OP(noper) != NOTHING ) {
1781                 for ( ; uc < e ; uc += len ) {
1782
1783                     TRIE_READ_CHAR;
1784
1785                     if ( uvc < 256 ) {
1786                         charid = trie->charmap[ uvc ];
1787                     } else {
1788                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1789                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1790                     }
1791                     if ( charid ) {
1792                         charid--;
1793                         if ( !trie->trans[ state + charid ].next ) {
1794                             trie->trans[ state + charid ].next = next_alloc;
1795                             trie->trans[ state ].check++;
1796                             prev_states[TRIE_NODENUM(next_alloc)]
1797                                     = TRIE_NODENUM(state);
1798                             next_alloc += trie->uniquecharcount;
1799                         }
1800                         state = trie->trans[ state + charid ].next;
1801                     } else {
1802                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1803                     }
1804                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1805                 }
1806             }
1807             accept_state = TRIE_NODENUM( state );
1808             TRIE_HANDLE_WORD(accept_state);
1809
1810         } /* end second pass */
1811
1812         /* and now dump it out before we compress it */
1813         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1814                                                           revcharmap,
1815                                                           next_alloc, depth+1));
1816
1817         {
1818         /*
1819            * Inplace compress the table.*
1820
1821            For sparse data sets the table constructed by the trie algorithm will
1822            be mostly 0/FAIL transitions or to put it another way mostly empty.
1823            (Note that leaf nodes will not contain any transitions.)
1824
1825            This algorithm compresses the tables by eliminating most such
1826            transitions, at the cost of a modest bit of extra work during lookup:
1827
1828            - Each states[] entry contains a .base field which indicates the
1829            index in the state[] array wheres its transition data is stored.
1830
1831            - If .base is 0 there are no  valid transitions from that node.
1832
1833            - If .base is nonzero then charid is added to it to find an entry in
1834            the trans array.
1835
1836            -If trans[states[state].base+charid].check!=state then the
1837            transition is taken to be a 0/Fail transition. Thus if there are fail
1838            transitions at the front of the node then the .base offset will point
1839            somewhere inside the previous nodes data (or maybe even into a node
1840            even earlier), but the .check field determines if the transition is
1841            valid.
1842
1843            XXX - wrong maybe?
1844            The following process inplace converts the table to the compressed
1845            table: We first do not compress the root node 1,and mark its all its
1846            .check pointers as 1 and set its .base pointer as 1 as well. This
1847            allows to do a DFA construction from the compressed table later, and
1848            ensures that any .base pointers we calculate later are greater than
1849            0.
1850
1851            - We set 'pos' to indicate the first entry of the second node.
1852
1853            - We then iterate over the columns of the node, finding the first and
1854            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1855            and set the .check pointers accordingly, and advance pos
1856            appropriately and repreat for the next node. Note that when we copy
1857            the next pointers we have to convert them from the original
1858            NODEIDX form to NODENUM form as the former is not valid post
1859            compression.
1860
1861            - If a node has no transitions used we mark its base as 0 and do not
1862            advance the pos pointer.
1863
1864            - If a node only has one transition we use a second pointer into the
1865            structure to fill in allocated fail transitions from other states.
1866            This pointer is independent of the main pointer and scans forward
1867            looking for null transitions that are allocated to a state. When it
1868            finds one it writes the single transition into the "hole".  If the
1869            pointer doesnt find one the single transition is appended as normal.
1870
1871            - Once compressed we can Renew/realloc the structures to release the
1872            excess space.
1873
1874            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1875            specifically Fig 3.47 and the associated pseudocode.
1876
1877            demq
1878         */
1879         const U32 laststate = TRIE_NODENUM( next_alloc );
1880         U32 state, charid;
1881         U32 pos = 0, zp=0;
1882         trie->statecount = laststate;
1883
1884         for ( state = 1 ; state < laststate ; state++ ) {
1885             U8 flag = 0;
1886             const U32 stateidx = TRIE_NODEIDX( state );
1887             const U32 o_used = trie->trans[ stateidx ].check;
1888             U32 used = trie->trans[ stateidx ].check;
1889             trie->trans[ stateidx ].check = 0;
1890
1891             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1892                 if ( flag || trie->trans[ stateidx + charid ].next ) {
1893                     if ( trie->trans[ stateidx + charid ].next ) {
1894                         if (o_used == 1) {
1895                             for ( ; zp < pos ; zp++ ) {
1896                                 if ( ! trie->trans[ zp ].next ) {
1897                                     break;
1898                                 }
1899                             }
1900                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1901                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1902                             trie->trans[ zp ].check = state;
1903                             if ( ++zp > pos ) pos = zp;
1904                             break;
1905                         }
1906                         used--;
1907                     }
1908                     if ( !flag ) {
1909                         flag = 1;
1910                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1911                     }
1912                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1913                     trie->trans[ pos ].check = state;
1914                     pos++;
1915                 }
1916             }
1917         }
1918         trie->lasttrans = pos + 1;
1919         trie->states = (reg_trie_state *)
1920             PerlMemShared_realloc( trie->states, laststate
1921                                    * sizeof(reg_trie_state) );
1922         DEBUG_TRIE_COMPILE_MORE_r(
1923                 PerlIO_printf( Perl_debug_log,
1924                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1925                     (int)depth * 2 + 2,"",
1926                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1927                     (IV)next_alloc,
1928                     (IV)pos,
1929                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1930             );
1931
1932         } /* end table compress */
1933     }
1934     DEBUG_TRIE_COMPILE_MORE_r(
1935             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1936                 (int)depth * 2 + 2, "",
1937                 (UV)trie->statecount,
1938                 (UV)trie->lasttrans)
1939     );
1940     /* resize the trans array to remove unused space */
1941     trie->trans = (reg_trie_trans *)
1942         PerlMemShared_realloc( trie->trans, trie->lasttrans
1943                                * sizeof(reg_trie_trans) );
1944
1945     {   /* Modify the program and insert the new TRIE node*/
1946         U8 nodetype =(U8)(flags & 0xFF);
1947         char *str=NULL;
1948
1949 #ifdef DEBUGGING
1950         regnode *optimize = NULL;
1951 #ifdef RE_TRACK_PATTERN_OFFSETS
1952
1953         U32 mjd_offset = 0;
1954         U32 mjd_nodelen = 0;
1955 #endif /* RE_TRACK_PATTERN_OFFSETS */
1956 #endif /* DEBUGGING */
1957         /*
1958            This means we convert either the first branch or the first Exact,
1959            depending on whether the thing following (in 'last') is a branch
1960            or not and whther first is the startbranch (ie is it a sub part of
1961            the alternation or is it the whole thing.)
1962            Assuming its a sub part we conver the EXACT otherwise we convert
1963            the whole branch sequence, including the first.
1964          */
1965         /* Find the node we are going to overwrite */
1966         if ( first != startbranch || OP( last ) == BRANCH ) {
1967             /* branch sub-chain */
1968             NEXT_OFF( first ) = (U16)(last - first);
1969 #ifdef RE_TRACK_PATTERN_OFFSETS
1970             DEBUG_r({
1971                 mjd_offset= Node_Offset((convert));
1972                 mjd_nodelen= Node_Length((convert));
1973             });
1974 #endif
1975             /* whole branch chain */
1976         }
1977 #ifdef RE_TRACK_PATTERN_OFFSETS
1978         else {
1979             DEBUG_r({
1980                 const  regnode *nop = NEXTOPER( convert );
1981                 mjd_offset= Node_Offset((nop));
1982                 mjd_nodelen= Node_Length((nop));
1983             });
1984         }
1985         DEBUG_OPTIMISE_r(
1986             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1987                 (int)depth * 2 + 2, "",
1988                 (UV)mjd_offset, (UV)mjd_nodelen)
1989         );
1990 #endif
1991         /* But first we check to see if there is a common prefix we can
1992            split out as an EXACT and put in front of the TRIE node.  */
1993         trie->startstate= 1;
1994         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
1995             U32 state;
1996             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1997                 U32 ofs = 0;
1998                 I32 idx = -1;
1999                 U32 count = 0;
2000                 const U32 base = trie->states[ state ].trans.base;
2001
2002                 if ( trie->states[state].wordnum )
2003                         count = 1;
2004
2005                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2006                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2007                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2008                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2009                     {
2010                         if ( ++count > 1 ) {
2011                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2012                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2013                             if ( state == 1 ) break;
2014                             if ( count == 2 ) {
2015                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2016                                 DEBUG_OPTIMISE_r(
2017                                     PerlIO_printf(Perl_debug_log,
2018                                         "%*sNew Start State=%"UVuf" Class: [",
2019                                         (int)depth * 2 + 2, "",
2020                                         (UV)state));
2021                                 if (idx >= 0) {
2022                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2023                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2024
2025                                     TRIE_BITMAP_SET(trie,*ch);
2026                                     if ( folder )
2027                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2028                                     DEBUG_OPTIMISE_r(
2029                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2030                                     );
2031                                 }
2032                             }
2033                             TRIE_BITMAP_SET(trie,*ch);
2034                             if ( folder )
2035                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2036                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2037                         }
2038                         idx = ofs;
2039                     }
2040                 }
2041                 if ( count == 1 ) {
2042                     SV **tmp = av_fetch( revcharmap, idx, 0);
2043                     STRLEN len;
2044                     char *ch = SvPV( *tmp, len );
2045                     DEBUG_OPTIMISE_r({
2046                         SV *sv=sv_newmortal();
2047                         PerlIO_printf( Perl_debug_log,
2048                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2049                             (int)depth * 2 + 2, "",
2050                             (UV)state, (UV)idx,
2051                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2052                                 PL_colors[0], PL_colors[1],
2053                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2054                                 PERL_PV_ESCAPE_FIRSTCHAR
2055                             )
2056                         );
2057                     });
2058                     if ( state==1 ) {
2059                         OP( convert ) = nodetype;
2060                         str=STRING(convert);
2061                         STR_LEN(convert)=0;
2062                     }
2063                     STR_LEN(convert) += len;
2064                     while (len--)
2065                         *str++ = *ch++;
2066                 } else {
2067 #ifdef DEBUGGING
2068                     if (state>1)
2069                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2070 #endif
2071                     break;
2072                 }
2073             }
2074             trie->prefixlen = (state-1);
2075             if (str) {
2076                 regnode *n = convert+NODE_SZ_STR(convert);
2077                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2078                 trie->startstate = state;
2079                 trie->minlen -= (state - 1);
2080                 trie->maxlen -= (state - 1);
2081 #ifdef DEBUGGING
2082                /* At least the UNICOS C compiler choked on this
2083                 * being argument to DEBUG_r(), so let's just have
2084                 * it right here. */
2085                if (
2086 #ifdef PERL_EXT_RE_BUILD
2087                    1
2088 #else
2089                    DEBUG_r_TEST
2090 #endif
2091                    ) {
2092                    regnode *fix = convert;
2093                    U32 word = trie->wordcount;
2094                    mjd_nodelen++;
2095                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2096                    while( ++fix < n ) {
2097                        Set_Node_Offset_Length(fix, 0, 0);
2098                    }
2099                    while (word--) {
2100                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2101                        if (tmp) {
2102                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2103                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2104                            else
2105                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2106                        }
2107                    }
2108                }
2109 #endif
2110                 if (trie->maxlen) {
2111                     convert = n;
2112                 } else {
2113                     NEXT_OFF(convert) = (U16)(tail - convert);
2114                     DEBUG_r(optimize= n);
2115                 }
2116             }
2117         }
2118         if (!jumper)
2119             jumper = last;
2120         if ( trie->maxlen ) {
2121             NEXT_OFF( convert ) = (U16)(tail - convert);
2122             ARG_SET( convert, data_slot );
2123             /* Store the offset to the first unabsorbed branch in
2124                jump[0], which is otherwise unused by the jump logic.
2125                We use this when dumping a trie and during optimisation. */
2126             if (trie->jump)
2127                 trie->jump[0] = (U16)(nextbranch - convert);
2128
2129             /* XXXX */
2130             if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2131                  ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2132             {
2133                 OP( convert ) = TRIEC;
2134                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2135                 PerlMemShared_free(trie->bitmap);
2136                 trie->bitmap= NULL;
2137             } else
2138                 OP( convert ) = TRIE;
2139
2140             /* store the type in the flags */
2141             convert->flags = nodetype;
2142             DEBUG_r({
2143             optimize = convert
2144                       + NODE_STEP_REGNODE
2145                       + regarglen[ OP( convert ) ];
2146             });
2147             /* XXX We really should free up the resource in trie now,
2148                    as we won't use them - (which resources?) dmq */
2149         }
2150         /* needed for dumping*/
2151         DEBUG_r(if (optimize) {
2152             regnode *opt = convert;
2153
2154             while ( ++opt < optimize) {
2155                 Set_Node_Offset_Length(opt,0,0);
2156             }
2157             /*
2158                 Try to clean up some of the debris left after the
2159                 optimisation.
2160              */
2161             while( optimize < jumper ) {
2162                 mjd_nodelen += Node_Length((optimize));
2163                 OP( optimize ) = OPTIMIZED;
2164                 Set_Node_Offset_Length(optimize,0,0);
2165                 optimize++;
2166             }
2167             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2168         });
2169     } /* end node insert */
2170     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2171
2172     /*  Finish populating the prev field of the wordinfo array.  Walk back
2173      *  from each accept state until we find another accept state, and if
2174      *  so, point the first word's .prev field at the second word. If the
2175      *  second already has a .prev field set, stop now. This will be the
2176      *  case either if we've already processed that word's accept state,
2177      *  or that that state had multiple words, and the overspill words
2178      *  were already linked up earlier.
2179      */
2180     {
2181         U16 word;
2182         U32 state;
2183         U16 prev;
2184
2185         for (word=1; word <= trie->wordcount; word++) {
2186             prev = 0;
2187             if (trie->wordinfo[word].prev)
2188                 continue;
2189             state = trie->wordinfo[word].accept;
2190             while (state) {
2191                 state = prev_states[state];
2192                 if (!state)
2193                     break;
2194                 prev = trie->states[state].wordnum;
2195                 if (prev)
2196                     break;
2197             }
2198             trie->wordinfo[word].prev = prev;
2199         }
2200         Safefree(prev_states);
2201     }
2202
2203
2204     /* and now dump out the compressed format */
2205     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2206
2207     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2208 #ifdef DEBUGGING
2209     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2210     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2211 #else
2212     SvREFCNT_dec(revcharmap);
2213 #endif
2214     return trie->jump
2215            ? MADE_JUMP_TRIE
2216            : trie->startstate>1
2217              ? MADE_EXACT_TRIE
2218              : MADE_TRIE;
2219 }
2220
2221 STATIC void
2222 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2223 {
2224 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2225
2226    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2227    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2228    ISBN 0-201-10088-6
2229
2230    We find the fail state for each state in the trie, this state is the longest proper
2231    suffix of the current states 'word' that is also a proper prefix of another word in our
2232    trie. State 1 represents the word '' and is the thus the default fail state. This allows
2233    the DFA not to have to restart after its tried and failed a word at a given point, it
2234    simply continues as though it had been matching the other word in the first place.
2235    Consider
2236       'abcdgu'=~/abcdefg|cdgu/
2237    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2238    fail, which would bring use to the state representing 'd' in the second word where we would
2239    try 'g' and succeed, prodceding to match 'cdgu'.
2240  */
2241  /* add a fail transition */
2242     const U32 trie_offset = ARG(source);
2243     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2244     U32 *q;
2245     const U32 ucharcount = trie->uniquecharcount;
2246     const U32 numstates = trie->statecount;
2247     const U32 ubound = trie->lasttrans + ucharcount;
2248     U32 q_read = 0;
2249     U32 q_write = 0;
2250     U32 charid;
2251     U32 base = trie->states[ 1 ].trans.base;
2252     U32 *fail;
2253     reg_ac_data *aho;
2254     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2255     GET_RE_DEBUG_FLAGS_DECL;
2256
2257     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2258 #ifndef DEBUGGING
2259     PERL_UNUSED_ARG(depth);
2260 #endif
2261
2262
2263     ARG_SET( stclass, data_slot );
2264     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2265     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2266     aho->trie=trie_offset;
2267     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2268     Copy( trie->states, aho->states, numstates, reg_trie_state );
2269     Newxz( q, numstates, U32);
2270     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2271     aho->refcount = 1;
2272     fail = aho->fail;
2273     /* initialize fail[0..1] to be 1 so that we always have
2274        a valid final fail state */
2275     fail[ 0 ] = fail[ 1 ] = 1;
2276
2277     for ( charid = 0; charid < ucharcount ; charid++ ) {
2278         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2279         if ( newstate ) {
2280             q[ q_write ] = newstate;
2281             /* set to point at the root */
2282             fail[ q[ q_write++ ] ]=1;
2283         }
2284     }
2285     while ( q_read < q_write) {
2286         const U32 cur = q[ q_read++ % numstates ];
2287         base = trie->states[ cur ].trans.base;
2288
2289         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2290             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2291             if (ch_state) {
2292                 U32 fail_state = cur;
2293                 U32 fail_base;
2294                 do {
2295                     fail_state = fail[ fail_state ];
2296                     fail_base = aho->states[ fail_state ].trans.base;
2297                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2298
2299                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2300                 fail[ ch_state ] = fail_state;
2301                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2302                 {
2303                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2304                 }
2305                 q[ q_write++ % numstates] = ch_state;
2306             }
2307         }
2308     }
2309     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2310        when we fail in state 1, this allows us to use the
2311        charclass scan to find a valid start char. This is based on the principle
2312        that theres a good chance the string being searched contains lots of stuff
2313        that cant be a start char.
2314      */
2315     fail[ 0 ] = fail[ 1 ] = 0;
2316     DEBUG_TRIE_COMPILE_r({
2317         PerlIO_printf(Perl_debug_log,
2318                       "%*sStclass Failtable (%"UVuf" states): 0",
2319                       (int)(depth * 2), "", (UV)numstates
2320         );
2321         for( q_read=1; q_read<numstates; q_read++ ) {
2322             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2323         }
2324         PerlIO_printf(Perl_debug_log, "\n");
2325     });
2326     Safefree(q);
2327     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2328 }
2329
2330
2331 /*
2332  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2333  * These need to be revisited when a newer toolchain becomes available.
2334  */
2335 #if defined(__sparc64__) && defined(__GNUC__)
2336 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2337 #       undef  SPARC64_GCC_WORKAROUND
2338 #       define SPARC64_GCC_WORKAROUND 1
2339 #   endif
2340 #endif
2341
2342 #define DEBUG_PEEP(str,scan,depth) \
2343     DEBUG_OPTIMISE_r({if (scan){ \
2344        SV * const mysv=sv_newmortal(); \
2345        regnode *Next = regnext(scan); \
2346        regprop(RExC_rx, mysv, scan); \
2347        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2348        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2349        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2350    }});
2351
2352
2353
2354
2355
2356 #define JOIN_EXACT(scan,min,flags) \
2357     if (PL_regkind[OP(scan)] == EXACT) \
2358         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2359
2360 STATIC U32
2361 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2362     /* Merge several consecutive EXACTish nodes into one. */
2363     regnode *n = regnext(scan);
2364     U32 stringok = 1;
2365     regnode *next = scan + NODE_SZ_STR(scan);
2366     U32 merged = 0;
2367     U32 stopnow = 0;
2368 #ifdef DEBUGGING
2369     regnode *stop = scan;
2370     GET_RE_DEBUG_FLAGS_DECL;
2371 #else
2372     PERL_UNUSED_ARG(depth);
2373 #endif
2374
2375     PERL_ARGS_ASSERT_JOIN_EXACT;
2376 #ifndef EXPERIMENTAL_INPLACESCAN
2377     PERL_UNUSED_ARG(flags);
2378     PERL_UNUSED_ARG(val);
2379 #endif
2380     DEBUG_PEEP("join",scan,depth);
2381
2382     /* Skip NOTHING, merge EXACT*. */
2383     while (n &&
2384            ( PL_regkind[OP(n)] == NOTHING ||
2385              (stringok && (OP(n) == OP(scan))))
2386            && NEXT_OFF(n)
2387            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2388
2389         if (OP(n) == TAIL || n > next)
2390             stringok = 0;
2391         if (PL_regkind[OP(n)] == NOTHING) {
2392             DEBUG_PEEP("skip:",n,depth);
2393             NEXT_OFF(scan) += NEXT_OFF(n);
2394             next = n + NODE_STEP_REGNODE;
2395 #ifdef DEBUGGING
2396             if (stringok)
2397                 stop = n;
2398 #endif
2399             n = regnext(n);
2400         }
2401         else if (stringok) {
2402             const unsigned int oldl = STR_LEN(scan);
2403             regnode * const nnext = regnext(n);
2404
2405             DEBUG_PEEP("merg",n,depth);
2406
2407             merged++;
2408             if (oldl + STR_LEN(n) > U8_MAX)
2409                 break;
2410             NEXT_OFF(scan) += NEXT_OFF(n);
2411             STR_LEN(scan) += STR_LEN(n);
2412             next = n + NODE_SZ_STR(n);
2413             /* Now we can overwrite *n : */
2414             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2415 #ifdef DEBUGGING
2416             stop = next - 1;
2417 #endif
2418             n = nnext;
2419             if (stopnow) break;
2420         }
2421
2422 #ifdef EXPERIMENTAL_INPLACESCAN
2423         if (flags && !NEXT_OFF(n)) {
2424             DEBUG_PEEP("atch", val, depth);
2425             if (reg_off_by_arg[OP(n)]) {
2426                 ARG_SET(n, val - n);
2427             }
2428             else {
2429                 NEXT_OFF(n) = val - n;
2430             }
2431             stopnow = 1;
2432         }
2433 #endif
2434     }
2435
2436     if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2437     /*
2438     Two problematic code points in Unicode casefolding of EXACT nodes:
2439
2440     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2441     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2442
2443     which casefold to
2444
2445     Unicode                      UTF-8
2446
2447     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2448     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2449
2450     This means that in case-insensitive matching (or "loose matching",
2451     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2452     length of the above casefolded versions) can match a target string
2453     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2454     This would rather mess up the minimum length computation.
2455
2456     What we'll do is to look for the tail four bytes, and then peek
2457     at the preceding two bytes to see whether we need to decrease
2458     the minimum length by four (six minus two).
2459
2460     Thanks to the design of UTF-8, there cannot be false matches:
2461     A sequence of valid UTF-8 bytes cannot be a subsequence of
2462     another valid sequence of UTF-8 bytes.
2463
2464     */
2465          char * const s0 = STRING(scan), *s, *t;
2466          char * const s1 = s0 + STR_LEN(scan) - 1;
2467          char * const s2 = s1 - 4;
2468 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2469          const char t0[] = "\xaf\x49\xaf\x42";
2470 #else
2471          const char t0[] = "\xcc\x88\xcc\x81";
2472 #endif
2473          const char * const t1 = t0 + 3;
2474
2475          for (s = s0 + 2;
2476               s < s2 && (t = ninstr(s, s1, t0, t1));
2477               s = t + 4) {
2478 #ifdef EBCDIC
2479               if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2480                   ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2481 #else
2482               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2483                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2484 #endif
2485                    *min -= 4;
2486          }
2487     }
2488
2489 #ifdef DEBUGGING
2490     /* Allow dumping */
2491     n = scan + NODE_SZ_STR(scan);
2492     while (n <= stop) {
2493         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2494             OP(n) = OPTIMIZED;
2495             NEXT_OFF(n) = 0;
2496         }
2497         n++;
2498     }
2499 #endif
2500     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2501     return stopnow;
2502 }
2503
2504 /* REx optimizer.  Converts nodes into quickier variants "in place".
2505    Finds fixed substrings.  */
2506
2507 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2508    to the position after last scanned or to NULL. */
2509
2510 #define INIT_AND_WITHP \
2511     assert(!and_withp); \
2512     Newx(and_withp,1,struct regnode_charclass_class); \
2513     SAVEFREEPV(and_withp)
2514
2515 /* this is a chain of data about sub patterns we are processing that
2516    need to be handled seperately/specially in study_chunk. Its so
2517    we can simulate recursion without losing state.  */
2518 struct scan_frame;
2519 typedef struct scan_frame {
2520     regnode *last;  /* last node to process in this frame */
2521     regnode *next;  /* next node to process when last is reached */
2522     struct scan_frame *prev; /*previous frame*/
2523     I32 stop; /* what stopparen do we use */
2524 } scan_frame;
2525
2526
2527 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2528
2529 #define CASE_SYNST_FNC(nAmE)                                       \
2530 case nAmE:                                                         \
2531     if (flags & SCF_DO_STCLASS_AND) {                              \
2532             for (value = 0; value < 256; value++)                  \
2533                 if (!is_ ## nAmE ## _cp(value))                       \
2534                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2535     }                                                              \
2536     else {                                                         \
2537             for (value = 0; value < 256; value++)                  \
2538                 if (is_ ## nAmE ## _cp(value))                        \
2539                     ANYOF_BITMAP_SET(data->start_class, value);    \
2540     }                                                              \
2541     break;                                                         \
2542 case N ## nAmE:                                                    \
2543     if (flags & SCF_DO_STCLASS_AND) {                              \
2544             for (value = 0; value < 256; value++)                   \
2545                 if (is_ ## nAmE ## _cp(value))                         \
2546                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2547     }                                                               \
2548     else {                                                          \
2549             for (value = 0; value < 256; value++)                   \
2550                 if (!is_ ## nAmE ## _cp(value))                        \
2551                     ANYOF_BITMAP_SET(data->start_class, value);     \
2552     }                                                               \
2553     break
2554
2555
2556
2557 STATIC I32
2558 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2559                         I32 *minlenp, I32 *deltap,
2560                         regnode *last,
2561                         scan_data_t *data,
2562                         I32 stopparen,
2563                         U8* recursed,
2564                         struct regnode_charclass_class *and_withp,
2565                         U32 flags, U32 depth)
2566                         /* scanp: Start here (read-write). */
2567                         /* deltap: Write maxlen-minlen here. */
2568                         /* last: Stop before this one. */
2569                         /* data: string data about the pattern */
2570                         /* stopparen: treat close N as END */
2571                         /* recursed: which subroutines have we recursed into */
2572                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2573 {
2574     dVAR;
2575     I32 min = 0, pars = 0, code;
2576     regnode *scan = *scanp, *next;
2577     I32 delta = 0;
2578     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2579     int is_inf_internal = 0;            /* The studied chunk is infinite */
2580     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2581     scan_data_t data_fake;
2582     SV *re_trie_maxbuff = NULL;
2583     regnode *first_non_open = scan;
2584     I32 stopmin = I32_MAX;
2585     scan_frame *frame = NULL;
2586     GET_RE_DEBUG_FLAGS_DECL;
2587
2588     PERL_ARGS_ASSERT_STUDY_CHUNK;
2589
2590 #ifdef DEBUGGING
2591     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2592 #endif
2593
2594     if ( depth == 0 ) {
2595         while (first_non_open && OP(first_non_open) == OPEN)
2596             first_non_open=regnext(first_non_open);
2597     }
2598
2599
2600   fake_study_recurse:
2601     while ( scan && OP(scan) != END && scan < last ){
2602         /* Peephole optimizer: */
2603         DEBUG_STUDYDATA("Peep:", data,depth);
2604         DEBUG_PEEP("Peep",scan,depth);
2605         JOIN_EXACT(scan,&min,0);
2606
2607         /* Follow the next-chain of the current node and optimize
2608            away all the NOTHINGs from it.  */
2609         if (OP(scan) != CURLYX) {
2610             const int max = (reg_off_by_arg[OP(scan)]
2611                        ? I32_MAX
2612                        /* I32 may be smaller than U16 on CRAYs! */
2613                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2614             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2615             int noff;
2616             regnode *n = scan;
2617
2618             /* Skip NOTHING and LONGJMP. */
2619             while ((n = regnext(n))
2620                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2621                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2622                    && off + noff < max)
2623                 off += noff;
2624             if (reg_off_by_arg[OP(scan)])
2625                 ARG(scan) = off;
2626             else
2627                 NEXT_OFF(scan) = off;
2628         }
2629
2630
2631
2632         /* The principal pseudo-switch.  Cannot be a switch, since we
2633            look into several different things.  */
2634         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2635                    || OP(scan) == IFTHEN) {
2636             next = regnext(scan);
2637             code = OP(scan);
2638             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2639
2640             if (OP(next) == code || code == IFTHEN) {
2641                 /* NOTE - There is similar code to this block below for handling
2642                    TRIE nodes on a re-study.  If you change stuff here check there
2643                    too. */
2644                 I32 max1 = 0, min1 = I32_MAX, num = 0;
2645                 struct regnode_charclass_class accum;
2646                 regnode * const startbranch=scan;
2647
2648                 if (flags & SCF_DO_SUBSTR)
2649                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2650                 if (flags & SCF_DO_STCLASS)
2651                     cl_init_zero(pRExC_state, &accum);
2652
2653                 while (OP(scan) == code) {
2654                     I32 deltanext, minnext, f = 0, fake;
2655                     struct regnode_charclass_class this_class;
2656
2657                     num++;
2658                     data_fake.flags = 0;
2659                     if (data) {
2660                         data_fake.whilem_c = data->whilem_c;
2661                         data_fake.last_closep = data->last_closep;
2662                     }
2663                     else
2664                         data_fake.last_closep = &fake;
2665
2666                     data_fake.pos_delta = delta;
2667                     next = regnext(scan);
2668                     scan = NEXTOPER(scan);
2669                     if (code != BRANCH)
2670                         scan = NEXTOPER(scan);
2671                     if (flags & SCF_DO_STCLASS) {
2672                         cl_init(pRExC_state, &this_class);
2673                         data_fake.start_class = &this_class;
2674                         f = SCF_DO_STCLASS_AND;
2675                     }
2676                     if (flags & SCF_WHILEM_VISITED_POS)
2677                         f |= SCF_WHILEM_VISITED_POS;
2678
2679                     /* we suppose the run is continuous, last=next...*/
2680                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2681                                           next, &data_fake,
2682                                           stopparen, recursed, NULL, f,depth+1);
2683                     if (min1 > minnext)
2684                         min1 = minnext;
2685                     if (max1 < minnext + deltanext)
2686                         max1 = minnext + deltanext;
2687                     if (deltanext == I32_MAX)
2688                         is_inf = is_inf_internal = 1;
2689                     scan = next;
2690                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2691                         pars++;
2692                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
2693                         if ( stopmin > minnext)
2694                             stopmin = min + min1;
2695                         flags &= ~SCF_DO_SUBSTR;
2696                         if (data)
2697                             data->flags |= SCF_SEEN_ACCEPT;
2698                     }
2699                     if (data) {
2700                         if (data_fake.flags & SF_HAS_EVAL)
2701                             data->flags |= SF_HAS_EVAL;
2702                         data->whilem_c = data_fake.whilem_c;
2703                     }
2704                     if (flags & SCF_DO_STCLASS)
2705                         cl_or(pRExC_state, &accum, &this_class);
2706                 }
2707                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2708                     min1 = 0;
2709                 if (flags & SCF_DO_SUBSTR) {
2710                     data->pos_min += min1;
2711                     data->pos_delta += max1 - min1;
2712                     if (max1 != min1 || is_inf)
2713                         data->longest = &(data->longest_float);
2714                 }
2715                 min += min1;
2716                 delta += max1 - min1;
2717                 if (flags & SCF_DO_STCLASS_OR) {
2718                     cl_or(pRExC_state, data->start_class, &accum);
2719                     if (min1) {
2720                         cl_and(data->start_class, and_withp);
2721                         flags &= ~SCF_DO_STCLASS;
2722                     }
2723                 }
2724                 else if (flags & SCF_DO_STCLASS_AND) {
2725                     if (min1) {
2726                         cl_and(data->start_class, &accum);
2727                         flags &= ~SCF_DO_STCLASS;
2728                     }
2729                     else {
2730                         /* Switch to OR mode: cache the old value of
2731                          * data->start_class */
2732                         INIT_AND_WITHP;
2733                         StructCopy(data->start_class, and_withp,
2734                                    struct regnode_charclass_class);
2735                         flags &= ~SCF_DO_STCLASS_AND;
2736                         StructCopy(&accum, data->start_class,
2737                                    struct regnode_charclass_class);
2738                         flags |= SCF_DO_STCLASS_OR;
2739                         data->start_class->flags |= ANYOF_EOS;
2740                     }
2741                 }
2742
2743                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2744                 /* demq.
2745
2746                    Assuming this was/is a branch we are dealing with: 'scan' now
2747                    points at the item that follows the branch sequence, whatever
2748                    it is. We now start at the beginning of the sequence and look
2749                    for subsequences of
2750
2751                    BRANCH->EXACT=>x1
2752                    BRANCH->EXACT=>x2
2753                    tail
2754
2755                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
2756
2757                    If we can find such a subseqence we need to turn the first
2758                    element into a trie and then add the subsequent branch exact
2759                    strings to the trie.
2760
2761                    We have two cases
2762
2763                      1. patterns where the whole set of branch can be converted.
2764
2765                      2. patterns where only a subset can be converted.
2766
2767                    In case 1 we can replace the whole set with a single regop
2768                    for the trie. In case 2 we need to keep the start and end
2769                    branchs so
2770
2771                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2772                      becomes BRANCH TRIE; BRANCH X;
2773
2774                   There is an additional case, that being where there is a
2775                   common prefix, which gets split out into an EXACT like node
2776                   preceding the TRIE node.
2777
2778                   If x(1..n)==tail then we can do a simple trie, if not we make
2779                   a "jump" trie, such that when we match the appropriate word
2780                   we "jump" to the appopriate tail node. Essentailly we turn
2781                   a nested if into a case structure of sorts.
2782
2783                 */
2784
2785                     int made=0;
2786                     if (!re_trie_maxbuff) {
2787                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2788                         if (!SvIOK(re_trie_maxbuff))
2789                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2790                     }
2791                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2792                         regnode *cur;
2793                         regnode *first = (regnode *)NULL;
2794                         regnode *last = (regnode *)NULL;
2795                         regnode *tail = scan;
2796                         U8 optype = 0;
2797                         U32 count=0;
2798
2799 #ifdef DEBUGGING
2800                         SV * const mysv = sv_newmortal();       /* for dumping */
2801 #endif
2802                         /* var tail is used because there may be a TAIL
2803                            regop in the way. Ie, the exacts will point to the
2804                            thing following the TAIL, but the last branch will
2805                            point at the TAIL. So we advance tail. If we
2806                            have nested (?:) we may have to move through several
2807                            tails.
2808                          */
2809
2810                         while ( OP( tail ) == TAIL ) {
2811                             /* this is the TAIL generated by (?:) */
2812                             tail = regnext( tail );
2813                         }
2814
2815
2816                         DEBUG_OPTIMISE_r({
2817                             regprop(RExC_rx, mysv, tail );
2818                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2819                                 (int)depth * 2 + 2, "",
2820                                 "Looking for TRIE'able sequences. Tail node is: ",
2821                                 SvPV_nolen_const( mysv )
2822                             );
2823                         });
2824
2825                         /*
2826
2827                            step through the branches, cur represents each
2828                            branch, noper is the first thing to be matched
2829                            as part of that branch and noper_next is the
2830                            regnext() of that node. if noper is an EXACT
2831                            and noper_next is the same as scan (our current
2832                            position in the regex) then the EXACT branch is
2833                            a possible optimization target. Once we have
2834                            two or more consequetive such branches we can
2835                            create a trie of the EXACT's contents and stich
2836                            it in place. If the sequence represents all of
2837                            the branches we eliminate the whole thing and
2838                            replace it with a single TRIE. If it is a
2839                            subsequence then we need to stitch it in. This
2840                            means the first branch has to remain, and needs
2841                            to be repointed at the item on the branch chain
2842                            following the last branch optimized. This could
2843                            be either a BRANCH, in which case the
2844                            subsequence is internal, or it could be the
2845                            item following the branch sequence in which
2846                            case the subsequence is at the end.
2847
2848                         */
2849
2850                         /* dont use tail as the end marker for this traverse */
2851                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2852                             regnode * const noper = NEXTOPER( cur );
2853 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2854                             regnode * const noper_next = regnext( noper );
2855 #endif
2856
2857                             DEBUG_OPTIMISE_r({
2858                                 regprop(RExC_rx, mysv, cur);
2859                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2860                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2861
2862                                 regprop(RExC_rx, mysv, noper);
2863                                 PerlIO_printf( Perl_debug_log, " -> %s",
2864                                     SvPV_nolen_const(mysv));
2865
2866                                 if ( noper_next ) {
2867                                   regprop(RExC_rx, mysv, noper_next );
2868                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2869                                     SvPV_nolen_const(mysv));
2870                                 }
2871                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2872                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2873                             });
2874                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2875                                          : PL_regkind[ OP( noper ) ] == EXACT )
2876                                   || OP(noper) == NOTHING )
2877 #ifdef NOJUMPTRIE
2878                                   && noper_next == tail
2879 #endif
2880                                   && count < U16_MAX)
2881                             {
2882                                 count++;
2883                                 if ( !first || optype == NOTHING ) {
2884                                     if (!first) first = cur;
2885                                     optype = OP( noper );
2886                                 } else {
2887                                     last = cur;
2888                                 }
2889                             } else {
2890 /*
2891     Currently we do not believe that the trie logic can
2892     handle case insensitive matching properly when the
2893     pattern is not unicode (thus forcing unicode semantics).
2894
2895     If/when this is fixed the following define can be swapped
2896     in below to fully enable trie logic.
2897
2898 #define TRIE_TYPE_IS_SAFE 1
2899
2900 */
2901 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2902
2903                                 if ( last && TRIE_TYPE_IS_SAFE ) {
2904                                     make_trie( pRExC_state,
2905                                             startbranch, first, cur, tail, count,
2906                                             optype, depth+1 );
2907                                 }
2908                                 if ( PL_regkind[ OP( noper ) ] == EXACT
2909 #ifdef NOJUMPTRIE
2910                                      && noper_next == tail
2911 #endif
2912                                 ){
2913                                     count = 1;
2914                                     first = cur;
2915                                     optype = OP( noper );
2916                                 } else {
2917                                     count = 0;
2918                                     first = NULL;
2919                                     optype = 0;
2920                                 }
2921                                 last = NULL;
2922                             }
2923                         }
2924                         DEBUG_OPTIMISE_r({
2925                             regprop(RExC_rx, mysv, cur);
2926                             PerlIO_printf( Perl_debug_log,
2927                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2928                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2929
2930                         });
2931
2932                         if ( last && TRIE_TYPE_IS_SAFE ) {
2933                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2934 #ifdef TRIE_STUDY_OPT
2935                             if ( ((made == MADE_EXACT_TRIE &&
2936                                  startbranch == first)
2937                                  || ( first_non_open == first )) &&
2938                                  depth==0 ) {
2939                                 flags |= SCF_TRIE_RESTUDY;
2940                                 if ( startbranch == first
2941                                      && scan == tail )
2942                                 {
2943                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2944                                 }
2945                             }
2946 #endif
2947                         }
2948                     }
2949
2950                 } /* do trie */
2951
2952             }
2953             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
2954                 scan = NEXTOPER(NEXTOPER(scan));
2955             } else                      /* single branch is optimized. */
2956                 scan = NEXTOPER(scan);
2957             continue;
2958         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2959             scan_frame *newframe = NULL;
2960             I32 paren;
2961             regnode *start;
2962             regnode *end;
2963
2964             if (OP(scan) != SUSPEND) {
2965             /* set the pointer */
2966                 if (OP(scan) == GOSUB) {
2967                     paren = ARG(scan);
2968                     RExC_recurse[ARG2L(scan)] = scan;
2969                     start = RExC_open_parens[paren-1];
2970                     end   = RExC_close_parens[paren-1];
2971                 } else {
2972                     paren = 0;
2973                     start = RExC_rxi->program + 1;
2974                     end   = RExC_opend;
2975                 }
2976                 if (!recursed) {
2977                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2978                     SAVEFREEPV(recursed);
2979                 }
2980                 if (!PAREN_TEST(recursed,paren+1)) {
2981                     PAREN_SET(recursed,paren+1);
2982                     Newx(newframe,1,scan_frame);
2983                 } else {
2984                     if (flags & SCF_DO_SUBSTR) {
2985                         SCAN_COMMIT(pRExC_state,data,minlenp);
2986                         data->longest = &(data->longest_float);
2987                     }
2988                     is_inf = is_inf_internal = 1;
2989                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2990                         cl_anything(pRExC_state, data->start_class);
2991                     flags &= ~SCF_DO_STCLASS;
2992                 }
2993             } else {
2994                 Newx(newframe,1,scan_frame);
2995                 paren = stopparen;
2996                 start = scan+2;
2997                 end = regnext(scan);
2998             }
2999             if (newframe) {
3000                 assert(start);
3001                 assert(end);
3002                 SAVEFREEPV(newframe);
3003                 newframe->next = regnext(scan);
3004                 newframe->last = last;
3005                 newframe->stop = stopparen;
3006                 newframe->prev = frame;
3007
3008                 frame = newframe;
3009                 scan =  start;
3010                 stopparen = paren;
3011                 last = end;
3012
3013                 continue;
3014             }
3015         }
3016         else if (OP(scan) == EXACT) {
3017             I32 l = STR_LEN(scan);
3018             UV uc;
3019             if (UTF) {
3020                 const U8 * const s = (U8*)STRING(scan);
3021                 l = utf8_length(s, s + l);
3022                 uc = utf8_to_uvchr(s, NULL);
3023             } else {
3024                 uc = *((U8*)STRING(scan));
3025             }
3026             min += l;
3027             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3028                 /* The code below prefers earlier match for fixed
3029                    offset, later match for variable offset.  */
3030                 if (data->last_end == -1) { /* Update the start info. */
3031                     data->last_start_min = data->pos_min;
3032                     data->last_start_max = is_inf
3033                         ? I32_MAX : data->pos_min + data->pos_delta;
3034                 }
3035                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3036                 if (UTF)
3037                     SvUTF8_on(data->last_found);
3038                 {
3039                     SV * const sv = data->last_found;
3040                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3041                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3042                     if (mg && mg->mg_len >= 0)
3043                         mg->mg_len += utf8_length((U8*)STRING(scan),
3044                                                   (U8*)STRING(scan)+STR_LEN(scan));
3045                 }
3046                 data->last_end = data->pos_min + l;
3047                 data->pos_min += l; /* As in the first entry. */
3048                 data->flags &= ~SF_BEFORE_EOL;
3049             }
3050             if (flags & SCF_DO_STCLASS_AND) {
3051                 /* Check whether it is compatible with what we know already! */
3052                 int compat = 1;
3053
3054                 if (uc >= 0x100 ||
3055                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3056                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3057                     && (!(data->start_class->flags & ANYOF_FOLD)
3058                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3059                     )
3060                     compat = 0;
3061                 ANYOF_CLASS_ZERO(data->start_class);
3062                 ANYOF_BITMAP_ZERO(data->start_class);
3063                 if (compat)
3064                     ANYOF_BITMAP_SET(data->start_class, uc);
3065                 data->start_class->flags &= ~ANYOF_EOS;
3066                 if (uc < 0x100)
3067                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3068             }
3069             else if (flags & SCF_DO_STCLASS_OR) {
3070                 /* false positive possible if the class is case-folded */
3071                 if (uc < 0x100)
3072                     ANYOF_BITMAP_SET(data->start_class, uc);
3073                 else
3074                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3075                 data->start_class->flags &= ~ANYOF_EOS;
3076                 cl_and(data->start_class, and_withp);
3077             }
3078             flags &= ~SCF_DO_STCLASS;
3079         }
3080         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3081             I32 l = STR_LEN(scan);
3082             UV uc = *((U8*)STRING(scan));
3083
3084             /* Search for fixed substrings supports EXACT only. */
3085             if (flags & SCF_DO_SUBSTR) {
3086                 assert(data);
3087                 SCAN_COMMIT(pRExC_state, data, minlenp);
3088             }
3089             if (UTF) {
3090                 const U8 * const s = (U8 *)STRING(scan);
3091                 l = utf8_length(s, s + l);
3092                 uc = utf8_to_uvchr(s, NULL);
3093             }
3094             min += l;
3095             if (flags & SCF_DO_SUBSTR)
3096                 data->pos_min += l;
3097             if (flags & SCF_DO_STCLASS_AND) {
3098                 /* Check whether it is compatible with what we know already! */
3099                 int compat = 1;
3100
3101                 if (uc >= 0x100 ||
3102                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3103                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3104                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3105                     compat = 0;
3106                 ANYOF_CLASS_ZERO(data->start_class);
3107                 ANYOF_BITMAP_ZERO(data->start_class);
3108                 if (compat) {
3109                     ANYOF_BITMAP_SET(data->start_class, uc);
3110                     data->start_class->flags &= ~ANYOF_EOS;
3111                     data->start_class->flags |= ANYOF_FOLD;
3112                     if (OP(scan) == EXACTFL)
3113                         data->start_class->flags |= ANYOF_LOCALE;
3114                 }
3115             }
3116             else if (flags & SCF_DO_STCLASS_OR) {
3117                 if (data->start_class->flags & ANYOF_FOLD) {
3118                     /* false positive possible if the class is case-folded.
3119                        Assume that the locale settings are the same... */
3120                     if (uc < 0x100)
3121                         ANYOF_BITMAP_SET(data->start_class, uc);
3122                     data->start_class->flags &= ~ANYOF_EOS;
3123                 }
3124                 cl_and(data->start_class, and_withp);
3125             }
3126             flags &= ~SCF_DO_STCLASS;
3127         }
3128         else if (REGNODE_VARIES(OP(scan))) {
3129             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3130             I32 f = flags, pos_before = 0;
3131             regnode * const oscan = scan;
3132             struct regnode_charclass_class this_class;
3133             struct regnode_charclass_class *oclass = NULL;
3134             I32 next_is_eval = 0;
3135
3136             switch (PL_regkind[OP(scan)]) {
3137             case WHILEM:                /* End of (?:...)* . */
3138                 scan = NEXTOPER(scan);
3139                 goto finish;
3140             case PLUS:
3141                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3142                     next = NEXTOPER(scan);
3143                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3144                         mincount = 1;
3145                         maxcount = REG_INFTY;
3146                         next = regnext(scan);
3147                         scan = NEXTOPER(scan);
3148                         goto do_curly;
3149                     }
3150                 }
3151                 if (flags & SCF_DO_SUBSTR)
3152                     data->pos_min++;
3153                 min++;
3154                 /* Fall through. */
3155             case STAR:
3156                 if (flags & SCF_DO_STCLASS) {
3157                     mincount = 0;
3158                     maxcount = REG_INFTY;
3159                     next = regnext(scan);
3160                     scan = NEXTOPER(scan);
3161                     goto do_curly;
3162                 }
3163                 is_inf = is_inf_internal = 1;
3164                 scan = regnext(scan);
3165                 if (flags & SCF_DO_SUBSTR) {
3166                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3167                     data->longest = &(data->longest_float);
3168                 }
3169                 goto optimize_curly_tail;
3170             case CURLY:
3171                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3172                     && (scan->flags == stopparen))
3173                 {
3174                     mincount = 1;
3175                     maxcount = 1;
3176                 } else {
3177                     mincount = ARG1(scan);
3178                     maxcount = ARG2(scan);
3179                 }
3180                 next = regnext(scan);
3181                 if (OP(scan) == CURLYX) {
3182                     I32 lp = (data ? *(data->last_closep) : 0);
3183                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3184                 }
3185                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3186                 next_is_eval = (OP(scan) == EVAL);
3187               do_curly:
3188                 if (flags & SCF_DO_SUBSTR) {
3189                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3190                     pos_before = data->pos_min;
3191                 }
3192                 if (data) {
3193                     fl = data->flags;
3194                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3195                     if (is_inf)
3196                         data->flags |= SF_IS_INF;
3197                 }
3198                 if (flags & SCF_DO_STCLASS) {
3199                     cl_init(pRExC_state, &this_class);
3200                     oclass = data->start_class;
3201                     data->start_class = &this_class;
3202                     f |= SCF_DO_STCLASS_AND;
3203                     f &= ~SCF_DO_STCLASS_OR;
3204                 }
3205                 /* These are the cases when once a subexpression
3206                    fails at a particular position, it cannot succeed
3207                    even after backtracking at the enclosing scope.
3208
3209                    XXXX what if minimal match and we are at the
3210                         initial run of {n,m}? */
3211                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3212                     f &= ~SCF_WHILEM_VISITED_POS;
3213
3214                 /* This will finish on WHILEM, setting scan, or on NULL: */
3215                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3216                                       last, data, stopparen, recursed, NULL,
3217                                       (mincount == 0
3218                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3219
3220                 if (flags & SCF_DO_STCLASS)
3221                     data->start_class = oclass;
3222                 if (mincount == 0 || minnext == 0) {
3223                     if (flags & SCF_DO_STCLASS_OR) {
3224                         cl_or(pRExC_state, data->start_class, &this_class);
3225                     }
3226                     else if (flags & SCF_DO_STCLASS_AND) {
3227                         /* Switch to OR mode: cache the old value of
3228                          * data->start_class */
3229                         INIT_AND_WITHP;
3230                         StructCopy(data->start_class, and_withp,
3231                                    struct regnode_charclass_class);
3232                         flags &= ~SCF_DO_STCLASS_AND;
3233                         StructCopy(&this_class, data->start_class,
3234                                    struct regnode_charclass_class);
3235                         flags |= SCF_DO_STCLASS_OR;
3236                         data->start_class->flags |= ANYOF_EOS;
3237                     }
3238                 } else {                /* Non-zero len */
3239                     if (flags & SCF_DO_STCLASS_OR) {
3240                         cl_or(pRExC_state, data->start_class, &this_class);
3241                         cl_and(data->start_class, and_withp);
3242                     }
3243                     else if (flags & SCF_DO_STCLASS_AND)
3244                         cl_and(data->start_class, &this_class);
3245                     flags &= ~SCF_DO_STCLASS;
3246                 }
3247                 if (!scan)              /* It was not CURLYX, but CURLY. */
3248                     scan = next;
3249                 if ( /* ? quantifier ok, except for (?{ ... }) */
3250                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3251                     && (minnext == 0) && (deltanext == 0)
3252                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3253                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3254                 {
3255                     ckWARNreg(RExC_parse,
3256                               "Quantifier unexpected on zero-length expression");
3257                 }
3258
3259                 min += minnext * mincount;
3260                 is_inf_internal |= ((maxcount == REG_INFTY
3261                                      && (minnext + deltanext) > 0)
3262                                     || deltanext == I32_MAX);
3263                 is_inf |= is_inf_internal;
3264                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3265
3266                 /* Try powerful optimization CURLYX => CURLYN. */
3267                 if (  OP(oscan) == CURLYX && data
3268                       && data->flags & SF_IN_PAR
3269                       && !(data->flags & SF_HAS_EVAL)
3270                       && !deltanext && minnext == 1 ) {
3271                     /* Try to optimize to CURLYN.  */
3272                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3273                     regnode * const nxt1 = nxt;
3274 #ifdef DEBUGGING
3275                     regnode *nxt2;
3276 #endif
3277
3278                     /* Skip open. */
3279                     nxt = regnext(nxt);
3280                     if (!REGNODE_SIMPLE(OP(nxt))
3281                         && !(PL_regkind[OP(nxt)] == EXACT
3282                              && STR_LEN(nxt) == 1))
3283                         goto nogo;
3284 #ifdef DEBUGGING
3285                     nxt2 = nxt;
3286 #endif
3287                     nxt = regnext(nxt);
3288                     if (OP(nxt) != CLOSE)
3289                         goto nogo;
3290                     if (RExC_open_parens) {
3291                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3292                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3293                     }
3294                     /* Now we know that nxt2 is the only contents: */
3295                     oscan->flags = (U8)ARG(nxt);
3296                     OP(oscan) = CURLYN;
3297                     OP(nxt1) = NOTHING; /* was OPEN. */
3298
3299 #ifdef DEBUGGING
3300                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3301                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3302                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3303                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3304                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3305                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3306 #endif
3307                 }
3308               nogo:
3309
3310                 /* Try optimization CURLYX => CURLYM. */
3311                 if (  OP(oscan) == CURLYX && data
3312                       && !(data->flags & SF_HAS_PAR)
3313                       && !(data->flags & SF_HAS_EVAL)
3314                       && !deltanext     /* atom is fixed width */
3315                       && minnext != 0   /* CURLYM can't handle zero width */
3316                 ) {
3317                     /* XXXX How to optimize if data == 0? */
3318                     /* Optimize to a simpler form.  */
3319                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3320                     regnode *nxt2;
3321
3322                     OP(oscan) = CURLYM;
3323                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3324                             && (OP(nxt2) != WHILEM))
3325                         nxt = nxt2;
3326                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3327                     /* Need to optimize away parenths. */
3328                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3329                         /* Set the parenth number.  */
3330                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3331
3332                         oscan->flags = (U8)ARG(nxt);
3333                         if (RExC_open_parens) {
3334                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3335                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3336                         }
3337                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3338                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3339
3340 #ifdef DEBUGGING
3341                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3342                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3343                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3344                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3345 #endif
3346 #if 0
3347                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3348                             regnode *nnxt = regnext(nxt1);
3349
3350                             if (nnxt == nxt) {
3351                                 if (reg_off_by_arg[OP(nxt1)])
3352                                     ARG_SET(nxt1, nxt2 - nxt1);
3353                                 else if (nxt2 - nxt1 < U16_MAX)
3354                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3355                                 else
3356                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3357                             }
3358                             nxt1 = nnxt;
3359                         }
3360 #endif
3361                         /* Optimize again: */
3362                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3363                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3364                     }
3365                     else
3366                         oscan->flags = 0;
3367                 }
3368                 else if ((OP(oscan) == CURLYX)
3369                          && (flags & SCF_WHILEM_VISITED_POS)
3370                          /* See the comment on a similar expression above.
3371                             However, this time it not a subexpression
3372                             we care about, but the expression itself. */
3373                          && (maxcount == REG_INFTY)
3374                          && data && ++data->whilem_c < 16) {
3375                     /* This stays as CURLYX, we can put the count/of pair. */
3376                     /* Find WHILEM (as in regexec.c) */
3377                     regnode *nxt = oscan + NEXT_OFF(oscan);
3378
3379                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3380                         nxt += ARG(nxt);
3381                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3382                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3383                 }
3384                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3385                     pars++;
3386                 if (flags & SCF_DO_SUBSTR) {
3387                     SV *last_str = NULL;
3388                     int counted = mincount != 0;
3389
3390                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3391 #if defined(SPARC64_GCC_WORKAROUND)
3392                         I32 b = 0;
3393                         STRLEN l = 0;
3394                         const char *s = NULL;
3395                         I32 old = 0;
3396
3397                         if (pos_before >= data->last_start_min)
3398                             b = pos_before;
3399                         else
3400                             b = data->last_start_min;
3401
3402                         l = 0;
3403                         s = SvPV_const(data->last_found, l);
3404                         old = b - data->last_start_min;
3405
3406 #else
3407                         I32 b = pos_before >= data->last_start_min
3408                             ? pos_before : data->last_start_min;
3409                         STRLEN l;
3410                         const char * const s = SvPV_const(data->last_found, l);
3411                         I32 old = b - data->last_start_min;
3412 #endif
3413
3414                         if (UTF)
3415                             old = utf8_hop((U8*)s, old) - (U8*)s;
3416
3417                         l -= old;
3418                         /* Get the added string: */
3419                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3420                         if (deltanext == 0 && pos_before == b) {
3421                             /* What was added is a constant string */
3422                             if (mincount > 1) {
3423                                 SvGROW(last_str, (mincount * l) + 1);
3424                                 repeatcpy(SvPVX(last_str) + l,
3425                                           SvPVX_const(last_str), l, mincount - 1);
3426                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3427                                 /* Add additional parts. */
3428                                 SvCUR_set(data->last_found,
3429                                           SvCUR(data->last_found) - l);
3430                                 sv_catsv(data->last_found, last_str);
3431                                 {
3432                                     SV * sv = data->last_found;
3433                                     MAGIC *mg =
3434                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3435                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3436                                     if (mg && mg->mg_len >= 0)
3437                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3438                                 }
3439                                 data->last_end += l * (mincount - 1);
3440                             }
3441                         } else {
3442                             /* start offset must point into the last copy */
3443                             data->last_start_min += minnext * (mincount - 1);
3444                             data->last_start_max += is_inf ? I32_MAX
3445                                 : (maxcount - 1) * (minnext + data->pos_delta);
3446                         }
3447                     }
3448                     /* It is counted once already... */
3449                     data->pos_min += minnext * (mincount - counted);
3450                     data->pos_delta += - counted * deltanext +
3451                         (minnext + deltanext) * maxcount - minnext * mincount;
3452                     if (mincount != maxcount) {
3453                          /* Cannot extend fixed substrings found inside
3454                             the group.  */
3455                         SCAN_COMMIT(pRExC_state,data,minlenp);
3456                         if (mincount && last_str) {
3457                             SV * const sv = data->last_found;
3458                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3459                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3460
3461                             if (mg)
3462                                 mg->mg_len = -1;
3463                             sv_setsv(sv, last_str);
3464                             data->last_end = data->pos_min;
3465                             data->last_start_min =
3466                                 data->pos_min - CHR_SVLEN(last_str);
3467                             data->last_start_max = is_inf
3468                                 ? I32_MAX
3469                                 : data->pos_min + data->pos_delta
3470                                 - CHR_SVLEN(last_str);
3471                         }
3472                         data->longest = &(data->longest_float);
3473                     }
3474                     SvREFCNT_dec(last_str);
3475                 }
3476                 if (data && (fl & SF_HAS_EVAL))
3477                     data->flags |= SF_HAS_EVAL;
3478               optimize_curly_tail:
3479                 if (OP(oscan) != CURLYX) {
3480                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3481                            && NEXT_OFF(next))
3482                         NEXT_OFF(oscan) += NEXT_OFF(next);
3483                 }
3484                 continue;
3485             default:                    /* REF and CLUMP only? */
3486                 if (flags & SCF_DO_SUBSTR) {
3487                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3488                     data->longest = &(data->longest_float);
3489                 }
3490                 is_inf = is_inf_internal = 1;
3491                 if (flags & SCF_DO_STCLASS_OR)
3492                     cl_anything(pRExC_state, data->start_class);
3493                 flags &= ~SCF_DO_STCLASS;
3494                 break;
3495             }
3496         }
3497         else if (OP(scan) == LNBREAK) {
3498             if (flags & SCF_DO_STCLASS) {
3499                 int value = 0;
3500                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3501                 if (flags & SCF_DO_STCLASS_AND) {
3502                     for (value = 0; value < 256; value++)
3503                         if (!is_VERTWS_cp(value))
3504                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3505                 }
3506                 else {
3507                     for (value = 0; value < 256; value++)
3508                         if (is_VERTWS_cp(value))
3509                             ANYOF_BITMAP_SET(data->start_class, value);
3510                 }
3511                 if (flags & SCF_DO_STCLASS_OR)
3512                     cl_and(data->start_class, and_withp);
3513                 flags &= ~SCF_DO_STCLASS;
3514             }
3515             min += 1;
3516             delta += 1;
3517             if (flags & SCF_DO_SUBSTR) {
3518                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3519                 data->pos_min += 1;
3520                 data->pos_delta += 1;
3521                 data->longest = &(data->longest_float);
3522             }
3523
3524         }
3525         else if (OP(scan) == FOLDCHAR) {
3526             int d = ARG(scan)==0xDF ? 1 : 2;
3527             flags &= ~SCF_DO_STCLASS;
3528             min += 1;
3529             delta += d;
3530             if (flags & SCF_DO_SUBSTR) {
3531                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3532                 data->pos_min += 1;
3533                 data->pos_delta += d;
3534                 data->longest = &(data->longest_float);
3535             }
3536         }
3537         else if (REGNODE_SIMPLE(OP(scan))) {
3538             int value = 0;
3539
3540             if (flags & SCF_DO_SUBSTR) {
3541                 SCAN_COMMIT(pRExC_state,data,minlenp);
3542                 data->pos_min++;
3543             }
3544             min++;
3545             if (flags & SCF_DO_STCLASS) {
3546                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3547
3548                 /* Some of the logic below assumes that switching
3549                    locale on will only add false positives. */
3550                 switch (PL_regkind[OP(scan)]) {
3551                 case SANY:
3552                 default:
3553                   do_default:
3554                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3555                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3556                         cl_anything(pRExC_state, data->start_class);
3557                     break;
3558                 case REG_ANY:
3559                     if (OP(scan) == SANY)
3560                         goto do_default;
3561                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3562                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3563                                  || (data->start_class->flags & ANYOF_CLASS));
3564                         cl_anything(pRExC_state, data->start_class);
3565                     }
3566                     if (flags & SCF_DO_STCLASS_AND || !value)
3567                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3568                     break;
3569                 case ANYOF:
3570                     if (flags & SCF_DO_STCLASS_AND)
3571                         cl_and(data->start_class,
3572                                (struct regnode_charclass_class*)scan);
3573                     else
3574                         cl_or(pRExC_state, data->start_class,
3575                               (struct regnode_charclass_class*)scan);
3576                     break;
3577                 case ALNUM:
3578                     if (flags & SCF_DO_STCLASS_AND) {
3579                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3580                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3581                             for (value = 0; value < 256; value++)
3582                                 if (!isALNUM(value))
3583                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3584                         }
3585                     }
3586                     else {
3587                         if (data->start_class->flags & ANYOF_LOCALE)
3588                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3589                         else {
3590                             for (value = 0; value < 256; value++)
3591                                 if (isALNUM(value))
3592                                     ANYOF_BITMAP_SET(data->start_class, value);
3593                         }
3594                     }
3595                     break;
3596                 case ALNUML:
3597                     if (flags & SCF_DO_STCLASS_AND) {
3598                         if (data->start_class->flags & ANYOF_LOCALE)
3599                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3600                     }
3601                     else {
3602                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3603                         data->start_class->flags |= ANYOF_LOCALE;
3604                     }
3605                     break;
3606                 case NALNUM:
3607                     if (flags & SCF_DO_STCLASS_AND) {
3608                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3609                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3610                             for (value = 0; value < 256; value++)
3611                                 if (isALNUM(value))
3612                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3613                         }
3614                     }
3615                     else {
3616                         if (data->start_class->flags & ANYOF_LOCALE)
3617                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3618                         else {
3619                             for (value = 0; value < 256; value++)
3620                                 if (!isALNUM(value))
3621                                     ANYOF_BITMAP_SET(data->start_class, value);
3622                         }
3623                     }
3624                     break;
3625                 case NALNUML:
3626                     if (flags & SCF_DO_STCLASS_AND) {
3627                         if (data->start_class->flags & ANYOF_LOCALE)
3628                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3629                     }
3630                     else {
3631                         data->start_class->flags |= ANYOF_LOCALE;
3632                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3633                     }
3634                     break;
3635                 case SPACE:
3636                     if (flags & SCF_DO_STCLASS_AND) {
3637                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3638                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3639                             for (value = 0; value < 256; value++)
3640                                 if (!isSPACE(value))
3641                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3642                         }
3643                     }
3644                     else {
3645                         if (data->start_class->flags & ANYOF_LOCALE)
3646                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3647                         else {
3648                             for (value = 0; value < 256; value++)
3649                                 if (isSPACE(value))
3650                                     ANYOF_BITMAP_SET(data->start_class, value);
3651                         }
3652                     }
3653                     break;
3654                 case SPACEL:
3655                     if (flags & SCF_DO_STCLASS_AND) {
3656                         if (data->start_class->flags & ANYOF_LOCALE)
3657                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3658                     }
3659                     else {
3660                         data->start_class->flags |= ANYOF_LOCALE;
3661                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3662                     }
3663                     break;
3664                 case NSPACE:
3665                     if (flags & SCF_DO_STCLASS_AND) {
3666                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3667                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3668                             for (value = 0; value < 256; value++)
3669                                 if (isSPACE(value))
3670                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3671                         }
3672                     }
3673                     else {
3674                         if (data->start_class->flags & ANYOF_LOCALE)
3675                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3676                         else {
3677                             for (value = 0; value < 256; value++)
3678                                 if (!isSPACE(value))
3679                                     ANYOF_BITMAP_SET(data->start_class, value);
3680                         }
3681                     }
3682                     break;
3683                 case NSPACEL:
3684                     if (flags & SCF_DO_STCLASS_AND) {
3685                         if (data->start_class->flags & ANYOF_LOCALE) {
3686                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3687                             for (value = 0; value < 256; value++)
3688                                 if (!isSPACE(value))
3689                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3690                         }
3691                     }
3692                     else {
3693                         data->start_class->flags |= ANYOF_LOCALE;
3694                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3695                     }
3696                     break;
3697                 case DIGIT:
3698                     if (flags & SCF_DO_STCLASS_AND) {
3699                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3700                         for (value = 0; value < 256; value++)
3701                             if (!isDIGIT(value))
3702                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3703                     }
3704                     else {
3705                         if (data->start_class->flags & ANYOF_LOCALE)
3706                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3707                         else {
3708                             for (value = 0; value < 256; value++)
3709                                 if (isDIGIT(value))
3710                                     ANYOF_BITMAP_SET(data->start_class, value);
3711                         }
3712                     }
3713                     break;
3714                 case NDIGIT:
3715                     if (flags & SCF_DO_STCLASS_AND) {
3716                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3717                         for (value = 0; value < 256; value++)
3718                             if (isDIGIT(value))
3719                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3720                     }
3721                     else {
3722                         if (data->start_class->flags & ANYOF_LOCALE)
3723                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3724                         else {
3725                             for (value = 0; value < 256; value++)
3726                                 if (!isDIGIT(value))
3727                                     ANYOF_BITMAP_SET(data->start_class, value);
3728                         }
3729                     }
3730                     break;
3731                 CASE_SYNST_FNC(VERTWS);
3732                 CASE_SYNST_FNC(HORIZWS);
3733
3734                 }
3735                 if (flags & SCF_DO_STCLASS_OR)
3736                     cl_and(data->start_class, and_withp);
3737                 flags &= ~SCF_DO_STCLASS;
3738             }
3739         }
3740         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3741             data->flags |= (OP(scan) == MEOL
3742                             ? SF_BEFORE_MEOL
3743                             : SF_BEFORE_SEOL);
3744         }
3745         else if (  PL_regkind[OP(scan)] == BRANCHJ
3746                  /* Lookbehind, or need to calculate parens/evals/stclass: */
3747                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
3748                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3749             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3750                 || OP(scan) == UNLESSM )
3751             {
3752                 /* Negative Lookahead/lookbehind
3753                    In this case we can't do fixed string optimisation.
3754                 */
3755
3756                 I32 deltanext, minnext, fake = 0;
3757                 regnode *nscan;
3758                 struct regnode_charclass_class intrnl;
3759                 int f = 0;
3760
3761                 data_fake.flags = 0;
3762                 if (data) {
3763                     data_fake.whilem_c = data->whilem_c;
3764                     data_fake.last_closep = data->last_closep;
3765                 }
3766                 else
3767                     data_fake.last_closep = &fake;
3768                 data_fake.pos_delta = delta;
3769                 if ( flags & SCF_DO_STCLASS && !scan->flags
3770                      && OP(scan) == IFMATCH ) { /* Lookahead */
3771                     cl_init(pRExC_state, &intrnl);
3772                     data_fake.start_class = &intrnl;
3773                     f |= SCF_DO_STCLASS_AND;
3774                 }
3775                 if (flags & SCF_WHILEM_VISITED_POS)
3776                     f |= SCF_WHILEM_VISITED_POS;
3777                 next = regnext(scan);
3778                 nscan = NEXTOPER(NEXTOPER(scan));
3779                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3780                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3781                 if (scan->flags) {
3782                     if (deltanext) {
3783                         FAIL("Variable length lookbehind not implemented");
3784                     }
3785                     else if (minnext > (I32)U8_MAX) {
3786                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3787                     }
3788                     scan->flags = (U8)minnext;
3789                 }
3790                 if (data) {
3791                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3792                         pars++;
3793                     if (data_fake.flags & SF_HAS_EVAL)
3794                         data->flags |= SF_HAS_EVAL;
3795                     data->whilem_c = data_fake.whilem_c;
3796                 }
3797                 if (f & SCF_DO_STCLASS_AND) {
3798                     if (flags & SCF_DO_STCLASS_OR) {
3799                         /* OR before, AND after: ideally we would recurse with
3800                          * data_fake to get the AND applied by study of the
3801                          * remainder of the pattern, and then derecurse;
3802                          * *** HACK *** for now just treat as "no information".
3803                          * See [perl #56690].
3804                          */
3805                         cl_init(pRExC_state, data->start_class);
3806                     }  else {
3807                         /* AND before and after: combine and continue */
3808                         const int was = (data->start_class->flags & ANYOF_EOS);
3809
3810                         cl_and(data->start_class, &intrnl);
3811                         if (was)
3812                             data->start_class->flags |= ANYOF_EOS;
3813                     }
3814                 }
3815             }
3816 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3817             else {
3818                 /* Positive Lookahead/lookbehind
3819                    In this case we can do fixed string optimisation,
3820                    but we must be careful about it. Note in the case of
3821                    lookbehind the positions will be offset by the minimum
3822                    length of the pattern, something we won't know about
3823                    until after the recurse.
3824                 */
3825                 I32 deltanext, fake = 0;
3826                 regnode *nscan;
3827                 struct regnode_charclass_class intrnl;
3828                 int f = 0;
3829                 /* We use SAVEFREEPV so that when the full compile
3830                     is finished perl will clean up the allocated
3831                     minlens when its all done. This was we don't
3832                     have to worry about freeing them when we know
3833                     they wont be used, which would be a pain.
3834                  */
3835                 I32 *minnextp;
3836                 Newx( minnextp, 1, I32 );
3837                 SAVEFREEPV(minnextp);
3838
3839                 if (data) {
3840                     StructCopy(data, &data_fake, scan_data_t);
3841                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3842                         f |= SCF_DO_SUBSTR;
3843                         if (scan->flags)
3844                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3845                         data_fake.last_found=newSVsv(data->last_found);
3846                     }
3847                 }
3848                 else
3849                     data_fake.last_closep = &fake;
3850                 data_fake.flags = 0;
3851                 data_fake.pos_delta = delta;
3852                 if (is_inf)
3853                     data_fake.flags |= SF_IS_INF;
3854                 if ( flags & SCF_DO_STCLASS && !scan->flags
3855                      && OP(scan) == IFMATCH ) { /* Lookahead */
3856                     cl_init(pRExC_state, &intrnl);
3857                     data_fake.start_class = &intrnl;
3858                     f |= SCF_DO_STCLASS_AND;
3859                 }
3860                 if (flags & SCF_WHILEM_VISITED_POS)
3861                     f |= SCF_WHILEM_VISITED_POS;
3862                 next = regnext(scan);
3863                 nscan = NEXTOPER(NEXTOPER(scan));
3864
3865                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3866                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3867                 if (scan->flags) {
3868                     if (deltanext) {
3869                         FAIL("Variable length lookbehind not implemented");
3870                     }
3871                     else if (*minnextp > (I32)U8_MAX) {
3872                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3873                     }
3874                     scan->flags = (U8)*minnextp;
3875                 }
3876
3877                 *minnextp += min;
3878
3879                 if (f & SCF_DO_STCLASS_AND) {
3880                     const int was = (data->start_class->flags & ANYOF_EOS);
3881
3882                     cl_and(data->start_class, &intrnl);
3883                     if (was)
3884                         data->start_class->flags |= ANYOF_EOS;
3885                 }
3886                 if (data) {
3887                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3888                         pars++;
3889                     if (data_fake.flags & SF_HAS_EVAL)
3890                         data->flags |= SF_HAS_EVAL;
3891                     data->whilem_c = data_fake.whilem_c;
3892                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3893                         if (RExC_rx->minlen<*minnextp)
3894                             RExC_rx->minlen=*minnextp;
3895                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3896                         SvREFCNT_dec(data_fake.last_found);
3897
3898                         if ( data_fake.minlen_fixed != minlenp )
3899                         {
3900                             data->offset_fixed= data_fake.offset_fixed;
3901                             data->minlen_fixed= data_fake.minlen_fixed;
3902                             data->lookbehind_fixed+= scan->flags;
3903                         }
3904                         if ( data_fake.minlen_float != minlenp )
3905                         {
3906                             data->minlen_float= data_fake.minlen_float;
3907                             data->offset_float_min=data_fake.offset_float_min;
3908                             data->offset_float_max=data_fake.offset_float_max;
3909                             data->lookbehind_float+= scan->flags;
3910                         }
3911                     }
3912                 }
3913
3914
3915             }
3916 #endif
3917         }
3918         else if (OP(scan) == OPEN) {
3919             if (stopparen != (I32)ARG(scan))
3920                 pars++;
3921         }
3922         else if (OP(scan) == CLOSE) {
3923             if (stopparen == (I32)ARG(scan)) {
3924                 break;
3925             }
3926             if ((I32)ARG(scan) == is_par) {
3927                 next = regnext(scan);
3928
3929                 if ( next && (OP(next) != WHILEM) && next < last)
3930                     is_par = 0;         /* Disable optimization */
3931             }
3932             if (data)
3933                 *(data->last_closep) = ARG(scan);
3934         }
3935         else if (OP(scan) == EVAL) {
3936                 if (data)
3937                     data->flags |= SF_HAS_EVAL;
3938         }
3939         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3940             if (flags & SCF_DO_SUBSTR) {
3941                 SCAN_COMMIT(pRExC_state,data,minlenp);
3942                 flags &= ~SCF_DO_SUBSTR;
3943             }
3944             if (data && OP(scan)==ACCEPT) {
3945                 data->flags |= SCF_SEEN_ACCEPT;
3946                 if (stopmin > min)
3947                     stopmin = min;
3948             }
3949         }
3950         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3951         {
3952                 if (flags & SCF_DO_SUBSTR) {
3953                     SCAN_COMMIT(pRExC_state,data,minlenp);
3954                     data->longest = &(data->longest_float);
3955                 }
3956                 is_inf = is_inf_internal = 1;
3957                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3958                     cl_anything(pRExC_state, data->start_class);
3959                 flags &= ~SCF_DO_STCLASS;
3960         }
3961         else if (OP(scan) == GPOS) {
3962             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3963                 !(delta || is_inf || (data && data->pos_delta)))
3964             {
3965                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3966                     RExC_rx->extflags |= RXf_ANCH_GPOS;
3967                 if (RExC_rx->gofs < (U32)min)
3968                     RExC_rx->gofs = min;
3969             } else {
3970                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3971                 RExC_rx->gofs = 0;
3972             }
3973         }
3974 #ifdef TRIE_STUDY_OPT
3975 #ifdef FULL_TRIE_STUDY
3976         else if (PL_regkind[OP(scan)] == TRIE) {
3977             /* NOTE - There is similar code to this block above for handling
3978                BRANCH nodes on the initial study.  If you change stuff here
3979                check there too. */
3980             regnode *trie_node= scan;
3981             regnode *tail= regnext(scan);
3982             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3983             I32 max1 = 0, min1 = I32_MAX;
3984             struct regnode_charclass_class accum;
3985
3986             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3987                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3988             if (flags & SCF_DO_STCLASS)
3989                 cl_init_zero(pRExC_state, &accum);
3990
3991             if (!trie->jump) {
3992                 min1= trie->minlen;
3993                 max1= trie->maxlen;
3994             } else {
3995                 const regnode *nextbranch= NULL;
3996                 U32 word;
3997
3998                 for ( word=1 ; word <= trie->wordcount ; word++)
3999                 {
4000                     I32 deltanext=0, minnext=0, f = 0, fake;
4001                     struct regnode_charclass_class this_class;
4002
4003                     data_fake.flags = 0;
4004                     if (data) {
4005                         data_fake.whilem_c = data->whilem_c;
4006                         data_fake.last_closep = data->last_closep;
4007                     }
4008                     else
4009                         data_fake.last_closep = &fake;
4010                     data_fake.pos_delta = delta;
4011                     if (flags & SCF_DO_STCLASS) {
4012                         cl_init(pRExC_state, &this_class);
4013                         data_fake.start_class = &this_class;
4014                         f = SCF_DO_STCLASS_AND;
4015                     }
4016                     if (flags & SCF_WHILEM_VISITED_POS)
4017                         f |= SCF_WHILEM_VISITED_POS;
4018
4019                     if (trie->jump[word]) {
4020                         if (!nextbranch)
4021                             nextbranch = trie_node + trie->jump[0];
4022                         scan= trie_node + trie->jump[word];
4023                         /* We go from the jump point to the branch that follows
4024                            it. Note this means we need the vestigal unused branches
4025                            even though they arent otherwise used.
4026                          */
4027                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4028                             &deltanext, (regnode *)nextbranch, &data_fake,
4029                             stopparen, recursed, NULL, f,depth+1);
4030                     }
4031                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4032                         nextbranch= regnext((regnode*)nextbranch);
4033
4034                     if (min1 > (I32)(minnext + trie->minlen))
4035                         min1 = minnext + trie->minlen;
4036                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4037                         max1 = minnext + deltanext + trie->maxlen;
4038                     if (deltanext == I32_MAX)
4039                         is_inf = is_inf_internal = 1;
4040
4041                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4042                         pars++;
4043                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4044                         if ( stopmin > min + min1)
4045                             stopmin = min + min1;
4046                         flags &= ~SCF_DO_SUBSTR;
4047                         if (data)
4048                             data->flags |= SCF_SEEN_ACCEPT;
4049                     }
4050                     if (data) {
4051                         if (data_fake.flags & SF_HAS_EVAL)
4052                             data->flags |= SF_HAS_EVAL;
4053                         data->whilem_c = data_fake.whilem_c;
4054                     }
4055                     if (flags & SCF_DO_STCLASS)
4056                         cl_or(pRExC_state, &accum, &this_class);
4057                 }
4058             }
4059             if (flags & SCF_DO_SUBSTR) {
4060                 data->pos_min += min1;
4061                 data->pos_delta += max1 - min1;
4062                 if (max1 != min1 || is_inf)
4063                     data->longest = &(data->longest_float);
4064             }
4065             min += min1;
4066             delta += max1 - min1;
4067             if (flags & SCF_DO_STCLASS_OR) {
4068                 cl_or(pRExC_state, data->start_class, &accum);
4069                 if (min1) {
4070                     cl_and(data->start_class, and_withp);
4071                     flags &= ~SCF_DO_STCLASS;
4072                 }
4073             }
4074             else if (flags & SCF_DO_STCLASS_AND) {
4075                 if (min1) {
4076                     cl_and(data->start_class, &accum);
4077                     flags &= ~SCF_DO_STCLASS;
4078                 }
4079                 else {
4080                     /* Switch to OR mode: cache the old value of
4081                      * data->start_class */
4082                     INIT_AND_WITHP;
4083                     StructCopy(data->start_class, and_withp,
4084                                struct regnode_charclass_class);
4085                     flags &= ~SCF_DO_STCLASS_AND;
4086                     StructCopy(&accum, data->start_class,
4087                                struct regnode_charclass_class);
4088                     flags |= SCF_DO_STCLASS_OR;
4089                     data->start_class->flags |= ANYOF_EOS;
4090                 }
4091             }
4092             scan= tail;
4093             continue;
4094         }
4095 #else
4096         else if (PL_regkind[OP(scan)] == TRIE) {
4097             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4098             U8*bang=NULL;
4099
4100             min += trie->minlen;
4101             delta += (trie->maxlen - trie->minlen);
4102             flags &= ~SCF_DO_STCLASS; /* xxx */
4103             if (flags & SCF_DO_SUBSTR) {
4104                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4105                 data->pos_min += trie->minlen;
4106                 data->pos_delta += (trie->maxlen - trie->minlen);
4107                 if (trie->maxlen != trie->minlen)
4108                     data->longest = &(data->longest_float);
4109             }
4110             if (trie->jump) /* no more substrings -- for now /grr*/
4111                 flags &= ~SCF_DO_SUBSTR;
4112         }
4113 #endif /* old or new */
4114 #endif /* TRIE_STUDY_OPT */
4115
4116         /* Else: zero-length, ignore. */
4117         scan = regnext(scan);
4118     }
4119     if (frame) {
4120         last = frame->last;
4121         scan = frame->next;
4122         stopparen = frame->stop;
4123         frame = frame->prev;
4124         goto fake_study_recurse;
4125     }
4126
4127   finish:
4128     assert(!frame);
4129     DEBUG_STUDYDATA("pre-fin:",data,depth);
4130
4131     *scanp = scan;
4132     *deltap = is_inf_internal ? I32_MAX : delta;
4133     if (flags & SCF_DO_SUBSTR && is_inf)
4134         data->pos_delta = I32_MAX - data->pos_min;
4135     if (is_par > (I32)U8_MAX)
4136         is_par = 0;
4137     if (is_par && pars==1 && data) {
4138         data->flags |= SF_IN_PAR;
4139         data->flags &= ~SF_HAS_PAR;
4140     }
4141     else if (pars && data) {
4142         data->flags |= SF_HAS_PAR;
4143         data->flags &= ~SF_IN_PAR;
4144     }
4145     if (flags & SCF_DO_STCLASS_OR)
4146         cl_and(data->start_class, and_withp);
4147     if (flags & SCF_TRIE_RESTUDY)
4148         data->flags |=  SCF_TRIE_RESTUDY;
4149
4150     DEBUG_STUDYDATA("post-fin:",data,depth);
4151
4152     return min < stopmin ? min : stopmin;
4153 }
4154
4155 STATIC U32
4156 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4157 {
4158     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4159
4160     PERL_ARGS_ASSERT_ADD_DATA;
4161
4162     Renewc(RExC_rxi->data,
4163            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4164            char, struct reg_data);
4165     if(count)
4166         Renew(RExC_rxi->data->what, count + n, U8);
4167     else
4168         Newx(RExC_rxi->data->what, n, U8);
4169     RExC_rxi->data->count = count + n;
4170     Copy(s, RExC_rxi->data->what + count, n, U8);
4171     return count;
4172 }
4173
4174 /*XXX: todo make this not included in a non debugging perl */
4175 #ifndef PERL_IN_XSUB_RE
4176 void
4177 Perl_reginitcolors(pTHX)
4178 {
4179     dVAR;
4180     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4181     if (s) {
4182         char *t = savepv(s);
4183         int i = 0;
4184         PL_colors[0] = t;
4185         while (++i < 6) {
4186             t = strchr(t, '\t');
4187             if (t) {
4188                 *t = '\0';
4189                 PL_colors[i] = ++t;
4190             }
4191             else
4192                 PL_colors[i] = t = (char *)"";
4193         }
4194     } else {
4195         int i = 0;
4196         while (i < 6)
4197             PL_colors[i++] = (char *)"";
4198     }
4199     PL_colorset = 1;
4200 }
4201 #endif
4202
4203
4204 #ifdef TRIE_STUDY_OPT
4205 #define CHECK_RESTUDY_GOTO                                  \
4206         if (                                                \
4207               (data.flags & SCF_TRIE_RESTUDY)               \
4208               && ! restudied++                              \
4209         )     goto reStudy
4210 #else
4211 #define CHECK_RESTUDY_GOTO
4212 #endif
4213
4214 /*
4215  - pregcomp - compile a regular expression into internal code
4216  *
4217  * We can't allocate space until we know how big the compiled form will be,
4218  * but we can't compile it (and thus know how big it is) until we've got a
4219  * place to put the code.  So we cheat:  we compile it twice, once with code
4220  * generation turned off and size counting turned on, and once "for real".
4221  * This also means that we don't allocate space until we are sure that the
4222  * thing really will compile successfully, and we never have to move the
4223  * code and thus invalidate pointers into it.  (Note that it has to be in
4224  * one piece because free() must be able to free it all.) [NB: not true in perl]
4225  *
4226  * Beware that the optimization-preparation code in here knows about some
4227  * of the structure of the compiled regexp.  [I'll say.]
4228  */
4229
4230
4231
4232 #ifndef PERL_IN_XSUB_RE
4233 #define RE_ENGINE_PTR &reh_regexp_engine
4234 #else
4235 extern const struct regexp_engine my_reg_engine;
4236 #define RE_ENGINE_PTR &my_reg_engine
4237 #endif
4238
4239 #ifndef PERL_IN_XSUB_RE
4240 REGEXP *
4241 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4242 {
4243     dVAR;
4244     HV * const table = GvHV(PL_hintgv);
4245
4246     PERL_ARGS_ASSERT_PREGCOMP;
4247
4248     /* Dispatch a request to compile a regexp to correct
4249        regexp engine. */
4250     if (table) {
4251         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4252         GET_RE_DEBUG_FLAGS_DECL;
4253         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4254             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4255             DEBUG_COMPILE_r({
4256                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4257                     SvIV(*ptr));
4258             });
4259             return CALLREGCOMP_ENG(eng, pattern, flags);
4260         }
4261     }
4262     return Perl_re_compile(aTHX_ pattern, flags);
4263 }
4264 #endif
4265
4266 REGEXP *
4267 Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
4268 {
4269     dVAR;
4270     REGEXP *rx;
4271     struct regexp *r;
4272     register regexp_internal *ri;
4273     STRLEN plen;
4274     char  *exp = SvPV(pattern, plen);
4275     char* xend = exp + plen;
4276     regnode *scan;
4277     I32 flags;
4278     I32 minlen = 0;
4279     I32 sawplus = 0;
4280     I32 sawopen = 0;
4281     U8 jump_ret = 0;
4282     dJMPENV;
4283     scan_data_t data;
4284     RExC_state_t RExC_state;
4285     RExC_state_t * const pRExC_state = &RExC_state;
4286 #ifdef TRIE_STUDY_OPT
4287     int restudied= 0;
4288     RExC_state_t copyRExC_state;
4289 #endif
4290     GET_RE_DEBUG_FLAGS_DECL;
4291
4292     PERL_ARGS_ASSERT_RE_COMPILE;
4293
4294     DEBUG_r(if (!PL_colorset) reginitcolors());
4295
4296     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4297
4298     DEBUG_COMPILE_r({
4299         SV *dsv= sv_newmortal();
4300         RE_PV_QUOTED_DECL(s, RExC_utf8,
4301             dsv, exp, plen, 60);
4302         PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4303                        PL_colors[4],PL_colors[5],s);
4304     });
4305
4306     /* Longjmp back to here if have to switch in midstream to utf8 */
4307     if (! RExC_orig_utf8) {
4308         JMPENV_PUSH(jump_ret);
4309     }
4310
4311     if (jump_ret != 0) {
4312         STRLEN len = plen;
4313
4314         /* Here, we longjmped back.  If the cause was other than changing to
4315          * utf8, pop our own setjmp, and longjmp to the correct handler */
4316         if (jump_ret != UTF8_LONGJMP) {
4317             JMPENV_POP;
4318             JMPENV_JUMP(jump_ret);
4319         }
4320
4321         GET_RE_DEBUG_FLAGS;
4322
4323         /* It's possible to write a regexp in ascii that represents Unicode
4324         codepoints outside of the byte range, such as via \x{100}. If we
4325         detect such a sequence we have to convert the entire pattern to utf8
4326         and then recompile, as our sizing calculation will have been based
4327         on 1 byte == 1 character, but we will need to use utf8 to encode
4328         at least some part of the pattern, and therefore must convert the whole
4329         thing.
4330         -- dmq */
4331         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4332             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4333         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4334         xend = exp + len;
4335         RExC_orig_utf8 = RExC_utf8 = 1;
4336         SAVEFREEPV(exp);
4337     }
4338
4339     RExC_precomp = exp;
4340     RExC_flags = pm_flags;
4341     RExC_sawback = 0;
4342
4343     RExC_seen = 0;
4344     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4345     RExC_seen_evals = 0;
4346     RExC_extralen = 0;
4347
4348     /* First pass: determine size, legality. */
4349     RExC_parse = exp;
4350     RExC_start = exp;
4351     RExC_end = xend;
4352     RExC_naughty = 0;
4353     RExC_npar = 1;
4354     RExC_nestroot = 0;
4355     RExC_size = 0L;
4356     RExC_emit = &PL_regdummy;
4357     RExC_whilem_seen = 0;
4358     RExC_open_parens = NULL;
4359     RExC_close_parens = NULL;
4360     RExC_opend = NULL;
4361     RExC_paren_names = NULL;
4362 #ifdef DEBUGGING
4363     RExC_paren_name_list = NULL;
4364 #endif
4365     RExC_recurse = NULL;
4366     RExC_recurse_count = 0;
4367
4368 #if 0 /* REGC() is (currently) a NOP at the first pass.
4369        * Clever compilers notice this and complain. --jhi */
4370     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4371 #endif
4372     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4373     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4374         RExC_precomp = NULL;
4375         return(NULL);
4376     }
4377
4378     /* Here, finished first pass.  Get rid of our setjmp, which we added for
4379      * efficiency only if the passed-in string wasn't in utf8, as shown by
4380      * RExC_orig_utf8.  But if the first pass was redone, that variable will be
4381      * 1 here even though the original string wasn't utf8, but in this case
4382      * there will have been a long jump */
4383     if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
4384         JMPENV_POP;
4385     }
4386     DEBUG_PARSE_r({
4387         PerlIO_printf(Perl_debug_log,
4388             "Required size %"IVdf" nodes\n"
4389             "Starting second pass (creation)\n",
4390             (IV)RExC_size);
4391         RExC_lastnum=0;
4392         RExC_lastparse=NULL;
4393     });
4394     /* Small enough for pointer-storage convention?
4395        If extralen==0, this means that we will not need long jumps. */
4396     if (RExC_size >= 0x10000L && RExC_extralen)
4397         RExC_size += RExC_extralen;
4398     else
4399         RExC_extralen = 0;
4400     if (RExC_whilem_seen > 15)
4401         RExC_whilem_seen = 15;
4402
4403     /* Allocate space and zero-initialize. Note, the two step process
4404        of zeroing when in debug mode, thus anything assigned has to
4405        happen after that */
4406     rx = (REGEXP*) newSV_type(SVt_REGEXP);
4407     r = (struct regexp*)SvANY(rx);
4408     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4409          char, regexp_internal);
4410     if ( r == NULL || ri == NULL )
4411         FAIL("Regexp out of space");
4412 #ifdef DEBUGGING
4413     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4414     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4415 #else
4416     /* bulk initialize base fields with 0. */
4417     Zero(ri, sizeof(regexp_internal), char);
4418 #endif
4419
4420     /* non-zero initialization begins here */
4421     RXi_SET( r, ri );
4422     r->engine= RE_ENGINE_PTR;
4423     r->extflags = pm_flags;
4424     {
4425         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4426         bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4427         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4428         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4429                             >> RXf_PMf_STD_PMMOD_SHIFT);
4430         const char *fptr = STD_PAT_MODS;        /*"msix"*/
4431         char *p;
4432         const STRLEN wraplen = plen + has_minus + has_p + has_runon
4433             + (sizeof(STD_PAT_MODS) - 1)
4434             + (sizeof("(?:)") - 1);
4435
4436         p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
4437         SvCUR_set(rx, wraplen);
4438         SvPOK_on(rx);
4439         SvFLAGS(rx) |= SvUTF8(pattern);
4440         *p++='('; *p++='?';
4441         if (has_p)
4442             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4443         {
4444             char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4445             char *colon = r + 1;
4446             char ch;
4447
4448             while((ch = *fptr++)) {
4449                 if(reganch & 1)
4450                     *p++ = ch;
4451                 else
4452                     *r-- = ch;
4453                 reganch >>= 1;
4454             }
4455             if(has_minus) {
4456                 *r = '-';
4457                 p = colon;
4458             }
4459         }
4460
4461         *p++ = ':';
4462         Copy(RExC_precomp, p, plen, char);
4463         assert ((RX_WRAPPED(rx) - p) < 16);
4464         r->pre_prefix = p - RX_WRAPPED(rx);
4465         p += plen;
4466         if (has_runon)
4467             *p++ = '\n';
4468         *p++ = ')';
4469         *p = 0;
4470     }
4471
4472     r->intflags = 0;
4473     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4474
4475     if (RExC_seen & REG_SEEN_RECURSE) {
4476         Newxz(RExC_open_parens, RExC_npar,regnode *);
4477         SAVEFREEPV(RExC_open_parens);
4478         Newxz(RExC_close_parens,RExC_npar,regnode *);
4479         SAVEFREEPV(RExC_close_parens);
4480     }
4481
4482     /* Useful during FAIL. */
4483 #ifdef RE_TRACK_PATTERN_OFFSETS
4484     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4485     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4486                           "%s %"UVuf" bytes for offset annotations.\n",
4487                           ri->u.offsets ? "Got" : "Couldn't get",
4488                           (UV)((2*RExC_size+1) * sizeof(U32))));
4489 #endif
4490     SetProgLen(ri,RExC_size);
4491     RExC_rx_sv = rx;
4492     RExC_rx = r;
4493     RExC_rxi = ri;
4494     REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4495
4496     /* Second pass: emit code. */
4497     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
4498     RExC_parse = exp;
4499     RExC_end = xend;
4500     RExC_naughty = 0;
4501     RExC_npar = 1;
4502     RExC_emit_start = ri->program;
4503     RExC_emit = ri->program;
4504     RExC_emit_bound = ri->program + RExC_size + 1;
4505
4506     /* Store the count of eval-groups for security checks: */
4507     RExC_rx->seen_evals = RExC_seen_evals;
4508     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4509     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4510         ReREFCNT_dec(rx);
4511         return(NULL);
4512     }
4513     /* XXXX To minimize changes to RE engine we always allocate
4514        3-units-long substrs field. */
4515     Newx(r->substrs, 1, struct reg_substr_data);
4516     if (RExC_recurse_count) {
4517         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4518         SAVEFREEPV(RExC_recurse);
4519     }
4520
4521 reStudy:
4522     r->minlen = minlen = sawplus = sawopen = 0;
4523     Zero(r->substrs, 1, struct reg_substr_data);
4524
4525 #ifdef TRIE_STUDY_OPT
4526     if (!restudied) {
4527         StructCopy(&zero_scan_data, &data, scan_data_t);
4528         copyRExC_state = RExC_state;
4529     } else {
4530         U32 seen=RExC_seen;
4531         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4532
4533         RExC_state = copyRExC_state;
4534         if (seen & REG_TOP_LEVEL_BRANCHES)
4535             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4536         else
4537             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4538         if (data.last_found) {
4539             SvREFCNT_dec(data.longest_fixed);
4540             SvREFCNT_dec(data.longest_float);
4541             SvREFCNT_dec(data.last_found);
4542         }
4543         StructCopy(&zero_scan_data, &data, scan_data_t);
4544     }
4545 #else
4546     StructCopy(&zero_scan_data, &data, scan_data_t);
4547 #endif
4548
4549     /* Dig out information for optimizations. */
4550     r->extflags = RExC_flags; /* was pm_op */
4551     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4552
4553     if (UTF)
4554         SvUTF8_on(rx);  /* Unicode in it? */
4555     ri->regstclass = NULL;
4556     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
4557         r->intflags |= PREGf_NAUGHTY;
4558     scan = ri->program + 1;             /* First BRANCH. */
4559
4560     /* testing for BRANCH here tells us whether there is "must appear"
4561        data in the pattern. If there is then we can use it for optimisations */
4562     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4563         I32 fake;
4564         STRLEN longest_float_length, longest_fixed_length;
4565         struct regnode_charclass_class ch_class; /* pointed to by data */
4566         int stclass_flag;
4567         I32 last_close = 0; /* pointed to by data */
4568         regnode *first= scan;
4569         regnode *first_next= regnext(first);
4570
4571         /*
4572          * Skip introductions and multiplicators >= 1
4573          * so that we can extract the 'meat' of the pattern that must
4574          * match in the large if() sequence following.
4575          * NOTE that EXACT is NOT covered here, as it is normally
4576          * picked up by the optimiser separately.
4577          *
4578          * This is unfortunate as the optimiser isnt handling lookahead
4579          * properly currently.
4580          *
4581          */
4582         while ((OP(first) == OPEN && (sawopen = 1)) ||
4583                /* An OR of *one* alternative - should not happen now. */
4584             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4585             /* for now we can't handle lookbehind IFMATCH*/
4586             (OP(first) == IFMATCH && !first->flags) ||
4587             (OP(first) == PLUS) ||
4588             (OP(first) == MINMOD) ||
4589                /* An {n,m} with n>0 */
4590             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4591             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4592         {
4593                 /*
4594                  * the only op that could be a regnode is PLUS, all the rest
4595                  * will be regnode_1 or regnode_2.
4596                  *
4597                  */
4598                 if (OP(first) == PLUS)
4599                     sawplus = 1;
4600                 else
4601                     first += regarglen[OP(first)];
4602
4603                 first = NEXTOPER(first);
4604                 first_next= regnext(first);
4605         }
4606
4607         /* Starting-point info. */
4608       again:
4609         DEBUG_PEEP("first:",first,0);
4610         /* Ignore EXACT as we deal with it later. */
4611         if (PL_regkind[OP(first)] == EXACT) {
4612             if (OP(first) == EXACT)
4613                 NOOP;   /* Empty, get anchored substr later. */
4614             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4615                 ri->regstclass = first;
4616         }
4617 #ifdef TRIE_STCLASS
4618         else if (PL_regkind[OP(first)] == TRIE &&
4619                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4620         {
4621             regnode *trie_op;
4622             /* this can happen only on restudy */
4623             if ( OP(first) == TRIE ) {
4624                 struct regnode_1 *trieop = (struct regnode_1 *)
4625                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
4626                 StructCopy(first,trieop,struct regnode_1);
4627                 trie_op=(regnode *)trieop;
4628             } else {
4629                 struct regnode_charclass *trieop = (struct regnode_charclass *)
4630                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4631                 StructCopy(first,trieop,struct regnode_charclass);
4632                 trie_op=(regnode *)trieop;
4633             }
4634             OP(trie_op)+=2;
4635             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4636             ri->regstclass = trie_op;
4637         }
4638 #endif
4639         else if (REGNODE_SIMPLE(OP(first)))
4640             ri->regstclass = first;
4641         else if (PL_regkind[OP(first)] == BOUND ||
4642                  PL_regkind[OP(first)] == NBOUND)
4643             ri->regstclass = first;
4644         else if (PL_regkind[OP(first)] == BOL) {
4645             r->extflags |= (OP(first) == MBOL
4646                            ? RXf_ANCH_MBOL
4647                            : (OP(first) == SBOL
4648                               ? RXf_ANCH_SBOL
4649                               : RXf_ANCH_BOL));
4650             first = NEXTOPER(first);
4651             goto again;
4652         }
4653         else if (OP(first) == GPOS) {
4654             r->extflags |= RXf_ANCH_GPOS;
4655             first = NEXTOPER(first);
4656             goto again;
4657         }
4658         else if ((!sawopen || !RExC_sawback) &&
4659             (OP(first) == STAR &&
4660             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4661             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4662         {
4663             /* turn .* into ^.* with an implied $*=1 */
4664             const int type =
4665                 (OP(NEXTOPER(first)) == REG_ANY)
4666                     ? RXf_ANCH_MBOL
4667                     : RXf_ANCH_SBOL;
4668             r->extflags |= type;
4669             r->intflags |= PREGf_IMPLICIT;
4670             first = NEXTOPER(first);
4671             goto again;
4672         }
4673         if (sawplus && (!sawopen || !RExC_sawback)
4674             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4675             /* x+ must match at the 1st pos of run of x's */
4676             r->intflags |= PREGf_SKIP;
4677
4678         /* Scan is after the zeroth branch, first is atomic matcher. */
4679 #ifdef TRIE_STUDY_OPT
4680         DEBUG_PARSE_r(
4681             if (!restudied)
4682                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4683                               (IV)(first - scan + 1))
4684         );
4685 #else
4686         DEBUG_PARSE_r(
4687             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4688                 (IV)(first - scan + 1))
4689         );
4690 #endif
4691
4692
4693         /*
4694         * If there's something expensive in the r.e., find the
4695         * longest literal string that must appear and make it the
4696         * regmust.  Resolve ties in favor of later strings, since
4697         * the regstart check works with the beginning of the r.e.
4698         * and avoiding duplication strengthens checking.  Not a
4699         * strong reason, but sufficient in the absence of others.
4700         * [Now we resolve ties in favor of the earlier string if
4701         * it happens that c_offset_min has been invalidated, since the
4702         * earlier string may buy us something the later one won't.]
4703         */
4704
4705         data.longest_fixed = newSVpvs("");
4706         data.longest_float = newSVpvs("");
4707         data.last_found = newSVpvs("");
4708         data.longest = &(data.longest_fixed);
4709         first = scan;
4710         if (!ri->regstclass) {
4711             cl_init(pRExC_state, &ch_class);
4712             data.start_class = &ch_class;
4713             stclass_flag = SCF_DO_STCLASS_AND;
4714         } else                          /* XXXX Check for BOUND? */
4715             stclass_flag = 0;
4716         data.last_closep = &last_close;
4717
4718         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4719             &data, -1, NULL, NULL,
4720             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4721
4722
4723         CHECK_RESTUDY_GOTO;
4724
4725
4726         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4727              && data.last_start_min == 0 && data.last_end > 0
4728              && !RExC_seen_zerolen
4729              && !(RExC_seen & REG_SEEN_VERBARG)
4730              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4731             r->extflags |= RXf_CHECK_ALL;
4732         scan_commit(pRExC_state, &data,&minlen,0);
4733         SvREFCNT_dec(data.last_found);
4734
4735         /* Note that code very similar to this but for anchored string
4736            follows immediately below, changes may need to be made to both.
4737            Be careful.
4738          */
4739         longest_float_length = CHR_SVLEN(data.longest_float);
4740         if (longest_float_length
4741             || (data.flags & SF_FL_BEFORE_EOL
4742                 && (!(data.flags & SF_FL_BEFORE_MEOL)
4743                     || (RExC_flags & RXf_PMf_MULTILINE))))
4744         {
4745             I32 t,ml;
4746
4747             if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
4748                 && data.offset_fixed == data.offset_float_min
4749                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4750                     goto remove_float;          /* As in (a)+. */
4751
4752             /* copy the information about the longest float from the reg_scan_data
4753                over to the program. */
4754             if (SvUTF8(data.longest_float)) {
4755                 r->float_utf8 = data.longest_float;
4756                 r->float_substr = NULL;
4757             } else {
4758                 r->float_substr = data.longest_float;
4759                 r->float_utf8 = NULL;
4760             }
4761             /* float_end_shift is how many chars that must be matched that
4762                follow this item. We calculate it ahead of time as once the
4763                lookbehind offset is added in we lose the ability to correctly
4764                calculate it.*/
4765             ml = data.minlen_float ? *(data.minlen_float)
4766                                    : (I32)longest_float_length;
4767             r->float_end_shift = ml - data.offset_float_min
4768                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4769                 + data.lookbehind_float;
4770             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4771             r->float_max_offset = data.offset_float_max;
4772             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4773                 r->float_max_offset -= data.lookbehind_float;
4774
4775             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4776                        && (!(data.flags & SF_FL_BEFORE_MEOL)
4777                            || (RExC_flags & RXf_PMf_MULTILINE)));
4778             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4779         }
4780         else {
4781           remove_float:
4782             r->float_substr = r->float_utf8 = NULL;
4783             SvREFCNT_dec(data.longest_float);
4784             longest_float_length = 0;
4785         }
4786
4787         /* Note that code very similar to this but for floating string
4788            is immediately above, changes may need to be made to both.
4789            Be careful.
4790          */
4791         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4792         if (longest_fixed_length
4793             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4794                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4795                     || (RExC_flags & RXf_PMf_MULTILINE))))
4796         {
4797             I32 t,ml;
4798
4799             /* copy the information about the longest fixed
4800                from the reg_scan_data over to the program. */
4801             if (SvUTF8(data.longest_fixed)) {
4802                 r->anchored_utf8 = data.longest_fixed;
4803                 r->anchored_substr = NULL;
4804             } else {
4805                 r->anchored_substr = data.longest_fixed;
4806                 r->anchored_utf8 = NULL;
4807             }
4808             /* fixed_end_shift is how many chars that must be matched that
4809                follow this item. We calculate it ahead of time as once the
4810                lookbehind offset is added in we lose the ability to correctly
4811                calculate it.*/
4812             ml = data.minlen_fixed ? *(data.minlen_fixed)
4813                                    : (I32)longest_fixed_length;
4814             r->anchored_end_shift = ml - data.offset_fixed
4815                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4816                 + data.lookbehind_fixed;
4817             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4818
4819             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4820                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
4821                      || (RExC_flags & RXf_PMf_MULTILINE)));
4822             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4823         }
4824         else {
4825             r->anchored_substr = r->anchored_utf8 = NULL;
4826             SvREFCNT_dec(data.longest_fixed);
4827             longest_fixed_length = 0;
4828         }
4829         if (ri->regstclass
4830             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4831             ri->regstclass = NULL;
4832         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4833             && stclass_flag
4834             && !(data.start_class->flags & ANYOF_EOS)
4835             && !cl_is_anything(data.start_class))
4836         {
4837             const U32 n = add_data(pRExC_state, 1, "f");
4838
4839             Newx(RExC_rxi->data->data[n], 1,
4840                 struct regnode_charclass_class);
4841             StructCopy(data.start_class,
4842                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4843                        struct regnode_charclass_class);
4844             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4845             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4846             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4847                       regprop(r, sv, (regnode*)data.start_class);
4848                       PerlIO_printf(Perl_debug_log,
4849                                     "synthetic stclass \"%s\".\n",
4850                                     SvPVX_const(sv));});
4851         }
4852
4853         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4854         if (longest_fixed_length > longest_float_length) {
4855             r->check_end_shift = r->anchored_end_shift;
4856             r->check_substr = r->anchored_substr;
4857             r->check_utf8 = r->anchored_utf8;
4858             r->check_offset_min = r->check_offset_max = r->anchored_offset;
4859             if (r->extflags & RXf_ANCH_SINGLE)
4860                 r->extflags |= RXf_NOSCAN;
4861         }
4862         else {
4863             r->check_end_shift = r->float_end_shift;
4864             r->check_substr = r->float_substr;
4865             r->check_utf8 = r->float_utf8;
4866             r->check_offset_min = r->float_min_offset;
4867             r->check_offset_max = r->float_max_offset;
4868         }
4869         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4870            This should be changed ASAP!  */
4871         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4872             r->extflags |= RXf_USE_INTUIT;
4873             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4874                 r->extflags |= RXf_INTUIT_TAIL;
4875         }
4876         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4877         if ( (STRLEN)minlen < longest_float_length )
4878             minlen= longest_float_length;
4879         if ( (STRLEN)minlen < longest_fixed_length )
4880             minlen= longest_fixed_length;
4881         */
4882     }
4883     else {
4884         /* Several toplevels. Best we can is to set minlen. */
4885         I32 fake;
4886         struct regnode_charclass_class ch_class;
4887         I32 last_close = 0;
4888
4889         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4890
4891         scan = ri->program + 1;
4892         cl_init(pRExC_state, &ch_class);
4893         data.start_class = &ch_class;
4894         data.last_closep = &last_close;
4895
4896
4897         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4898             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4899
4900         CHECK_RESTUDY_GOTO;
4901
4902         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4903                 = r->float_substr = r->float_utf8 = NULL;
4904         if (!(data.start_class->flags & ANYOF_EOS)
4905             && !cl_is_anything(data.start_class))
4906         {
4907             const U32 n = add_data(pRExC_state, 1, "f");
4908
4909             Newx(RExC_rxi->data->data[n], 1,
4910                 struct regnode_charclass_class);
4911             StructCopy(data.start_class,
4912                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4913                        struct regnode_charclass_class);
4914             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4915             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4916             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4917                       regprop(r, sv, (regnode*)data.start_class);
4918                       PerlIO_printf(Perl_debug_log,
4919                                     "synthetic stclass \"%s\".\n",
4920                                     SvPVX_const(sv));});
4921         }
4922     }
4923
4924     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4925        the "real" pattern. */
4926     DEBUG_OPTIMISE_r({
4927         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4928                       (IV)minlen, (IV)r->minlen);
4929     });
4930     r->minlenret = minlen;
4931     if (r->minlen < minlen)
4932         r->minlen = minlen;
4933
4934     if (RExC_seen & REG_SEEN_GPOS)
4935         r->extflags |= RXf_GPOS_SEEN;
4936     if (RExC_seen & REG_SEEN_LOOKBEHIND)
4937         r->extflags |= RXf_LOOKBEHIND_SEEN;
4938     if (RExC_seen & REG_SEEN_EVAL)
4939         r->extflags |= RXf_EVAL_SEEN;
4940     if (RExC_seen & REG_SEEN_CANY)
4941         r->extflags |= RXf_CANY_SEEN;
4942     if (RExC_seen & REG_SEEN_VERBARG)
4943         r->intflags |= PREGf_VERBARG_SEEN;
4944     if (RExC_seen & REG_SEEN_CUTGROUP)
4945         r->intflags |= PREGf_CUTGROUP_SEEN;
4946     if (RExC_paren_names)
4947         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
4948     else
4949         RXp_PAREN_NAMES(r) = NULL;
4950
4951 #ifdef STUPID_PATTERN_CHECKS
4952     if (RX_PRELEN(rx) == 0)
4953         r->extflags |= RXf_NULL;
4954     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4955         /* XXX: this should happen BEFORE we compile */
4956         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4957     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
4958         r->extflags |= RXf_WHITE;
4959     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
4960         r->extflags |= RXf_START_ONLY;
4961 #else
4962     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4963             /* XXX: this should happen BEFORE we compile */
4964             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4965     else {
4966         regnode *first = ri->program + 1;
4967         U8 fop = OP(first);
4968         U8 nop = OP(NEXTOPER(first));
4969
4970         if (PL_regkind[fop] == NOTHING && nop == END)
4971             r->extflags |= RXf_NULL;
4972         else if (PL_regkind[fop] == BOL && nop == END)
4973             r->extflags |= RXf_START_ONLY;
4974         else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4975             r->extflags |= RXf_WHITE;
4976     }
4977 #endif
4978 #ifdef DEBUGGING
4979     if (RExC_paren_names) {
4980         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
4981         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4982     } else
4983 #endif
4984         ri->name_list_idx = 0;
4985
4986     if (RExC_recurse_count) {
4987         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4988             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4989             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4990         }
4991     }
4992     Newxz(r->offs, RExC_npar, regexp_paren_pair);
4993     /* assume we don't need to swap parens around before we match */
4994
4995     DEBUG_DUMP_r({
4996         PerlIO_printf(Perl_debug_log,"Final program:\n");
4997         regdump(r);
4998     });
4999 #ifdef RE_TRACK_PATTERN_OFFSETS
5000     DEBUG_OFFSETS_r(if (ri->u.offsets) {
5001         const U32 len = ri->u.offsets[0];
5002         U32 i;
5003         GET_RE_DEBUG_FLAGS_DECL;
5004         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5005         for (i = 1; i <= len; i++) {
5006             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5007                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5008                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5009             }
5010         PerlIO_printf(Perl_debug_log, "\n");
5011     });
5012 #endif
5013     return rx;
5014 }
5015
5016 #undef RE_ENGINE_PTR
5017
5018
5019 SV*
5020 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5021                     const U32 flags)
5022 {
5023     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5024
5025     PERL_UNUSED_ARG(value);
5026
5027     if (flags & RXapif_FETCH) {
5028         return reg_named_buff_fetch(rx, key, flags);
5029     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5030         Perl_croak_no_modify(aTHX);
5031         return NULL;
5032     } else if (flags & RXapif_EXISTS) {
5033         return reg_named_buff_exists(rx, key, flags)
5034             ? &PL_sv_yes
5035             : &PL_sv_no;
5036     } else if (flags & RXapif_REGNAMES) {
5037         return reg_named_buff_all(rx, flags);
5038     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5039         return reg_named_buff_scalar(rx, flags);
5040     } else {
5041         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5042         return NULL;
5043     }
5044 }
5045
5046 SV*
5047 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5048                          const U32 flags)
5049 {
5050     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5051     PERL_UNUSED_ARG(lastkey);
5052
5053     if (flags & RXapif_FIRSTKEY)
5054         return reg_named_buff_firstkey(rx, flags);
5055     else if (flags & RXapif_NEXTKEY)
5056         return reg_named_buff_nextkey(rx, flags);
5057     else {
5058         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5059         return NULL;
5060     }
5061 }
5062
5063 SV*
5064 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5065                           const U32 flags)
5066 {
5067     AV *retarray = NULL;
5068     SV *ret;
5069     struct regexp *const rx = (struct regexp *)SvANY(r);
5070
5071     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5072
5073     if (flags & RXapif_ALL)
5074         retarray=newAV();
5075
5076     if (rx && RXp_PAREN_NAMES(rx)) {
5077         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5078         if (he_str) {
5079             IV i;
5080             SV* sv_dat=HeVAL(he_str);
5081             I32 *nums=(I32*)SvPVX(sv_dat);
5082             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5083                 if ((I32)(rx->nparens) >= nums[i]
5084                     && rx->offs[nums[i]].start != -1
5085                     && rx->offs[nums[i]].end != -1)
5086                 {
5087                     ret = newSVpvs("");
5088                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5089                     if (!retarray)
5090                         return ret;
5091                 } else {
5092                     ret = newSVsv(&PL_sv_undef);
5093                 }
5094                 if (retarray)
5095                     av_push(retarray, ret);
5096             }
5097             if (retarray)
5098                 return newRV_noinc(MUTABLE_SV(retarray));
5099         }
5100     }
5101     return NULL;
5102 }
5103
5104 bool
5105 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5106                            const U32 flags)
5107 {
5108     struct regexp *const rx = (struct regexp *)SvANY(r);
5109
5110     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5111
5112     if (rx && RXp_PAREN_NAMES(rx)) {
5113         if (flags & RXapif_ALL) {
5114             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5115         } else {
5116             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5117             if (sv) {
5118                 SvREFCNT_dec(sv);
5119                 return TRUE;
5120             } else {
5121                 return FALSE;
5122             }
5123         }
5124     } else {
5125         return FALSE;
5126     }
5127 }
5128
5129 SV*
5130 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5131 {
5132     struct regexp *const rx = (struct regexp *)SvANY(r);
5133
5134     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5135
5136     if ( rx && RXp_PAREN_NAMES(rx) ) {
5137         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5138
5139         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5140     } else {
5141         return FALSE;
5142     }
5143 }
5144
5145 SV*
5146 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5147 {
5148     struct regexp *const rx = (struct regexp *)SvANY(r);
5149     GET_RE_DEBUG_FLAGS_DECL;
5150
5151     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5152
5153     if (rx && RXp_PAREN_NAMES(rx)) {
5154         HV *hv = RXp_PAREN_NAMES(rx);
5155         HE *temphe;
5156         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5157             IV i;
5158             IV parno = 0;
5159             SV* sv_dat = HeVAL(temphe);
5160             I32 *nums = (I32*)SvPVX(sv_dat);
5161             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5162                 if ((I32)(rx->lastparen) >= nums[i] &&
5163                     rx->offs[nums[i]].start != -1 &&
5164                     rx->offs[nums[i]].end != -1)
5165                 {
5166                     parno = nums[i];
5167                     break;
5168                 }
5169             }
5170             if (parno || flags & RXapif_ALL) {
5171                 return newSVhek(HeKEY_hek(temphe));
5172             }
5173         }
5174     }
5175     return NULL;
5176 }
5177
5178 SV*
5179 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5180 {
5181     SV *ret;
5182     AV *av;
5183     I32 length;
5184     struct regexp *const rx = (struct regexp *)SvANY(r);
5185
5186     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5187
5188     if (rx && RXp_PAREN_NAMES(rx)) {
5189         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5190             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5191         } else if (flags & RXapif_ONE) {
5192             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5193             av = MUTABLE_AV(SvRV(ret));
5194             length = av_len(av);
5195             SvREFCNT_dec(ret);
5196             return newSViv(length + 1);
5197         } else {
5198             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5199             return NULL;
5200         }
5201     }
5202     return &PL_sv_undef;
5203 }
5204
5205 SV*
5206 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5207 {
5208     struct regexp *const rx = (struct regexp *)SvANY(r);
5209     AV *av = newAV();
5210
5211     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5212
5213     if (rx && RXp_PAREN_NAMES(rx)) {
5214         HV *hv= RXp_PAREN_NAMES(rx);
5215         HE *temphe;
5216         (void)hv_iterinit(hv);
5217         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5218             IV i;
5219             IV parno = 0;
5220             SV* sv_dat = HeVAL(temphe);
5221             I32 *nums = (I32*)SvPVX(sv_dat);
5222             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5223                 if ((I32)(rx->lastparen) >= nums[i] &&
5224                     rx->offs[nums[i]].start != -1 &&
5225                     rx->offs[nums[i]].end != -1)
5226                 {
5227                     parno = nums[i];
5228                     break;
5229                 }
5230             }
5231             if (parno || flags & RXapif_ALL) {
5232                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5233             }
5234         }
5235     }
5236
5237     return newRV_noinc(MUTABLE_SV(av));
5238 }
5239
5240 void
5241 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5242                              SV * const sv)
5243 {
5244     struct regexp *const rx = (struct regexp *)SvANY(r);
5245     char *s = NULL;
5246     I32 i = 0;
5247     I32 s1, t1;
5248
5249     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5250
5251     if (!rx->subbeg) {
5252         sv_setsv(sv,&PL_sv_undef);
5253         return;
5254     }
5255     else
5256     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5257         /* $` */
5258         i = rx->offs[0].start;
5259         s = rx->subbeg;
5260     }
5261     else
5262     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5263         /* $' */
5264         s = rx->subbeg + rx->offs[0].end;
5265         i = rx->sublen - rx->offs[0].end;
5266     }
5267     else
5268     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5269         (s1 = rx->offs[paren].start) != -1 &&
5270         (t1 = rx->offs[paren].end) != -1)
5271     {
5272         /* $& $1 ... */
5273         i = t1 - s1;
5274         s = rx->subbeg + s1;
5275     } else {
5276         sv_setsv(sv,&PL_sv_undef);
5277         return;
5278     }
5279     assert(rx->sublen >= (s - rx->subbeg) + i );
5280     if (i >= 0) {
5281         const int oldtainted = PL_tainted;
5282         TAINT_NOT;
5283         sv_setpvn(sv, s, i);
5284         PL_tainted = oldtainted;
5285         if ( (rx->extflags & RXf_CANY_SEEN)
5286             ? (RXp_MATCH_UTF8(rx)
5287                         && (!i || is_utf8_string((U8*)s, i)))
5288             : (RXp_MATCH_UTF8(rx)) )
5289         {
5290             SvUTF8_on(sv);
5291         }
5292         else
5293             SvUTF8_off(sv);
5294         if (PL_tainting) {
5295             if (RXp_MATCH_TAINTED(rx)) {
5296                 if (SvTYPE(sv) >= SVt_PVMG) {
5297                     MAGIC* const mg = SvMAGIC(sv);
5298                     MAGIC* mgt;
5299                     PL_tainted = 1;
5300                     SvMAGIC_set(sv, mg->mg_moremagic);
5301                     SvTAINT(sv);
5302                     if ((mgt = SvMAGIC(sv))) {
5303                         mg->mg_moremagic = mgt;
5304                         SvMAGIC_set(sv, mg);
5305                     }
5306                 } else {
5307                     PL_tainted = 1;
5308                     SvTAINT(sv);
5309                 }
5310             } else
5311                 SvTAINTED_off(sv);
5312         }
5313     } else {
5314         sv_setsv(sv,&PL_sv_undef);
5315         return;
5316     }
5317 }
5318
5319 void
5320 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5321                                                          SV const * const value)
5322 {
5323     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5324
5325     PERL_UNUSED_ARG(rx);
5326     PERL_UNUSED_ARG(paren);
5327     PERL_UNUSED_ARG(value);
5328
5329     if (!PL_localizing)
5330         Perl_croak_no_modify(aTHX);
5331 }
5332
5333 I32
5334 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5335                               const I32 paren)
5336 {
5337     struct regexp *const rx = (struct regexp *)SvANY(r);
5338     I32 i;
5339     I32 s1, t1;
5340
5341     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5342
5343     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5344         switch (paren) {
5345       /* $` / ${^PREMATCH} */
5346       case RX_BUFF_IDX_PREMATCH:
5347         if (rx->offs[0].start != -1) {
5348                         i = rx->offs[0].start;
5349                         if (i > 0) {
5350                                 s1 = 0;
5351                                 t1 = i;
5352                                 goto getlen;
5353                         }
5354             }
5355         return 0;
5356       /* $' / ${^POSTMATCH} */
5357       case RX_BUFF_IDX_POSTMATCH:
5358             if (rx->offs[0].end != -1) {
5359                         i = rx->sublen - rx->offs[0].end;
5360                         if (i > 0) {
5361                                 s1 = rx->offs[0].end;
5362                                 t1 = rx->sublen;
5363                                 goto getlen;
5364                         }
5365             }
5366         return 0;
5367       /* $& / ${^MATCH}, $1, $2, ... */
5368       default:
5369             if (paren <= (I32)rx->nparens &&
5370             (s1 = rx->offs[paren].start) != -1 &&
5371             (t1 = rx->offs[paren].end) != -1)
5372             {
5373             i = t1 - s1;
5374             goto getlen;
5375         } else {
5376             if (ckWARN(WARN_UNINITIALIZED))
5377                 report_uninit((const SV *)sv);
5378             return 0;
5379         }
5380     }
5381   getlen:
5382     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5383         const char * const s = rx->subbeg + s1;
5384         const U8 *ep;
5385         STRLEN el;
5386
5387         i = t1 - s1;
5388         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5389                         i = el;
5390     }
5391     return i;
5392 }
5393
5394 SV*
5395 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5396 {
5397     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5398         PERL_UNUSED_ARG(rx);
5399         if (0)
5400             return NULL;
5401         else
5402             return newSVpvs("Regexp");
5403 }
5404
5405 /* Scans the name of a named buffer from the pattern.
5406  * If flags is REG_RSN_RETURN_NULL returns null.
5407  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5408  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5409  * to the parsed name as looked up in the RExC_paren_names hash.
5410  * If there is an error throws a vFAIL().. type exception.
5411  */
5412
5413 #define REG_RSN_RETURN_NULL    0
5414 #define REG_RSN_RETURN_NAME    1
5415 #define REG_RSN_RETURN_DATA    2
5416
5417 STATIC SV*
5418 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5419 {
5420     char *name_start = RExC_parse;
5421
5422     PERL_ARGS_ASSERT_REG_SCAN_NAME;
5423
5424     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5425          /* skip IDFIRST by using do...while */
5426         if (UTF)
5427             do {
5428                 RExC_parse += UTF8SKIP(RExC_parse);
5429             } while (isALNUM_utf8((U8*)RExC_parse));
5430         else
5431             do {
5432                 RExC_parse++;
5433             } while (isALNUM(*RExC_parse));
5434     }
5435
5436     if ( flags ) {
5437         SV* sv_name
5438             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5439                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5440         if ( flags == REG_RSN_RETURN_NAME)
5441             return sv_name;
5442         else if (flags==REG_RSN_RETURN_DATA) {
5443             HE *he_str = NULL;
5444             SV *sv_dat = NULL;
5445             if ( ! sv_name )      /* should not happen*/
5446                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5447             if (RExC_paren_names)
5448                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5449             if ( he_str )
5450                 sv_dat = HeVAL(he_str);
5451             if ( ! sv_dat )
5452                 vFAIL("Reference to nonexistent named group");
5453             return sv_dat;
5454         }
5455         else {
5456             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5457         }
5458         /* NOT REACHED */
5459     }
5460     return NULL;
5461 }
5462
5463 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5464     int rem=(int)(RExC_end - RExC_parse);                       \
5465     int cut;                                                    \
5466     int num;                                                    \
5467     int iscut=0;                                                \
5468     if (rem>10) {                                               \
5469         rem=10;                                                 \
5470         iscut=1;                                                \
5471     }                                                           \
5472     cut=10-rem;                                                 \
5473     if (RExC_lastparse!=RExC_parse)                             \
5474         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5475             rem, RExC_parse,                                    \
5476             cut + 4,                                            \
5477             iscut ? "..." : "<"                                 \
5478         );                                                      \
5479     else                                                        \
5480         PerlIO_printf(Perl_debug_log,"%16s","");                \
5481                                                                 \
5482     if (SIZE_ONLY)                                              \
5483        num = RExC_size + 1;                                     \
5484     else                                                        \
5485        num=REG_NODE_NUM(RExC_emit);                             \
5486     if (RExC_lastnum!=num)                                      \
5487        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5488     else                                                        \
5489        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5490     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5491         (int)((depth*2)), "",                                   \
5492         (funcname)                                              \
5493     );                                                          \
5494     RExC_lastnum=num;                                           \
5495     RExC_lastparse=RExC_parse;                                  \
5496 })
5497
5498
5499
5500 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5501     DEBUG_PARSE_MSG((funcname));                            \
5502     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5503 })
5504 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5505     DEBUG_PARSE_MSG((funcname));                            \
5506     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5507 })
5508 /*
5509  - reg - regular expression, i.e. main body or parenthesized thing
5510  *
5511  * Caller must absorb opening parenthesis.
5512  *
5513  * Combining parenthesis handling with the base level of regular expression
5514  * is a trifle forced, but the need to tie the tails of the branches to what
5515  * follows makes it hard to avoid.
5516  */
5517 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5518 #ifdef DEBUGGING
5519 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5520 #else
5521 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5522 #endif
5523
5524 STATIC regnode *
5525 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5526     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5527 {
5528     dVAR;
5529     register regnode *ret;              /* Will be the head of the group. */
5530     register regnode *br;
5531     register regnode *lastbr;
5532     register regnode *ender = NULL;
5533     register I32 parno = 0;
5534     I32 flags;
5535     U32 oregflags = RExC_flags;
5536     bool have_branch = 0;
5537     bool is_open = 0;
5538     I32 freeze_paren = 0;
5539     I32 after_freeze = 0;
5540
5541     /* for (?g), (?gc), and (?o) warnings; warning
5542        about (?c) will warn about (?g) -- japhy    */
5543
5544 #define WASTED_O  0x01
5545 #define WASTED_G  0x02
5546 #define WASTED_C  0x04
5547 #define WASTED_GC (0x02|0x04)
5548     I32 wastedflags = 0x00;
5549
5550     char * parse_start = RExC_parse; /* MJD */
5551     char * const oregcomp_parse = RExC_parse;
5552
5553     GET_RE_DEBUG_FLAGS_DECL;
5554
5555     PERL_ARGS_ASSERT_REG;
5556     DEBUG_PARSE("reg ");
5557
5558     *flagp = 0;                         /* Tentatively. */
5559
5560
5561     /* Make an OPEN node, if parenthesized. */
5562     if (paren) {
5563         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5564             char *start_verb = RExC_parse;
5565             STRLEN verb_len = 0;
5566             char *start_arg = NULL;
5567             unsigned char op = 0;
5568             int argok = 1;
5569             int internal_argval = 0; /* internal_argval is only useful if !argok */
5570             while ( *RExC_parse && *RExC_parse != ')' ) {
5571                 if ( *RExC_parse == ':' ) {
5572                     start_arg = RExC_parse + 1;
5573                     break;
5574                 }
5575                 RExC_parse++;
5576             }
5577             ++start_verb;
5578             verb_len = RExC_parse - start_verb;
5579             if ( start_arg ) {
5580                 RExC_parse++;
5581                 while ( *RExC_parse && *RExC_parse != ')' )
5582                     RExC_parse++;
5583                 if ( *RExC_parse != ')' )
5584                     vFAIL("Unterminated verb pattern argument");
5585                 if ( RExC_parse == start_arg )
5586                     start_arg = NULL;
5587             } else {
5588                 if ( *RExC_parse != ')' )
5589                     vFAIL("Unterminated verb pattern");
5590             }
5591
5592             switch ( *start_verb ) {
5593             case 'A':  /* (*ACCEPT) */
5594                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5595                     op = ACCEPT;
5596                     internal_argval = RExC_nestroot;
5597                 }
5598                 break;
5599             case 'C':  /* (*COMMIT) */
5600                 if ( memEQs(start_verb,verb_len,"COMMIT") )
5601                     op = COMMIT;
5602                 break;
5603             case 'F':  /* (*FAIL) */
5604                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5605                     op = OPFAIL;
5606                     argok = 0;
5607                 }
5608                 break;
5609             case ':':  /* (*:NAME) */
5610             case 'M':  /* (*MARK:NAME) */
5611                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5612                     op = MARKPOINT;
5613                     argok = -1;
5614                 }
5615                 break;
5616             case 'P':  /* (*PRUNE) */
5617                 if ( memEQs(start_verb,verb_len,"PRUNE") )
5618                     op = PRUNE;
5619                 break;
5620             case 'S':   /* (*SKIP) */
5621                 if ( memEQs(start_verb,verb_len,"SKIP") )
5622                     op = SKIP;
5623                 break;
5624             case 'T':  /* (*THEN) */
5625                 /* [19:06] <TimToady> :: is then */
5626                 if ( memEQs(start_verb,verb_len,"THEN") ) {
5627                     op = CUTGROUP;
5628                     RExC_seen |= REG_SEEN_CUTGROUP;
5629                 }
5630                 break;
5631             }
5632             if ( ! op ) {
5633                 RExC_parse++;
5634                 vFAIL3("Unknown verb pattern '%.*s'",
5635                     verb_len, start_verb);
5636             }
5637             if ( argok ) {
5638                 if ( start_arg && internal_argval ) {
5639                     vFAIL3("Verb pattern '%.*s' may not have an argument",
5640                         verb_len, start_verb);
5641                 } else if ( argok < 0 && !start_arg ) {
5642                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5643                         verb_len, start_verb);
5644                 } else {
5645                     ret = reganode(pRExC_state, op, internal_argval);
5646                     if ( ! internal_argval && ! SIZE_ONLY ) {
5647                         if (start_arg) {
5648                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5649                             ARG(ret) = add_data( pRExC_state, 1, "S" );
5650                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5651                             ret->flags = 0;
5652                         } else {
5653                             ret->flags = 1;
5654                         }
5655                     }
5656                 }
5657                 if (!internal_argval)
5658                     RExC_seen |= REG_SEEN_VERBARG;
5659             } else if ( start_arg ) {
5660                 vFAIL3("Verb pattern '%.*s' may not have an argument",
5661                         verb_len, start_verb);
5662             } else {
5663                 ret = reg_node(pRExC_state, op);
5664             }
5665             nextchar(pRExC_state);
5666             return ret;
5667         } else
5668         if (*RExC_parse == '?') { /* (?...) */
5669             bool is_logical = 0;
5670             const char * const seqstart = RExC_parse;
5671
5672             RExC_parse++;
5673             paren = *RExC_parse++;
5674             ret = NULL;                 /* For look-ahead/behind. */
5675             switch (paren) {
5676
5677             case 'P':   /* (?P...) variants for those used to PCRE/Python */
5678                 paren = *RExC_parse++;
5679                 if ( paren == '<')         /* (?P<...>) named capture */
5680                     goto named_capture;
5681                 else if (paren == '>') {   /* (?P>name) named recursion */
5682                     goto named_recursion;
5683                 }
5684                 else if (paren == '=') {   /* (?P=...)  named backref */
5685                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
5686                        you change this make sure you change that */
5687                     char* name_start = RExC_parse;
5688                     U32 num = 0;
5689                     SV *sv_dat = reg_scan_name(pRExC_state,
5690                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5691                     if (RExC_parse == name_start || *RExC_parse != ')')
5692                         vFAIL2("Sequence %.3s... not terminated",parse_start);
5693
5694                     if (!SIZE_ONLY) {
5695                         num = add_data( pRExC_state, 1, "S" );
5696                         RExC_rxi->data->data[num]=(void*)sv_dat;
5697                         SvREFCNT_inc_simple_void(sv_dat);
5698                     }
5699                     RExC_sawback = 1;
5700                     ret = reganode(pRExC_state,
5701                            (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5702                            num);
5703                     *flagp |= HASWIDTH;
5704
5705                     Set_Node_Offset(ret, parse_start+1);
5706                     Set_Node_Cur_Length(ret); /* MJD */
5707
5708                     nextchar(pRExC_state);
5709                     return ret;
5710                 }
5711                 RExC_parse++;
5712                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5713                 /*NOTREACHED*/
5714             case '<':           /* (?<...) */
5715                 if (*RExC_parse == '!')
5716                     paren = ',';
5717                 else if (*RExC_parse != '=')
5718               named_capture:
5719                 {               /* (?<...>) */
5720                     char *name_start;
5721                     SV *svname;
5722                     paren= '>';
5723             case '\'':          /* (?'...') */
5724                     name_start= RExC_parse;
5725                     svname = reg_scan_name(pRExC_state,
5726                         SIZE_ONLY ?  /* reverse test from the others */
5727                         REG_RSN_RETURN_NAME :
5728                         REG_RSN_RETURN_NULL);
5729                     if (RExC_parse == name_start) {
5730                         RExC_parse++;
5731                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5732                         /*NOTREACHED*/
5733                     }
5734                     if (*RExC_parse != paren)
5735                         vFAIL2("Sequence (?%c... not terminated",
5736                             paren=='>' ? '<' : paren);
5737                     if (SIZE_ONLY) {
5738                         HE *he_str;
5739                         SV *sv_dat = NULL;
5740                         if (!svname) /* shouldnt happen */
5741                             Perl_croak(aTHX_
5742                                 "panic: reg_scan_name returned NULL");
5743                         if (!RExC_paren_names) {
5744                             RExC_paren_names= newHV();
5745                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
5746 #ifdef DEBUGGING
5747                             RExC_paren_name_list= newAV();
5748                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5749 #endif
5750                         }
5751                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5752                         if ( he_str )
5753                             sv_dat = HeVAL(he_str);
5754                         if ( ! sv_dat ) {
5755                             /* croak baby croak */
5756                             Perl_croak(aTHX_
5757                                 "panic: paren_name hash element allocation failed");
5758                         } else if ( SvPOK(sv_dat) ) {
5759                             /* (?|...) can mean we have dupes so scan to check
5760                                its already been stored. Maybe a flag indicating
5761                                we are inside such a construct would be useful,
5762                                but the arrays are likely to be quite small, so
5763                                for now we punt -- dmq */
5764                             IV count = SvIV(sv_dat);
5765                             I32 *pv = (I32*)SvPVX(sv_dat);
5766                             IV i;
5767                             for ( i = 0 ; i < count ; i++ ) {
5768                                 if ( pv[i] == RExC_npar ) {
5769                                     count = 0;
5770                                     break;
5771                                 }
5772                             }
5773                             if ( count ) {
5774                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5775                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5776                                 pv[count] = RExC_npar;
5777                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5778                             }
5779                         } else {
5780                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
5781                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5782                             SvIOK_on(sv_dat);
5783                             SvIV_set(sv_dat, 1);
5784                         }
5785 #ifdef DEBUGGING
5786                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5787                             SvREFCNT_dec(svname);
5788 #endif
5789
5790                         /*sv_dump(sv_dat);*/
5791                     }
5792                     nextchar(pRExC_state);
5793                     paren = 1;
5794                     goto capturing_parens;
5795                 }
5796                 RExC_seen |= REG_SEEN_LOOKBEHIND;
5797                 RExC_parse++;
5798             case '=':           /* (?=...) */
5799                 RExC_seen_zerolen++;
5800                         break;
5801             case '!':           /* (?!...) */
5802                 RExC_seen_zerolen++;
5803                 if (*RExC_parse == ')') {
5804                     ret=reg_node(pRExC_state, OPFAIL);
5805                     nextchar(pRExC_state);
5806                     return ret;
5807                 }
5808                 break;
5809             case '|':           /* (?|...) */
5810                 /* branch reset, behave like a (?:...) except that
5811                    buffers in alternations share the same numbers */
5812                 paren = ':';
5813                 after_freeze = freeze_paren = RExC_npar;
5814                 break;
5815             case ':':           /* (?:...) */
5816             case '>':           /* (?>...) */
5817                 break;
5818             case '$':           /* (?$...) */
5819             case '@':           /* (?@...) */
5820                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5821                 break;
5822             case '#':           /* (?#...) */
5823                 while (*RExC_parse && *RExC_parse != ')')
5824                     RExC_parse++;
5825                 if (*RExC_parse != ')')
5826                     FAIL("Sequence (?#... not terminated");
5827                 nextchar(pRExC_state);
5828                 *flagp = TRYAGAIN;
5829                 return NULL;
5830             case '0' :           /* (?0) */
5831             case 'R' :           /* (?R) */
5832                 if (*RExC_parse != ')')
5833                     FAIL("Sequence (?R) not terminated");
5834                 ret = reg_node(pRExC_state, GOSTART);
5835                 *flagp |= POSTPONED;
5836                 nextchar(pRExC_state);
5837                 return ret;
5838                 /*notreached*/
5839             { /* named and numeric backreferences */
5840                 I32 num;
5841             case '&':            /* (?&NAME) */
5842                 parse_start = RExC_parse - 1;
5843               named_recursion:
5844                 {
5845                     SV *sv_dat = reg_scan_name(pRExC_state,
5846                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5847                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5848                 }
5849                 goto gen_recurse_regop;
5850                 /* NOT REACHED */
5851             case '+':
5852                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5853                     RExC_parse++;
5854                     vFAIL("Illegal pattern");
5855                 }
5856                 goto parse_recursion;
5857                 /* NOT REACHED*/
5858             case '-': /* (?-1) */
5859                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5860                     RExC_parse--; /* rewind to let it be handled later */
5861                     goto parse_flags;
5862                 }
5863                 /*FALLTHROUGH */
5864             case '1': case '2': case '3': case '4': /* (?1) */
5865             case '5': case '6': case '7': case '8': case '9':
5866                 RExC_parse--;
5867               parse_recursion:
5868                 num = atoi(RExC_parse);
5869                 parse_start = RExC_parse - 1; /* MJD */
5870                 if (*RExC_parse == '-')
5871                     RExC_parse++;
5872                 while (isDIGIT(*RExC_parse))
5873                         RExC_parse++;
5874                 if (*RExC_parse!=')')
5875                     vFAIL("Expecting close bracket");
5876
5877               gen_recurse_regop:
5878                 if ( paren == '-' ) {
5879                     /*
5880                     Diagram of capture buffer numbering.
5881                     Top line is the normal capture buffer numbers
5882                     Botton line is the negative indexing as from
5883                     the X (the (?-2))
5884
5885                     +   1 2    3 4 5 X          6 7
5886                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5887                     -   5 4    3 2 1 X          x x
5888
5889                     */
5890                     num = RExC_npar + num;
5891                     if (num < 1)  {
5892                         RExC_parse++;
5893                         vFAIL("Reference to nonexistent group");
5894                     }
5895                 } else if ( paren == '+' ) {
5896                     num = RExC_npar + num - 1;
5897                 }
5898
5899                 ret = reganode(pRExC_state, GOSUB, num);
5900                 if (!SIZE_ONLY) {
5901                     if (num > (I32)RExC_rx->nparens) {
5902                         RExC_parse++;
5903                         vFAIL("Reference to nonexistent group");
5904                     }
5905                     ARG2L_SET( ret, RExC_recurse_count++);
5906                     RExC_emit++;
5907                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5908                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5909                 } else {
5910                     RExC_size++;
5911                 }
5912                 RExC_seen |= REG_SEEN_RECURSE;
5913                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5914                 Set_Node_Offset(ret, parse_start); /* MJD */
5915
5916                 *flagp |= POSTPONED;
5917                 nextchar(pRExC_state);
5918                 return ret;
5919             } /* named and numeric backreferences */
5920             /* NOT REACHED */
5921
5922             case '?':           /* (??...) */
5923                 is_logical = 1;
5924                 if (*RExC_parse != '{') {
5925                     RExC_parse++;
5926                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5927                     /*NOTREACHED*/
5928                 }
5929                 *flagp |= POSTPONED;
5930                 paren = *RExC_parse++;
5931                 /* FALL THROUGH */
5932             case '{':           /* (?{...}) */
5933             {
5934                 I32 count = 1;
5935                 U32 n = 0;
5936                 char c;
5937                 char *s = RExC_parse;
5938
5939                 RExC_seen_zerolen++;
5940                 RExC_seen |= REG_SEEN_EVAL;
5941                 while (count && (c = *RExC_parse)) {
5942                     if (c == '\\') {
5943                         if (RExC_parse[1])
5944                             RExC_parse++;
5945                     }
5946                     else if (c == '{')
5947                         count++;
5948                     else if (c == '}')
5949                         count--;
5950                     RExC_parse++;
5951                 }
5952                 if (*RExC_parse != ')') {
5953                     RExC_parse = s;
5954                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5955                 }
5956                 if (!SIZE_ONLY) {
5957                     PAD *pad;
5958                     OP_4tree *sop, *rop;
5959                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5960
5961                     ENTER;
5962                     Perl_save_re_context(aTHX);
5963                     rop = sv_compile_2op(sv, &sop, "re", &pad);
5964                     sop->op_private |= OPpREFCOUNTED;
5965                     /* re_dup will OpREFCNT_inc */
5966                     OpREFCNT_set(sop, 1);
5967                     LEAVE;
5968
5969                     n = add_data(pRExC_state, 3, "nop");
5970                     RExC_rxi->data->data[n] = (void*)rop;
5971                     RExC_rxi->data->data[n+1] = (void*)sop;
5972                     RExC_rxi->data->data[n+2] = (void*)pad;
5973                     SvREFCNT_dec(sv);
5974                 }
5975                 else {                                          /* First pass */
5976                     if (PL_reginterp_cnt < ++RExC_seen_evals
5977                         && IN_PERL_RUNTIME)
5978                         /* No compiled RE interpolated, has runtime
5979                            components ===> unsafe.  */
5980                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
5981                     if (PL_tainting && PL_tainted)
5982                         FAIL("Eval-group in insecure regular expression");
5983 #if PERL_VERSION > 8
5984                     if (IN_PERL_COMPILETIME)
5985                         PL_cv_has_eval = 1;
5986 #endif
5987                 }
5988
5989                 nextchar(pRExC_state);
5990                 if (is_logical) {
5991                     ret = reg_node(pRExC_state, LOGICAL);
5992                     if (!SIZE_ONLY)
5993                         ret->flags = 2;
5994                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5995                     /* deal with the length of this later - MJD */
5996                     return ret;
5997                 }
5998                 ret = reganode(pRExC_state, EVAL, n);
5999                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6000                 Set_Node_Offset(ret, parse_start);
6001                 return ret;
6002             }
6003             case '(':           /* (?(?{...})...) and (?(?=...)...) */
6004             {
6005                 int is_define= 0;
6006                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
6007                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6008                         || RExC_parse[1] == '<'
6009                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
6010                         I32 flag;
6011
6012                         ret = reg_node(pRExC_state, LOGICAL);
6013                         if (!SIZE_ONLY)
6014                             ret->flags = 1;
6015                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6016                         goto insert_if;
6017                     }
6018                 }
6019                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
6020                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6021                 {
6022                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
6023                     char *name_start= RExC_parse++;
6024                     U32 num = 0;
6025                     SV *sv_dat=reg_scan_name(pRExC_state,
6026                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6027                     if (RExC_parse == name_start || *RExC_parse != ch)
6028                         vFAIL2("Sequence (?(%c... not terminated",
6029                             (ch == '>' ? '<' : ch));
6030                     RExC_parse++;
6031                     if (!SIZE_ONLY) {
6032                         num = add_data( pRExC_state, 1, "S" );
6033                         RExC_rxi->data->data[num]=(void*)sv_dat;
6034                         SvREFCNT_inc_simple_void(sv_dat);
6035                     }
6036                     ret = reganode(pRExC_state,NGROUPP,num);
6037                     goto insert_if_check_paren;
6038                 }
6039                 else if (RExC_parse[0] == 'D' &&
6040                          RExC_parse[1] == 'E' &&
6041                          RExC_parse[2] == 'F' &&
6042                          RExC_parse[3] == 'I' &&
6043                          RExC_parse[4] == 'N' &&
6044                          RExC_parse[5] == 'E')
6045                 {
6046                     ret = reganode(pRExC_state,DEFINEP,0);
6047                     RExC_parse +=6 ;
6048                     is_define = 1;
6049                     goto insert_if_check_paren;
6050                 }
6051                 else if (RExC_parse[0] == 'R') {
6052                     RExC_parse++;
6053                     parno = 0;
6054                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6055                         parno = atoi(RExC_parse++);
6056                         while (isDIGIT(*RExC_parse))
6057                             RExC_parse++;
6058                     } else if (RExC_parse[0] == '&') {
6059                         SV *sv_dat;
6060                         RExC_parse++;
6061                         sv_dat = reg_scan_name(pRExC_state,
6062                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6063                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6064                     }
6065                     ret = reganode(pRExC_state,INSUBP,parno);
6066                     goto insert_if_check_paren;
6067                 }
6068                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6069                     /* (?(1)...) */
6070                     char c;
6071                     parno = atoi(RExC_parse++);
6072
6073                     while (isDIGIT(*RExC_parse))
6074                         RExC_parse++;
6075                     ret = reganode(pRExC_state, GROUPP, parno);
6076
6077                  insert_if_check_paren:
6078                     if ((c = *nextchar(pRExC_state)) != ')')
6079                         vFAIL("Switch condition not recognized");
6080                   insert_if:
6081                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6082                     br = regbranch(pRExC_state, &flags, 1,depth+1);
6083                     if (br == NULL)
6084                         br = reganode(pRExC_state, LONGJMP, 0);
6085                     else
6086                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6087                     c = *nextchar(pRExC_state);
6088                     if (flags&HASWIDTH)
6089                         *flagp |= HASWIDTH;
6090                     if (c == '|') {
6091                         if (is_define)
6092                             vFAIL("(?(DEFINE)....) does not allow branches");
6093                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6094                         regbranch(pRExC_state, &flags, 1,depth+1);
6095                         REGTAIL(pRExC_state, ret, lastbr);
6096                         if (flags&HASWIDTH)
6097                             *flagp |= HASWIDTH;
6098                         c = *nextchar(pRExC_state);
6099                     }
6100                     else
6101                         lastbr = NULL;
6102                     if (c != ')')
6103                         vFAIL("Switch (?(condition)... contains too many branches");
6104                     ender = reg_node(pRExC_state, TAIL);
6105                     REGTAIL(pRExC_state, br, ender);
6106                     if (lastbr) {
6107                         REGTAIL(pRExC_state, lastbr, ender);
6108                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6109                     }
6110                     else
6111                         REGTAIL(pRExC_state, ret, ender);
6112                     RExC_size++; /* XXX WHY do we need this?!!
6113                                     For large programs it seems to be required
6114                                     but I can't figure out why. -- dmq*/
6115                     return ret;
6116                 }
6117                 else {
6118                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6119                 }
6120             }
6121             case 0:
6122                 RExC_parse--; /* for vFAIL to print correctly */
6123                 vFAIL("Sequence (? incomplete");
6124                 break;
6125             default:
6126                 --RExC_parse;
6127                 parse_flags:      /* (?i) */
6128             {
6129                 U32 posflags = 0, negflags = 0;
6130                 U32 *flagsp = &posflags;
6131
6132                 while (*RExC_parse) {
6133                     /* && strchr("iogcmsx", *RExC_parse) */
6134                     /* (?g), (?gc) and (?o) are useless here
6135                        and must be globally applied -- japhy */
6136                     switch (*RExC_parse) {
6137                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6138                     case ONCE_PAT_MOD: /* 'o' */
6139                     case GLOBAL_PAT_MOD: /* 'g' */
6140                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6141                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6142                             if (! (wastedflags & wflagbit) ) {
6143                                 wastedflags |= wflagbit;
6144                                 vWARN5(
6145                                     RExC_parse + 1,
6146                                     "Useless (%s%c) - %suse /%c modifier",
6147                                     flagsp == &negflags ? "?-" : "?",
6148                                     *RExC_parse,
6149                                     flagsp == &negflags ? "don't " : "",
6150                                     *RExC_parse
6151                                 );
6152                             }
6153                         }
6154                         break;
6155
6156                     case CONTINUE_PAT_MOD: /* 'c' */
6157                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6158                             if (! (wastedflags & WASTED_C) ) {
6159                                 wastedflags |= WASTED_GC;
6160                                 vWARN3(
6161                                     RExC_parse + 1,
6162                                     "Useless (%sc) - %suse /gc modifier",
6163                                     flagsp == &negflags ? "?-" : "?",
6164                                     flagsp == &negflags ? "don't " : ""
6165                                 );
6166                             }
6167                         }
6168                         break;
6169                     case KEEPCOPY_PAT_MOD: /* 'p' */
6170                         if (flagsp == &negflags) {
6171                             if (SIZE_ONLY)
6172                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6173                         } else {
6174                             *flagsp |= RXf_PMf_KEEPCOPY;
6175                         }
6176                         break;
6177                     case '-':
6178                         if (flagsp == &negflags) {
6179                             RExC_parse++;
6180                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6181                             /*NOTREACHED*/
6182                         }
6183                         flagsp = &negflags;
6184                         wastedflags = 0;  /* reset so (?g-c) warns twice */
6185                         break;
6186                     case ':':
6187                         paren = ':';
6188                         /*FALLTHROUGH*/
6189                     case ')':
6190                         RExC_flags |= posflags;
6191                         RExC_flags &= ~negflags;
6192                         if (paren != ':') {
6193                             oregflags |= posflags;
6194                             oregflags &= ~negflags;
6195                         }
6196                         nextchar(pRExC_state);
6197                         if (paren != ':') {
6198                             *flagp = TRYAGAIN;
6199                             return NULL;
6200                         } else {
6201                             ret = NULL;
6202                             goto parse_rest;
6203                         }
6204                         /*NOTREACHED*/
6205                     default:
6206                         RExC_parse++;
6207                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6208                         /*NOTREACHED*/
6209                     }
6210                     ++RExC_parse;
6211                 }
6212             }} /* one for the default block, one for the switch */
6213         }
6214         else {                  /* (...) */
6215           capturing_parens:
6216             parno = RExC_npar;
6217             RExC_npar++;
6218
6219             ret = reganode(pRExC_state, OPEN, parno);
6220             if (!SIZE_ONLY ){
6221                 if (!RExC_nestroot)
6222                     RExC_nestroot = parno;
6223                 if (RExC_seen & REG_SEEN_RECURSE
6224                     && !RExC_open_parens[parno-1])
6225                 {
6226                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6227                         "Setting open paren #%"IVdf" to %d\n",
6228                         (IV)parno, REG_NODE_NUM(ret)));
6229                     RExC_open_parens[parno-1]= ret;
6230                 }
6231             }
6232             Set_Node_Length(ret, 1); /* MJD */
6233             Set_Node_Offset(ret, RExC_parse); /* MJD */
6234             is_open = 1;
6235         }
6236     }
6237     else                        /* ! paren */
6238         ret = NULL;
6239
6240    parse_rest:
6241     /* Pick up the branches, linking them together. */
6242     parse_start = RExC_parse;   /* MJD */
6243     br = regbranch(pRExC_state, &flags, 1,depth+1);
6244
6245     if (freeze_paren) {
6246         if (RExC_npar > after_freeze)
6247             after_freeze = RExC_npar;
6248         RExC_npar = freeze_paren;
6249     }
6250
6251     /*     branch_len = (paren != 0); */
6252
6253     if (br == NULL)
6254         return(NULL);
6255     if (*RExC_parse == '|') {
6256         if (!SIZE_ONLY && RExC_extralen) {
6257             reginsert(pRExC_state, BRANCHJ, br, depth+1);
6258         }
6259         else {                  /* MJD */
6260             reginsert(pRExC_state, BRANCH, br, depth+1);
6261             Set_Node_Length(br, paren != 0);
6262             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6263         }
6264         have_branch = 1;
6265         if (SIZE_ONLY)
6266             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
6267     }
6268     else if (paren == ':') {
6269         *flagp |= flags&SIMPLE;
6270     }
6271     if (is_open) {                              /* Starts with OPEN. */
6272         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
6273     }
6274     else if (paren != '?')              /* Not Conditional */
6275         ret = br;
6276     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6277     lastbr = br;
6278     while (*RExC_parse == '|') {
6279         if (!SIZE_ONLY && RExC_extralen) {
6280             ender = reganode(pRExC_state, LONGJMP,0);
6281             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6282         }
6283         if (SIZE_ONLY)
6284             RExC_extralen += 2;         /* Account for LONGJMP. */
6285         nextchar(pRExC_state);
6286         if (freeze_paren) {
6287             if (RExC_npar > after_freeze)
6288                 after_freeze = RExC_npar;
6289             RExC_npar = freeze_paren;
6290         }
6291         br = regbranch(pRExC_state, &flags, 0, depth+1);
6292
6293         if (br == NULL)
6294             return(NULL);
6295         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
6296         lastbr = br;
6297         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6298     }
6299
6300     if (have_branch || paren != ':') {
6301         /* Make a closing node, and hook it on the end. */
6302         switch (paren) {
6303         case ':':
6304             ender = reg_node(pRExC_state, TAIL);
6305             break;
6306         case 1:
6307             ender = reganode(pRExC_state, CLOSE, parno);
6308             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6309                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6310                         "Setting close paren #%"IVdf" to %d\n",
6311                         (IV)parno, REG_NODE_NUM(ender)));
6312                 RExC_close_parens[parno-1]= ender;
6313                 if (RExC_nestroot == parno)
6314                     RExC_nestroot = 0;
6315             }
6316             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6317             Set_Node_Length(ender,1); /* MJD */
6318             break;
6319         case '<':
6320         case ',':
6321         case '=':
6322         case '!':
6323             *flagp &= ~HASWIDTH;
6324             /* FALL THROUGH */
6325         case '>':
6326             ender = reg_node(pRExC_state, SUCCEED);
6327             break;
6328         case 0:
6329             ender = reg_node(pRExC_state, END);
6330             if (!SIZE_ONLY) {
6331                 assert(!RExC_opend); /* there can only be one! */
6332                 RExC_opend = ender;
6333             }
6334             break;
6335         }
6336         REGTAIL(pRExC_state, lastbr, ender);
6337
6338         if (have_branch && !SIZE_ONLY) {
6339             if (depth==1)
6340                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6341
6342             /* Hook the tails of the branches to the closing node. */
6343             for (br = ret; br; br = regnext(br)) {
6344                 const U8 op = PL_regkind[OP(br)];
6345                 if (op == BRANCH) {
6346                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6347                 }
6348                 else if (op == BRANCHJ) {
6349                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6350                 }
6351             }
6352         }
6353     }
6354
6355     {
6356         const char *p;
6357         static const char parens[] = "=!<,>";
6358
6359         if (paren && (p = strchr(parens, paren))) {
6360             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6361             int flag = (p - parens) > 1;
6362
6363             if (paren == '>')
6364                 node = SUSPEND, flag = 0;
6365             reginsert(pRExC_state, node,ret, depth+1);
6366             Set_Node_Cur_Length(ret);
6367             Set_Node_Offset(ret, parse_start + 1);
6368             ret->flags = flag;
6369             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6370         }
6371     }
6372
6373     /* Check for proper termination. */
6374     if (paren) {
6375         RExC_flags = oregflags;
6376         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6377             RExC_parse = oregcomp_parse;
6378             vFAIL("Unmatched (");
6379         }
6380     }
6381     else if (!paren && RExC_parse < RExC_end) {
6382         if (*RExC_parse == ')') {
6383             RExC_parse++;
6384             vFAIL("Unmatched )");
6385         }
6386         else
6387             FAIL("Junk on end of regexp");      /* "Can't happen". */
6388         /* NOTREACHED */
6389     }
6390     if (after_freeze)
6391         RExC_npar = after_freeze;
6392     return(ret);
6393 }
6394
6395 /*
6396  - regbranch - one alternative of an | operator
6397  *
6398  * Implements the concatenation operator.
6399  */
6400 STATIC regnode *
6401 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6402 {
6403     dVAR;
6404     register regnode *ret;
6405     register regnode *chain = NULL;
6406     register regnode *latest;
6407     I32 flags = 0, c = 0;
6408     GET_RE_DEBUG_FLAGS_DECL;
6409
6410     PERL_ARGS_ASSERT_REGBRANCH;
6411
6412     DEBUG_PARSE("brnc");
6413
6414     if (first)
6415         ret = NULL;
6416     else {
6417         if (!SIZE_ONLY && RExC_extralen)
6418             ret = reganode(pRExC_state, BRANCHJ,0);
6419         else {
6420             ret = reg_node(pRExC_state, BRANCH);
6421             Set_Node_Length(ret, 1);
6422         }
6423     }
6424
6425     if (!first && SIZE_ONLY)
6426         RExC_extralen += 1;                     /* BRANCHJ */
6427
6428     *flagp = WORST;                     /* Tentatively. */
6429
6430     RExC_parse--;
6431     nextchar(pRExC_state);
6432     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6433         flags &= ~TRYAGAIN;
6434         latest = regpiece(pRExC_state, &flags,depth+1);
6435         if (latest == NULL) {
6436             if (flags & TRYAGAIN)
6437                 continue;
6438             return(NULL);
6439         }
6440         else if (ret == NULL)
6441             ret = latest;
6442         *flagp |= flags&(HASWIDTH|POSTPONED);
6443         if (chain == NULL)      /* First piece. */
6444             *flagp |= flags&SPSTART;
6445         else {
6446             RExC_naughty++;
6447             REGTAIL(pRExC_state, chain, latest);
6448         }
6449         chain = latest;
6450         c++;
6451     }
6452     if (chain == NULL) {        /* Loop ran zero times. */
6453         chain = reg_node(pRExC_state, NOTHING);
6454         if (ret == NULL)
6455             ret = chain;
6456     }
6457     if (c == 1) {
6458         *flagp |= flags&SIMPLE;
6459     }
6460
6461     return ret;
6462 }
6463
6464 /*
6465  - regpiece - something followed by possible [*+?]
6466  *
6467  * Note that the branching code sequences used for ? and the general cases
6468  * of * and + are somewhat optimized:  they use the same NOTHING node as
6469  * both the endmarker for their branch list and the body of the last branch.
6470  * It might seem that this node could be dispensed with entirely, but the
6471  * endmarker role is not redundant.
6472  */
6473 STATIC regnode *
6474 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6475 {
6476     dVAR;
6477     register regnode *ret;
6478     register char op;
6479     register char *next;
6480     I32 flags;
6481     const char * const origparse = RExC_parse;
6482     I32 min;
6483     I32 max = REG_INFTY;
6484     char *parse_start;
6485     const char *maxpos = NULL;
6486     GET_RE_DEBUG_FLAGS_DECL;
6487
6488     PERL_ARGS_ASSERT_REGPIECE;
6489
6490     DEBUG_PARSE("piec");
6491
6492     ret = regatom(pRExC_state, &flags,depth+1);
6493     if (ret == NULL) {
6494         if (flags & TRYAGAIN)
6495             *flagp |= TRYAGAIN;
6496         return(NULL);
6497     }
6498
6499     op = *RExC_parse;
6500
6501     if (op == '{' && regcurly(RExC_parse)) {
6502         maxpos = NULL;
6503         parse_start = RExC_parse; /* MJD */
6504         next = RExC_parse + 1;
6505         while (isDIGIT(*next) || *next == ',') {
6506             if (*next == ',') {
6507                 if (maxpos)
6508                     break;
6509                 else
6510                     maxpos = next;
6511             }
6512             next++;
6513         }
6514         if (*next == '}') {             /* got one */
6515             if (!maxpos)
6516                 maxpos = next;
6517             RExC_parse++;
6518             min = atoi(RExC_parse);
6519             if (*maxpos == ',')
6520                 maxpos++;
6521             else
6522                 maxpos = RExC_parse;
6523             max = atoi(maxpos);
6524             if (!max && *maxpos != '0')
6525                 max = REG_INFTY;                /* meaning "infinity" */
6526             else if (max >= REG_INFTY)
6527                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6528             RExC_parse = next;
6529             nextchar(pRExC_state);
6530
6531         do_curly:
6532             if ((flags&SIMPLE)) {
6533                 RExC_naughty += 2 + RExC_naughty / 2;
6534                 reginsert(pRExC_state, CURLY, ret, depth+1);
6535                 Set_Node_Offset(ret, parse_start+1); /* MJD */
6536                 Set_Node_Cur_Length(ret);
6537             }
6538             else {
6539                 regnode * const w = reg_node(pRExC_state, WHILEM);
6540
6541                 w->flags = 0;
6542                 REGTAIL(pRExC_state, ret, w);
6543                 if (!SIZE_ONLY && RExC_extralen) {
6544                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
6545                     reginsert(pRExC_state, NOTHING,ret, depth+1);
6546                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
6547                 }
6548                 reginsert(pRExC_state, CURLYX,ret, depth+1);
6549                                 /* MJD hk */
6550                 Set_Node_Offset(ret, parse_start+1);
6551                 Set_Node_Length(ret,
6552                                 op == '{' ? (RExC_parse - parse_start) : 1);
6553
6554                 if (!SIZE_ONLY && RExC_extralen)
6555                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
6556                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6557                 if (SIZE_ONLY)
6558                     RExC_whilem_seen++, RExC_extralen += 3;
6559                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
6560             }
6561             ret->flags = 0;
6562
6563             if (min > 0)
6564                 *flagp = WORST;
6565             if (max > 0)
6566                 *flagp |= HASWIDTH;
6567             if (max < min)
6568                 vFAIL("Can't do {n,m} with n > m");
6569             if (!SIZE_ONLY) {
6570                 ARG1_SET(ret, (U16)min);
6571                 ARG2_SET(ret, (U16)max);
6572             }
6573
6574             goto nest_check;
6575         }
6576     }
6577
6578     if (!ISMULT1(op)) {
6579         *flagp = flags;
6580         return(ret);
6581     }
6582
6583 #if 0                           /* Now runtime fix should be reliable. */
6584
6585     /* if this is reinstated, don't forget to put this back into perldiag:
6586
6587             =item Regexp *+ operand could be empty at {#} in regex m/%s/
6588
6589            (F) The part of the regexp subject to either the * or + quantifier
6590            could match an empty string. The {#} shows in the regular
6591            expression about where the problem was discovered.
6592
6593     */
6594
6595     if (!(flags&HASWIDTH) && op != '?')
6596       vFAIL("Regexp *+ operand could be empty");
6597 #endif
6598
6599     parse_start = RExC_parse;
6600     nextchar(pRExC_state);
6601
6602     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6603
6604     if (op == '*' && (flags&SIMPLE)) {
6605         reginsert(pRExC_state, STAR, ret, depth+1);
6606         ret->flags = 0;
6607         RExC_naughty += 4;
6608     }
6609     else if (op == '*') {
6610         min = 0;
6611         goto do_curly;
6612     }
6613     else if (op == '+' && (flags&SIMPLE)) {
6614         reginsert(pRExC_state, PLUS, ret, depth+1);
6615         ret->flags = 0;
6616         RExC_naughty += 3;
6617     }
6618     else if (op == '+') {
6619         min = 1;
6620         goto do_curly;
6621     }
6622     else if (op == '?') {
6623         min = 0; max = 1;
6624         goto do_curly;
6625     }
6626   nest_check:
6627     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6628         ckWARN3reg(RExC_parse,
6629                    "%.*s matches null string many times",
6630                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6631                    origparse);
6632     }
6633
6634     if (RExC_parse < RExC_end && *RExC_parse == '?') {
6635         nextchar(pRExC_state);
6636         reginsert(pRExC_state, MINMOD, ret, depth+1);
6637         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6638     }
6639 #ifndef REG_ALLOW_MINMOD_SUSPEND
6640     else
6641 #endif
6642     if (RExC_parse < RExC_end && *RExC_parse == '+') {
6643         regnode *ender;
6644         nextchar(pRExC_state);
6645         ender = reg_node(pRExC_state, SUCCEED);
6646         REGTAIL(pRExC_state, ret, ender);
6647         reginsert(pRExC_state, SUSPEND, ret, depth+1);
6648         ret->flags = 0;
6649         ender = reg_node(pRExC_state, TAIL);
6650         REGTAIL(pRExC_state, ret, ender);
6651         /*ret= ender;*/
6652     }
6653
6654     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6655         RExC_parse++;
6656         vFAIL("Nested quantifiers");
6657     }
6658
6659     return(ret);
6660 }
6661
6662
6663 /* reg_namedseq(pRExC_state,UVp)
6664
6665    This is expected to be called by a parser routine that has
6666    recognized '\N' and needs to handle the rest. RExC_parse is
6667    expected to point at the first char following the N at the time
6668    of the call.
6669
6670    The \N may be inside (indicated by valuep not being NULL) or outside a
6671    character class.
6672
6673    \N may begin either a named sequence, or if outside a character class, mean
6674    to match a non-newline.  For non single-quoted regexes, the tokenizer has
6675    attempted to decide which, and in the case of a named sequence converted it
6676    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6677    where c1... are the characters in the sequence.  For single-quoted regexes,
6678    the tokenizer passes the \N sequence through unchanged; this code will not
6679    attempt to determine this nor expand those.  The net effect is that if the
6680    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6681    signals that this \N occurrence means to match a non-newline.
6682
6683    Only the \N{U+...} form should occur in a character class, for the same
6684    reason that '.' inside a character class means to just match a period: it
6685    just doesn't make sense.
6686
6687    If valuep is non-null then it is assumed that we are parsing inside
6688    of a charclass definition and the first codepoint in the resolved
6689    string is returned via *valuep and the routine will return NULL.
6690    In this mode if a multichar string is returned from the charnames
6691    handler, a warning will be issued, and only the first char in the
6692    sequence will be examined. If the string returned is zero length
6693    then the value of *valuep is undefined and NON-NULL will
6694    be returned to indicate failure. (This will NOT be a valid pointer
6695    to a regnode.)
6696
6697    If valuep is null then it is assumed that we are parsing normal text and a
6698    new EXACT node is inserted into the program containing the resolved string,
6699    and a pointer to the new node is returned.  But if the string is zero length
6700    a NOTHING node is emitted instead.
6701
6702    On success RExC_parse is set to the char following the endbrace.
6703    Parsing failures will generate a fatal error via vFAIL(...)
6704  */
6705 STATIC regnode *
6706 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6707 {
6708     char * endbrace;    /* '}' following the name */
6709     regnode *ret = NULL;
6710 #ifdef DEBUGGING
6711     char* parse_start = RExC_parse - 2;     /* points to the '\N' */
6712 #endif
6713     char* p;
6714
6715     GET_RE_DEBUG_FLAGS_DECL;
6716
6717     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6718
6719     GET_RE_DEBUG_FLAGS;
6720
6721     /* The [^\n] meaning of \N ignores spaces and comments under the /x
6722      * modifier.  The other meaning does not */
6723     p = (RExC_flags & RXf_PMf_EXTENDED)
6724         ? regwhite( pRExC_state, RExC_parse )
6725         : RExC_parse;
6726
6727     /* Disambiguate between \N meaning a named character versus \N meaning
6728      * [^\n].  The former is assumed when it can't be the latter. */
6729     if (*p != '{' || regcurly(p)) {
6730         RExC_parse = p;
6731         if (valuep) {
6732             /* no bare \N in a charclass */
6733             vFAIL("\\N in a character class must be a named character: \\N{...}");
6734         }
6735         nextchar(pRExC_state);
6736         ret = reg_node(pRExC_state, REG_ANY);
6737         *flagp |= HASWIDTH|SIMPLE;
6738         RExC_naughty++;
6739         RExC_parse--;
6740         Set_Node_Length(ret, 1); /* MJD */
6741         return ret;
6742     }
6743
6744     /* Here, we have decided it should be a named sequence */
6745
6746     /* The test above made sure that the next real character is a '{', but
6747      * under the /x modifier, it could be separated by space (or a comment and
6748      * \n) and this is not allowed (for consistency with \x{...} and the
6749      * tokenizer handling of \N{NAME}). */
6750     if (*RExC_parse != '{') {
6751         vFAIL("Missing braces on \\N{}");
6752     }
6753
6754     RExC_parse++;       /* Skip past the '{' */
6755
6756     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6757         || ! (endbrace == RExC_parse            /* nothing between the {} */
6758               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
6759                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6760     {
6761         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
6762         vFAIL("\\N{NAME} must be resolved by the lexer");
6763     }
6764
6765     if (endbrace == RExC_parse) {   /* empty: \N{} */
6766         if (! valuep) {
6767             RExC_parse = endbrace + 1;
6768             return reg_node(pRExC_state,NOTHING);
6769         }
6770
6771         if (SIZE_ONLY) {
6772             ckWARNreg(RExC_parse,
6773                     "Ignoring zero length \\N{} in character class"
6774             );
6775             RExC_parse = endbrace + 1;
6776         }
6777         *valuep = 0;
6778         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6779     }
6780
6781     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
6782     RExC_parse += 2;    /* Skip past the 'U+' */
6783
6784     if (valuep) {   /* In a bracketed char class */
6785         /* We only pay attention to the first char of
6786         multichar strings being returned. I kinda wonder
6787         if this makes sense as it does change the behaviour
6788         from earlier versions, OTOH that behaviour was broken
6789         as well. XXX Solution is to recharacterize as
6790         [rest-of-class]|multi1|multi2... */
6791
6792         STRLEN length_of_hex;
6793         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6794             | PERL_SCAN_DISALLOW_PREFIX
6795             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6796
6797         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
6798         if (endchar < endbrace) {
6799             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6800         }
6801
6802         length_of_hex = (STRLEN)(endchar - RExC_parse);
6803         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
6804
6805         /* The tokenizer should have guaranteed validity, but it's possible to
6806          * bypass it by using single quoting, so check */
6807         if (length_of_hex == 0
6808             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6809         {
6810             RExC_parse += length_of_hex;        /* Includes all the valid */
6811             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
6812                             ? UTF8SKIP(RExC_parse)
6813                             : 1;
6814             /* Guard against malformed utf8 */
6815             if (RExC_parse >= endchar) RExC_parse = endchar;
6816             vFAIL("Invalid hexadecimal number in \\N{U+...}");
6817         }
6818
6819         RExC_parse = endbrace + 1;
6820         if (endchar == endbrace) return NULL;
6821
6822         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
6823     }
6824     else {      /* Not a char class */
6825         char *s;            /* String to put in generated EXACT node */
6826         STRLEN len = 0;     /* Its current byte length */
6827         char *endchar;      /* Points to '.' or '}' ending cur char in the input
6828                                stream */
6829
6830         ret = reg_node(pRExC_state,
6831                         (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6832         s= STRING(ret);
6833
6834         /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
6835          * the input which is of the form now 'c1.c2.c3...}' until find the
6836          * ending brace or exceed length 255.  The characters that exceed this
6837          * limit are dropped.  The limit could be relaxed should it become
6838          * desirable by reparsing this as (?:\N{NAME}), so could generate
6839          * multiple EXACT nodes, as is done for just regular input.  But this
6840          * is primarily a named character, and not intended to be a huge long
6841          * string, so 255 bytes should be good enough */
6842         while (1) {
6843             STRLEN length_of_hex;
6844             I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
6845                             | PERL_SCAN_DISALLOW_PREFIX
6846                             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6847             UV cp;  /* Ord of current character */
6848
6849             /* Code points are separated by dots.  If none, there is only one
6850              * code point, and is terminated by the brace */
6851             endchar = RExC_parse + strcspn(RExC_parse, ".}");
6852
6853             /* The values are Unicode even on EBCDIC machines */
6854             length_of_hex = (STRLEN)(endchar - RExC_parse);
6855             cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
6856             if ( length_of_hex == 0
6857                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6858             {
6859                 RExC_parse += length_of_hex;        /* Includes all the valid */
6860                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
6861                                 ? UTF8SKIP(RExC_parse)
6862                                 : 1;
6863                 /* Guard against malformed utf8 */
6864                 if (RExC_parse >= endchar) RExC_parse = endchar;
6865                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
6866             }
6867
6868             if (! FOLD) {       /* Not folding, just append to the string */
6869                 STRLEN unilen;
6870
6871                 /* Quit before adding this character if would exceed limit */
6872                 if (len + UNISKIP(cp) > U8_MAX) break;
6873
6874                 unilen = reguni(pRExC_state, cp, s);
6875                 if (unilen > 0) {
6876                     s   += unilen;
6877                     len += unilen;
6878                 }
6879             } else {    /* Folding, output the folded equivalent */
6880                 STRLEN foldlen,numlen;
6881                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6882                 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
6883
6884                 /* Quit before exceeding size limit */
6885                 if (len + foldlen > U8_MAX) break;
6886
6887                 for (foldbuf = tmpbuf;
6888                     foldlen;
6889                     foldlen -= numlen)
6890                 {
6891                     cp = utf8_to_uvchr(foldbuf, &numlen);
6892                     if (numlen > 0) {
6893                         const STRLEN unilen = reguni(pRExC_state, cp, s);
6894                         s       += unilen;
6895                         len     += unilen;
6896                         /* In EBCDIC the numlen and unilen can differ. */
6897                         foldbuf += numlen;
6898                         if (numlen >= foldlen)
6899                             break;
6900                     }
6901                     else
6902                         break; /* "Can't happen." */
6903                 }
6904             }
6905
6906             /* Point to the beginning of the next character in the sequence. */
6907             RExC_parse = endchar + 1;
6908
6909             /* Quit if no more characters */
6910             if (RExC_parse >= endbrace) break;
6911         }
6912
6913
6914         if (SIZE_ONLY) {
6915             if (RExC_parse < endbrace) {
6916                 ckWARNreg(RExC_parse - 1,
6917                           "Using just the first characters returned by \\N{}");
6918             }
6919
6920             RExC_size += STR_SZ(len);
6921         } else {
6922             STR_LEN(ret) = len;
6923             RExC_emit += STR_SZ(len);
6924         }
6925
6926         RExC_parse = endbrace + 1;
6927
6928         *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
6929                                with malformed in t/re/pat_advanced.t */
6930         RExC_parse --;
6931         Set_Node_Cur_Length(ret); /* MJD */
6932         nextchar(pRExC_state);
6933     }
6934
6935     return ret;
6936 }
6937
6938
6939 /*
6940  * reg_recode
6941  *
6942  * It returns the code point in utf8 for the value in *encp.
6943  *    value: a code value in the source encoding
6944  *    encp:  a pointer to an Encode object
6945  *
6946  * If the result from Encode is not a single character,
6947  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6948  */
6949 STATIC UV
6950 S_reg_recode(pTHX_ const char value, SV **encp)
6951 {
6952     STRLEN numlen = 1;
6953     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
6954     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6955     const STRLEN newlen = SvCUR(sv);
6956     UV uv = UNICODE_REPLACEMENT;
6957
6958     PERL_ARGS_ASSERT_REG_RECODE;
6959
6960     if (newlen)
6961         uv = SvUTF8(sv)
6962              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6963              : *(U8*)s;
6964
6965     if (!newlen || numlen != newlen) {
6966         uv = UNICODE_REPLACEMENT;
6967         *encp = NULL;
6968     }
6969     return uv;
6970 }
6971
6972
6973 /*
6974  - regatom - the lowest level
6975
6976    Try to identify anything special at the start of the pattern. If there
6977    is, then handle it as required. This may involve generating a single regop,
6978    such as for an assertion; or it may involve recursing, such as to
6979    handle a () structure.
6980
6981    If the string doesn't start with something special then we gobble up
6982    as much literal text as we can.
6983
6984    Once we have been able to handle whatever type of thing started the
6985    sequence, we return.
6986
6987    Note: we have to be careful with escapes, as they can be both literal
6988    and special, and in the case of \10 and friends can either, depending
6989    on context. Specifically there are two seperate switches for handling
6990    escape sequences, with the one for handling literal escapes requiring
6991    a dummy entry for all of the special escapes that are actually handled
6992    by the other.
6993 */
6994
6995 STATIC regnode *
6996 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6997 {
6998     dVAR;
6999     register regnode *ret = NULL;
7000     I32 flags;
7001     char *parse_start = RExC_parse;
7002     GET_RE_DEBUG_FLAGS_DECL;
7003     DEBUG_PARSE("atom");
7004     *flagp = WORST;             /* Tentatively. */
7005
7006     PERL_ARGS_ASSERT_REGATOM;
7007
7008 tryagain:
7009     switch ((U8)*RExC_parse) {
7010     case '^':
7011         RExC_seen_zerolen++;
7012         nextchar(pRExC_state);
7013         if (RExC_flags & RXf_PMf_MULTILINE)
7014             ret = reg_node(pRExC_state, MBOL);
7015         else if (RExC_flags & RXf_PMf_SINGLELINE)
7016             ret = reg_node(pRExC_state, SBOL);
7017         else
7018             ret = reg_node(pRExC_state, BOL);
7019         Set_Node_Length(ret, 1); /* MJD */
7020         break;
7021     case '$':
7022         nextchar(pRExC_state);
7023         if (*RExC_parse)
7024             RExC_seen_zerolen++;
7025         if (RExC_flags & RXf_PMf_MULTILINE)
7026             ret = reg_node(pRExC_state, MEOL);
7027         else if (RExC_flags & RXf_PMf_SINGLELINE)
7028             ret = reg_node(pRExC_state, SEOL);
7029         else
7030             ret = reg_node(pRExC_state, EOL);
7031         Set_Node_Length(ret, 1); /* MJD */
7032         break;
7033     case '.':
7034         nextchar(pRExC_state);
7035         if (RExC_flags & RXf_PMf_SINGLELINE)
7036             ret = reg_node(pRExC_state, SANY);
7037         else
7038             ret = reg_node(pRExC_state, REG_ANY);
7039         *flagp |= HASWIDTH|SIMPLE;
7040         RExC_naughty++;
7041         Set_Node_Length(ret, 1); /* MJD */
7042         break;
7043     case '[':
7044     {
7045         char * const oregcomp_parse = ++RExC_parse;
7046         ret = regclass(pRExC_state,depth+1);
7047         if (*RExC_parse != ']') {
7048             RExC_parse = oregcomp_parse;
7049             vFAIL("Unmatched [");
7050         }
7051         nextchar(pRExC_state);
7052         *flagp |= HASWIDTH|SIMPLE;
7053         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7054         break;
7055     }
7056     case '(':
7057         nextchar(pRExC_state);
7058         ret = reg(pRExC_state, 1, &flags,depth+1);
7059         if (ret == NULL) {
7060                 if (flags & TRYAGAIN) {
7061                     if (RExC_parse == RExC_end) {
7062                          /* Make parent create an empty node if needed. */
7063                         *flagp |= TRYAGAIN;
7064                         return(NULL);
7065                     }
7066                     goto tryagain;
7067                 }
7068                 return(NULL);
7069         }
7070         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7071         break;
7072     case '|':
7073     case ')':
7074         if (flags & TRYAGAIN) {
7075             *flagp |= TRYAGAIN;
7076             return NULL;
7077         }
7078         vFAIL("Internal urp");
7079                                 /* Supposed to be caught earlier. */
7080         break;
7081     case '{':
7082         if (!regcurly(RExC_parse)) {
7083             RExC_parse++;
7084             goto defchar;
7085         }
7086         /* FALL THROUGH */
7087     case '?':
7088     case '+':
7089     case '*':
7090         RExC_parse++;
7091         vFAIL("Quantifier follows nothing");
7092         break;
7093     case 0xDF:
7094     case 0xC3:
7095     case 0xCE:
7096         do_foldchar:
7097         if (!LOC && FOLD) {
7098             U32 len,cp;
7099             len=0; /* silence a spurious compiler warning */
7100             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7101                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7102                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7103                 ret = reganode(pRExC_state, FOLDCHAR, cp);
7104                 Set_Node_Length(ret, 1); /* MJD */
7105                 nextchar(pRExC_state); /* kill whitespace under /x */
7106                 return ret;
7107             }
7108         }
7109         goto outer_default;
7110     case '\\':
7111         /* Special Escapes
7112
7113            This switch handles escape sequences that resolve to some kind
7114            of special regop and not to literal text. Escape sequnces that
7115            resolve to literal text are handled below in the switch marked
7116            "Literal Escapes".
7117
7118            Every entry in this switch *must* have a corresponding entry
7119            in the literal escape switch. However, the opposite is not
7120            required, as the default for this switch is to jump to the
7121            literal text handling code.
7122         */
7123         switch ((U8)*++RExC_parse) {
7124         case 0xDF:
7125         case 0xC3:
7126         case 0xCE:
7127                    goto do_foldchar;
7128         /* Special Escapes */
7129         case 'A':
7130             RExC_seen_zerolen++;
7131             ret = reg_node(pRExC_state, SBOL);
7132             *flagp |= SIMPLE;
7133             goto finish_meta_pat;
7134         case 'G':
7135             ret = reg_node(pRExC_state, GPOS);
7136             RExC_seen |= REG_SEEN_GPOS;
7137             *flagp |= SIMPLE;
7138             goto finish_meta_pat;
7139         case 'K':
7140             RExC_seen_zerolen++;
7141             ret = reg_node(pRExC_state, KEEPS);
7142             *flagp |= SIMPLE;
7143             /* XXX:dmq : disabling in-place substitution seems to
7144              * be necessary here to avoid cases of memory corruption, as
7145              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7146              */
7147             RExC_seen |= REG_SEEN_LOOKBEHIND;
7148             goto finish_meta_pat;
7149         case 'Z':
7150             ret = reg_node(pRExC_state, SEOL);
7151             *flagp |= SIMPLE;
7152             RExC_seen_zerolen++;                /* Do not optimize RE away */
7153             goto finish_meta_pat;
7154         case 'z':
7155             ret = reg_node(pRExC_state, EOS);
7156             *flagp |= SIMPLE;
7157             RExC_seen_zerolen++;                /* Do not optimize RE away */
7158             goto finish_meta_pat;
7159         case 'C':
7160             ret = reg_node(pRExC_state, CANY);
7161             RExC_seen |= REG_SEEN_CANY;
7162             *flagp |= HASWIDTH|SIMPLE;
7163             goto finish_meta_pat;
7164         case 'X':
7165             ret = reg_node(pRExC_state, CLUMP);
7166             *flagp |= HASWIDTH;
7167             goto finish_meta_pat;
7168         case 'w':
7169             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
7170             *flagp |= HASWIDTH|SIMPLE;
7171             goto finish_meta_pat;
7172         case 'W':
7173             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
7174             *flagp |= HASWIDTH|SIMPLE;
7175             goto finish_meta_pat;
7176         case 'b':
7177             RExC_seen_zerolen++;
7178             RExC_seen |= REG_SEEN_LOOKBEHIND;
7179             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
7180             *flagp |= SIMPLE;
7181             goto finish_meta_pat;
7182         case 'B':
7183             RExC_seen_zerolen++;
7184             RExC_seen |= REG_SEEN_LOOKBEHIND;
7185             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
7186             *flagp |= SIMPLE;
7187             goto finish_meta_pat;
7188         case 's':
7189             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
7190             *flagp |= HASWIDTH|SIMPLE;
7191             goto finish_meta_pat;
7192         case 'S':
7193             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
7194             *flagp |= HASWIDTH|SIMPLE;
7195             goto finish_meta_pat;
7196         case 'd':
7197             ret = reg_node(pRExC_state, DIGIT);
7198             *flagp |= HASWIDTH|SIMPLE;
7199             goto finish_meta_pat;
7200         case 'D':
7201             ret = reg_node(pRExC_state, NDIGIT);
7202             *flagp |= HASWIDTH|SIMPLE;
7203             goto finish_meta_pat;
7204         case 'R':
7205             ret = reg_node(pRExC_state, LNBREAK);
7206             *flagp |= HASWIDTH|SIMPLE;
7207             goto finish_meta_pat;
7208         case 'h':
7209             ret = reg_node(pRExC_state, HORIZWS);
7210             *flagp |= HASWIDTH|SIMPLE;
7211             goto finish_meta_pat;
7212         case 'H':
7213             ret = reg_node(pRExC_state, NHORIZWS);
7214             *flagp |= HASWIDTH|SIMPLE;
7215             goto finish_meta_pat;
7216         case 'v':
7217             ret = reg_node(pRExC_state, VERTWS);
7218             *flagp |= HASWIDTH|SIMPLE;
7219             goto finish_meta_pat;
7220         case 'V':
7221             ret = reg_node(pRExC_state, NVERTWS);
7222             *flagp |= HASWIDTH|SIMPLE;
7223          finish_meta_pat:
7224             nextchar(pRExC_state);
7225             Set_Node_Length(ret, 2); /* MJD */
7226             break;
7227         case 'p':
7228         case 'P':
7229             {
7230                 char* const oldregxend = RExC_end;
7231 #ifdef DEBUGGING
7232                 char* parse_start = RExC_parse - 2;
7233 #endif
7234
7235                 if (RExC_parse[1] == '{') {
7236                   /* a lovely hack--pretend we saw [\pX] instead */
7237                     RExC_end = strchr(RExC_parse, '}');
7238                     if (!RExC_end) {
7239                         const U8 c = (U8)*RExC_parse;
7240                         RExC_parse += 2;
7241                         RExC_end = oldregxend;
7242                         vFAIL2("Missing right brace on \\%c{}", c);
7243                     }
7244                     RExC_end++;
7245                 }
7246                 else {
7247                     RExC_end = RExC_parse + 2;
7248                     if (RExC_end > oldregxend)
7249                         RExC_end = oldregxend;
7250                 }
7251                 RExC_parse--;
7252
7253                 ret = regclass(pRExC_state,depth+1);
7254
7255                 RExC_end = oldregxend;
7256                 RExC_parse--;
7257
7258                 Set_Node_Offset(ret, parse_start + 2);
7259                 Set_Node_Cur_Length(ret);
7260                 nextchar(pRExC_state);
7261                 *flagp |= HASWIDTH|SIMPLE;
7262             }
7263             break;
7264         case 'N':
7265             /* Handle \N and \N{NAME} here and not below because it can be
7266             multicharacter. join_exact() will join them up later on.
7267             Also this makes sure that things like /\N{BLAH}+/ and
7268             \N{BLAH} being multi char Just Happen. dmq*/
7269             ++RExC_parse;
7270             ret= reg_namedseq(pRExC_state, NULL, flagp);
7271             break;
7272         case 'k':    /* Handle \k<NAME> and \k'NAME' */
7273         parse_named_seq:
7274         {
7275             char ch= RExC_parse[1];
7276             if (ch != '<' && ch != '\'' && ch != '{') {
7277                 RExC_parse++;
7278                 vFAIL2("Sequence %.2s... not terminated",parse_start);
7279             } else {
7280                 /* this pretty much dupes the code for (?P=...) in reg(), if
7281                    you change this make sure you change that */
7282                 char* name_start = (RExC_parse += 2);
7283                 U32 num = 0;
7284                 SV *sv_dat = reg_scan_name(pRExC_state,
7285                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7286                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7287                 if (RExC_parse == name_start || *RExC_parse != ch)
7288                     vFAIL2("Sequence %.3s... not terminated",parse_start);
7289
7290                 if (!SIZE_ONLY) {
7291                     num = add_data( pRExC_state, 1, "S" );
7292                     RExC_rxi->data->data[num]=(void*)sv_dat;
7293                     SvREFCNT_inc_simple_void(sv_dat);
7294                 }
7295
7296                 RExC_sawback = 1;
7297                 ret = reganode(pRExC_state,
7298                            (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7299                            num);
7300                 *flagp |= HASWIDTH;
7301
7302                 /* override incorrect value set in reganode MJD */
7303                 Set_Node_Offset(ret, parse_start+1);
7304                 Set_Node_Cur_Length(ret); /* MJD */
7305                 nextchar(pRExC_state);
7306
7307             }
7308             break;
7309         }
7310         case 'g':
7311         case '1': case '2': case '3': case '4':
7312         case '5': case '6': case '7': case '8': case '9':
7313             {
7314                 I32 num;
7315                 bool isg = *RExC_parse == 'g';
7316                 bool isrel = 0;
7317                 bool hasbrace = 0;
7318                 if (isg) {
7319                     RExC_parse++;
7320                     if (*RExC_parse == '{') {
7321                         RExC_parse++;
7322                         hasbrace = 1;
7323                     }
7324                     if (*RExC_parse == '-') {
7325                         RExC_parse++;
7326                         isrel = 1;
7327                     }
7328                     if (hasbrace && !isDIGIT(*RExC_parse)) {
7329                         if (isrel) RExC_parse--;
7330                         RExC_parse -= 2;
7331                         goto parse_named_seq;
7332                 }   }
7333                 num = atoi(RExC_parse);
7334                 if (isg && num == 0)
7335                     vFAIL("Reference to invalid group 0");
7336                 if (isrel) {
7337                     num = RExC_npar - num;
7338                     if (num < 1)
7339                         vFAIL("Reference to nonexistent or unclosed group");
7340                 }
7341                 if (!isg && num > 9 && num >= RExC_npar)
7342                     goto defchar;
7343                 else {
7344                     char * const parse_start = RExC_parse - 1; /* MJD */
7345                     while (isDIGIT(*RExC_parse))
7346                         RExC_parse++;
7347                     if (parse_start == RExC_parse - 1)
7348                         vFAIL("Unterminated \\g... pattern");
7349                     if (hasbrace) {
7350                         if (*RExC_parse != '}')
7351                             vFAIL("Unterminated \\g{...} pattern");
7352                         RExC_parse++;
7353                     }
7354                     if (!SIZE_ONLY) {
7355                         if (num > (I32)RExC_rx->nparens)
7356                             vFAIL("Reference to nonexistent group");
7357                     }
7358                     RExC_sawback = 1;
7359                     ret = reganode(pRExC_state,
7360                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7361                                    num);
7362                     *flagp |= HASWIDTH;
7363
7364                     /* override incorrect value set in reganode MJD */
7365                     Set_Node_Offset(ret, parse_start+1);
7366                     Set_Node_Cur_Length(ret); /* MJD */
7367                     RExC_parse--;
7368                     nextchar(pRExC_state);
7369                 }
7370             }
7371             break;
7372         case '\0':
7373             if (RExC_parse >= RExC_end)
7374                 FAIL("Trailing \\");
7375             /* FALL THROUGH */
7376         default:
7377             /* Do not generate "unrecognized" warnings here, we fall
7378                back into the quick-grab loop below */
7379             parse_start--;
7380             goto defchar;
7381         }
7382         break;
7383
7384     case '#':
7385         if (RExC_flags & RXf_PMf_EXTENDED) {
7386             if ( reg_skipcomment( pRExC_state ) )
7387                 goto tryagain;
7388         }
7389         /* FALL THROUGH */
7390
7391     default:
7392         outer_default:{
7393             register STRLEN len;
7394             register UV ender;
7395             register char *p;
7396             char *s;
7397             STRLEN foldlen;
7398             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7399
7400             parse_start = RExC_parse - 1;
7401
7402             RExC_parse++;
7403
7404         defchar:
7405             ender = 0;
7406             ret = reg_node(pRExC_state,
7407                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7408             s = STRING(ret);
7409             for (len = 0, p = RExC_parse - 1;
7410               len < 127 && p < RExC_end;
7411               len++)
7412             {
7413                 char * const oldp = p;
7414
7415                 if (RExC_flags & RXf_PMf_EXTENDED)
7416                     p = regwhite( pRExC_state, p );
7417                 switch ((U8)*p) {
7418                 case 0xDF:
7419                 case 0xC3:
7420                 case 0xCE:
7421                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7422                                 goto normal_default;
7423                 case '^':
7424                 case '$':
7425                 case '.':
7426                 case '[':
7427                 case '(':
7428                 case ')':
7429                 case '|':
7430                     goto loopdone;
7431                 case '\\':
7432                     /* Literal Escapes Switch
7433
7434                        This switch is meant to handle escape sequences that
7435                        resolve to a literal character.
7436
7437                        Every escape sequence that represents something
7438                        else, like an assertion or a char class, is handled
7439                        in the switch marked 'Special Escapes' above in this
7440                        routine, but also has an entry here as anything that
7441                        isn't explicitly mentioned here will be treated as
7442                        an unescaped equivalent literal.
7443                     */
7444
7445                     switch ((U8)*++p) {
7446                     /* These are all the special escapes. */
7447                     case 0xDF:
7448                     case 0xC3:
7449                     case 0xCE:
7450                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7451                                 goto normal_default;
7452                     case 'A':             /* Start assertion */
7453                     case 'b': case 'B':   /* Word-boundary assertion*/
7454                     case 'C':             /* Single char !DANGEROUS! */
7455                     case 'd': case 'D':   /* digit class */
7456                     case 'g': case 'G':   /* generic-backref, pos assertion */
7457                     case 'h': case 'H':   /* HORIZWS */
7458                     case 'k': case 'K':   /* named backref, keep marker */
7459                     case 'N':             /* named char sequence */
7460                     case 'p': case 'P':   /* Unicode property */
7461                               case 'R':   /* LNBREAK */
7462                     case 's': case 'S':   /* space class */
7463                     case 'v': case 'V':   /* VERTWS */
7464                     case 'w': case 'W':   /* word class */
7465                     case 'X':             /* eXtended Unicode "combining character sequence" */
7466                     case 'z': case 'Z':   /* End of line/string assertion */
7467                         --p;
7468                         goto loopdone;
7469
7470                     /* Anything after here is an escape that resolves to a
7471                        literal. (Except digits, which may or may not)
7472                      */
7473                     case 'n':
7474                         ender = '\n';
7475                         p++;
7476                         break;
7477                     case 'r':
7478                         ender = '\r';
7479                         p++;
7480                         break;
7481                     case 't':
7482                         ender = '\t';
7483                         p++;
7484                         break;
7485                     case 'f':
7486                         ender = '\f';
7487                         p++;
7488                         break;
7489                     case 'e':
7490                           ender = ASCII_TO_NATIVE('\033');
7491                         p++;
7492                         break;
7493                     case 'a':
7494                           ender = ASCII_TO_NATIVE('\007');
7495                         p++;
7496                         break;
7497                     case 'o':
7498                         {
7499                             STRLEN brace_len = len;
7500                             UV result;
7501                             const char* error_msg;
7502
7503                             bool valid = grok_bslash_o(p,
7504                                                        &result,
7505                                                        &brace_len,
7506                                                        &error_msg,
7507                                                        1);
7508                             p += brace_len;
7509                             if (! valid) {
7510                                 RExC_parse = p; /* going to die anyway; point
7511                                                    to exact spot of failure */
7512                                 vFAIL(error_msg);
7513                             }
7514                             else
7515                             {
7516                                 ender = result;
7517                             }
7518                             if (PL_encoding && ender < 0x100) {
7519                                 goto recode_encoding;
7520                             }
7521                             if (ender > 0xff) {
7522                                 REQUIRE_UTF8;
7523                             }
7524                             break;
7525                         }
7526                     case 'x':
7527                         if (*++p == '{') {
7528                             char* const e = strchr(p, '}');
7529
7530                             if (!e) {
7531                                 RExC_parse = p + 1;
7532                                 vFAIL("Missing right brace on \\x{}");
7533                             }
7534                             else {
7535                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7536                                     | PERL_SCAN_DISALLOW_PREFIX;
7537                                 STRLEN numlen = e - p - 1;
7538                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7539                                 if (ender > 0xff)
7540                                     REQUIRE_UTF8;
7541                                 p = e + 1;
7542                             }
7543                         }
7544                         else {
7545                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7546                             STRLEN numlen = 2;
7547                             ender = grok_hex(p, &numlen, &flags, NULL);
7548                             p += numlen;
7549                         }
7550                         if (PL_encoding && ender < 0x100)
7551                             goto recode_encoding;
7552                         break;
7553                     case 'c':
7554                         p++;
7555                         ender = grok_bslash_c(*p++, SIZE_ONLY);
7556                         break;
7557                     case '0': case '1': case '2': case '3':case '4':
7558                     case '5': case '6': case '7': case '8':case '9':
7559                         if (*p == '0' ||
7560                             (isOCTAL(p[1]) && atoi(p) >= RExC_npar))
7561                         {
7562                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
7563                             STRLEN numlen = 3;
7564                             ender = grok_oct(p, &numlen, &flags, NULL);
7565                             if (ender > 0xff) {
7566                                 REQUIRE_UTF8;
7567                             }
7568                             p += numlen;
7569                         }
7570                         else {
7571                             --p;
7572                             goto loopdone;
7573                         }
7574                         if (PL_encoding && ender < 0x100)
7575                             goto recode_encoding;
7576                         break;
7577                     recode_encoding:
7578                         {
7579                             SV* enc = PL_encoding;
7580                             ender = reg_recode((const char)(U8)ender, &enc);
7581                             if (!enc && SIZE_ONLY)
7582                                 ckWARNreg(p, "Invalid escape in the specified encoding");
7583                             REQUIRE_UTF8;
7584                         }
7585                         break;
7586                     case '\0':
7587                         if (p >= RExC_end)
7588                             FAIL("Trailing \\");
7589                         /* FALL THROUGH */
7590                     default:
7591                         if (!SIZE_ONLY&& isALPHA(*p))
7592                             ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7593                         goto normal_default;
7594                     }
7595                     break;
7596                 default:
7597                   normal_default:
7598                     if (UTF8_IS_START(*p) && UTF) {
7599                         STRLEN numlen;
7600                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7601                                                &numlen, UTF8_ALLOW_DEFAULT);
7602                         p += numlen;
7603                     }
7604                     else
7605                         ender = *p++;
7606                     break;
7607                 }
7608                 if ( RExC_flags & RXf_PMf_EXTENDED)
7609                     p = regwhite( pRExC_state, p );
7610                 if (UTF && FOLD) {
7611                     /* Prime the casefolded buffer. */
7612                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7613                 }
7614                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7615                     if (len)
7616                         p = oldp;
7617                     else if (UTF) {
7618                          if (FOLD) {
7619                               /* Emit all the Unicode characters. */
7620                               STRLEN numlen;
7621                               for (foldbuf = tmpbuf;
7622                                    foldlen;
7623                                    foldlen -= numlen) {
7624                                    ender = utf8_to_uvchr(foldbuf, &numlen);
7625                                    if (numlen > 0) {
7626                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
7627                                         s       += unilen;
7628                                         len     += unilen;
7629                                         /* In EBCDIC the numlen
7630                                          * and unilen can differ. */
7631                                         foldbuf += numlen;
7632                                         if (numlen >= foldlen)
7633                                              break;
7634                                    }
7635                                    else
7636                                         break; /* "Can't happen." */
7637                               }
7638                          }
7639                          else {
7640                               const STRLEN unilen = reguni(pRExC_state, ender, s);
7641                               if (unilen > 0) {
7642                                    s   += unilen;
7643                                    len += unilen;
7644                               }
7645                          }
7646                     }
7647                     else {
7648                         len++;
7649                         REGC((char)ender, s++);
7650                     }
7651                     break;
7652                 }
7653                 if (UTF) {
7654                      if (FOLD) {
7655                           /* Emit all the Unicode characters. */
7656                           STRLEN numlen;
7657                           for (foldbuf = tmpbuf;
7658                                foldlen;
7659                                foldlen -= numlen) {
7660                                ender = utf8_to_uvchr(foldbuf, &numlen);
7661                                if (numlen > 0) {
7662                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
7663                                     len     += unilen;
7664                                     s       += unilen;
7665                                     /* In EBCDIC the numlen
7666                                      * and unilen can differ. */
7667                                     foldbuf += numlen;
7668                                     if (numlen >= foldlen)
7669                                          break;
7670                                }
7671                                else
7672                                     break;
7673                           }
7674                      }
7675                      else {
7676                           const STRLEN unilen = reguni(pRExC_state, ender, s);
7677                           if (unilen > 0) {
7678                                s   += unilen;
7679                                len += unilen;
7680                           }
7681                      }
7682                      len--;
7683                 }
7684                 else
7685                     REGC((char)ender, s++);
7686             }
7687         loopdone:
7688             RExC_parse = p - 1;
7689             Set_Node_Cur_Length(ret); /* MJD */
7690             nextchar(pRExC_state);
7691             {
7692                 /* len is STRLEN which is unsigned, need to copy to signed */
7693                 IV iv = len;
7694                 if (iv < 0)
7695                     vFAIL("Internal disaster");
7696             }
7697             if (len > 0)
7698                 *flagp |= HASWIDTH;
7699             if (len == 1 && UNI_IS_INVARIANT(ender))
7700                 *flagp |= SIMPLE;
7701
7702             if (SIZE_ONLY)
7703                 RExC_size += STR_SZ(len);
7704             else {
7705                 STR_LEN(ret) = len;
7706                 RExC_emit += STR_SZ(len);
7707             }
7708         }
7709         break;
7710     }
7711
7712     return(ret);
7713 }
7714
7715 STATIC char *
7716 S_regwhite( RExC_state_t *pRExC_state, char *p )
7717 {
7718     const char *e = RExC_end;
7719
7720     PERL_ARGS_ASSERT_REGWHITE;
7721
7722     while (p < e) {
7723         if (isSPACE(*p))
7724             ++p;
7725         else if (*p == '#') {
7726             bool ended = 0;
7727             do {
7728                 if (*p++ == '\n') {
7729                     ended = 1;
7730                     break;
7731                 }
7732             } while (p < e);
7733             if (!ended)
7734                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7735         }
7736         else
7737             break;
7738     }
7739     return p;
7740 }
7741
7742 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7743    Character classes ([:foo:]) can also be negated ([:^foo:]).
7744    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7745    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7746    but trigger failures because they are currently unimplemented. */
7747
7748 #define POSIXCC_DONE(c)   ((c) == ':')
7749 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7750 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7751
7752 STATIC I32
7753 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7754 {
7755     dVAR;
7756     I32 namedclass = OOB_NAMEDCLASS;
7757
7758     PERL_ARGS_ASSERT_REGPPOSIXCC;
7759
7760     if (value == '[' && RExC_parse + 1 < RExC_end &&
7761         /* I smell either [: or [= or [. -- POSIX has been here, right? */
7762         POSIXCC(UCHARAT(RExC_parse))) {
7763         const char c = UCHARAT(RExC_parse);
7764         char* const s = RExC_parse++;
7765
7766         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7767             RExC_parse++;
7768         if (RExC_parse == RExC_end)
7769             /* Grandfather lone [:, [=, [. */
7770             RExC_parse = s;
7771         else {
7772             const char* const t = RExC_parse++; /* skip over the c */
7773             assert(*t == c);
7774
7775             if (UCHARAT(RExC_parse) == ']') {
7776                 const char *posixcc = s + 1;
7777                 RExC_parse++; /* skip over the ending ] */
7778
7779                 if (*s == ':') {
7780                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7781                     const I32 skip = t - posixcc;
7782
7783                     /* Initially switch on the length of the name.  */
7784                     switch (skip) {
7785                     case 4:
7786                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7787                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7788                         break;
7789                     case 5:
7790                         /* Names all of length 5.  */
7791                         /* alnum alpha ascii blank cntrl digit graph lower
7792                            print punct space upper  */
7793                         /* Offset 4 gives the best switch position.  */
7794                         switch (posixcc[4]) {
7795                         case 'a':
7796                             if (memEQ(posixcc, "alph", 4)) /* alpha */
7797                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7798                             break;
7799                         case 'e':
7800                             if (memEQ(posixcc, "spac", 4)) /* space */
7801                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7802                             break;
7803                         case 'h':
7804                             if (memEQ(posixcc, "grap", 4)) /* graph */
7805                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7806                             break;
7807                         case 'i':
7808                             if (memEQ(posixcc, "asci", 4)) /* ascii */
7809                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7810                             break;
7811                         case 'k':
7812                             if (memEQ(posixcc, "blan", 4)) /* blank */
7813                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7814                             break;
7815                         case 'l':
7816                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7817                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7818                             break;
7819                         case 'm':
7820                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
7821                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7822                             break;
7823                         case 'r':
7824                             if (memEQ(posixcc, "lowe", 4)) /* lower */
7825                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7826                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
7827                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7828                             break;
7829                         case 't':
7830                             if (memEQ(posixcc, "digi", 4)) /* digit */
7831                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7832                             else if (memEQ(posixcc, "prin", 4)) /* print */
7833                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7834                             else if (memEQ(posixcc, "punc", 4)) /* punct */
7835                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7836                             break;
7837                         }
7838                         break;
7839                     case 6:
7840                         if (memEQ(posixcc, "xdigit", 6))
7841                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7842                         break;
7843                     }
7844
7845                     if (namedclass == OOB_NAMEDCLASS)
7846                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7847                                       t - s - 1, s + 1);
7848                     assert (posixcc[skip] == ':');
7849                     assert (posixcc[skip+1] == ']');
7850                 } else if (!SIZE_ONLY) {
7851                     /* [[=foo=]] and [[.foo.]] are still future. */
7852
7853                     /* adjust RExC_parse so the warning shows after
7854                        the class closes */
7855                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7856                         RExC_parse++;
7857                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7858                 }
7859             } else {
7860                 /* Maternal grandfather:
7861                  * "[:" ending in ":" but not in ":]" */
7862                 RExC_parse = s;
7863             }
7864         }
7865     }
7866
7867     return namedclass;
7868 }
7869
7870 STATIC void
7871 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7872 {
7873     dVAR;
7874
7875     PERL_ARGS_ASSERT_CHECKPOSIXCC;
7876
7877     if (POSIXCC(UCHARAT(RExC_parse))) {
7878         const char *s = RExC_parse;
7879         const char  c = *s++;
7880
7881         while (isALNUM(*s))
7882             s++;
7883         if (*s && c == *s && s[1] == ']') {
7884             ckWARN3reg(s+2,
7885                        "POSIX syntax [%c %c] belongs inside character classes",
7886                        c, c);
7887
7888             /* [[=foo=]] and [[.foo.]] are still future. */
7889             if (POSIXCC_NOTYET(c)) {
7890                 /* adjust RExC_parse so the error shows after
7891                    the class closes */
7892                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7893                     NOOP;
7894                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7895             }
7896         }
7897     }
7898 }
7899
7900
7901 #define _C_C_T_(NAME,TEST,WORD)                         \
7902 ANYOF_##NAME:                                           \
7903     if (LOC)                                            \
7904         ANYOF_CLASS_SET(ret, ANYOF_##NAME);             \
7905     else {                                              \
7906         for (value = 0; value < 256; value++)           \
7907             if (TEST)                                   \
7908                 ANYOF_BITMAP_SET(ret, value);           \
7909     }                                                   \
7910     yesno = '+';                                        \
7911     what = WORD;                                        \
7912     break;                                              \
7913 case ANYOF_N##NAME:                                     \
7914     if (LOC)                                            \
7915         ANYOF_CLASS_SET(ret, ANYOF_N##NAME);            \
7916     else {                                              \
7917         for (value = 0; value < 256; value++)           \
7918             if (!TEST)                                  \
7919                 ANYOF_BITMAP_SET(ret, value);           \
7920     }                                                   \
7921     yesno = '!';                                        \
7922     what = WORD;                                        \
7923     break
7924
7925 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
7926 ANYOF_##NAME:                                           \
7927         for (value = 0; value < 256; value++)           \
7928             if (TEST)                                   \
7929                 ANYOF_BITMAP_SET(ret, value);           \
7930     yesno = '+';                                        \
7931     what = WORD;                                        \
7932     break;                                              \
7933 case ANYOF_N##NAME:                                     \
7934         for (value = 0; value < 256; value++)           \
7935             if (!TEST)                                  \
7936                 ANYOF_BITMAP_SET(ret, value);           \
7937     yesno = '!';                                        \
7938     what = WORD;                                        \
7939     break
7940
7941 /*
7942    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7943    so that it is possible to override the option here without having to
7944    rebuild the entire core. as we are required to do if we change regcomp.h
7945    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7946 */
7947 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7948 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7949 #endif
7950
7951 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7952 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7953 #else
7954 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7955 #endif
7956
7957 /*
7958    parse a class specification and produce either an ANYOF node that
7959    matches the pattern or if the pattern matches a single char only and
7960    that char is < 256 and we are case insensitive then we produce an
7961    EXACT node instead.
7962 */
7963
7964 STATIC regnode *
7965 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7966 {
7967     dVAR;
7968     register UV nextvalue;
7969     register IV prevvalue = OOB_UNICODE;
7970     register IV range = 0;
7971     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7972     register regnode *ret;
7973     STRLEN numlen;
7974     IV namedclass;
7975     char *rangebegin = NULL;
7976     bool need_class = 0;
7977     SV *listsv = NULL;
7978     UV n;
7979     bool optimize_invert   = TRUE;
7980     AV* unicode_alternate  = NULL;
7981 #ifdef EBCDIC
7982     UV literal_endpoint = 0;
7983 #endif
7984     UV stored = 0;  /* number of chars stored in the class */
7985
7986     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7987         case we need to change the emitted regop to an EXACT. */
7988     const char * orig_parse = RExC_parse;
7989     GET_RE_DEBUG_FLAGS_DECL;
7990
7991     PERL_ARGS_ASSERT_REGCLASS;
7992 #ifndef DEBUGGING
7993     PERL_UNUSED_ARG(depth);
7994 #endif
7995
7996     DEBUG_PARSE("clas");
7997
7998     /* Assume we are going to generate an ANYOF node. */
7999     ret = reganode(pRExC_state, ANYOF, 0);
8000
8001     if (!SIZE_ONLY)
8002         ANYOF_FLAGS(ret) = 0;
8003
8004     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
8005         RExC_naughty++;
8006         RExC_parse++;
8007         if (!SIZE_ONLY)
8008             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8009     }
8010
8011     if (SIZE_ONLY) {
8012         RExC_size += ANYOF_SKIP;
8013         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8014     }
8015     else {
8016         RExC_emit += ANYOF_SKIP;
8017         if (FOLD)
8018             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
8019         if (LOC)
8020             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
8021         ANYOF_BITMAP_ZERO(ret);
8022         listsv = newSVpvs("# comment\n");
8023     }
8024
8025     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8026
8027     if (!SIZE_ONLY && POSIXCC(nextvalue))
8028         checkposixcc(pRExC_state);
8029
8030     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8031     if (UCHARAT(RExC_parse) == ']')
8032         goto charclassloop;
8033
8034 parseit:
8035     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
8036
8037     charclassloop:
8038
8039         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8040
8041         if (!range)
8042             rangebegin = RExC_parse;
8043         if (UTF) {
8044             value = utf8n_to_uvchr((U8*)RExC_parse,
8045                                    RExC_end - RExC_parse,
8046                                    &numlen, UTF8_ALLOW_DEFAULT);
8047             RExC_parse += numlen;
8048         }
8049         else
8050             value = UCHARAT(RExC_parse++);
8051
8052         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8053         if (value == '[' && POSIXCC(nextvalue))
8054             namedclass = regpposixcc(pRExC_state, value);
8055         else if (value == '\\') {
8056             if (UTF) {
8057                 value = utf8n_to_uvchr((U8*)RExC_parse,
8058                                    RExC_end - RExC_parse,
8059                                    &numlen, UTF8_ALLOW_DEFAULT);
8060                 RExC_parse += numlen;
8061             }
8062             else
8063                 value = UCHARAT(RExC_parse++);
8064             /* Some compilers cannot handle switching on 64-bit integer
8065              * values, therefore value cannot be an UV.  Yes, this will
8066              * be a problem later if we want switch on Unicode.
8067              * A similar issue a little bit later when switching on
8068              * namedclass. --jhi */
8069             switch ((I32)value) {
8070             case 'w':   namedclass = ANYOF_ALNUM;       break;
8071             case 'W':   namedclass = ANYOF_NALNUM;      break;
8072             case 's':   namedclass = ANYOF_SPACE;       break;
8073             case 'S':   namedclass = ANYOF_NSPACE;      break;
8074             case 'd':   namedclass = ANYOF_DIGIT;       break;
8075             case 'D':   namedclass = ANYOF_NDIGIT;      break;
8076             case 'v':   namedclass = ANYOF_VERTWS;      break;
8077             case 'V':   namedclass = ANYOF_NVERTWS;     break;
8078             case 'h':   namedclass = ANYOF_HORIZWS;     break;
8079             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
8080             case 'N':  /* Handle \N{NAME} in class */
8081                 {
8082                     /* We only pay attention to the first char of
8083                     multichar strings being returned. I kinda wonder
8084                     if this makes sense as it does change the behaviour
8085                     from earlier versions, OTOH that behaviour was broken
8086                     as well. */
8087                     UV v; /* value is register so we cant & it /grrr */
8088                     if (reg_namedseq(pRExC_state, &v, NULL)) {
8089                         goto parseit;
8090                     }
8091                     value= v;
8092                 }
8093                 break;
8094             case 'p':
8095             case 'P':
8096                 {
8097                 char *e;
8098                 if (RExC_parse >= RExC_end)
8099                     vFAIL2("Empty \\%c{}", (U8)value);
8100                 if (*RExC_parse == '{') {
8101                     const U8 c = (U8)value;
8102                     e = strchr(RExC_parse++, '}');
8103                     if (!e)
8104                         vFAIL2("Missing right brace on \\%c{}", c);
8105                     while (isSPACE(UCHARAT(RExC_parse)))
8106                         RExC_parse++;
8107                     if (e == RExC_parse)
8108                         vFAIL2("Empty \\%c{}", c);
8109                     n = e - RExC_parse;
8110                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8111                         n--;
8112                 }
8113                 else {
8114                     e = RExC_parse;
8115                     n = 1;
8116                 }
8117                 if (!SIZE_ONLY) {
8118                     if (UCHARAT(RExC_parse) == '^') {
8119                          RExC_parse++;
8120                          n--;
8121                          value = value == 'p' ? 'P' : 'p'; /* toggle */
8122                          while (isSPACE(UCHARAT(RExC_parse))) {
8123                               RExC_parse++;
8124                               n--;
8125                          }
8126                     }
8127                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8128                         (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8129                 }
8130                 RExC_parse = e + 1;
8131                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8132                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
8133                 }
8134                 break;
8135             case 'n':   value = '\n';                   break;
8136             case 'r':   value = '\r';                   break;
8137             case 't':   value = '\t';                   break;
8138             case 'f':   value = '\f';                   break;
8139             case 'b':   value = '\b';                   break;
8140             case 'e':   value = ASCII_TO_NATIVE('\033');break;
8141             case 'a':   value = ASCII_TO_NATIVE('\007');break;
8142             case 'o':
8143                 RExC_parse--;   /* function expects to be pointed at the 'o' */
8144                 {
8145                     const char* error_msg;
8146                     bool valid = grok_bslash_o(RExC_parse,
8147                                                &value,
8148                                                &numlen,
8149                                                &error_msg,
8150                                                SIZE_ONLY);
8151                     RExC_parse += numlen;
8152                     if (! valid) {
8153                         vFAIL(error_msg);
8154                     }
8155                 }
8156                 if (PL_encoding && value < 0x100) {
8157                     goto recode_encoding;
8158                 }
8159                 break;
8160             case 'x':
8161                 if (*RExC_parse == '{') {
8162                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8163                         | PERL_SCAN_DISALLOW_PREFIX;
8164                     char * const e = strchr(RExC_parse++, '}');
8165                     if (!e)
8166                         vFAIL("Missing right brace on \\x{}");
8167
8168                     numlen = e - RExC_parse;
8169                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8170                     RExC_parse = e + 1;
8171                 }
8172                 else {
8173                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8174                     numlen = 2;
8175                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8176                     RExC_parse += numlen;
8177                 }
8178                 if (PL_encoding && value < 0x100)
8179                     goto recode_encoding;
8180                 break;
8181             case 'c':
8182                 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
8183                 break;
8184             case '0': case '1': case '2': case '3': case '4':
8185             case '5': case '6': case '7':
8186                 {
8187                     /* Take 1-3 octal digits */
8188                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8189                     numlen = 3;
8190                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8191                     RExC_parse += numlen;
8192                     if (PL_encoding && value < 0x100)
8193                         goto recode_encoding;
8194                     break;
8195                 }
8196             recode_encoding:
8197                 {
8198                     SV* enc = PL_encoding;
8199                     value = reg_recode((const char)(U8)value, &enc);
8200                     if (!enc && SIZE_ONLY)
8201                         ckWARNreg(RExC_parse,
8202                                   "Invalid escape in the specified encoding");
8203                     break;
8204                 }
8205             default:
8206                 /* Allow \_ to not give an error */
8207                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
8208                     ckWARN2reg(RExC_parse,
8209                                "Unrecognized escape \\%c in character class passed through",
8210                                (int)value);
8211                 }
8212                 break;
8213             }
8214         } /* end of \blah */
8215 #ifdef EBCDIC
8216         else
8217             literal_endpoint++;
8218 #endif
8219
8220         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8221
8222             if (!SIZE_ONLY && !need_class)
8223                 ANYOF_CLASS_ZERO(ret);
8224
8225             need_class = 1;
8226
8227             /* a bad range like a-\d, a-[:digit:] ? */
8228             if (range) {
8229                 if (!SIZE_ONLY) {
8230                     const int w =
8231                         RExC_parse >= rangebegin ?
8232                         RExC_parse - rangebegin : 0;
8233                     ckWARN4reg(RExC_parse,
8234                                "False [] range \"%*.*s\"",
8235                                w, w, rangebegin);
8236
8237                     if (prevvalue < 256) {
8238                         ANYOF_BITMAP_SET(ret, prevvalue);
8239                         ANYOF_BITMAP_SET(ret, '-');
8240                     }
8241                     else {
8242                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8243                         Perl_sv_catpvf(aTHX_ listsv,
8244                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8245                     }
8246                 }
8247
8248                 range = 0; /* this was not a true range */
8249             }
8250
8251
8252
8253             if (!SIZE_ONLY) {
8254                 const char *what = NULL;
8255                 char yesno = 0;
8256
8257                 if (namedclass > OOB_NAMEDCLASS)
8258                     optimize_invert = FALSE;
8259                 /* Possible truncation here but in some 64-bit environments
8260                  * the compiler gets heartburn about switch on 64-bit values.
8261                  * A similar issue a little earlier when switching on value.
8262                  * --jhi */
8263                 switch ((I32)namedclass) {
8264
8265                 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8266                 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8267                 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8268                 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8269                 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8270                 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8271                 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8272                 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8273                 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8274                 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8275 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8276                 case _C_C_T_(ALNUM, isALNUM(value), "Word");
8277                 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
8278 #else
8279                 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8280                 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8281 #endif
8282                 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
8283                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8284                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8285                 case ANYOF_ASCII:
8286                     if (LOC)
8287                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8288                     else {
8289 #ifndef EBCDIC
8290                         for (value = 0; value < 128; value++)
8291                             ANYOF_BITMAP_SET(ret, value);
8292 #else  /* EBCDIC */
8293                         for (value = 0; value < 256; value++) {
8294                             if (isASCII(value))
8295                                 ANYOF_BITMAP_SET(ret, value);
8296                         }
8297 #endif /* EBCDIC */
8298                     }
8299                     yesno = '+';
8300                     what = "ASCII";
8301                     break;
8302                 case ANYOF_NASCII:
8303                     if (LOC)
8304                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8305                     else {
8306 #ifndef EBCDIC
8307                         for (value = 128; value < 256; value++)
8308                             ANYOF_BITMAP_SET(ret, value);
8309 #else  /* EBCDIC */
8310                         for (value = 0; value < 256; value++) {
8311                             if (!isASCII(value))
8312                                 ANYOF_BITMAP_SET(ret, value);
8313                         }
8314 #endif /* EBCDIC */
8315                     }
8316                     yesno = '!';
8317                     what = "ASCII";
8318                     break;
8319                 case ANYOF_DIGIT:
8320                     if (LOC)
8321                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8322                     else {
8323                         /* consecutive digits assumed */
8324                         for (value = '0'; value <= '9'; value++)
8325                             ANYOF_BITMAP_SET(ret, value);
8326                     }
8327                     yesno = '+';
8328                     what = POSIX_CC_UNI_NAME("Digit");
8329                     break;
8330                 case ANYOF_NDIGIT:
8331                     if (LOC)
8332                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8333                     else {
8334                         /* consecutive digits assumed */
8335                         for (value = 0; value < '0'; value++)
8336                             ANYOF_BITMAP_SET(ret, value);
8337                         for (value = '9' + 1; value < 256; value++)
8338                             ANYOF_BITMAP_SET(ret, value);
8339                     }
8340                     yesno = '!';
8341                     what = POSIX_CC_UNI_NAME("Digit");
8342                     break;
8343                 case ANYOF_MAX:
8344                     /* this is to handle \p and \P */
8345                     break;
8346                 default:
8347                     vFAIL("Invalid [::] class");
8348                     break;
8349                 }
8350                 if (what) {
8351                     /* Strings such as "+utf8::isWord\n" */
8352                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8353                 }
8354                 if (LOC)
8355                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8356                 continue;
8357             }
8358         } /* end of namedclass \blah */
8359
8360         if (range) {
8361             if (prevvalue > (IV)value) /* b-a */ {
8362                 const int w = RExC_parse - rangebegin;
8363                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8364                 range = 0; /* not a valid range */
8365             }
8366         }
8367         else {
8368             prevvalue = value; /* save the beginning of the range */
8369             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8370                 RExC_parse[1] != ']') {
8371                 RExC_parse++;
8372
8373                 /* a bad range like \w-, [:word:]- ? */
8374                 if (namedclass > OOB_NAMEDCLASS) {
8375                     if (ckWARN(WARN_REGEXP)) {
8376                         const int w =
8377                             RExC_parse >= rangebegin ?
8378                             RExC_parse - rangebegin : 0;
8379                         vWARN4(RExC_parse,
8380                                "False [] range \"%*.*s\"",
8381                                w, w, rangebegin);
8382                     }
8383                     if (!SIZE_ONLY)
8384                         ANYOF_BITMAP_SET(ret, '-');
8385                 } else
8386                     range = 1;  /* yeah, it's a range! */
8387                 continue;       /* but do it the next time */
8388             }
8389         }
8390
8391         /* now is the next time */
8392         /*stored += (value - prevvalue + 1);*/
8393         if (!SIZE_ONLY) {
8394             if (prevvalue < 256) {
8395                 const IV ceilvalue = value < 256 ? value : 255;
8396                 IV i;
8397 #ifdef EBCDIC
8398                 /* In EBCDIC [\x89-\x91] should include
8399                  * the \x8e but [i-j] should not. */
8400                 if (literal_endpoint == 2 &&
8401                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8402                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8403                 {
8404                     if (isLOWER(prevvalue)) {
8405                         for (i = prevvalue; i <= ceilvalue; i++)
8406                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8407                                 stored++;
8408                                 ANYOF_BITMAP_SET(ret, i);
8409                             }
8410                     } else {
8411                         for (i = prevvalue; i <= ceilvalue; i++)
8412                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8413                                 stored++;
8414                                 ANYOF_BITMAP_SET(ret, i);
8415                             }
8416                     }
8417                 }
8418                 else
8419 #endif
8420                       for (i = prevvalue; i <= ceilvalue; i++) {
8421                         if (!ANYOF_BITMAP_TEST(ret,i)) {
8422                             stored++;
8423                             ANYOF_BITMAP_SET(ret, i);
8424                         }
8425                       }
8426           }
8427           if (value > 255 || UTF) {
8428                 const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
8429                 const UV natvalue      = NATIVE_TO_UNI(value);
8430                 stored+=2; /* can't optimize this class */
8431                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8432                 if (prevnatvalue < natvalue) { /* what about > ? */
8433                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8434                                    prevnatvalue, natvalue);
8435                 }
8436                 else if (prevnatvalue == natvalue) {
8437                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8438                     if (FOLD) {
8439                          U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8440                          STRLEN foldlen;
8441                          const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8442
8443 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8444                          if (RExC_precomp[0] == ':' &&
8445                              RExC_precomp[1] == '[' &&
8446                              (f == 0xDF || f == 0x92)) {
8447                              f = NATIVE_TO_UNI(f);
8448                         }
8449 #endif
8450                          /* If folding and foldable and a single
8451                           * character, insert also the folded version
8452                           * to the charclass. */
8453                          if (f != value) {
8454 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8455                              if ((RExC_precomp[0] == ':' &&
8456                                   RExC_precomp[1] == '[' &&
8457                                   (f == 0xA2 &&
8458                                    (value == 0xFB05 || value == 0xFB06))) ?
8459                                  foldlen == ((STRLEN)UNISKIP(f) - 1) :
8460                                  foldlen == (STRLEN)UNISKIP(f) )
8461 #else
8462                               if (foldlen == (STRLEN)UNISKIP(f))
8463 #endif
8464                                   Perl_sv_catpvf(aTHX_ listsv,
8465                                                  "%04"UVxf"\n", f);
8466                               else {
8467                                   /* Any multicharacter foldings
8468                                    * require the following transform:
8469                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8470                                    * where E folds into "pq" and F folds
8471                                    * into "rst", all other characters
8472                                    * fold to single characters.  We save
8473                                    * away these multicharacter foldings,
8474                                    * to be later saved as part of the
8475                                    * additional "s" data. */
8476                                   SV *sv;
8477
8478                                   if (!unicode_alternate)
8479                                       unicode_alternate = newAV();
8480                                   sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8481                                                      TRUE);
8482                                   av_push(unicode_alternate, sv);
8483                               }
8484                          }
8485
8486                          /* If folding and the value is one of the Greek
8487                           * sigmas insert a few more sigmas to make the
8488                           * folding rules of the sigmas to work right.
8489                           * Note that not all the possible combinations
8490                           * are handled here: some of them are handled
8491                           * by the standard folding rules, and some of
8492                           * them (literal or EXACTF cases) are handled
8493                           * during runtime in regexec.c:S_find_byclass(). */
8494                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8495                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8496                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8497                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8498                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8499                          }
8500                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8501                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8502                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8503                     }
8504                 }
8505             }
8506 #ifdef EBCDIC
8507             literal_endpoint = 0;
8508 #endif
8509         }
8510
8511         range = 0; /* this range (if it was one) is done now */
8512     }
8513
8514     if (need_class) {
8515         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8516         if (SIZE_ONLY)
8517             RExC_size += ANYOF_CLASS_ADD_SKIP;
8518         else
8519             RExC_emit += ANYOF_CLASS_ADD_SKIP;
8520     }
8521
8522
8523     if (SIZE_ONLY)
8524         return ret;
8525     /****** !SIZE_ONLY AFTER HERE *********/
8526
8527     if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8528         && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8529     ) {
8530         /* optimize single char class to an EXACT node
8531            but *only* when its not a UTF/high char  */
8532         const char * cur_parse= RExC_parse;
8533         RExC_emit = (regnode *)orig_emit;
8534         RExC_parse = (char *)orig_parse;
8535         ret = reg_node(pRExC_state,
8536                        (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8537         RExC_parse = (char *)cur_parse;
8538         *STRING(ret)= (char)value;
8539         STR_LEN(ret)= 1;
8540         RExC_emit += STR_SZ(1);
8541         SvREFCNT_dec(listsv);
8542         return ret;
8543     }
8544     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8545     if ( /* If the only flag is folding (plus possibly inversion). */
8546         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8547        ) {
8548         for (value = 0; value < 256; ++value) {
8549             if (ANYOF_BITMAP_TEST(ret, value)) {
8550                 UV fold = PL_fold[value];
8551
8552                 if (fold != value)
8553                     ANYOF_BITMAP_SET(ret, fold);
8554             }
8555         }
8556         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8557     }
8558
8559     /* optimize inverted simple patterns (e.g. [^a-z]) */
8560     if (optimize_invert &&
8561         /* If the only flag is inversion. */
8562         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8563         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8564             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8565         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8566     }
8567     {
8568         AV * const av = newAV();
8569         SV *rv;
8570         /* The 0th element stores the character class description
8571          * in its textual form: used later (regexec.c:Perl_regclass_swash())
8572          * to initialize the appropriate swash (which gets stored in
8573          * the 1st element), and also useful for dumping the regnode.
8574          * The 2nd element stores the multicharacter foldings,
8575          * used later (regexec.c:S_reginclass()). */
8576         av_store(av, 0, listsv);
8577         av_store(av, 1, NULL);
8578         av_store(av, 2, MUTABLE_SV(unicode_alternate));
8579         rv = newRV_noinc(MUTABLE_SV(av));
8580         n = add_data(pRExC_state, 1, "s");
8581         RExC_rxi->data->data[n] = (void*)rv;
8582         ARG_SET(ret, n);
8583     }
8584     return ret;
8585 }
8586 #undef _C_C_T_
8587
8588
8589 /* reg_skipcomment()
8590
8591    Absorbs an /x style # comments from the input stream.
8592    Returns true if there is more text remaining in the stream.
8593    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8594    terminates the pattern without including a newline.
8595
8596    Note its the callers responsibility to ensure that we are
8597    actually in /x mode
8598
8599 */
8600
8601 STATIC bool
8602 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8603 {
8604     bool ended = 0;
8605
8606     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8607
8608     while (RExC_parse < RExC_end)
8609         if (*RExC_parse++ == '\n') {
8610             ended = 1;
8611             break;
8612         }
8613     if (!ended) {
8614         /* we ran off the end of the pattern without ending
8615            the comment, so we have to add an \n when wrapping */
8616         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8617         return 0;
8618     } else
8619         return 1;
8620 }
8621
8622 /* nextchar()
8623
8624    Advance that parse position, and optionall absorbs
8625    "whitespace" from the inputstream.
8626
8627    Without /x "whitespace" means (?#...) style comments only,
8628    with /x this means (?#...) and # comments and whitespace proper.
8629
8630    Returns the RExC_parse point from BEFORE the scan occurs.
8631
8632    This is the /x friendly way of saying RExC_parse++.
8633 */
8634
8635 STATIC char*
8636 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8637 {
8638     char* const retval = RExC_parse++;
8639
8640     PERL_ARGS_ASSERT_NEXTCHAR;
8641
8642     for (;;) {
8643         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8644                 RExC_parse[2] == '#') {
8645             while (*RExC_parse != ')') {
8646                 if (RExC_parse == RExC_end)
8647                     FAIL("Sequence (?#... not terminated");
8648                 RExC_parse++;
8649             }
8650             RExC_parse++;
8651             continue;
8652         }
8653         if (RExC_flags & RXf_PMf_EXTENDED) {
8654             if (isSPACE(*RExC_parse)) {
8655                 RExC_parse++;
8656                 continue;
8657             }
8658             else if (*RExC_parse == '#') {
8659                 if ( reg_skipcomment( pRExC_state ) )
8660                     continue;
8661             }
8662         }
8663         return retval;
8664     }
8665 }
8666
8667 /*
8668 - reg_node - emit a node
8669 */
8670 STATIC regnode *                        /* Location. */
8671 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8672 {
8673     dVAR;
8674     register regnode *ptr;
8675     regnode * const ret = RExC_emit;
8676     GET_RE_DEBUG_FLAGS_DECL;
8677
8678     PERL_ARGS_ASSERT_REG_NODE;
8679
8680     if (SIZE_ONLY) {
8681         SIZE_ALIGN(RExC_size);
8682         RExC_size += 1;
8683         return(ret);
8684     }
8685     if (RExC_emit >= RExC_emit_bound)
8686         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8687
8688     NODE_ALIGN_FILL(ret);
8689     ptr = ret;
8690     FILL_ADVANCE_NODE(ptr, op);
8691     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
8692 #ifdef RE_TRACK_PATTERN_OFFSETS
8693     if (RExC_offsets) {         /* MJD */
8694         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8695               "reg_node", __LINE__,
8696               PL_reg_name[op],
8697               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8698                 ? "Overwriting end of array!\n" : "OK",
8699               (UV)(RExC_emit - RExC_emit_start),
8700               (UV)(RExC_parse - RExC_start),
8701               (UV)RExC_offsets[0]));
8702         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8703     }
8704 #endif
8705     RExC_emit = ptr;
8706     return(ret);
8707 }
8708
8709 /*
8710 - reganode - emit a node with an argument
8711 */
8712 STATIC regnode *                        /* Location. */
8713 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8714 {
8715     dVAR;
8716     register regnode *ptr;
8717     regnode * const ret = RExC_emit;
8718     GET_RE_DEBUG_FLAGS_DECL;
8719
8720     PERL_ARGS_ASSERT_REGANODE;
8721
8722     if (SIZE_ONLY) {
8723         SIZE_ALIGN(RExC_size);
8724         RExC_size += 2;
8725         /*
8726            We can't do this:
8727
8728            assert(2==regarglen[op]+1);
8729
8730            Anything larger than this has to allocate the extra amount.
8731            If we changed this to be:
8732
8733            RExC_size += (1 + regarglen[op]);
8734
8735            then it wouldn't matter. Its not clear what side effect
8736            might come from that so its not done so far.
8737            -- dmq
8738         */
8739         return(ret);
8740     }
8741     if (RExC_emit >= RExC_emit_bound)
8742         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8743
8744     NODE_ALIGN_FILL(ret);
8745     ptr = ret;
8746     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8747     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
8748 #ifdef RE_TRACK_PATTERN_OFFSETS
8749     if (RExC_offsets) {         /* MJD */
8750         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8751               "reganode",
8752               __LINE__,
8753               PL_reg_name[op],
8754               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8755               "Overwriting end of array!\n" : "OK",
8756               (UV)(RExC_emit - RExC_emit_start),
8757               (UV)(RExC_parse - RExC_start),
8758               (UV)RExC_offsets[0]));
8759         Set_Cur_Node_Offset;
8760     }
8761 #endif
8762     RExC_emit = ptr;
8763     return(ret);
8764 }
8765
8766 /*
8767 - reguni - emit (if appropriate) a Unicode character
8768 */
8769 STATIC STRLEN
8770 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8771 {
8772     dVAR;
8773
8774     PERL_ARGS_ASSERT_REGUNI;
8775
8776     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8777 }
8778
8779 /*
8780 - reginsert - insert an operator in front of already-emitted operand
8781 *
8782 * Means relocating the operand.
8783 */
8784 STATIC void
8785 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8786 {
8787     dVAR;
8788     register regnode *src;
8789     register regnode *dst;
8790     register regnode *place;
8791     const int offset = regarglen[(U8)op];
8792     const int size = NODE_STEP_REGNODE + offset;
8793     GET_RE_DEBUG_FLAGS_DECL;
8794
8795     PERL_ARGS_ASSERT_REGINSERT;
8796     PERL_UNUSED_ARG(depth);
8797 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8798     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8799     if (SIZE_ONLY) {
8800         RExC_size += size;
8801         return;
8802     }
8803
8804     src = RExC_emit;
8805     RExC_emit += size;
8806     dst = RExC_emit;
8807     if (RExC_open_parens) {
8808         int paren;
8809         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8810         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8811             if ( RExC_open_parens[paren] >= opnd ) {
8812                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8813                 RExC_open_parens[paren] += size;
8814             } else {
8815                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8816             }
8817             if ( RExC_close_parens[paren] >= opnd ) {
8818                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8819                 RExC_close_parens[paren] += size;
8820             } else {
8821                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8822             }
8823         }
8824     }
8825
8826     while (src > opnd) {
8827         StructCopy(--src, --dst, regnode);
8828 #ifdef RE_TRACK_PATTERN_OFFSETS
8829         if (RExC_offsets) {     /* MJD 20010112 */
8830             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8831                   "reg_insert",
8832                   __LINE__,
8833                   PL_reg_name[op],
8834                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8835                     ? "Overwriting end of array!\n" : "OK",
8836                   (UV)(src - RExC_emit_start),
8837                   (UV)(dst - RExC_emit_start),
8838                   (UV)RExC_offsets[0]));
8839             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8840             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8841         }
8842 #endif
8843     }
8844
8845
8846     place = opnd;               /* Op node, where operand used to be. */
8847 #ifdef RE_TRACK_PATTERN_OFFSETS
8848     if (RExC_offsets) {         /* MJD */
8849         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8850               "reginsert",
8851               __LINE__,
8852               PL_reg_name[op],
8853               (UV)(place - RExC_emit_start) > RExC_offsets[0]
8854               ? "Overwriting end of array!\n" : "OK",
8855               (UV)(place - RExC_emit_start),
8856               (UV)(RExC_parse - RExC_start),
8857               (UV)RExC_offsets[0]));
8858         Set_Node_Offset(place, RExC_parse);
8859         Set_Node_Length(place, 1);
8860     }
8861 #endif
8862     src = NEXTOPER(place);
8863     FILL_ADVANCE_NODE(place, op);
8864     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
8865     Zero(src, offset, regnode);
8866 }
8867
8868 /*
8869 - regtail - set the next-pointer at the end of a node chain of p to val.
8870 - SEE ALSO: regtail_study
8871 */
8872 /* TODO: All three parms should be const */
8873 STATIC void
8874 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8875 {
8876     dVAR;
8877     register regnode *scan;
8878     GET_RE_DEBUG_FLAGS_DECL;
8879
8880     PERL_ARGS_ASSERT_REGTAIL;
8881 #ifndef DEBUGGING
8882     PERL_UNUSED_ARG(depth);
8883 #endif
8884
8885     if (SIZE_ONLY)
8886         return;
8887
8888     /* Find last node. */
8889     scan = p;
8890     for (;;) {
8891         regnode * const temp = regnext(scan);
8892         DEBUG_PARSE_r({
8893             SV * const mysv=sv_newmortal();
8894             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8895             regprop(RExC_rx, mysv, scan);
8896             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8897                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8898                     (temp == NULL ? "->" : ""),
8899                     (temp == NULL ? PL_reg_name[OP(val)] : "")
8900             );
8901         });
8902         if (temp == NULL)
8903             break;
8904         scan = temp;
8905     }
8906
8907     if (reg_off_by_arg[OP(scan)]) {
8908         ARG_SET(scan, val - scan);
8909     }
8910     else {
8911         NEXT_OFF(scan) = val - scan;
8912     }
8913 }
8914
8915 #ifdef DEBUGGING
8916 /*
8917 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8918 - Look for optimizable sequences at the same time.
8919 - currently only looks for EXACT chains.
8920
8921 This is expermental code. The idea is to use this routine to perform
8922 in place optimizations on branches and groups as they are constructed,
8923 with the long term intention of removing optimization from study_chunk so
8924 that it is purely analytical.
8925
8926 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8927 to control which is which.
8928
8929 */
8930 /* TODO: All four parms should be const */
8931
8932 STATIC U8
8933 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8934 {
8935     dVAR;
8936     register regnode *scan;
8937     U8 exact = PSEUDO;
8938 #ifdef EXPERIMENTAL_INPLACESCAN
8939     I32 min = 0;
8940 #endif
8941     GET_RE_DEBUG_FLAGS_DECL;
8942
8943     PERL_ARGS_ASSERT_REGTAIL_STUDY;
8944
8945
8946     if (SIZE_ONLY)
8947         return exact;
8948
8949     /* Find last node. */
8950
8951     scan = p;
8952     for (;;) {
8953         regnode * const temp = regnext(scan);
8954 #ifdef EXPERIMENTAL_INPLACESCAN
8955         if (PL_regkind[OP(scan)] == EXACT)
8956             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8957                 return EXACT;
8958 #endif
8959         if ( exact ) {
8960             switch (OP(scan)) {
8961                 case EXACT:
8962                 case EXACTF:
8963                 case EXACTFL:
8964                         if( exact == PSEUDO )
8965                             exact= OP(scan);
8966                         else if ( exact != OP(scan) )
8967                             exact= 0;
8968                 case NOTHING:
8969                     break;
8970                 default:
8971                     exact= 0;
8972             }
8973         }
8974         DEBUG_PARSE_r({
8975             SV * const mysv=sv_newmortal();
8976             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8977             regprop(RExC_rx, mysv, scan);
8978             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8979                 SvPV_nolen_const(mysv),
8980                 REG_NODE_NUM(scan),
8981                 PL_reg_name[exact]);
8982         });
8983         if (temp == NULL)
8984             break;
8985         scan = temp;
8986     }
8987     DEBUG_PARSE_r({
8988         SV * const mysv_val=sv_newmortal();
8989         DEBUG_PARSE_MSG("");
8990         regprop(RExC_rx, mysv_val, val);
8991         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8992                       SvPV_nolen_const(mysv_val),
8993                       (IV)REG_NODE_NUM(val),
8994                       (IV)(val - scan)
8995         );
8996     });
8997     if (reg_off_by_arg[OP(scan)]) {
8998         ARG_SET(scan, val - scan);
8999     }
9000     else {
9001         NEXT_OFF(scan) = val - scan;
9002     }
9003
9004     return exact;
9005 }
9006 #endif
9007
9008 /*
9009  - regcurly - a little FSA that accepts {\d+,?\d*}
9010  */
9011 #ifndef PERL_IN_XSUB_RE
9012 I32
9013 Perl_regcurly(register const char *s)
9014 {
9015     PERL_ARGS_ASSERT_REGCURLY;
9016
9017     if (*s++ != '{')
9018         return FALSE;
9019     if (!isDIGIT(*s))
9020         return FALSE;
9021     while (isDIGIT(*s))
9022         s++;
9023     if (*s == ',')
9024         s++;
9025     while (isDIGIT(*s))
9026         s++;
9027     if (*s != '}')
9028         return FALSE;
9029     return TRUE;
9030 }
9031 #endif
9032
9033 /*
9034  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
9035  */
9036 #ifdef DEBUGGING
9037 static void
9038 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9039 {
9040     int bit;
9041     int set=0;
9042
9043     for (bit=0; bit<32; bit++) {
9044         if (flags & (1<<bit)) {
9045             if (!set++ && lead)
9046                 PerlIO_printf(Perl_debug_log, "%s",lead);
9047             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9048         }
9049     }
9050     if (lead)  {
9051         if (set)
9052             PerlIO_printf(Perl_debug_log, "\n");
9053         else
9054             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9055     }
9056 }
9057 #endif
9058
9059 void
9060 Perl_regdump(pTHX_ const regexp *r)
9061 {
9062 #ifdef DEBUGGING
9063     dVAR;
9064     SV * const sv = sv_newmortal();
9065     SV *dsv= sv_newmortal();
9066     RXi_GET_DECL(r,ri);
9067     GET_RE_DEBUG_FLAGS_DECL;
9068
9069     PERL_ARGS_ASSERT_REGDUMP;
9070
9071     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
9072
9073     /* Header fields of interest. */
9074     if (r->anchored_substr) {
9075         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9076             RE_SV_DUMPLEN(r->anchored_substr), 30);
9077         PerlIO_printf(Perl_debug_log,
9078                       "anchored %s%s at %"IVdf" ",
9079                       s, RE_SV_TAIL(r->anchored_substr),
9080                       (IV)r->anchored_offset);
9081     } else if (r->anchored_utf8) {
9082         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9083             RE_SV_DUMPLEN(r->anchored_utf8), 30);
9084         PerlIO_printf(Perl_debug_log,
9085                       "anchored utf8 %s%s at %"IVdf" ",
9086                       s, RE_SV_TAIL(r->anchored_utf8),
9087                       (IV)r->anchored_offset);
9088     }
9089     if (r->float_substr) {
9090         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9091             RE_SV_DUMPLEN(r->float_substr), 30);
9092         PerlIO_printf(Perl_debug_log,
9093                       "floating %s%s at %"IVdf"..%"UVuf" ",
9094                       s, RE_SV_TAIL(r->float_substr),
9095                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9096     } else if (r->float_utf8) {
9097         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9098             RE_SV_DUMPLEN(r->float_utf8), 30);
9099         PerlIO_printf(Perl_debug_log,
9100                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9101                       s, RE_SV_TAIL(r->float_utf8),
9102                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9103     }
9104     if (r->check_substr || r->check_utf8)
9105         PerlIO_printf(Perl_debug_log,
9106                       (const char *)
9107                       (r->check_substr == r->float_substr
9108                        && r->check_utf8 == r->float_utf8
9109                        ? "(checking floating" : "(checking anchored"));
9110     if (r->extflags & RXf_NOSCAN)
9111         PerlIO_printf(Perl_debug_log, " noscan");
9112     if (r->extflags & RXf_CHECK_ALL)
9113         PerlIO_printf(Perl_debug_log, " isall");
9114     if (r->check_substr || r->check_utf8)
9115         PerlIO_printf(Perl_debug_log, ") ");
9116
9117     if (ri->regstclass) {
9118         regprop(r, sv, ri->regstclass);
9119         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9120     }
9121     if (r->extflags & RXf_ANCH) {
9122         PerlIO_printf(Perl_debug_log, "anchored");
9123         if (r->extflags & RXf_ANCH_BOL)
9124             PerlIO_printf(Perl_debug_log, "(BOL)");
9125         if (r->extflags & RXf_ANCH_MBOL)
9126             PerlIO_printf(Perl_debug_log, "(MBOL)");
9127         if (r->extflags & RXf_ANCH_SBOL)
9128             PerlIO_printf(Perl_debug_log, "(SBOL)");
9129         if (r->extflags & RXf_ANCH_GPOS)
9130             PerlIO_printf(Perl_debug_log, "(GPOS)");
9131         PerlIO_putc(Perl_debug_log, ' ');
9132     }
9133     if (r->extflags & RXf_GPOS_SEEN)
9134         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9135     if (r->intflags & PREGf_SKIP)
9136         PerlIO_printf(Perl_debug_log, "plus ");
9137     if (r->intflags & PREGf_IMPLICIT)
9138         PerlIO_printf(Perl_debug_log, "implicit ");
9139     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9140     if (r->extflags & RXf_EVAL_SEEN)
9141         PerlIO_printf(Perl_debug_log, "with eval ");
9142     PerlIO_printf(Perl_debug_log, "\n");
9143     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9144 #else
9145     PERL_ARGS_ASSERT_REGDUMP;
9146     PERL_UNUSED_CONTEXT;
9147     PERL_UNUSED_ARG(r);
9148 #endif  /* DEBUGGING */
9149 }
9150
9151 /*
9152 - regprop - printable representation of opcode
9153 */
9154 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9155 STMT_START { \
9156         if (do_sep) {                           \
9157             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9158             if (flags & ANYOF_INVERT)           \
9159                 /*make sure the invert info is in each */ \
9160                 sv_catpvs(sv, "^");             \
9161             do_sep = 0;                         \
9162         }                                       \
9163 } STMT_END
9164
9165 void
9166 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9167 {
9168 #ifdef DEBUGGING
9169     dVAR;
9170     register int k;
9171     RXi_GET_DECL(prog,progi);
9172     GET_RE_DEBUG_FLAGS_DECL;
9173
9174     PERL_ARGS_ASSERT_REGPROP;
9175
9176     sv_setpvs(sv, "");
9177
9178     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
9179         /* It would be nice to FAIL() here, but this may be called from
9180            regexec.c, and it would be hard to supply pRExC_state. */
9181         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9182     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9183
9184     k = PL_regkind[OP(o)];
9185
9186     if (k == EXACT) {
9187         sv_catpvs(sv, " ");
9188         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9189          * is a crude hack but it may be the best for now since
9190          * we have no flag "this EXACTish node was UTF-8"
9191          * --jhi */
9192         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9193                   PERL_PV_ESCAPE_UNI_DETECT |
9194                   PERL_PV_PRETTY_ELLIPSES   |
9195                   PERL_PV_PRETTY_LTGT       |
9196                   PERL_PV_PRETTY_NOCLEAR
9197                   );
9198     } else if (k == TRIE) {
9199         /* print the details of the trie in dumpuntil instead, as
9200          * progi->data isn't available here */
9201         const char op = OP(o);
9202         const U32 n = ARG(o);
9203         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9204                (reg_ac_data *)progi->data->data[n] :
9205                NULL;
9206         const reg_trie_data * const trie
9207             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9208
9209         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9210         DEBUG_TRIE_COMPILE_r(
9211             Perl_sv_catpvf(aTHX_ sv,
9212                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9213                 (UV)trie->startstate,
9214                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9215                 (UV)trie->wordcount,
9216                 (UV)trie->minlen,
9217                 (UV)trie->maxlen,
9218                 (UV)TRIE_CHARCOUNT(trie),
9219                 (UV)trie->uniquecharcount
9220             )
9221         );
9222         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9223             int i;
9224             int rangestart = -1;
9225             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9226             sv_catpvs(sv, "[");
9227             for (i = 0; i <= 256; i++) {
9228                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9229                     if (rangestart == -1)
9230                         rangestart = i;
9231                 } else if (rangestart != -1) {
9232                     if (i <= rangestart + 3)
9233                         for (; rangestart < i; rangestart++)
9234                             put_byte(sv, rangestart);
9235                     else {
9236                         put_byte(sv, rangestart);
9237                         sv_catpvs(sv, "-");
9238                         put_byte(sv, i - 1);
9239                     }
9240                     rangestart = -1;
9241                 }
9242             }
9243             sv_catpvs(sv, "]");
9244         }
9245
9246     } else if (k == CURLY) {
9247         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9248             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9249         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9250     }
9251     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
9252         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9253     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9254         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
9255         if ( RXp_PAREN_NAMES(prog) ) {
9256             if ( k != REF || OP(o) < NREF) {
9257                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9258                 SV **name= av_fetch(list, ARG(o), 0 );
9259                 if (name)
9260                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9261             }
9262             else {
9263                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9264                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9265                 I32 *nums=(I32*)SvPVX(sv_dat);
9266                 SV **name= av_fetch(list, nums[0], 0 );
9267                 I32 n;
9268                 if (name) {
9269                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
9270                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9271                                     (n ? "," : ""), (IV)nums[n]);
9272                     }
9273                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9274                 }
9275             }
9276         }
9277     } else if (k == GOSUB)
9278         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9279     else if (k == VERB) {
9280         if (!o->flags)
9281             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9282                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9283     } else if (k == LOGICAL)
9284         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
9285     else if (k == FOLDCHAR)
9286         Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9287     else if (k == ANYOF) {
9288         int i, rangestart = -1;
9289         const U8 flags = ANYOF_FLAGS(o);
9290         int do_sep = 0;
9291
9292         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9293         static const char * const anyofs[] = {
9294             "\\w",
9295             "\\W",
9296             "\\s",
9297             "\\S",
9298             "\\d",
9299             "\\D",
9300             "[:alnum:]",
9301             "[:^alnum:]",
9302             "[:alpha:]",
9303             "[:^alpha:]",
9304             "[:ascii:]",
9305             "[:^ascii:]",
9306             "[:cntrl:]",
9307             "[:^cntrl:]",
9308             "[:graph:]",
9309             "[:^graph:]",
9310             "[:lower:]",
9311             "[:^lower:]",
9312             "[:print:]",
9313             "[:^print:]",
9314             "[:punct:]",
9315             "[:^punct:]",
9316             "[:upper:]",
9317             "[:^upper:]",
9318             "[:xdigit:]",
9319             "[:^xdigit:]",
9320             "[:space:]",
9321             "[:^space:]",
9322             "[:blank:]",
9323             "[:^blank:]"
9324         };
9325
9326         if (flags & ANYOF_LOCALE)
9327             sv_catpvs(sv, "{loc}");
9328         if (flags & ANYOF_FOLD)
9329             sv_catpvs(sv, "{i}");
9330         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9331         if (flags & ANYOF_INVERT)
9332             sv_catpvs(sv, "^");
9333
9334         /* output what the standard cp 0-255 bitmap matches */
9335         for (i = 0; i <= 256; i++) {
9336             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9337                 if (rangestart == -1)
9338                     rangestart = i;
9339             } else if (rangestart != -1) {
9340                 if (i <= rangestart + 3)
9341                     for (; rangestart < i; rangestart++)
9342                         put_byte(sv, rangestart);
9343                 else {
9344                     put_byte(sv, rangestart);
9345                     sv_catpvs(sv, "-");
9346                     put_byte(sv, i - 1);
9347                 }
9348                 do_sep = 1;
9349                 rangestart = -1;
9350             }
9351         }
9352
9353         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9354         /* output any special charclass tests (used mostly under use locale) */
9355         if (o->flags & ANYOF_CLASS)
9356             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9357                 if (ANYOF_CLASS_TEST(o,i)) {
9358                     sv_catpv(sv, anyofs[i]);
9359                     do_sep = 1;
9360                 }
9361
9362         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9363
9364         /* output information about the unicode matching */
9365         if (flags & ANYOF_UNICODE)
9366             sv_catpvs(sv, "{unicode}");
9367         else if (flags & ANYOF_UNICODE_ALL)
9368             sv_catpvs(sv, "{unicode_all}");
9369
9370         {
9371             SV *lv;
9372             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9373
9374             if (lv) {
9375                 if (sw) {
9376                     U8 s[UTF8_MAXBYTES_CASE+1];
9377
9378                     for (i = 0; i <= 256; i++) { /* just the first 256 */
9379                         uvchr_to_utf8(s, i);
9380
9381                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
9382                             if (rangestart == -1)
9383                                 rangestart = i;
9384                         } else if (rangestart != -1) {
9385                             if (i <= rangestart + 3)
9386                                 for (; rangestart < i; rangestart++) {
9387                                     const U8 * const e = uvchr_to_utf8(s,rangestart);
9388                                     U8 *p;
9389                                     for(p = s; p < e; p++)
9390                                         put_byte(sv, *p);
9391                                 }
9392                             else {
9393                                 const U8 *e = uvchr_to_utf8(s,rangestart);
9394                                 U8 *p;
9395                                 for (p = s; p < e; p++)
9396                                     put_byte(sv, *p);
9397                                 sv_catpvs(sv, "-");
9398                                 e = uvchr_to_utf8(s, i-1);
9399                                 for (p = s; p < e; p++)
9400                                     put_byte(sv, *p);
9401                                 }
9402                                 rangestart = -1;
9403                             }
9404                         }
9405
9406                     sv_catpvs(sv, "..."); /* et cetera */
9407                 }
9408
9409                 {
9410                     char *s = savesvpv(lv);
9411                     char * const origs = s;
9412
9413                     while (*s && *s != '\n')
9414                         s++;
9415
9416                     if (*s == '\n') {
9417                         const char * const t = ++s;
9418
9419                         while (*s) {
9420                             if (*s == '\n')
9421                                 *s = ' ';
9422                             s++;
9423                         }
9424                         if (s[-1] == ' ')
9425                             s[-1] = 0;
9426
9427                         sv_catpv(sv, t);
9428                     }
9429
9430                     Safefree(origs);
9431                 }
9432             }
9433         }
9434
9435         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9436     }
9437     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9438         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9439 #else
9440     PERL_UNUSED_CONTEXT;
9441     PERL_UNUSED_ARG(sv);
9442     PERL_UNUSED_ARG(o);
9443     PERL_UNUSED_ARG(prog);
9444 #endif  /* DEBUGGING */
9445 }
9446
9447 SV *
9448 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9449 {                               /* Assume that RE_INTUIT is set */
9450     dVAR;
9451     struct regexp *const prog = (struct regexp *)SvANY(r);
9452     GET_RE_DEBUG_FLAGS_DECL;
9453
9454     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9455     PERL_UNUSED_CONTEXT;
9456
9457     DEBUG_COMPILE_r(
9458         {
9459             const char * const s = SvPV_nolen_const(prog->check_substr
9460                       ? prog->check_substr : prog->check_utf8);
9461
9462             if (!PL_colorset) reginitcolors();
9463             PerlIO_printf(Perl_debug_log,
9464                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9465                       PL_colors[4],
9466                       prog->check_substr ? "" : "utf8 ",
9467                       PL_colors[5],PL_colors[0],
9468                       s,
9469                       PL_colors[1],
9470                       (strlen(s) > 60 ? "..." : ""));
9471         } );
9472
9473     return prog->check_substr ? prog->check_substr : prog->check_utf8;
9474 }
9475
9476 /*
9477    pregfree()
9478
9479    handles refcounting and freeing the perl core regexp structure. When
9480    it is necessary to actually free the structure the first thing it
9481    does is call the 'free' method of the regexp_engine associated to to
9482    the regexp, allowing the handling of the void *pprivate; member
9483    first. (This routine is not overridable by extensions, which is why
9484    the extensions free is called first.)
9485
9486    See regdupe and regdupe_internal if you change anything here.
9487 */
9488 #ifndef PERL_IN_XSUB_RE
9489 void
9490 Perl_pregfree(pTHX_ REGEXP *r)
9491 {
9492     SvREFCNT_dec(r);
9493 }
9494
9495 void
9496 Perl_pregfree2(pTHX_ REGEXP *rx)
9497 {
9498     dVAR;
9499     struct regexp *const r = (struct regexp *)SvANY(rx);
9500     GET_RE_DEBUG_FLAGS_DECL;
9501
9502     PERL_ARGS_ASSERT_PREGFREE2;
9503
9504     if (r->mother_re) {
9505         ReREFCNT_dec(r->mother_re);
9506     } else {
9507         CALLREGFREE_PVT(rx); /* free the private data */
9508         SvREFCNT_dec(RXp_PAREN_NAMES(r));
9509     }
9510     if (r->substrs) {
9511         SvREFCNT_dec(r->anchored_substr);
9512         SvREFCNT_dec(r->anchored_utf8);
9513         SvREFCNT_dec(r->float_substr);
9514         SvREFCNT_dec(r->float_utf8);
9515         Safefree(r->substrs);
9516     }
9517     RX_MATCH_COPY_FREE(rx);
9518 #ifdef PERL_OLD_COPY_ON_WRITE
9519     SvREFCNT_dec(r->saved_copy);
9520 #endif
9521     Safefree(r->offs);
9522 }
9523
9524 /*  reg_temp_copy()
9525
9526     This is a hacky workaround to the structural issue of match results
9527     being stored in the regexp structure which is in turn stored in
9528     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9529     could be PL_curpm in multiple contexts, and could require multiple
9530     result sets being associated with the pattern simultaneously, such
9531     as when doing a recursive match with (??{$qr})
9532
9533     The solution is to make a lightweight copy of the regexp structure
9534     when a qr// is returned from the code executed by (??{$qr}) this
9535     lightweight copy doesnt actually own any of its data except for
9536     the starp/end and the actual regexp structure itself.
9537
9538 */
9539
9540
9541 REGEXP *
9542 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9543 {
9544     struct regexp *ret;
9545     struct regexp *const r = (struct regexp *)SvANY(rx);
9546     register const I32 npar = r->nparens+1;
9547
9548     PERL_ARGS_ASSERT_REG_TEMP_COPY;
9549
9550     if (!ret_x)
9551         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9552     ret = (struct regexp *)SvANY(ret_x);
9553
9554     (void)ReREFCNT_inc(rx);
9555     /* We can take advantage of the existing "copied buffer" mechanism in SVs
9556        by pointing directly at the buffer, but flagging that the allocated
9557        space in the copy is zero. As we've just done a struct copy, it's now
9558        a case of zero-ing that, rather than copying the current length.  */
9559     SvPV_set(ret_x, RX_WRAPPED(rx));
9560     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9561     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9562            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9563     SvLEN_set(ret_x, 0);
9564     SvSTASH_set(ret_x, NULL);
9565     SvMAGIC_set(ret_x, NULL);
9566     Newx(ret->offs, npar, regexp_paren_pair);
9567     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9568     if (r->substrs) {
9569         Newx(ret->substrs, 1, struct reg_substr_data);
9570         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9571
9572         SvREFCNT_inc_void(ret->anchored_substr);
9573         SvREFCNT_inc_void(ret->anchored_utf8);
9574         SvREFCNT_inc_void(ret->float_substr);
9575         SvREFCNT_inc_void(ret->float_utf8);
9576
9577         /* check_substr and check_utf8, if non-NULL, point to either their
9578            anchored or float namesakes, and don't hold a second reference.  */
9579     }
9580     RX_MATCH_COPIED_off(ret_x);
9581 #ifdef PERL_OLD_COPY_ON_WRITE
9582     ret->saved_copy = NULL;
9583 #endif
9584     ret->mother_re = rx;
9585
9586     return ret_x;
9587 }
9588 #endif
9589
9590 /* regfree_internal()
9591
9592    Free the private data in a regexp. This is overloadable by
9593    extensions. Perl takes care of the regexp structure in pregfree(),
9594    this covers the *pprivate pointer which technically perldoesnt
9595    know about, however of course we have to handle the
9596    regexp_internal structure when no extension is in use.
9597
9598    Note this is called before freeing anything in the regexp
9599    structure.
9600  */
9601
9602 void
9603 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9604 {
9605     dVAR;
9606     struct regexp *const r = (struct regexp *)SvANY(rx);
9607     RXi_GET_DECL(r,ri);
9608     GET_RE_DEBUG_FLAGS_DECL;
9609
9610     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9611
9612     DEBUG_COMPILE_r({
9613         if (!PL_colorset)
9614             reginitcolors();
9615         {
9616             SV *dsv= sv_newmortal();
9617             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
9618                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
9619             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9620                 PL_colors[4],PL_colors[5],s);
9621         }
9622     });
9623 #ifdef RE_TRACK_PATTERN_OFFSETS
9624     if (ri->u.offsets)
9625         Safefree(ri->u.offsets);             /* 20010421 MJD */
9626 #endif
9627     if (ri->data) {
9628         int n = ri->data->count;
9629         PAD* new_comppad = NULL;
9630         PAD* old_comppad;
9631         PADOFFSET refcnt;
9632
9633         while (--n >= 0) {
9634           /* If you add a ->what type here, update the comment in regcomp.h */
9635             switch (ri->data->what[n]) {
9636             case 'a':
9637             case 's':
9638             case 'S':
9639             case 'u':
9640                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9641                 break;
9642             case 'f':
9643                 Safefree(ri->data->data[n]);
9644                 break;
9645             case 'p':
9646                 new_comppad = MUTABLE_AV(ri->data->data[n]);
9647                 break;
9648             case 'o':
9649                 if (new_comppad == NULL)
9650                     Perl_croak(aTHX_ "panic: pregfree comppad");
9651                 PAD_SAVE_LOCAL(old_comppad,
9652                     /* Watch out for global destruction's random ordering. */
9653                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9654                 );
9655                 OP_REFCNT_LOCK;
9656                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9657                 OP_REFCNT_UNLOCK;
9658                 if (!refcnt)
9659                     op_free((OP_4tree*)ri->data->data[n]);
9660
9661                 PAD_RESTORE_LOCAL(old_comppad);
9662                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
9663                 new_comppad = NULL;
9664                 break;
9665             case 'n':
9666                 break;
9667             case 'T':
9668                 { /* Aho Corasick add-on structure for a trie node.
9669                      Used in stclass optimization only */
9670                     U32 refcount;
9671                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9672                     OP_REFCNT_LOCK;
9673                     refcount = --aho->refcount;
9674                     OP_REFCNT_UNLOCK;
9675                     if ( !refcount ) {
9676                         PerlMemShared_free(aho->states);
9677                         PerlMemShared_free(aho->fail);
9678                          /* do this last!!!! */
9679                         PerlMemShared_free(ri->data->data[n]);
9680                         PerlMemShared_free(ri->regstclass);
9681                     }
9682                 }
9683                 break;
9684             case 't':
9685                 {
9686                     /* trie structure. */
9687                     U32 refcount;
9688                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9689                     OP_REFCNT_LOCK;
9690                     refcount = --trie->refcount;
9691                     OP_REFCNT_UNLOCK;
9692                     if ( !refcount ) {
9693                         PerlMemShared_free(trie->charmap);
9694                         PerlMemShared_free(trie->states);
9695                         PerlMemShared_free(trie->trans);
9696                         if (trie->bitmap)
9697                             PerlMemShared_free(trie->bitmap);
9698                         if (trie->jump)
9699                             PerlMemShared_free(trie->jump);
9700                         PerlMemShared_free(trie->wordinfo);
9701                         /* do this last!!!! */
9702                         PerlMemShared_free(ri->data->data[n]);
9703                     }
9704                 }
9705                 break;
9706             default:
9707                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9708             }
9709         }
9710         Safefree(ri->data->what);
9711         Safefree(ri->data);
9712     }
9713
9714     Safefree(ri);
9715 }
9716
9717 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
9718 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
9719 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
9720
9721 /*
9722    re_dup - duplicate a regexp.
9723
9724    This routine is expected to clone a given regexp structure. It is only
9725    compiled under USE_ITHREADS.
9726
9727    After all of the core data stored in struct regexp is duplicated
9728    the regexp_engine.dupe method is used to copy any private data
9729    stored in the *pprivate pointer. This allows extensions to handle
9730    any duplication it needs to do.
9731
9732    See pregfree() and regfree_internal() if you change anything here.
9733 */
9734 #if defined(USE_ITHREADS)
9735 #ifndef PERL_IN_XSUB_RE
9736 void
9737 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
9738 {
9739     dVAR;
9740     I32 npar;
9741     const struct regexp *r = (const struct regexp *)SvANY(sstr);
9742     struct regexp *ret = (struct regexp *)SvANY(dstr);
9743
9744     PERL_ARGS_ASSERT_RE_DUP_GUTS;
9745
9746     npar = r->nparens+1;
9747     Newx(ret->offs, npar, regexp_paren_pair);
9748     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9749     if(ret->swap) {
9750         /* no need to copy these */
9751         Newx(ret->swap, npar, regexp_paren_pair);
9752     }
9753
9754     if (ret->substrs) {
9755         /* Do it this way to avoid reading from *r after the StructCopy().
9756            That way, if any of the sv_dup_inc()s dislodge *r from the L1
9757            cache, it doesn't matter.  */
9758         const bool anchored = r->check_substr
9759             ? r->check_substr == r->anchored_substr
9760             : r->check_utf8 == r->anchored_utf8;
9761         Newx(ret->substrs, 1, struct reg_substr_data);
9762         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9763
9764         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9765         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9766         ret->float_substr = sv_dup_inc(ret->float_substr, param);
9767         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9768
9769         /* check_substr and check_utf8, if non-NULL, point to either their
9770            anchored or float namesakes, and don't hold a second reference.  */
9771
9772         if (ret->check_substr) {
9773             if (anchored) {
9774                 assert(r->check_utf8 == r->anchored_utf8);
9775                 ret->check_substr = ret->anchored_substr;
9776                 ret->check_utf8 = ret->anchored_utf8;
9777             } else {
9778                 assert(r->check_substr == r->float_substr);
9779                 assert(r->check_utf8 == r->float_utf8);
9780                 ret->check_substr = ret->float_substr;
9781                 ret->check_utf8 = ret->float_utf8;
9782             }
9783         } else if (ret->check_utf8) {
9784             if (anchored) {
9785                 ret->check_utf8 = ret->anchored_utf8;
9786             } else {
9787                 ret->check_utf8 = ret->float_utf8;
9788             }
9789         }
9790     }
9791
9792     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9793
9794     if (ret->pprivate)
9795         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
9796
9797     if (RX_MATCH_COPIED(dstr))
9798         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
9799     else
9800         ret->subbeg = NULL;
9801 #ifdef PERL_OLD_COPY_ON_WRITE
9802     ret->saved_copy = NULL;
9803 #endif
9804
9805     if (ret->mother_re) {
9806         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9807             /* Our storage points directly to our mother regexp, but that's
9808                1: a buffer in a different thread
9809                2: something we no longer hold a reference on
9810                so we need to copy it locally.  */
9811             /* Note we need to sue SvCUR() on our mother_re, because it, in
9812                turn, may well be pointing to its own mother_re.  */
9813             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9814                                    SvCUR(ret->mother_re)+1));
9815             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9816         }
9817         ret->mother_re      = NULL;
9818     }
9819     ret->gofs = 0;
9820 }
9821 #endif /* PERL_IN_XSUB_RE */
9822
9823 /*
9824    regdupe_internal()
9825
9826    This is the internal complement to regdupe() which is used to copy
9827    the structure pointed to by the *pprivate pointer in the regexp.
9828    This is the core version of the extension overridable cloning hook.
9829    The regexp structure being duplicated will be copied by perl prior
9830    to this and will be provided as the regexp *r argument, however
9831    with the /old/ structures pprivate pointer value. Thus this routine
9832    may override any copying normally done by perl.
9833
9834    It returns a pointer to the new regexp_internal structure.
9835 */
9836
9837 void *
9838 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
9839 {
9840     dVAR;
9841     struct regexp *const r = (struct regexp *)SvANY(rx);
9842     regexp_internal *reti;
9843     int len, npar;
9844     RXi_GET_DECL(r,ri);
9845
9846     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
9847
9848     npar = r->nparens+1;
9849     len = ProgLen(ri);
9850
9851     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
9852     Copy(ri->program, reti->program, len+1, regnode);
9853
9854
9855     reti->regstclass = NULL;
9856
9857     if (ri->data) {
9858         struct reg_data *d;
9859         const int count = ri->data->count;
9860         int i;
9861
9862         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9863                 char, struct reg_data);
9864         Newx(d->what, count, U8);
9865
9866         d->count = count;
9867         for (i = 0; i < count; i++) {
9868             d->what[i] = ri->data->what[i];
9869             switch (d->what[i]) {
9870                 /* legal options are one of: sSfpontTua
9871                    see also regcomp.h and pregfree() */
9872             case 'a': /* actually an AV, but the dup function is identical.  */
9873             case 's':
9874             case 'S':
9875             case 'p': /* actually an AV, but the dup function is identical.  */
9876             case 'u': /* actually an HV, but the dup function is identical.  */
9877                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
9878                 break;
9879             case 'f':
9880                 /* This is cheating. */
9881                 Newx(d->data[i], 1, struct regnode_charclass_class);
9882                 StructCopy(ri->data->data[i], d->data[i],
9883                             struct regnode_charclass_class);
9884                 reti->regstclass = (regnode*)d->data[i];
9885                 break;
9886             case 'o':
9887                 /* Compiled op trees are readonly and in shared memory,
9888                    and can thus be shared without duplication. */
9889                 OP_REFCNT_LOCK;
9890                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9891                 OP_REFCNT_UNLOCK;
9892                 break;
9893             case 'T':
9894                 /* Trie stclasses are readonly and can thus be shared
9895                  * without duplication. We free the stclass in pregfree
9896                  * when the corresponding reg_ac_data struct is freed.
9897                  */
9898                 reti->regstclass= ri->regstclass;
9899                 /* Fall through */
9900             case 't':
9901                 OP_REFCNT_LOCK;
9902                 ((reg_trie_data*)ri->data->data[i])->refcount++;
9903                 OP_REFCNT_UNLOCK;
9904                 /* Fall through */
9905             case 'n':
9906                 d->data[i] = ri->data->data[i];
9907                 break;
9908             default:
9909                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9910             }
9911         }
9912
9913         reti->data = d;
9914     }
9915     else
9916         reti->data = NULL;
9917
9918     reti->name_list_idx = ri->name_list_idx;
9919
9920 #ifdef RE_TRACK_PATTERN_OFFSETS
9921     if (ri->u.offsets) {
9922         Newx(reti->u.offsets, 2*len+1, U32);
9923         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9924     }
9925 #else
9926     SetProgLen(reti,len);
9927 #endif
9928
9929     return (void*)reti;
9930 }
9931
9932 #endif    /* USE_ITHREADS */
9933
9934 #ifndef PERL_IN_XSUB_RE
9935
9936 /*
9937  - regnext - dig the "next" pointer out of a node
9938  */
9939 regnode *
9940 Perl_regnext(pTHX_ register regnode *p)
9941 {
9942     dVAR;
9943     register I32 offset;
9944
9945     if (!p)
9946         return(NULL);
9947
9948     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
9949         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
9950     }
9951
9952     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9953     if (offset == 0)
9954         return(NULL);
9955
9956     return(p+offset);
9957 }
9958 #endif
9959
9960 STATIC void
9961 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9962 {
9963     va_list args;
9964     STRLEN l1 = strlen(pat1);
9965     STRLEN l2 = strlen(pat2);
9966     char buf[512];
9967     SV *msv;
9968     const char *message;
9969
9970     PERL_ARGS_ASSERT_RE_CROAK2;
9971
9972     if (l1 > 510)
9973         l1 = 510;
9974     if (l1 + l2 > 510)
9975         l2 = 510 - l1;
9976     Copy(pat1, buf, l1 , char);
9977     Copy(pat2, buf + l1, l2 , char);
9978     buf[l1 + l2] = '\n';
9979     buf[l1 + l2 + 1] = '\0';
9980 #ifdef I_STDARG
9981     /* ANSI variant takes additional second argument */
9982     va_start(args, pat2);
9983 #else
9984     va_start(args);
9985 #endif
9986     msv = vmess(buf, &args);
9987     va_end(args);
9988     message = SvPV_const(msv,l1);
9989     if (l1 > 512)
9990         l1 = 512;
9991     Copy(message, buf, l1 , char);
9992     buf[l1-1] = '\0';                   /* Overwrite \n */
9993     Perl_croak(aTHX_ "%s", buf);
9994 }
9995
9996 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
9997
9998 #ifndef PERL_IN_XSUB_RE
9999 void
10000 Perl_save_re_context(pTHX)
10001 {
10002     dVAR;
10003
10004     struct re_save_state *state;
10005
10006     SAVEVPTR(PL_curcop);
10007     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10008
10009     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10010     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
10011     SSPUSHUV(SAVEt_RE_STATE);
10012
10013     Copy(&PL_reg_state, state, 1, struct re_save_state);
10014
10015     PL_reg_start_tmp = 0;
10016     PL_reg_start_tmpl = 0;
10017     PL_reg_oldsaved = NULL;
10018     PL_reg_oldsavedlen = 0;
10019     PL_reg_maxiter = 0;
10020     PL_reg_leftiter = 0;
10021     PL_reg_poscache = NULL;
10022     PL_reg_poscache_size = 0;
10023 #ifdef PERL_OLD_COPY_ON_WRITE
10024     PL_nrs = NULL;
10025 #endif
10026
10027     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10028     if (PL_curpm) {
10029         const REGEXP * const rx = PM_GETRE(PL_curpm);
10030         if (rx) {
10031             U32 i;
10032             for (i = 1; i <= RX_NPARENS(rx); i++) {
10033                 char digits[TYPE_CHARS(long)];
10034                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
10035                 GV *const *const gvp
10036                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10037
10038                 if (gvp) {
10039                     GV * const gv = *gvp;
10040                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10041                         save_scalar(gv);
10042                 }
10043             }
10044         }
10045     }
10046 }
10047 #endif
10048
10049 static void
10050 clear_re(pTHX_ void *r)
10051 {
10052     dVAR;
10053     ReREFCNT_dec((REGEXP *)r);
10054 }
10055
10056 #ifdef DEBUGGING
10057
10058 STATIC void
10059 S_put_byte(pTHX_ SV *sv, int c)
10060 {
10061     PERL_ARGS_ASSERT_PUT_BYTE;
10062
10063     /* Our definition of isPRINT() ignores locales, so only bytes that are
10064        not part of UTF-8 are considered printable. I assume that the same
10065        holds for UTF-EBCDIC.
10066        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10067        which Wikipedia says:
10068
10069        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10070        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10071        identical, to the ASCII delete (DEL) or rubout control character.
10072        ) So the old condition can be simplified to !isPRINT(c)  */
10073     if (!isPRINT(c))
10074         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
10075     else {
10076         const char string = c;
10077         if (c == '-' || c == ']' || c == '\\' || c == '^')
10078             sv_catpvs(sv, "\\");
10079         sv_catpvn(sv, &string, 1);
10080     }
10081 }
10082
10083
10084 #define CLEAR_OPTSTART \
10085     if (optstart) STMT_START { \
10086             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
10087             optstart=NULL; \
10088     } STMT_END
10089
10090 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
10091
10092 STATIC const regnode *
10093 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
10094             const regnode *last, const regnode *plast,
10095             SV* sv, I32 indent, U32 depth)
10096 {
10097     dVAR;
10098     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
10099     register const regnode *next;
10100     const regnode *optstart= NULL;
10101
10102     RXi_GET_DECL(r,ri);
10103     GET_RE_DEBUG_FLAGS_DECL;
10104
10105     PERL_ARGS_ASSERT_DUMPUNTIL;
10106
10107 #ifdef DEBUG_DUMPUNTIL
10108     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10109         last ? last-start : 0,plast ? plast-start : 0);
10110 #endif
10111
10112     if (plast && plast < last)
10113         last= plast;
10114
10115     while (PL_regkind[op] != END && (!last || node < last)) {
10116         /* While that wasn't END last time... */
10117         NODE_ALIGN(node);
10118         op = OP(node);
10119         if (op == CLOSE || op == WHILEM)
10120             indent--;
10121         next = regnext((regnode *)node);
10122
10123         /* Where, what. */
10124         if (OP(node) == OPTIMIZED) {
10125             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10126                 optstart = node;
10127             else
10128                 goto after_print;
10129         } else
10130             CLEAR_OPTSTART;
10131
10132         regprop(r, sv, node);
10133         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10134                       (int)(2*indent + 1), "", SvPVX_const(sv));
10135
10136         if (OP(node) != OPTIMIZED) {
10137             if (next == NULL)           /* Next ptr. */
10138                 PerlIO_printf(Perl_debug_log, " (0)");
10139             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10140                 PerlIO_printf(Perl_debug_log, " (FAIL)");
10141             else
10142                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10143             (void)PerlIO_putc(Perl_debug_log, '\n');
10144         }
10145
10146       after_print:
10147         if (PL_regkind[(U8)op] == BRANCHJ) {
10148             assert(next);
10149             {
10150                 register const regnode *nnode = (OP(next) == LONGJMP
10151                                              ? regnext((regnode *)next)
10152                                              : next);
10153                 if (last && nnode > last)
10154                     nnode = last;
10155                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10156             }
10157         }
10158         else if (PL_regkind[(U8)op] == BRANCH) {
10159             assert(next);
10160             DUMPUNTIL(NEXTOPER(node), next);
10161         }
10162         else if ( PL_regkind[(U8)op]  == TRIE ) {
10163             const regnode *this_trie = node;
10164             const char op = OP(node);
10165             const U32 n = ARG(node);
10166             const reg_ac_data * const ac = op>=AHOCORASICK ?
10167                (reg_ac_data *)ri->data->data[n] :
10168                NULL;
10169             const reg_trie_data * const trie =
10170                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10171 #ifdef DEBUGGING
10172             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10173 #endif
10174             const regnode *nextbranch= NULL;
10175             I32 word_idx;
10176             sv_setpvs(sv, "");
10177             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10178                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10179
10180                 PerlIO_printf(Perl_debug_log, "%*s%s ",
10181                    (int)(2*(indent+3)), "",
10182                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10183                             PL_colors[0], PL_colors[1],
10184                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10185                             PERL_PV_PRETTY_ELLIPSES    |
10186                             PERL_PV_PRETTY_LTGT
10187                             )
10188                             : "???"
10189                 );
10190                 if (trie->jump) {
10191                     U16 dist= trie->jump[word_idx+1];
10192                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10193                                   (UV)((dist ? this_trie + dist : next) - start));
10194                     if (dist) {
10195                         if (!nextbranch)
10196                             nextbranch= this_trie + trie->jump[0];
10197                         DUMPUNTIL(this_trie + dist, nextbranch);
10198                     }
10199                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10200                         nextbranch= regnext((regnode *)nextbranch);
10201                 } else {
10202                     PerlIO_printf(Perl_debug_log, "\n");
10203                 }
10204             }
10205             if (last && next > last)
10206                 node= last;
10207             else
10208                 node= next;
10209         }
10210         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
10211             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10212                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10213         }
10214         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10215             assert(next);
10216             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10217         }
10218         else if ( op == PLUS || op == STAR) {
10219             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10220         }
10221         else if (op == ANYOF) {
10222             /* arglen 1 + class block */
10223             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10224                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10225             node = NEXTOPER(node);
10226         }
10227         else if (PL_regkind[(U8)op] == EXACT) {
10228             /* Literal string, where present. */
10229             node += NODE_SZ_STR(node) - 1;
10230             node = NEXTOPER(node);
10231         }
10232         else {
10233             node = NEXTOPER(node);
10234             node += regarglen[(U8)op];
10235         }
10236         if (op == CURLYX || op == OPEN)
10237             indent++;
10238     }
10239     CLEAR_OPTSTART;
10240 #ifdef DEBUG_DUMPUNTIL
10241     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10242 #endif
10243     return node;
10244 }
10245
10246 #endif  /* DEBUGGING */
10247
10248 /*
10249  * Local variables:
10250  * c-indentation-style: bsd
10251  * c-basic-offset: 4
10252  * indent-tabs-mode: t
10253  * End:
10254  *
10255  * ex: set ts=8 sts=4 sw=4 noet:
10256  */