regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to  pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #ifdef op
  89 #undef op
  90 #endif /* op */
  91
  92 #ifdef MSDOS
  93 #  if defined(BUGGY_MSC6)
  94  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  95 #    pragma optimize("a",off)
  96  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  97 #    pragma optimize("w",on )
  98 #  endif /* BUGGY_MSC6 */
  99 #endif /* MSDOS */
 100
 101 #ifndef STATIC
 102 #define STATIC  static
 103 #endif
 104
 105 typedef struct RExC_state_t {
 106     U32         flags;                  /* are we folding, multilining? */
 107     char        *precomp;               /* uncompiled string. */
 108     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 109     regexp      *rx;                    /* perl core regexp structure */
 110     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 111     char        *start;                 /* Start of input for compile */
 112     char        *end;                   /* End of input for compile */
 113     char        *parse;                 /* Input-scan pointer. */
 114     I32         whilem_seen;            /* number of WHILEM in this expr */
 115     regnode     *emit_start;            /* Start of emitted-code area */
 116     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 117     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 118     I32         naughty;                /* How bad is this pattern? */
 119     I32         sawback;                /* Did we see \1, ...? */
 120     U32         seen;
 121     I32         size;                   /* Code size. */
 122     I32         npar;                   /* Capture buffer count, (OPEN). */
 123     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 124     I32         nestroot;               /* root parens we are in - used by accept */
 125     I32         extralen;
 126     I32         seen_zerolen;
 127     I32         seen_evals;
 128     regnode     **open_parens;          /* pointers to open parens */
 129     regnode     **close_parens;         /* pointers to close parens */
 130     regnode     *opend;                 /* END node in program */
 131     I32         utf8;           /* whether the pattern is utf8 or not */
 132     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 133                                 /* XXX use this for future optimisation of case
 134                                  * where pattern must be upgraded to utf8. */
 135     HV          *charnames;             /* cache of named sequences */
 136     HV          *paren_names;           /* Paren names */
 137
 138     regnode     **recurse;              /* Recurse regops */
 139     I32         recurse_count;          /* Number of recurse regops */
 140 #if ADD_TO_REGEXEC
 141     char        *starttry;              /* -Dr: where regtry was called. */
 142 #define RExC_starttry   (pRExC_state->starttry)
 143 #endif
 144 #ifdef DEBUGGING
 145     const char  *lastparse;
 146     I32         lastnum;
 147     AV          *paren_name_list;       /* idx -> name */
 148 #define RExC_lastparse  (pRExC_state->lastparse)
 149 #define RExC_lastnum    (pRExC_state->lastnum)
 150 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 151 #endif
 152 } RExC_state_t;
 153
 154 #define RExC_flags      (pRExC_state->flags)
 155 #define RExC_precomp    (pRExC_state->precomp)
 156 #define RExC_rx_sv      (pRExC_state->rx_sv)
 157 #define RExC_rx         (pRExC_state->rx)
 158 #define RExC_rxi        (pRExC_state->rxi)
 159 #define RExC_start      (pRExC_state->start)
 160 #define RExC_end        (pRExC_state->end)
 161 #define RExC_parse      (pRExC_state->parse)
 162 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 163 #ifdef RE_TRACK_PATTERN_OFFSETS
 164 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 165 #endif
 166 #define RExC_emit       (pRExC_state->emit)
 167 #define RExC_emit_start (pRExC_state->emit_start)
 168 #define RExC_emit_bound (pRExC_state->emit_bound)
 169 #define RExC_naughty    (pRExC_state->naughty)
 170 #define RExC_sawback    (pRExC_state->sawback)
 171 #define RExC_seen       (pRExC_state->seen)
 172 #define RExC_size       (pRExC_state->size)
 173 #define RExC_npar       (pRExC_state->npar)
 174 #define RExC_nestroot   (pRExC_state->nestroot)
 175 #define RExC_extralen   (pRExC_state->extralen)
 176 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 177 #define RExC_seen_evals (pRExC_state->seen_evals)
 178 #define RExC_utf8       (pRExC_state->utf8)
 179 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 180 #define RExC_charnames  (pRExC_state->charnames)
 181 #define RExC_open_parens        (pRExC_state->open_parens)
 182 #define RExC_close_parens       (pRExC_state->close_parens)
 183 #define RExC_opend      (pRExC_state->opend)
 184 #define RExC_paren_names        (pRExC_state->paren_names)
 185 #define RExC_recurse    (pRExC_state->recurse)
 186 #define RExC_recurse_count      (pRExC_state->recurse_count)
 187
 188
 189 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 190 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 191         ((*s) == '{' && regcurly(s)))
 192
 193 #ifdef SPSTART
 194 #undef SPSTART          /* dratted cpp namespace... */
 195 #endif
 196 /*
 197  * Flags to be passed up and down.
 198  */
 199 #define WORST           0       /* Worst case. */
 200 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 201 #define SIMPLE          0x02    /* Simple enough to be STAR/PLUS operand. */
 202 #define SPSTART         0x04    /* Starts with * or +. */
 203 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 204 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 205
 206 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 207
 208 /* whether trie related optimizations are enabled */
 209 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 210 #define TRIE_STUDY_OPT
 211 #define FULL_TRIE_STUDY
 212 #define TRIE_STCLASS
 213 #endif
 214
 215
 216
 217 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 218 #define PBITVAL(paren) (1 << ((paren) & 7))
 219 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 220 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 221 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 222
 223
 224 /* About scan_data_t.
 225
 226   During optimisation we recurse through the regexp program performing
 227   various inplace (keyhole style) optimisations. In addition study_chunk
 228   and scan_commit populate this data structure with information about
 229   what strings MUST appear in the pattern. We look for the longest
 230   string that must appear for at a fixed location, and we look for the
 231   longest string that may appear at a floating location. So for instance
 232   in the pattern:
 233
 234     /FOO[xX]A.*B[xX]BAR/
 235
 236   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 237   strings (because they follow a .* construct). study_chunk will identify
 238   both FOO and BAR as being the longest fixed and floating strings respectively.
 239
 240   The strings can be composites, for instance
 241
 242      /(f)(o)(o)/
 243
 244   will result in a composite fixed substring 'foo'.
 245
 246   For each string some basic information is maintained:
 247
 248   - offset or min_offset
 249     This is the position the string must appear at, or not before.
 250     It also implicitly (when combined with minlenp) tells us how many
 251     character must match before the string we are searching.
 252     Likewise when combined with minlenp and the length of the string
 253     tells us how many characters must appear after the string we have
 254     found.
 255
 256   - max_offset
 257     Only used for floating strings. This is the rightmost point that
 258     the string can appear at. Ifset to I32 max it indicates that the
 259     string can occur infinitely far to the right.
 260
 261   - minlenp
 262     A pointer to the minimum length of the pattern that the string
 263     was found inside. This is important as in the case of positive
 264     lookahead or positive lookbehind we can have multiple patterns
 265     involved. Consider
 266
 267     /(?=FOO).*F/
 268
 269     The minimum length of the pattern overall is 3, the minimum length
 270     of the lookahead part is 3, but the minimum length of the part that
 271     will actually match is 1. So 'FOO's minimum length is 3, but the
 272     minimum length for the F is 1. This is important as the minimum length
 273     is used to determine offsets in front of and behind the string being
 274     looked for.  Since strings can be composites this is the length of the
 275     pattern at the time it was commited with a scan_commit. Note that
 276     the length is calculated by study_chunk, so that the minimum lengths
 277     are not known until the full pattern has been compiled, thus the
 278     pointer to the value.
 279
 280   - lookbehind
 281
 282     In the case of lookbehind the string being searched for can be
 283     offset past the start point of the final matching string.
 284     If this value was just blithely removed from the min_offset it would
 285     invalidate some of the calculations for how many chars must match
 286     before or after (as they are derived from min_offset and minlen and
 287     the length of the string being searched for).
 288     When the final pattern is compiled and the data is moved from the
 289     scan_data_t structure into the regexp structure the information
 290     about lookbehind is factored in, with the information that would
 291     have been lost precalculated in the end_shift field for the
 292     associated string.
 293
 294   The fields pos_min and pos_delta are used to store the minimum offset
 295   and the delta to the maximum offset at the current point in the pattern.
 296
 297 */
 298
 299 typedef struct scan_data_t {
 300     /*I32 len_min;      unused */
 301     /*I32 len_delta;    unused */
 302     I32 pos_min;
 303     I32 pos_delta;
 304     SV *last_found;
 305     I32 last_end;           /* min value, <0 unless valid. */
 306     I32 last_start_min;
 307     I32 last_start_max;
 308     SV **longest;           /* Either &l_fixed, or &l_float. */
 309     SV *longest_fixed;      /* longest fixed string found in pattern */
 310     I32 offset_fixed;       /* offset where it starts */
 311     I32 *minlen_fixed;      /* pointer to the minlen relevent to the string */
 312     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 313     SV *longest_float;      /* longest floating string found in pattern */
 314     I32 offset_float_min;   /* earliest point in string it can appear */
 315     I32 offset_float_max;   /* latest point in string it can appear */
 316     I32 *minlen_float;      /* pointer to the minlen relevent to the string */
 317     I32 lookbehind_float;   /* is the position of the string modified by LB */
 318     I32 flags;
 319     I32 whilem_c;
 320     I32 *last_closep;
 321     struct regnode_charclass_class *start_class;
 322 } scan_data_t;
 323
 324 /*
 325  * Forward declarations for pregcomp()'s friends.
 326  */
 327
 328 static const scan_data_t zero_scan_data =
 329   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 330
 331 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 332 #define SF_BEFORE_SEOL          0x0001
 333 #define SF_BEFORE_MEOL          0x0002
 334 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 335 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 336
 337 #ifdef NO_UNARY_PLUS
 338 #  define SF_FIX_SHIFT_EOL      (0+2)
 339 #  define SF_FL_SHIFT_EOL               (0+4)
 340 #else
 341 #  define SF_FIX_SHIFT_EOL      (+2)
 342 #  define SF_FL_SHIFT_EOL               (+4)
 343 #endif
 344
 345 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 346 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 347
 348 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 349 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 350 #define SF_IS_INF               0x0040
 351 #define SF_HAS_PAR              0x0080
 352 #define SF_IN_PAR               0x0100
 353 #define SF_HAS_EVAL             0x0200
 354 #define SCF_DO_SUBSTR           0x0400
 355 #define SCF_DO_STCLASS_AND      0x0800
 356 #define SCF_DO_STCLASS_OR       0x1000
 357 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 358 #define SCF_WHILEM_VISITED_POS  0x2000
 359
 360 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 361 #define SCF_SEEN_ACCEPT         0x8000
 362
 363 #define UTF (RExC_utf8 != 0)
 364 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
 365 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
 366
 367 #define OOB_UNICODE             12345678
 368 #define OOB_NAMEDCLASS          -1
 369
 370 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 371 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 372
 373
 374 /* length of regex to show in messages that don't mark a position within */
 375 #define RegexLengthToShowInErrorMessages 127
 376
 377 /*
 378  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 379  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 380  * op/pragma/warn/regcomp.
 381  */
 382 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 383 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 384
 385 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 386
 387 /*
 388  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 389  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 390  * "...".
 391  */
 392 #define _FAIL(code) STMT_START {                                        \
 393     const char *ellipses = "";                                          \
 394     IV len = RExC_end - RExC_precomp;                                   \
 395                                                                         \
 396     if (!SIZE_ONLY)                                                     \
 397         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 398     if (len > RegexLengthToShowInErrorMessages) {                       \
 399         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 400         len = RegexLengthToShowInErrorMessages - 10;                    \
 401         ellipses = "...";                                               \
 402     }                                                                   \
 403     code;                                                               \
 404 } STMT_END
 405
 406 #define FAIL(msg) _FAIL(                            \
 407     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 408             msg, (int)len, RExC_precomp, ellipses))
 409
 410 #define FAIL2(msg,arg) _FAIL(                       \
 411     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 412             arg, (int)len, RExC_precomp, ellipses))
 413
 414 /*
 415  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 416  */
 417 #define Simple_vFAIL(m) STMT_START {                                    \
 418     const IV offset = RExC_parse - RExC_precomp;                        \
 419     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 420             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 421 } STMT_END
 422
 423 /*
 424  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 425  */
 426 #define vFAIL(m) STMT_START {                           \
 427     if (!SIZE_ONLY)                                     \
 428         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 429     Simple_vFAIL(m);                                    \
 430 } STMT_END
 431
 432 /*
 433  * Like Simple_vFAIL(), but accepts two arguments.
 434  */
 435 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 436     const IV offset = RExC_parse - RExC_precomp;                        \
 437     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 438             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 439 } STMT_END
 440
 441 /*
 442  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 443  */
 444 #define vFAIL2(m,a1) STMT_START {                       \
 445     if (!SIZE_ONLY)                                     \
 446         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 447     Simple_vFAIL2(m, a1);                               \
 448 } STMT_END
 449
 450
 451 /*
 452  * Like Simple_vFAIL(), but accepts three arguments.
 453  */
 454 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 455     const IV offset = RExC_parse - RExC_precomp;                \
 456     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 457             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 458 } STMT_END
 459
 460 /*
 461  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 462  */
 463 #define vFAIL3(m,a1,a2) STMT_START {                    \
 464     if (!SIZE_ONLY)                                     \
 465         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 466     Simple_vFAIL3(m, a1, a2);                           \
 467 } STMT_END
 468
 469 /*
 470  * Like Simple_vFAIL(), but accepts four arguments.
 471  */
 472 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 473     const IV offset = RExC_parse - RExC_precomp;                \
 474     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 475             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 476 } STMT_END
 477
 478 #define vWARN(loc,m) STMT_START {                                       \
 479     const IV offset = loc - RExC_precomp;                               \
 480     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 481             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 482 } STMT_END
 483
 484 #define vWARNdep(loc,m) STMT_START {                                    \
 485     const IV offset = loc - RExC_precomp;                               \
 486     Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),          \
 487             "%s" REPORT_LOCATION,                                       \
 488             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 489 } STMT_END
 490
 491
 492 #define vWARN2(loc, m, a1) STMT_START {                                 \
 493     const IV offset = loc - RExC_precomp;                               \
 494     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 495             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 496 } STMT_END
 497
 498 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 499     const IV offset = loc - RExC_precomp;                               \
 500     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 501             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 502 } STMT_END
 503
 504 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 505     const IV offset = loc - RExC_precomp;                               \
 506     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 507             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 508 } STMT_END
 509
 510 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 511     const IV offset = loc - RExC_precomp;                               \
 512     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 513             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 514 } STMT_END
 515
 516
 517 /* Allow for side effects in s */
 518 #define REGC(c,s) STMT_START {                  \
 519     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 520 } STMT_END
 521
 522 /* Macros for recording node offsets.   20001227 mjd@plover.com
 523  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 524  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 525  * Element 0 holds the number n.
 526  * Position is 1 indexed.
 527  */
 528 #ifndef RE_TRACK_PATTERN_OFFSETS
 529 #define Set_Node_Offset_To_R(node,byte)
 530 #define Set_Node_Offset(node,byte)
 531 #define Set_Cur_Node_Offset
 532 #define Set_Node_Length_To_R(node,len)
 533 #define Set_Node_Length(node,len)
 534 #define Set_Node_Cur_Length(node)
 535 #define Node_Offset(n)
 536 #define Node_Length(n)
 537 #define Set_Node_Offset_Length(node,offset,len)
 538 #define ProgLen(ri) ri->u.proglen
 539 #define SetProgLen(ri,x) ri->u.proglen = x
 540 #else
 541 #define ProgLen(ri) ri->u.offsets[0]
 542 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 543 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 544     if (! SIZE_ONLY) {                                                  \
 545         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 546                     __LINE__, (int)(node), (int)(byte)));               \
 547         if((node) < 0) {                                                \
 548             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 549         } else {                                                        \
 550             RExC_offsets[2*(node)-1] = (byte);                          \
 551         }                                                               \
 552     }                                                                   \
 553 } STMT_END
 554
 555 #define Set_Node_Offset(node,byte) \
 556     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 557 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 558
 559 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 560     if (! SIZE_ONLY) {                                                  \
 561         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 562                 __LINE__, (int)(node), (int)(len)));                    \
 563         if((node) < 0) {                                                \
 564             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 565         } else {                                                        \
 566             RExC_offsets[2*(node)] = (len);                             \
 567         }                                                               \
 568     }                                                                   \
 569 } STMT_END
 570
 571 #define Set_Node_Length(node,len) \
 572     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 573 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 574 #define Set_Node_Cur_Length(node) \
 575     Set_Node_Length(node, RExC_parse - parse_start)
 576
 577 /* Get offsets and lengths */
 578 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 579 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 580
 581 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 582     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 583     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 584 } STMT_END
 585 #endif
 586
 587 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 588 #define EXPERIMENTAL_INPLACESCAN
 589 #endif /*RE_TRACK_PATTERN_OFFSETS*/
 590
 591 #define DEBUG_STUDYDATA(str,data,depth)                              \
 592 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 593     PerlIO_printf(Perl_debug_log,                                    \
 594         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 595         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 596         (int)(depth)*2, "",                                          \
 597         (IV)((data)->pos_min),                                       \
 598         (IV)((data)->pos_delta),                                     \
 599         (UV)((data)->flags),                                         \
 600         (IV)((data)->whilem_c),                                      \
 601         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 602         is_inf ? "INF " : ""                                         \
 603     );                                                               \
 604     if ((data)->last_found)                                          \
 605         PerlIO_printf(Perl_debug_log,                                \
 606             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 607             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 608             SvPVX_const((data)->last_found),                         \
 609             (IV)((data)->last_end),                                  \
 610             (IV)((data)->last_start_min),                            \
 611             (IV)((data)->last_start_max),                            \
 612             ((data)->longest &&                                      \
 613              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 614             SvPVX_const((data)->longest_fixed),                      \
 615             (IV)((data)->offset_fixed),                              \
 616             ((data)->longest &&                                      \
 617              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 618             SvPVX_const((data)->longest_float),                      \
 619             (IV)((data)->offset_float_min),                          \
 620             (IV)((data)->offset_float_max)                           \
 621         );                                                           \
 622     PerlIO_printf(Perl_debug_log,"\n");                              \
 623 });
 624
 625 static void clear_re(pTHX_ void *r);
 626
 627 /* Mark that we cannot extend a found fixed substring at this point.
 628    Update the longest found anchored substring and the longest found
 629    floating substrings if needed. */
 630
 631 STATIC void
 632 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 633 {
 634     const STRLEN l = CHR_SVLEN(data->last_found);
 635     const STRLEN old_l = CHR_SVLEN(*data->longest);
 636     GET_RE_DEBUG_FLAGS_DECL;
 637
 638     PERL_ARGS_ASSERT_SCAN_COMMIT;
 639
 640     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 641         SvSetMagicSV(*data->longest, data->last_found);
 642         if (*data->longest == data->longest_fixed) {
 643             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 644             if (data->flags & SF_BEFORE_EOL)
 645                 data->flags
 646                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 647             else
 648                 data->flags &= ~SF_FIX_BEFORE_EOL;
 649             data->minlen_fixed=minlenp;
 650             data->lookbehind_fixed=0;
 651         }
 652         else { /* *data->longest == data->longest_float */
 653             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 654             data->offset_float_max = (l
 655                                       ? data->last_start_max
 656                                       : data->pos_min + data->pos_delta);
 657             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 658                 data->offset_float_max = I32_MAX;
 659             if (data->flags & SF_BEFORE_EOL)
 660                 data->flags
 661                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 662             else
 663                 data->flags &= ~SF_FL_BEFORE_EOL;
 664             data->minlen_float=minlenp;
 665             data->lookbehind_float=0;
 666         }
 667     }
 668     SvCUR_set(data->last_found, 0);
 669     {
 670         SV * const sv = data->last_found;
 671         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 672             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 673             if (mg)
 674                 mg->mg_len = 0;
 675         }
 676     }
 677     data->last_end = -1;
 678     data->flags &= ~SF_BEFORE_EOL;
 679     DEBUG_STUDYDATA("commit: ",data,0);
 680 }
 681
 682 /* Can match anything (initialization) */
 683 STATIC void
 684 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 685 {
 686     PERL_ARGS_ASSERT_CL_ANYTHING;
 687
 688     ANYOF_CLASS_ZERO(cl);
 689     ANYOF_BITMAP_SETALL(cl);
 690     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
 691     if (LOC)
 692         cl->flags |= ANYOF_LOCALE;
 693 }
 694
 695 /* Can match anything (initialization) */
 696 STATIC int
 697 S_cl_is_anything(const struct regnode_charclass_class *cl)
 698 {
 699     int value;
 700
 701     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 702
 703     for (value = 0; value <= ANYOF_MAX; value += 2)
 704         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 705             return 1;
 706     if (!(cl->flags & ANYOF_UNICODE_ALL))
 707         return 0;
 708     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 709         return 0;
 710     return 1;
 711 }
 712
 713 /* Can match anything (initialization) */
 714 STATIC void
 715 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 716 {
 717     PERL_ARGS_ASSERT_CL_INIT;
 718
 719     Zero(cl, 1, struct regnode_charclass_class);
 720     cl->type = ANYOF;
 721     cl_anything(pRExC_state, cl);
 722 }
 723
 724 STATIC void
 725 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 726 {
 727     PERL_ARGS_ASSERT_CL_INIT_ZERO;
 728
 729     Zero(cl, 1, struct regnode_charclass_class);
 730     cl->type = ANYOF;
 731     cl_anything(pRExC_state, cl);
 732     if (LOC)
 733         cl->flags |= ANYOF_LOCALE;
 734 }
 735
 736 /* 'And' a given class with another one.  Can create false positives */
 737 /* We assume that cl is not inverted */
 738 STATIC void
 739 S_cl_and(struct regnode_charclass_class *cl,
 740         const struct regnode_charclass_class *and_with)
 741 {
 742     PERL_ARGS_ASSERT_CL_AND;
 743
 744     assert(and_with->type == ANYOF);
 745     if (!(and_with->flags & ANYOF_CLASS)
 746         && !(cl->flags & ANYOF_CLASS)
 747         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 748         && !(and_with->flags & ANYOF_FOLD)
 749         && !(cl->flags & ANYOF_FOLD)) {
 750         int i;
 751
 752         if (and_with->flags & ANYOF_INVERT)
 753             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 754                 cl->bitmap[i] &= ~and_with->bitmap[i];
 755         else
 756             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 757                 cl->bitmap[i] &= and_with->bitmap[i];
 758     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 759     if (!(and_with->flags & ANYOF_EOS))
 760         cl->flags &= ~ANYOF_EOS;
 761
 762     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
 763         !(and_with->flags & ANYOF_INVERT)) {
 764         cl->flags &= ~ANYOF_UNICODE_ALL;
 765         cl->flags |= ANYOF_UNICODE;
 766         ARG_SET(cl, ARG(and_with));
 767     }
 768     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
 769         !(and_with->flags & ANYOF_INVERT))
 770         cl->flags &= ~ANYOF_UNICODE_ALL;
 771     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
 772         !(and_with->flags & ANYOF_INVERT))
 773         cl->flags &= ~ANYOF_UNICODE;
 774 }
 775
 776 /* 'OR' a given class with another one.  Can create false positives */
 777 /* We assume that cl is not inverted */
 778 STATIC void
 779 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 780 {
 781     PERL_ARGS_ASSERT_CL_OR;
 782
 783     if (or_with->flags & ANYOF_INVERT) {
 784         /* We do not use
 785          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 786          *   <= (B1 | !B2) | (CL1 | !CL2)
 787          * which is wasteful if CL2 is small, but we ignore CL2:
 788          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 789          * XXXX Can we handle case-fold?  Unclear:
 790          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 791          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 792          */
 793         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 794              && !(or_with->flags & ANYOF_FOLD)
 795              && !(cl->flags & ANYOF_FOLD) ) {
 796             int i;
 797
 798             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 799                 cl->bitmap[i] |= ~or_with->bitmap[i];
 800         } /* XXXX: logic is complicated otherwise */
 801         else {
 802             cl_anything(pRExC_state, cl);
 803         }
 804     } else {
 805         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 806         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 807              && (!(or_with->flags & ANYOF_FOLD)
 808                  || (cl->flags & ANYOF_FOLD)) ) {
 809             int i;
 810
 811             /* OR char bitmap and class bitmap separately */
 812             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 813                 cl->bitmap[i] |= or_with->bitmap[i];
 814             if (or_with->flags & ANYOF_CLASS) {
 815                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 816                     cl->classflags[i] |= or_with->classflags[i];
 817                 cl->flags |= ANYOF_CLASS;
 818             }
 819         }
 820         else { /* XXXX: logic is complicated, leave it along for a moment. */
 821             cl_anything(pRExC_state, cl);
 822         }
 823     }
 824     if (or_with->flags & ANYOF_EOS)
 825         cl->flags |= ANYOF_EOS;
 826
 827     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
 828         ARG(cl) != ARG(or_with)) {
 829         cl->flags |= ANYOF_UNICODE_ALL;
 830         cl->flags &= ~ANYOF_UNICODE;
 831     }
 832     if (or_with->flags & ANYOF_UNICODE_ALL) {
 833         cl->flags |= ANYOF_UNICODE_ALL;
 834         cl->flags &= ~ANYOF_UNICODE;
 835     }
 836 }
 837
 838 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 839 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 840 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 841 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
 842
 843
 844 #ifdef DEBUGGING
 845 /*
 846    dump_trie(trie,widecharmap,revcharmap)
 847    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
 848    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 849
 850    These routines dump out a trie in a somewhat readable format.
 851    The _interim_ variants are used for debugging the interim
 852    tables that are used to generate the final compressed
 853    representation which is what dump_trie expects.
 854
 855    Part of the reason for their existance is to provide a form
 856    of documentation as to how the different representations function.
 857
 858 */
 859
 860 /*
 861   Dumps the final compressed table form of the trie to Perl_debug_log.
 862   Used for debugging make_trie().
 863 */
 864
 865 STATIC void
 866 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
 867             AV *revcharmap, U32 depth)
 868 {
 869     U32 state;
 870     SV *sv=sv_newmortal();
 871     int colwidth= widecharmap ? 6 : 4;
 872     GET_RE_DEBUG_FLAGS_DECL;
 873
 874     PERL_ARGS_ASSERT_DUMP_TRIE;
 875
 876     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
 877         (int)depth * 2 + 2,"",
 878         "Match","Base","Ofs" );
 879
 880     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
 881         SV ** const tmp = av_fetch( revcharmap, state, 0);
 882         if ( tmp ) {
 883             PerlIO_printf( Perl_debug_log, "%*s",
 884                 colwidth,
 885                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 886                             PL_colors[0], PL_colors[1],
 887                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 888                             PERL_PV_ESCAPE_FIRSTCHAR
 889                 )
 890             );
 891         }
 892     }
 893     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
 894         (int)depth * 2 + 2,"");
 895
 896     for( state = 0 ; state < trie->uniquecharcount ; state++ )
 897         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
 898     PerlIO_printf( Perl_debug_log, "\n");
 899
 900     for( state = 1 ; state < trie->statecount ; state++ ) {
 901         const U32 base = trie->states[ state ].trans.base;
 902
 903         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
 904
 905         if ( trie->states[ state ].wordnum ) {
 906             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
 907         } else {
 908             PerlIO_printf( Perl_debug_log, "%6s", "" );
 909         }
 910
 911         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
 912
 913         if ( base ) {
 914             U32 ofs = 0;
 915
 916             while( ( base + ofs  < trie->uniquecharcount ) ||
 917                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
 918                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
 919                     ofs++;
 920
 921             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
 922
 923             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
 924                 if ( ( base + ofs >= trie->uniquecharcount ) &&
 925                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
 926                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
 927                 {
 928                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
 929                     colwidth,
 930                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
 931                 } else {
 932                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
 933                 }
 934             }
 935
 936             PerlIO_printf( Perl_debug_log, "]");
 937
 938         }
 939         PerlIO_printf( Perl_debug_log, "\n" );
 940     }
 941 }
 942 /*
 943   Dumps a fully constructed but uncompressed trie in list form.
 944   List tries normally only are used for construction when the number of
 945   possible chars (trie->uniquecharcount) is very high.
 946   Used for debugging make_trie().
 947 */
 948 STATIC void
 949 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
 950                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
 951                          U32 depth)
 952 {
 953     U32 state;
 954     SV *sv=sv_newmortal();
 955     int colwidth= widecharmap ? 6 : 4;
 956     GET_RE_DEBUG_FLAGS_DECL;
 957
 958     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
 959
 960     /* print out the table precompression.  */
 961     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
 962         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
 963         "------:-----+-----------------\n" );
 964
 965     for( state=1 ; state < next_alloc ; state ++ ) {
 966         U16 charid;
 967
 968         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
 969             (int)depth * 2 + 2,"", (UV)state  );
 970         if ( ! trie->states[ state ].wordnum ) {
 971             PerlIO_printf( Perl_debug_log, "%5s| ","");
 972         } else {
 973             PerlIO_printf( Perl_debug_log, "W%4x| ",
 974                 trie->states[ state ].wordnum
 975             );
 976         }
 977         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
 978             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
 979             if ( tmp ) {
 980                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
 981                     colwidth,
 982                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 983                             PL_colors[0], PL_colors[1],
 984                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 985                             PERL_PV_ESCAPE_FIRSTCHAR
 986                     ) ,
 987                     TRIE_LIST_ITEM(state,charid).forid,
 988                     (UV)TRIE_LIST_ITEM(state,charid).newstate
 989                 );
 990                 if (!(charid % 10))
 991                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
 992                         (int)((depth * 2) + 14), "");
 993             }
 994         }
 995         PerlIO_printf( Perl_debug_log, "\n");
 996     }
 997 }
 998
 999 /*
1000   Dumps a fully constructed but uncompressed trie in table form.
1001   This is the normal DFA style state transition table, with a few
1002   twists to facilitate compression later.
1003   Used for debugging make_trie().
1004 */
1005 STATIC void
1006 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1007                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1008                           U32 depth)
1009 {
1010     U32 state;
1011     U16 charid;
1012     SV *sv=sv_newmortal();
1013     int colwidth= widecharmap ? 6 : 4;
1014     GET_RE_DEBUG_FLAGS_DECL;
1015
1016     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1017
1018     /*
1019        print out the table precompression so that we can do a visual check
1020        that they are identical.
1021      */
1022
1023     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1024
1025     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1026         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1027         if ( tmp ) {
1028             PerlIO_printf( Perl_debug_log, "%*s",
1029                 colwidth,
1030                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1031                             PL_colors[0], PL_colors[1],
1032                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1033                             PERL_PV_ESCAPE_FIRSTCHAR
1034                 )
1035             );
1036         }
1037     }
1038
1039     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1040
1041     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1042         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1043     }
1044
1045     PerlIO_printf( Perl_debug_log, "\n" );
1046
1047     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1048
1049         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1050             (int)depth * 2 + 2,"",
1051             (UV)TRIE_NODENUM( state ) );
1052
1053         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1054             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1055             if (v)
1056                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1057             else
1058                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1059         }
1060         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1061             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1062         } else {
1063             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1064             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1065         }
1066     }
1067 }
1068
1069 #endif
1070
1071 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1072   startbranch: the first branch in the whole branch sequence
1073   first      : start branch of sequence of branch-exact nodes.
1074                May be the same as startbranch
1075   last       : Thing following the last branch.
1076                May be the same as tail.
1077   tail       : item following the branch sequence
1078   count      : words in the sequence
1079   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1080   depth      : indent depth
1081
1082 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1083
1084 A trie is an N'ary tree where the branches are determined by digital
1085 decomposition of the key. IE, at the root node you look up the 1st character and
1086 follow that branch repeat until you find the end of the branches. Nodes can be
1087 marked as "accepting" meaning they represent a complete word. Eg:
1088
1089   /he|she|his|hers/
1090
1091 would convert into the following structure. Numbers represent states, letters
1092 following numbers represent valid transitions on the letter from that state, if
1093 the number is in square brackets it represents an accepting state, otherwise it
1094 will be in parenthesis.
1095
1096       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1097       |    |
1098       |   (2)
1099       |    |
1100      (1)   +-i->(6)-+-s->[7]
1101       |
1102       +-s->(3)-+-h->(4)-+-e->[5]
1103
1104       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1105
1106 This shows that when matching against the string 'hers' we will begin at state 1
1107 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1108 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1109 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1110 single traverse. We store a mapping from accepting to state to which word was
1111 matched, and then when we have multiple possibilities we try to complete the
1112 rest of the regex in the order in which they occured in the alternation.
1113
1114 The only prior NFA like behaviour that would be changed by the TRIE support is
1115 the silent ignoring of duplicate alternations which are of the form:
1116
1117  / (DUPE|DUPE) X? (?{ ... }) Y /x
1118
1119 Thus EVAL blocks follwing a trie may be called a different number of times with
1120 and without the optimisation. With the optimisations dupes will be silently
1121 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1122 the following demonstrates:
1123
1124  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1125
1126 which prints out 'word' three times, but
1127
1128  'words'=~/(word|word|word)(?{ print $1 })S/
1129
1130 which doesnt print it out at all. This is due to other optimisations kicking in.
1131
1132 Example of what happens on a structural level:
1133
1134 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1135
1136    1: CURLYM[1] {1,32767}(18)
1137    5:   BRANCH(8)
1138    6:     EXACT <ac>(16)
1139    8:   BRANCH(11)
1140    9:     EXACT <ad>(16)
1141   11:   BRANCH(14)
1142   12:     EXACT <ab>(16)
1143   16:   SUCCEED(0)
1144   17:   NOTHING(18)
1145   18: END(0)
1146
1147 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1148 and should turn into:
1149
1150    1: CURLYM[1] {1,32767}(18)
1151    5:   TRIE(16)
1152         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1153           <ac>
1154           <ad>
1155           <ab>
1156   16:   SUCCEED(0)
1157   17:   NOTHING(18)
1158   18: END(0)
1159
1160 Cases where tail != last would be like /(?foo|bar)baz/:
1161
1162    1: BRANCH(4)
1163    2:   EXACT <foo>(8)
1164    4: BRANCH(7)
1165    5:   EXACT <bar>(8)
1166    7: TAIL(8)
1167    8: EXACT <baz>(10)
1168   10: END(0)
1169
1170 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1171 and would end up looking like:
1172
1173     1: TRIE(8)
1174       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1175         <foo>
1176         <bar>
1177    7: TAIL(8)
1178    8: EXACT <baz>(10)
1179   10: END(0)
1180
1181     d = uvuni_to_utf8_flags(d, uv, 0);
1182
1183 is the recommended Unicode-aware way of saying
1184
1185     *(d++) = uv;
1186 */
1187
1188 #define TRIE_STORE_REVCHAR                                                 \
1189     STMT_START {                                                           \
1190         if (UTF) {                                                         \
1191             SV *zlopp = newSV(2);                                          \
1192             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1193             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1194             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1195             SvPOK_on(zlopp);                                               \
1196             SvUTF8_on(zlopp);                                              \
1197             av_push(revcharmap, zlopp);                                    \
1198         } else {                                                           \
1199             char ooooff = (char)uvc;                                               \
1200             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1201         }                                                                  \
1202         } STMT_END
1203
1204 #define TRIE_READ_CHAR STMT_START {                                           \
1205     wordlen++;                                                                \
1206     if ( UTF ) {                                                              \
1207         if ( folder ) {                                                       \
1208             if ( foldlen > 0 ) {                                              \
1209                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1210                foldlen -= len;                                                \
1211                scan += len;                                                   \
1212                len = 0;                                                       \
1213             } else {                                                          \
1214                 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1215                 uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1216                 foldlen -= UNISKIP( uvc );                                    \
1217                 scan = foldbuf + UNISKIP( uvc );                              \
1218             }                                                                 \
1219         } else {                                                              \
1220             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1221         }                                                                     \
1222     } else {                                                                  \
1223         uvc = (U32)*uc;                                                       \
1224         len = 1;                                                              \
1225     }                                                                         \
1226 } STMT_END
1227
1228
1229
1230 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1231     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1232         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1233         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1234     }                                                           \
1235     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1236     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1237     TRIE_LIST_CUR( state )++;                                   \
1238 } STMT_END
1239
1240 #define TRIE_LIST_NEW(state) STMT_START {                       \
1241     Newxz( trie->states[ state ].trans.list,               \
1242         4, reg_trie_trans_le );                                 \
1243      TRIE_LIST_CUR( state ) = 1;                                \
1244      TRIE_LIST_LEN( state ) = 4;                                \
1245 } STMT_END
1246
1247 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1248     U16 dupe= trie->states[ state ].wordnum;                    \
1249     regnode * const noper_next = regnext( noper );              \
1250                                                                 \
1251     if (trie->wordlen)                                          \
1252         trie->wordlen[ curword ] = wordlen;                     \
1253     DEBUG_r({                                                   \
1254         /* store the word for dumping */                        \
1255         SV* tmp;                                                \
1256         if (OP(noper) != NOTHING)                               \
1257             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1258         else                                                    \
1259             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1260         av_push( trie_words, tmp );                             \
1261     });                                                         \
1262                                                                 \
1263     curword++;                                                  \
1264                                                                 \
1265     if ( noper_next < tail ) {                                  \
1266         if (!trie->jump)                                        \
1267             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1268         trie->jump[curword] = (U16)(noper_next - convert);      \
1269         if (!jumper)                                            \
1270             jumper = noper_next;                                \
1271         if (!nextbranch)                                        \
1272             nextbranch= regnext(cur);                           \
1273     }                                                           \
1274                                                                 \
1275     if ( dupe ) {                                               \
1276         /* So it's a dupe. This means we need to maintain a   */\
1277         /* linked-list from the first to the next.            */\
1278         /* we only allocate the nextword buffer when there    */\
1279         /* a dupe, so first time we have to do the allocation */\
1280         if (!trie->nextword)                                    \
1281             trie->nextword = (U16 *)                                    \
1282                 PerlMemShared_calloc( word_count + 1, sizeof(U16));     \
1283         while ( trie->nextword[dupe] )                          \
1284             dupe= trie->nextword[dupe];                         \
1285         trie->nextword[dupe]= curword;                          \
1286     } else {                                                    \
1287         /* we haven't inserted this word yet.                */ \
1288         trie->states[ state ].wordnum = curword;                \
1289     }                                                           \
1290 } STMT_END
1291
1292
1293 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1294      ( ( base + charid >=  ucharcount                                   \
1295          && base + charid < ubound                                      \
1296          && state == trie->trans[ base - ucharcount + charid ].check    \
1297          && trie->trans[ base - ucharcount + charid ].next )            \
1298            ? trie->trans[ base - ucharcount + charid ].next             \
1299            : ( state==1 ? special : 0 )                                 \
1300       )
1301
1302 #define MADE_TRIE       1
1303 #define MADE_JUMP_TRIE  2
1304 #define MADE_EXACT_TRIE 4
1305
1306 STATIC I32
1307 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1308 {
1309     dVAR;
1310     /* first pass, loop through and scan words */
1311     reg_trie_data *trie;
1312     HV *widecharmap = NULL;
1313     AV *revcharmap = newAV();
1314     regnode *cur;
1315     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1316     STRLEN len = 0;
1317     UV uvc = 0;
1318     U16 curword = 0;
1319     U32 next_alloc = 0;
1320     regnode *jumper = NULL;
1321     regnode *nextbranch = NULL;
1322     regnode *convert = NULL;
1323     /* we just use folder as a flag in utf8 */
1324     const U8 * const folder = ( flags == EXACTF
1325                        ? PL_fold
1326                        : ( flags == EXACTFL
1327                            ? PL_fold_locale
1328                            : NULL
1329                          )
1330                      );
1331
1332 #ifdef DEBUGGING
1333     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1334     AV *trie_words = NULL;
1335     /* along with revcharmap, this only used during construction but both are
1336      * useful during debugging so we store them in the struct when debugging.
1337      */
1338 #else
1339     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1340     STRLEN trie_charcount=0;
1341 #endif
1342     SV *re_trie_maxbuff;
1343     GET_RE_DEBUG_FLAGS_DECL;
1344
1345     PERL_ARGS_ASSERT_MAKE_TRIE;
1346 #ifndef DEBUGGING
1347     PERL_UNUSED_ARG(depth);
1348 #endif
1349
1350     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1351     trie->refcount = 1;
1352     trie->startstate = 1;
1353     trie->wordcount = word_count;
1354     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1355     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1356     if (!(UTF && folder))
1357         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1358     DEBUG_r({
1359         trie_words = newAV();
1360     });
1361
1362     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1363     if (!SvIOK(re_trie_maxbuff)) {
1364         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1365     }
1366     DEBUG_OPTIMISE_r({
1367                 PerlIO_printf( Perl_debug_log,
1368                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1369                   (int)depth * 2 + 2, "",
1370                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1371                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1372                   (int)depth);
1373     });
1374
1375    /* Find the node we are going to overwrite */
1376     if ( first == startbranch && OP( last ) != BRANCH ) {
1377         /* whole branch chain */
1378         convert = first;
1379     } else {
1380         /* branch sub-chain */
1381         convert = NEXTOPER( first );
1382     }
1383
1384     /*  -- First loop and Setup --
1385
1386        We first traverse the branches and scan each word to determine if it
1387        contains widechars, and how many unique chars there are, this is
1388        important as we have to build a table with at least as many columns as we
1389        have unique chars.
1390
1391        We use an array of integers to represent the character codes 0..255
1392        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1393        native representation of the character value as the key and IV's for the
1394        coded index.
1395
1396        *TODO* If we keep track of how many times each character is used we can
1397        remap the columns so that the table compression later on is more
1398        efficient in terms of memory by ensuring most common value is in the
1399        middle and the least common are on the outside.  IMO this would be better
1400        than a most to least common mapping as theres a decent chance the most
1401        common letter will share a node with the least common, meaning the node
1402        will not be compressable. With a middle is most common approach the worst
1403        case is when we have the least common nodes twice.
1404
1405      */
1406
1407     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1408         regnode * const noper = NEXTOPER( cur );
1409         const U8 *uc = (U8*)STRING( noper );
1410         const U8 * const e  = uc + STR_LEN( noper );
1411         STRLEN foldlen = 0;
1412         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1413         const U8 *scan = (U8*)NULL;
1414         U32 wordlen      = 0;         /* required init */
1415         STRLEN chars = 0;
1416         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1417
1418         if (OP(noper) == NOTHING) {
1419             trie->minlen= 0;
1420             continue;
1421         }
1422         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1423             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1424                                           regardless of encoding */
1425
1426         for ( ; uc < e ; uc += len ) {
1427             TRIE_CHARCOUNT(trie)++;
1428             TRIE_READ_CHAR;
1429             chars++;
1430             if ( uvc < 256 ) {
1431                 if ( !trie->charmap[ uvc ] ) {
1432                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1433                     if ( folder )
1434                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1435                     TRIE_STORE_REVCHAR;
1436                 }
1437                 if ( set_bit ) {
1438                     /* store the codepoint in the bitmap, and if its ascii
1439                        also store its folded equivelent. */
1440                     TRIE_BITMAP_SET(trie,uvc);
1441
1442                     /* store the folded codepoint */
1443                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1444
1445                     if ( !UTF ) {
1446                         /* store first byte of utf8 representation of
1447                            codepoints in the 127 < uvc < 256 range */
1448                         if (127 < uvc && uvc < 192) {
1449                             TRIE_BITMAP_SET(trie,194);
1450                         } else if (191 < uvc ) {
1451                             TRIE_BITMAP_SET(trie,195);
1452                         /* && uvc < 256 -- we know uvc is < 256 already */
1453                         }
1454                     }
1455                     set_bit = 0; /* We've done our bit :-) */
1456                 }
1457             } else {
1458                 SV** svpp;
1459                 if ( !widecharmap )
1460                     widecharmap = newHV();
1461
1462                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1463
1464                 if ( !svpp )
1465                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1466
1467                 if ( !SvTRUE( *svpp ) ) {
1468                     sv_setiv( *svpp, ++trie->uniquecharcount );
1469                     TRIE_STORE_REVCHAR;
1470                 }
1471             }
1472         }
1473         if( cur == first ) {
1474             trie->minlen=chars;
1475             trie->maxlen=chars;
1476         } else if (chars < trie->minlen) {
1477             trie->minlen=chars;
1478         } else if (chars > trie->maxlen) {
1479             trie->maxlen=chars;
1480         }
1481
1482     } /* end first pass */
1483     DEBUG_TRIE_COMPILE_r(
1484         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1485                 (int)depth * 2 + 2,"",
1486                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1487                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1488                 (int)trie->minlen, (int)trie->maxlen )
1489     );
1490     trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1491
1492     /*
1493         We now know what we are dealing with in terms of unique chars and
1494         string sizes so we can calculate how much memory a naive
1495         representation using a flat table  will take. If it's over a reasonable
1496         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1497         conservative but potentially much slower representation using an array
1498         of lists.
1499
1500         At the end we convert both representations into the same compressed
1501         form that will be used in regexec.c for matching with. The latter
1502         is a form that cannot be used to construct with but has memory
1503         properties similar to the list form and access properties similar
1504         to the table form making it both suitable for fast searches and
1505         small enough that its feasable to store for the duration of a program.
1506
1507         See the comment in the code where the compressed table is produced
1508         inplace from the flat tabe representation for an explanation of how
1509         the compression works.
1510
1511     */
1512
1513
1514     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1515         /*
1516             Second Pass -- Array Of Lists Representation
1517
1518             Each state will be represented by a list of charid:state records
1519             (reg_trie_trans_le) the first such element holds the CUR and LEN
1520             points of the allocated array. (See defines above).
1521
1522             We build the initial structure using the lists, and then convert
1523             it into the compressed table form which allows faster lookups
1524             (but cant be modified once converted).
1525         */
1526
1527         STRLEN transcount = 1;
1528
1529         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1530             "%*sCompiling trie using list compiler\n",
1531             (int)depth * 2 + 2, ""));
1532
1533         trie->states = (reg_trie_state *)
1534             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1535                                   sizeof(reg_trie_state) );
1536         TRIE_LIST_NEW(1);
1537         next_alloc = 2;
1538
1539         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1540
1541             regnode * const noper = NEXTOPER( cur );
1542             U8 *uc           = (U8*)STRING( noper );
1543             const U8 * const e = uc + STR_LEN( noper );
1544             U32 state        = 1;         /* required init */
1545             U16 charid       = 0;         /* sanity init */
1546             U8 *scan         = (U8*)NULL; /* sanity init */
1547             STRLEN foldlen   = 0;         /* required init */
1548             U32 wordlen      = 0;         /* required init */
1549             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1550
1551             if (OP(noper) != NOTHING) {
1552                 for ( ; uc < e ; uc += len ) {
1553
1554                     TRIE_READ_CHAR;
1555
1556                     if ( uvc < 256 ) {
1557                         charid = trie->charmap[ uvc ];
1558                     } else {
1559                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1560                         if ( !svpp ) {
1561                             charid = 0;
1562                         } else {
1563                             charid=(U16)SvIV( *svpp );
1564                         }
1565                     }
1566                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1567                     if ( charid ) {
1568
1569                         U16 check;
1570                         U32 newstate = 0;
1571
1572                         charid--;
1573                         if ( !trie->states[ state ].trans.list ) {
1574                             TRIE_LIST_NEW( state );
1575                         }
1576                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1577                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1578                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1579                                 break;
1580                             }
1581                         }
1582                         if ( ! newstate ) {
1583                             newstate = next_alloc++;
1584                             TRIE_LIST_PUSH( state, charid, newstate );
1585                             transcount++;
1586                         }
1587                         state = newstate;
1588                     } else {
1589                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1590                     }
1591                 }
1592             }
1593             TRIE_HANDLE_WORD(state);
1594
1595         } /* end second pass */
1596
1597         /* next alloc is the NEXT state to be allocated */
1598         trie->statecount = next_alloc;
1599         trie->states = (reg_trie_state *)
1600             PerlMemShared_realloc( trie->states,
1601                                    next_alloc
1602                                    * sizeof(reg_trie_state) );
1603
1604         /* and now dump it out before we compress it */
1605         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1606                                                          revcharmap, next_alloc,
1607                                                          depth+1)
1608         );
1609
1610         trie->trans = (reg_trie_trans *)
1611             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1612         {
1613             U32 state;
1614             U32 tp = 0;
1615             U32 zp = 0;
1616
1617
1618             for( state=1 ; state < next_alloc ; state ++ ) {
1619                 U32 base=0;
1620
1621                 /*
1622                 DEBUG_TRIE_COMPILE_MORE_r(
1623                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1624                 );
1625                 */
1626
1627                 if (trie->states[state].trans.list) {
1628                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1629                     U16 maxid=minid;
1630                     U16 idx;
1631
1632                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1633                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1634                         if ( forid < minid ) {
1635                             minid=forid;
1636                         } else if ( forid > maxid ) {
1637                             maxid=forid;
1638                         }
1639                     }
1640                     if ( transcount < tp + maxid - minid + 1) {
1641                         transcount *= 2;
1642                         trie->trans = (reg_trie_trans *)
1643                             PerlMemShared_realloc( trie->trans,
1644                                                      transcount
1645                                                      * sizeof(reg_trie_trans) );
1646                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1647                     }
1648                     base = trie->uniquecharcount + tp - minid;
1649                     if ( maxid == minid ) {
1650                         U32 set = 0;
1651                         for ( ; zp < tp ; zp++ ) {
1652                             if ( ! trie->trans[ zp ].next ) {
1653                                 base = trie->uniquecharcount + zp - minid;
1654                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1655                                 trie->trans[ zp ].check = state;
1656                                 set = 1;
1657                                 break;
1658                             }
1659                         }
1660                         if ( !set ) {
1661                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1662                             trie->trans[ tp ].check = state;
1663                             tp++;
1664                             zp = tp;
1665                         }
1666                     } else {
1667                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1668                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1669                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1670                             trie->trans[ tid ].check = state;
1671                         }
1672                         tp += ( maxid - minid + 1 );
1673                     }
1674                     Safefree(trie->states[ state ].trans.list);
1675                 }
1676                 /*
1677                 DEBUG_TRIE_COMPILE_MORE_r(
1678                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1679                 );
1680                 */
1681                 trie->states[ state ].trans.base=base;
1682             }
1683             trie->lasttrans = tp + 1;
1684         }
1685     } else {
1686         /*
1687            Second Pass -- Flat Table Representation.
1688
1689            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1690            We know that we will need Charcount+1 trans at most to store the data
1691            (one row per char at worst case) So we preallocate both structures
1692            assuming worst case.
1693
1694            We then construct the trie using only the .next slots of the entry
1695            structs.
1696
1697            We use the .check field of the first entry of the node  temporarily to
1698            make compression both faster and easier by keeping track of how many non
1699            zero fields are in the node.
1700
1701            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1702            transition.
1703
1704            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1705            number representing the first entry of the node, and state as a
1706            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1707            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1708            are 2 entrys per node. eg:
1709
1710              A B       A B
1711           1. 2 4    1. 3 7
1712           2. 0 3    3. 0 5
1713           3. 0 0    5. 0 0
1714           4. 0 0    7. 0 0
1715
1716            The table is internally in the right hand, idx form. However as we also
1717            have to deal with the states array which is indexed by nodenum we have to
1718            use TRIE_NODENUM() to convert.
1719
1720         */
1721         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1722             "%*sCompiling trie using table compiler\n",
1723             (int)depth * 2 + 2, ""));
1724
1725         trie->trans = (reg_trie_trans *)
1726             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1727                                   * trie->uniquecharcount + 1,
1728                                   sizeof(reg_trie_trans) );
1729         trie->states = (reg_trie_state *)
1730             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1731                                   sizeof(reg_trie_state) );
1732         next_alloc = trie->uniquecharcount + 1;
1733
1734
1735         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1736
1737             regnode * const noper   = NEXTOPER( cur );
1738             const U8 *uc     = (U8*)STRING( noper );
1739             const U8 * const e = uc + STR_LEN( noper );
1740
1741             U32 state        = 1;         /* required init */
1742
1743             U16 charid       = 0;         /* sanity init */
1744             U32 accept_state = 0;         /* sanity init */
1745             U8 *scan         = (U8*)NULL; /* sanity init */
1746
1747             STRLEN foldlen   = 0;         /* required init */
1748             U32 wordlen      = 0;         /* required init */
1749             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1750
1751             if ( OP(noper) != NOTHING ) {
1752                 for ( ; uc < e ; uc += len ) {
1753
1754                     TRIE_READ_CHAR;
1755
1756                     if ( uvc < 256 ) {
1757                         charid = trie->charmap[ uvc ];
1758                     } else {
1759                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1760                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1761                     }
1762                     if ( charid ) {
1763                         charid--;
1764                         if ( !trie->trans[ state + charid ].next ) {
1765                             trie->trans[ state + charid ].next = next_alloc;
1766                             trie->trans[ state ].check++;
1767                             next_alloc += trie->uniquecharcount;
1768                         }
1769                         state = trie->trans[ state + charid ].next;
1770                     } else {
1771                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1772                     }
1773                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1774                 }
1775             }
1776             accept_state = TRIE_NODENUM( state );
1777             TRIE_HANDLE_WORD(accept_state);
1778
1779         } /* end second pass */
1780
1781         /* and now dump it out before we compress it */
1782         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1783                                                           revcharmap,
1784                                                           next_alloc, depth+1));
1785
1786         {
1787         /*
1788            * Inplace compress the table.*
1789
1790            For sparse data sets the table constructed by the trie algorithm will
1791            be mostly 0/FAIL transitions or to put it another way mostly empty.
1792            (Note that leaf nodes will not contain any transitions.)
1793
1794            This algorithm compresses the tables by eliminating most such
1795            transitions, at the cost of a modest bit of extra work during lookup:
1796
1797            - Each states[] entry contains a .base field which indicates the
1798            index in the state[] array wheres its transition data is stored.
1799
1800            - If .base is 0 there are no  valid transitions from that node.
1801
1802            - If .base is nonzero then charid is added to it to find an entry in
1803            the trans array.
1804
1805            -If trans[states[state].base+charid].check!=state then the
1806            transition is taken to be a 0/Fail transition. Thus if there are fail
1807            transitions at the front of the node then the .base offset will point
1808            somewhere inside the previous nodes data (or maybe even into a node
1809            even earlier), but the .check field determines if the transition is
1810            valid.
1811
1812            XXX - wrong maybe?
1813            The following process inplace converts the table to the compressed
1814            table: We first do not compress the root node 1,and mark its all its
1815            .check pointers as 1 and set its .base pointer as 1 as well. This
1816            allows to do a DFA construction from the compressed table later, and
1817            ensures that any .base pointers we calculate later are greater than
1818            0.
1819
1820            - We set 'pos' to indicate the first entry of the second node.
1821
1822            - We then iterate over the columns of the node, finding the first and
1823            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1824            and set the .check pointers accordingly, and advance pos
1825            appropriately and repreat for the next node. Note that when we copy
1826            the next pointers we have to convert them from the original
1827            NODEIDX form to NODENUM form as the former is not valid post
1828            compression.
1829
1830            - If a node has no transitions used we mark its base as 0 and do not
1831            advance the pos pointer.
1832
1833            - If a node only has one transition we use a second pointer into the
1834            structure to fill in allocated fail transitions from other states.
1835            This pointer is independent of the main pointer and scans forward
1836            looking for null transitions that are allocated to a state. When it
1837            finds one it writes the single transition into the "hole".  If the
1838            pointer doesnt find one the single transition is appended as normal.
1839
1840            - Once compressed we can Renew/realloc the structures to release the
1841            excess space.
1842
1843            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1844            specifically Fig 3.47 and the associated pseudocode.
1845
1846            demq
1847         */
1848         const U32 laststate = TRIE_NODENUM( next_alloc );
1849         U32 state, charid;
1850         U32 pos = 0, zp=0;
1851         trie->statecount = laststate;
1852
1853         for ( state = 1 ; state < laststate ; state++ ) {
1854             U8 flag = 0;
1855             const U32 stateidx = TRIE_NODEIDX( state );
1856             const U32 o_used = trie->trans[ stateidx ].check;
1857             U32 used = trie->trans[ stateidx ].check;
1858             trie->trans[ stateidx ].check = 0;
1859
1860             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1861                 if ( flag || trie->trans[ stateidx + charid ].next ) {
1862                     if ( trie->trans[ stateidx + charid ].next ) {
1863                         if (o_used == 1) {
1864                             for ( ; zp < pos ; zp++ ) {
1865                                 if ( ! trie->trans[ zp ].next ) {
1866                                     break;
1867                                 }
1868                             }
1869                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1870                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1871                             trie->trans[ zp ].check = state;
1872                             if ( ++zp > pos ) pos = zp;
1873                             break;
1874                         }
1875                         used--;
1876                     }
1877                     if ( !flag ) {
1878                         flag = 1;
1879                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1880                     }
1881                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1882                     trie->trans[ pos ].check = state;
1883                     pos++;
1884                 }
1885             }
1886         }
1887         trie->lasttrans = pos + 1;
1888         trie->states = (reg_trie_state *)
1889             PerlMemShared_realloc( trie->states, laststate
1890                                    * sizeof(reg_trie_state) );
1891         DEBUG_TRIE_COMPILE_MORE_r(
1892                 PerlIO_printf( Perl_debug_log,
1893                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1894                     (int)depth * 2 + 2,"",
1895                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1896                     (IV)next_alloc,
1897                     (IV)pos,
1898                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1899             );
1900
1901         } /* end table compress */
1902     }
1903     DEBUG_TRIE_COMPILE_MORE_r(
1904             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1905                 (int)depth * 2 + 2, "",
1906                 (UV)trie->statecount,
1907                 (UV)trie->lasttrans)
1908     );
1909     /* resize the trans array to remove unused space */
1910     trie->trans = (reg_trie_trans *)
1911         PerlMemShared_realloc( trie->trans, trie->lasttrans
1912                                * sizeof(reg_trie_trans) );
1913
1914     /* and now dump out the compressed format */
1915     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1916
1917     {   /* Modify the program and insert the new TRIE node*/
1918         U8 nodetype =(U8)(flags & 0xFF);
1919         char *str=NULL;
1920
1921 #ifdef DEBUGGING
1922         regnode *optimize = NULL;
1923 #ifdef RE_TRACK_PATTERN_OFFSETS
1924
1925         U32 mjd_offset = 0;
1926         U32 mjd_nodelen = 0;
1927 #endif /* RE_TRACK_PATTERN_OFFSETS */
1928 #endif /* DEBUGGING */
1929         /*
1930            This means we convert either the first branch or the first Exact,
1931            depending on whether the thing following (in 'last') is a branch
1932            or not and whther first is the startbranch (ie is it a sub part of
1933            the alternation or is it the whole thing.)
1934            Assuming its a sub part we conver the EXACT otherwise we convert
1935            the whole branch sequence, including the first.
1936          */
1937         /* Find the node we are going to overwrite */
1938         if ( first != startbranch || OP( last ) == BRANCH ) {
1939             /* branch sub-chain */
1940             NEXT_OFF( first ) = (U16)(last - first);
1941 #ifdef RE_TRACK_PATTERN_OFFSETS
1942             DEBUG_r({
1943                 mjd_offset= Node_Offset((convert));
1944                 mjd_nodelen= Node_Length((convert));
1945             });
1946 #endif
1947             /* whole branch chain */
1948         }
1949 #ifdef RE_TRACK_PATTERN_OFFSETS
1950         else {
1951             DEBUG_r({
1952                 const  regnode *nop = NEXTOPER( convert );
1953                 mjd_offset= Node_Offset((nop));
1954                 mjd_nodelen= Node_Length((nop));
1955             });
1956         }
1957         DEBUG_OPTIMISE_r(
1958             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1959                 (int)depth * 2 + 2, "",
1960                 (UV)mjd_offset, (UV)mjd_nodelen)
1961         );
1962 #endif
1963         /* But first we check to see if there is a common prefix we can
1964            split out as an EXACT and put in front of the TRIE node.  */
1965         trie->startstate= 1;
1966         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
1967             U32 state;
1968             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1969                 U32 ofs = 0;
1970                 I32 idx = -1;
1971                 U32 count = 0;
1972                 const U32 base = trie->states[ state ].trans.base;
1973
1974                 if ( trie->states[state].wordnum )
1975                         count = 1;
1976
1977                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1978                     if ( ( base + ofs >= trie->uniquecharcount ) &&
1979                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1980                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1981                     {
1982                         if ( ++count > 1 ) {
1983                             SV **tmp = av_fetch( revcharmap, ofs, 0);
1984                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1985                             if ( state == 1 ) break;
1986                             if ( count == 2 ) {
1987                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1988                                 DEBUG_OPTIMISE_r(
1989                                     PerlIO_printf(Perl_debug_log,
1990                                         "%*sNew Start State=%"UVuf" Class: [",
1991                                         (int)depth * 2 + 2, "",
1992                                         (UV)state));
1993                                 if (idx >= 0) {
1994                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
1995                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1996
1997                                     TRIE_BITMAP_SET(trie,*ch);
1998                                     if ( folder )
1999                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2000                                     DEBUG_OPTIMISE_r(
2001                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2002                                     );
2003                                 }
2004                             }
2005                             TRIE_BITMAP_SET(trie,*ch);
2006                             if ( folder )
2007                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2008                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2009                         }
2010                         idx = ofs;
2011                     }
2012                 }
2013                 if ( count == 1 ) {
2014                     SV **tmp = av_fetch( revcharmap, idx, 0);
2015                     STRLEN len;
2016                     char *ch = SvPV( *tmp, len );
2017                     DEBUG_OPTIMISE_r({
2018                         SV *sv=sv_newmortal();
2019                         PerlIO_printf( Perl_debug_log,
2020                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2021                             (int)depth * 2 + 2, "",
2022                             (UV)state, (UV)idx,
2023                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2024                                 PL_colors[0], PL_colors[1],
2025                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2026                                 PERL_PV_ESCAPE_FIRSTCHAR
2027                             )
2028                         );
2029                     });
2030                     if ( state==1 ) {
2031                         OP( convert ) = nodetype;
2032                         str=STRING(convert);
2033                         STR_LEN(convert)=0;
2034                     }
2035                     STR_LEN(convert) += len;
2036                     while (len--)
2037                         *str++ = *ch++;
2038                 } else {
2039 #ifdef DEBUGGING
2040                     if (state>1)
2041                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2042 #endif
2043                     break;
2044                 }
2045             }
2046             if (str) {
2047                 regnode *n = convert+NODE_SZ_STR(convert);
2048                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2049                 trie->startstate = state;
2050                 trie->minlen -= (state - 1);
2051                 trie->maxlen -= (state - 1);
2052 #ifdef DEBUGGING
2053                /* At least the UNICOS C compiler choked on this
2054                 * being argument to DEBUG_r(), so let's just have
2055                 * it right here. */
2056                if (
2057 #ifdef PERL_EXT_RE_BUILD
2058                    1
2059 #else
2060                    DEBUG_r_TEST
2061 #endif
2062                    ) {
2063                    regnode *fix = convert;
2064                    U32 word = trie->wordcount;
2065                    mjd_nodelen++;
2066                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2067                    while( ++fix < n ) {
2068                        Set_Node_Offset_Length(fix, 0, 0);
2069                    }
2070                    while (word--) {
2071                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2072                        if (tmp) {
2073                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2074                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2075                            else
2076                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2077                        }
2078                    }
2079                }
2080 #endif
2081                 if (trie->maxlen) {
2082                     convert = n;
2083                 } else {
2084                     NEXT_OFF(convert) = (U16)(tail - convert);
2085                     DEBUG_r(optimize= n);
2086                 }
2087             }
2088         }
2089         if (!jumper)
2090             jumper = last;
2091         if ( trie->maxlen ) {
2092             NEXT_OFF( convert ) = (U16)(tail - convert);
2093             ARG_SET( convert, data_slot );
2094             /* Store the offset to the first unabsorbed branch in
2095                jump[0], which is otherwise unused by the jump logic.
2096                We use this when dumping a trie and during optimisation. */
2097             if (trie->jump)
2098                 trie->jump[0] = (U16)(nextbranch - convert);
2099
2100             /* XXXX */
2101             if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2102                  ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2103             {
2104                 OP( convert ) = TRIEC;
2105                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2106                 PerlMemShared_free(trie->bitmap);
2107                 trie->bitmap= NULL;
2108             } else
2109                 OP( convert ) = TRIE;
2110
2111             /* store the type in the flags */
2112             convert->flags = nodetype;
2113             DEBUG_r({
2114             optimize = convert
2115                       + NODE_STEP_REGNODE
2116                       + regarglen[ OP( convert ) ];
2117             });
2118             /* XXX We really should free up the resource in trie now,
2119                    as we won't use them - (which resources?) dmq */
2120         }
2121         /* needed for dumping*/
2122         DEBUG_r(if (optimize) {
2123             regnode *opt = convert;
2124
2125             while ( ++opt < optimize) {
2126                 Set_Node_Offset_Length(opt,0,0);
2127             }
2128             /*
2129                 Try to clean up some of the debris left after the
2130                 optimisation.
2131              */
2132             while( optimize < jumper ) {
2133                 mjd_nodelen += Node_Length((optimize));
2134                 OP( optimize ) = OPTIMIZED;
2135                 Set_Node_Offset_Length(optimize,0,0);
2136                 optimize++;
2137             }
2138             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2139         });
2140     } /* end node insert */
2141     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2142 #ifdef DEBUGGING
2143     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2144     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2145 #else
2146     SvREFCNT_dec(revcharmap);
2147 #endif
2148     return trie->jump
2149            ? MADE_JUMP_TRIE
2150            : trie->startstate>1
2151              ? MADE_EXACT_TRIE
2152              : MADE_TRIE;
2153 }
2154
2155 STATIC void
2156 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2157 {
2158 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2159
2160    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2161    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2162    ISBN 0-201-10088-6
2163
2164    We find the fail state for each state in the trie, this state is the longest proper
2165    suffix of the current states 'word' that is also a proper prefix of another word in our
2166    trie. State 1 represents the word '' and is the thus the default fail state. This allows
2167    the DFA not to have to restart after its tried and failed a word at a given point, it
2168    simply continues as though it had been matching the other word in the first place.
2169    Consider
2170       'abcdgu'=~/abcdefg|cdgu/
2171    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2172    fail, which would bring use to the state representing 'd' in the second word where we would
2173    try 'g' and succeed, prodceding to match 'cdgu'.
2174  */
2175  /* add a fail transition */
2176     const U32 trie_offset = ARG(source);
2177     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2178     U32 *q;
2179     const U32 ucharcount = trie->uniquecharcount;
2180     const U32 numstates = trie->statecount;
2181     const U32 ubound = trie->lasttrans + ucharcount;
2182     U32 q_read = 0;
2183     U32 q_write = 0;
2184     U32 charid;
2185     U32 base = trie->states[ 1 ].trans.base;
2186     U32 *fail;
2187     reg_ac_data *aho;
2188     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2189     GET_RE_DEBUG_FLAGS_DECL;
2190
2191     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2192 #ifndef DEBUGGING
2193     PERL_UNUSED_ARG(depth);
2194 #endif
2195
2196
2197     ARG_SET( stclass, data_slot );
2198     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2199     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2200     aho->trie=trie_offset;
2201     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2202     Copy( trie->states, aho->states, numstates, reg_trie_state );
2203     Newxz( q, numstates, U32);
2204     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2205     aho->refcount = 1;
2206     fail = aho->fail;
2207     /* initialize fail[0..1] to be 1 so that we always have
2208        a valid final fail state */
2209     fail[ 0 ] = fail[ 1 ] = 1;
2210
2211     for ( charid = 0; charid < ucharcount ; charid++ ) {
2212         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2213         if ( newstate ) {
2214             q[ q_write ] = newstate;
2215             /* set to point at the root */
2216             fail[ q[ q_write++ ] ]=1;
2217         }
2218     }
2219     while ( q_read < q_write) {
2220         const U32 cur = q[ q_read++ % numstates ];
2221         base = trie->states[ cur ].trans.base;
2222
2223         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2224             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2225             if (ch_state) {
2226                 U32 fail_state = cur;
2227                 U32 fail_base;
2228                 do {
2229                     fail_state = fail[ fail_state ];
2230                     fail_base = aho->states[ fail_state ].trans.base;
2231                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2232
2233                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2234                 fail[ ch_state ] = fail_state;
2235                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2236                 {
2237                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2238                 }
2239                 q[ q_write++ % numstates] = ch_state;
2240             }
2241         }
2242     }
2243     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2244        when we fail in state 1, this allows us to use the
2245        charclass scan to find a valid start char. This is based on the principle
2246        that theres a good chance the string being searched contains lots of stuff
2247        that cant be a start char.
2248      */
2249     fail[ 0 ] = fail[ 1 ] = 0;
2250     DEBUG_TRIE_COMPILE_r({
2251         PerlIO_printf(Perl_debug_log,
2252                       "%*sStclass Failtable (%"UVuf" states): 0",
2253                       (int)(depth * 2), "", (UV)numstates
2254         );
2255         for( q_read=1; q_read<numstates; q_read++ ) {
2256             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2257         }
2258         PerlIO_printf(Perl_debug_log, "\n");
2259     });
2260     Safefree(q);
2261     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2262 }
2263
2264
2265 /*
2266  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2267  * These need to be revisited when a newer toolchain becomes available.
2268  */
2269 #if defined(__sparc64__) && defined(__GNUC__)
2270 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2271 #       undef  SPARC64_GCC_WORKAROUND
2272 #       define SPARC64_GCC_WORKAROUND 1
2273 #   endif
2274 #endif
2275
2276 #define DEBUG_PEEP(str,scan,depth) \
2277     DEBUG_OPTIMISE_r({if (scan){ \
2278        SV * const mysv=sv_newmortal(); \
2279        regnode *Next = regnext(scan); \
2280        regprop(RExC_rx, mysv, scan); \
2281        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2282        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2283        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2284    }});
2285
2286
2287
2288
2289
2290 #define JOIN_EXACT(scan,min,flags) \
2291     if (PL_regkind[OP(scan)] == EXACT) \
2292         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2293
2294 STATIC U32
2295 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2296     /* Merge several consecutive EXACTish nodes into one. */
2297     regnode *n = regnext(scan);
2298     U32 stringok = 1;
2299     regnode *next = scan + NODE_SZ_STR(scan);
2300     U32 merged = 0;
2301     U32 stopnow = 0;
2302 #ifdef DEBUGGING
2303     regnode *stop = scan;
2304     GET_RE_DEBUG_FLAGS_DECL;
2305 #else
2306     PERL_UNUSED_ARG(depth);
2307 #endif
2308
2309     PERL_ARGS_ASSERT_JOIN_EXACT;
2310 #ifndef EXPERIMENTAL_INPLACESCAN
2311     PERL_UNUSED_ARG(flags);
2312     PERL_UNUSED_ARG(val);
2313 #endif
2314     DEBUG_PEEP("join",scan,depth);
2315
2316     /* Skip NOTHING, merge EXACT*. */
2317     while (n &&
2318            ( PL_regkind[OP(n)] == NOTHING ||
2319              (stringok && (OP(n) == OP(scan))))
2320            && NEXT_OFF(n)
2321            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2322
2323         if (OP(n) == TAIL || n > next)
2324             stringok = 0;
2325         if (PL_regkind[OP(n)] == NOTHING) {
2326             DEBUG_PEEP("skip:",n,depth);
2327             NEXT_OFF(scan) += NEXT_OFF(n);
2328             next = n + NODE_STEP_REGNODE;
2329 #ifdef DEBUGGING
2330             if (stringok)
2331                 stop = n;
2332 #endif
2333             n = regnext(n);
2334         }
2335         else if (stringok) {
2336             const unsigned int oldl = STR_LEN(scan);
2337             regnode * const nnext = regnext(n);
2338
2339             DEBUG_PEEP("merg",n,depth);
2340
2341             merged++;
2342             if (oldl + STR_LEN(n) > U8_MAX)
2343                 break;
2344             NEXT_OFF(scan) += NEXT_OFF(n);
2345             STR_LEN(scan) += STR_LEN(n);
2346             next = n + NODE_SZ_STR(n);
2347             /* Now we can overwrite *n : */
2348             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2349 #ifdef DEBUGGING
2350             stop = next - 1;
2351 #endif
2352             n = nnext;
2353             if (stopnow) break;
2354         }
2355
2356 #ifdef EXPERIMENTAL_INPLACESCAN
2357         if (flags && !NEXT_OFF(n)) {
2358             DEBUG_PEEP("atch", val, depth);
2359             if (reg_off_by_arg[OP(n)]) {
2360                 ARG_SET(n, val - n);
2361             }
2362             else {
2363                 NEXT_OFF(n) = val - n;
2364             }
2365             stopnow = 1;
2366         }
2367 #endif
2368     }
2369
2370     if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2371     /*
2372     Two problematic code points in Unicode casefolding of EXACT nodes:
2373
2374     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2375     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2376
2377     which casefold to
2378
2379     Unicode                      UTF-8
2380
2381     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2382     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2383
2384     This means that in case-insensitive matching (or "loose matching",
2385     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2386     length of the above casefolded versions) can match a target string
2387     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2388     This would rather mess up the minimum length computation.
2389
2390     What we'll do is to look for the tail four bytes, and then peek
2391     at the preceding two bytes to see whether we need to decrease
2392     the minimum length by four (six minus two).
2393
2394     Thanks to the design of UTF-8, there cannot be false matches:
2395     A sequence of valid UTF-8 bytes cannot be a subsequence of
2396     another valid sequence of UTF-8 bytes.
2397
2398     */
2399          char * const s0 = STRING(scan), *s, *t;
2400          char * const s1 = s0 + STR_LEN(scan) - 1;
2401          char * const s2 = s1 - 4;
2402 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2403          const char t0[] = "\xaf\x49\xaf\x42";
2404 #else
2405          const char t0[] = "\xcc\x88\xcc\x81";
2406 #endif
2407          const char * const t1 = t0 + 3;
2408
2409          for (s = s0 + 2;
2410               s < s2 && (t = ninstr(s, s1, t0, t1));
2411               s = t + 4) {
2412 #ifdef EBCDIC
2413               if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2414                   ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2415 #else
2416               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2417                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2418 #endif
2419                    *min -= 4;
2420          }
2421     }
2422
2423 #ifdef DEBUGGING
2424     /* Allow dumping */
2425     n = scan + NODE_SZ_STR(scan);
2426     while (n <= stop) {
2427         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2428             OP(n) = OPTIMIZED;
2429             NEXT_OFF(n) = 0;
2430         }
2431         n++;
2432     }
2433 #endif
2434     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2435     return stopnow;
2436 }
2437
2438 /* REx optimizer.  Converts nodes into quickier variants "in place".
2439    Finds fixed substrings.  */
2440
2441 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2442    to the position after last scanned or to NULL. */
2443
2444 #define INIT_AND_WITHP \
2445     assert(!and_withp); \
2446     Newx(and_withp,1,struct regnode_charclass_class); \
2447     SAVEFREEPV(and_withp)
2448
2449 /* this is a chain of data about sub patterns we are processing that
2450    need to be handled seperately/specially in study_chunk. Its so
2451    we can simulate recursion without losing state.  */
2452 struct scan_frame;
2453 typedef struct scan_frame {
2454     regnode *last;  /* last node to process in this frame */
2455     regnode *next;  /* next node to process when last is reached */
2456     struct scan_frame *prev; /*previous frame*/
2457     I32 stop; /* what stopparen do we use */
2458 } scan_frame;
2459
2460
2461 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2462
2463 #define CASE_SYNST_FNC(nAmE)                                       \
2464 case nAmE:                                                         \
2465     if (flags & SCF_DO_STCLASS_AND) {                              \
2466             for (value = 0; value < 256; value++)                  \
2467                 if (!is_ ## nAmE ## _cp(value))                       \
2468                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2469     }                                                              \
2470     else {                                                         \
2471             for (value = 0; value < 256; value++)                  \
2472                 if (is_ ## nAmE ## _cp(value))                        \
2473                     ANYOF_BITMAP_SET(data->start_class, value);    \
2474     }                                                              \
2475     break;                                                         \
2476 case N ## nAmE:                                                    \
2477     if (flags & SCF_DO_STCLASS_AND) {                              \
2478             for (value = 0; value < 256; value++)                   \
2479                 if (is_ ## nAmE ## _cp(value))                         \
2480                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2481     }                                                               \
2482     else {                                                          \
2483             for (value = 0; value < 256; value++)                   \
2484                 if (!is_ ## nAmE ## _cp(value))                        \
2485                     ANYOF_BITMAP_SET(data->start_class, value);     \
2486     }                                                               \
2487     break
2488
2489
2490
2491 STATIC I32
2492 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2493                         I32 *minlenp, I32 *deltap,
2494                         regnode *last,
2495                         scan_data_t *data,
2496                         I32 stopparen,
2497                         U8* recursed,
2498                         struct regnode_charclass_class *and_withp,
2499                         U32 flags, U32 depth)
2500                         /* scanp: Start here (read-write). */
2501                         /* deltap: Write maxlen-minlen here. */
2502                         /* last: Stop before this one. */
2503                         /* data: string data about the pattern */
2504                         /* stopparen: treat close N as END */
2505                         /* recursed: which subroutines have we recursed into */
2506                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2507 {
2508     dVAR;
2509     I32 min = 0, pars = 0, code;
2510     regnode *scan = *scanp, *next;
2511     I32 delta = 0;
2512     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2513     int is_inf_internal = 0;            /* The studied chunk is infinite */
2514     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2515     scan_data_t data_fake;
2516     SV *re_trie_maxbuff = NULL;
2517     regnode *first_non_open = scan;
2518     I32 stopmin = I32_MAX;
2519     scan_frame *frame = NULL;
2520     GET_RE_DEBUG_FLAGS_DECL;
2521
2522     PERL_ARGS_ASSERT_STUDY_CHUNK;
2523
2524 #ifdef DEBUGGING
2525     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2526 #endif
2527
2528     if ( depth == 0 ) {
2529         while (first_non_open && OP(first_non_open) == OPEN)
2530             first_non_open=regnext(first_non_open);
2531     }
2532
2533
2534   fake_study_recurse:
2535     while ( scan && OP(scan) != END && scan < last ){
2536         /* Peephole optimizer: */
2537         DEBUG_STUDYDATA("Peep:", data,depth);
2538         DEBUG_PEEP("Peep",scan,depth);
2539         JOIN_EXACT(scan,&min,0);
2540
2541         /* Follow the next-chain of the current node and optimize
2542            away all the NOTHINGs from it.  */
2543         if (OP(scan) != CURLYX) {
2544             const int max = (reg_off_by_arg[OP(scan)]
2545                        ? I32_MAX
2546                        /* I32 may be smaller than U16 on CRAYs! */
2547                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2548             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2549             int noff;
2550             regnode *n = scan;
2551
2552             /* Skip NOTHING and LONGJMP. */
2553             while ((n = regnext(n))
2554                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2555                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2556                    && off + noff < max)
2557                 off += noff;
2558             if (reg_off_by_arg[OP(scan)])
2559                 ARG(scan) = off;
2560             else
2561                 NEXT_OFF(scan) = off;
2562         }
2563
2564
2565
2566         /* The principal pseudo-switch.  Cannot be a switch, since we
2567            look into several different things.  */
2568         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2569                    || OP(scan) == IFTHEN) {
2570             next = regnext(scan);
2571             code = OP(scan);
2572             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2573
2574             if (OP(next) == code || code == IFTHEN) {
2575                 /* NOTE - There is similar code to this block below for handling
2576                    TRIE nodes on a re-study.  If you change stuff here check there
2577                    too. */
2578                 I32 max1 = 0, min1 = I32_MAX, num = 0;
2579                 struct regnode_charclass_class accum;
2580                 regnode * const startbranch=scan;
2581
2582                 if (flags & SCF_DO_SUBSTR)
2583                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2584                 if (flags & SCF_DO_STCLASS)
2585                     cl_init_zero(pRExC_state, &accum);
2586
2587                 while (OP(scan) == code) {
2588                     I32 deltanext, minnext, f = 0, fake;
2589                     struct regnode_charclass_class this_class;
2590
2591                     num++;
2592                     data_fake.flags = 0;
2593                     if (data) {
2594                         data_fake.whilem_c = data->whilem_c;
2595                         data_fake.last_closep = data->last_closep;
2596                     }
2597                     else
2598                         data_fake.last_closep = &fake;
2599
2600                     data_fake.pos_delta = delta;
2601                     next = regnext(scan);
2602                     scan = NEXTOPER(scan);
2603                     if (code != BRANCH)
2604                         scan = NEXTOPER(scan);
2605                     if (flags & SCF_DO_STCLASS) {
2606                         cl_init(pRExC_state, &this_class);
2607                         data_fake.start_class = &this_class;
2608                         f = SCF_DO_STCLASS_AND;
2609                     }
2610                     if (flags & SCF_WHILEM_VISITED_POS)
2611                         f |= SCF_WHILEM_VISITED_POS;
2612
2613                     /* we suppose the run is continuous, last=next...*/
2614                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2615                                           next, &data_fake,
2616                                           stopparen, recursed, NULL, f,depth+1);
2617                     if (min1 > minnext)
2618                         min1 = minnext;
2619                     if (max1 < minnext + deltanext)
2620                         max1 = minnext + deltanext;
2621                     if (deltanext == I32_MAX)
2622                         is_inf = is_inf_internal = 1;
2623                     scan = next;
2624                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2625                         pars++;
2626                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
2627                         if ( stopmin > minnext)
2628                             stopmin = min + min1;
2629                         flags &= ~SCF_DO_SUBSTR;
2630                         if (data)
2631                             data->flags |= SCF_SEEN_ACCEPT;
2632                     }
2633                     if (data) {
2634                         if (data_fake.flags & SF_HAS_EVAL)
2635                             data->flags |= SF_HAS_EVAL;
2636                         data->whilem_c = data_fake.whilem_c;
2637                     }
2638                     if (flags & SCF_DO_STCLASS)
2639                         cl_or(pRExC_state, &accum, &this_class);
2640                 }
2641                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2642                     min1 = 0;
2643                 if (flags & SCF_DO_SUBSTR) {
2644                     data->pos_min += min1;
2645                     data->pos_delta += max1 - min1;
2646                     if (max1 != min1 || is_inf)
2647                         data->longest = &(data->longest_float);
2648                 }
2649                 min += min1;
2650                 delta += max1 - min1;
2651                 if (flags & SCF_DO_STCLASS_OR) {
2652                     cl_or(pRExC_state, data->start_class, &accum);
2653                     if (min1) {
2654                         cl_and(data->start_class, and_withp);
2655                         flags &= ~SCF_DO_STCLASS;
2656                     }
2657                 }
2658                 else if (flags & SCF_DO_STCLASS_AND) {
2659                     if (min1) {
2660                         cl_and(data->start_class, &accum);
2661                         flags &= ~SCF_DO_STCLASS;
2662                     }
2663                     else {
2664                         /* Switch to OR mode: cache the old value of
2665                          * data->start_class */
2666                         INIT_AND_WITHP;
2667                         StructCopy(data->start_class, and_withp,
2668                                    struct regnode_charclass_class);
2669                         flags &= ~SCF_DO_STCLASS_AND;
2670                         StructCopy(&accum, data->start_class,
2671                                    struct regnode_charclass_class);
2672                         flags |= SCF_DO_STCLASS_OR;
2673                         data->start_class->flags |= ANYOF_EOS;
2674                     }
2675                 }
2676
2677                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2678                 /* demq.
2679
2680                    Assuming this was/is a branch we are dealing with: 'scan' now
2681                    points at the item that follows the branch sequence, whatever
2682                    it is. We now start at the beginning of the sequence and look
2683                    for subsequences of
2684
2685                    BRANCH->EXACT=>x1
2686                    BRANCH->EXACT=>x2
2687                    tail
2688
2689                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
2690
2691                    If we can find such a subseqence we need to turn the first
2692                    element into a trie and then add the subsequent branch exact
2693                    strings to the trie.
2694
2695                    We have two cases
2696
2697                      1. patterns where the whole set of branch can be converted.
2698
2699                      2. patterns where only a subset can be converted.
2700
2701                    In case 1 we can replace the whole set with a single regop
2702                    for the trie. In case 2 we need to keep the start and end
2703                    branchs so
2704
2705                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2706                      becomes BRANCH TRIE; BRANCH X;
2707
2708                   There is an additional case, that being where there is a
2709                   common prefix, which gets split out into an EXACT like node
2710                   preceding the TRIE node.
2711
2712                   If x(1..n)==tail then we can do a simple trie, if not we make
2713                   a "jump" trie, such that when we match the appropriate word
2714                   we "jump" to the appopriate tail node. Essentailly we turn
2715                   a nested if into a case structure of sorts.
2716
2717                 */
2718
2719                     int made=0;
2720                     if (!re_trie_maxbuff) {
2721                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2722                         if (!SvIOK(re_trie_maxbuff))
2723                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2724                     }
2725                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2726                         regnode *cur;
2727                         regnode *first = (regnode *)NULL;
2728                         regnode *last = (regnode *)NULL;
2729                         regnode *tail = scan;
2730                         U8 optype = 0;
2731                         U32 count=0;
2732
2733 #ifdef DEBUGGING
2734                         SV * const mysv = sv_newmortal();       /* for dumping */
2735 #endif
2736                         /* var tail is used because there may be a TAIL
2737                            regop in the way. Ie, the exacts will point to the
2738                            thing following the TAIL, but the last branch will
2739                            point at the TAIL. So we advance tail. If we
2740                            have nested (?:) we may have to move through several
2741                            tails.
2742                          */
2743
2744                         while ( OP( tail ) == TAIL ) {
2745                             /* this is the TAIL generated by (?:) */
2746                             tail = regnext( tail );
2747                         }
2748
2749
2750                         DEBUG_OPTIMISE_r({
2751                             regprop(RExC_rx, mysv, tail );
2752                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2753                                 (int)depth * 2 + 2, "",
2754                                 "Looking for TRIE'able sequences. Tail node is: ",
2755                                 SvPV_nolen_const( mysv )
2756                             );
2757                         });
2758
2759                         /*
2760
2761                            step through the branches, cur represents each
2762                            branch, noper is the first thing to be matched
2763                            as part of that branch and noper_next is the
2764                            regnext() of that node. if noper is an EXACT
2765                            and noper_next is the same as scan (our current
2766                            position in the regex) then the EXACT branch is
2767                            a possible optimization target. Once we have
2768                            two or more consequetive such branches we can
2769                            create a trie of the EXACT's contents and stich
2770                            it in place. If the sequence represents all of
2771                            the branches we eliminate the whole thing and
2772                            replace it with a single TRIE. If it is a
2773                            subsequence then we need to stitch it in. This
2774                            means the first branch has to remain, and needs
2775                            to be repointed at the item on the branch chain
2776                            following the last branch optimized. This could
2777                            be either a BRANCH, in which case the
2778                            subsequence is internal, or it could be the
2779                            item following the branch sequence in which
2780                            case the subsequence is at the end.
2781
2782                         */
2783
2784                         /* dont use tail as the end marker for this traverse */
2785                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2786                             regnode * const noper = NEXTOPER( cur );
2787 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2788                             regnode * const noper_next = regnext( noper );
2789 #endif
2790
2791                             DEBUG_OPTIMISE_r({
2792                                 regprop(RExC_rx, mysv, cur);
2793                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2794                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2795
2796                                 regprop(RExC_rx, mysv, noper);
2797                                 PerlIO_printf( Perl_debug_log, " -> %s",
2798                                     SvPV_nolen_const(mysv));
2799
2800                                 if ( noper_next ) {
2801                                   regprop(RExC_rx, mysv, noper_next );
2802                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2803                                     SvPV_nolen_const(mysv));
2804                                 }
2805                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2806                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2807                             });
2808                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2809                                          : PL_regkind[ OP( noper ) ] == EXACT )
2810                                   || OP(noper) == NOTHING )
2811 #ifdef NOJUMPTRIE
2812                                   && noper_next == tail
2813 #endif
2814                                   && count < U16_MAX)
2815                             {
2816                                 count++;
2817                                 if ( !first || optype == NOTHING ) {
2818                                     if (!first) first = cur;
2819                                     optype = OP( noper );
2820                                 } else {
2821                                     last = cur;
2822                                 }
2823                             } else {
2824 /*
2825     Currently we assume that the trie can handle unicode and ascii
2826     matches fold cased matches. If this proves true then the following
2827     define will prevent tries in this situation.
2828
2829     #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2830 */
2831 #define TRIE_TYPE_IS_SAFE 1
2832                                 if ( last && TRIE_TYPE_IS_SAFE ) {
2833                                     make_trie( pRExC_state,
2834                                             startbranch, first, cur, tail, count,
2835                                             optype, depth+1 );
2836                                 }
2837                                 if ( PL_regkind[ OP( noper ) ] == EXACT
2838 #ifdef NOJUMPTRIE
2839                                      && noper_next == tail
2840 #endif
2841                                 ){
2842                                     count = 1;
2843                                     first = cur;
2844                                     optype = OP( noper );
2845                                 } else {
2846                                     count = 0;
2847                                     first = NULL;
2848                                     optype = 0;
2849                                 }
2850                                 last = NULL;
2851                             }
2852                         }
2853                         DEBUG_OPTIMISE_r({
2854                             regprop(RExC_rx, mysv, cur);
2855                             PerlIO_printf( Perl_debug_log,
2856                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2857                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2858
2859                         });
2860
2861                         if ( last && TRIE_TYPE_IS_SAFE ) {
2862                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2863 #ifdef TRIE_STUDY_OPT
2864                             if ( ((made == MADE_EXACT_TRIE &&
2865                                  startbranch == first)
2866                                  || ( first_non_open == first )) &&
2867                                  depth==0 ) {
2868                                 flags |= SCF_TRIE_RESTUDY;
2869                                 if ( startbranch == first
2870                                      && scan == tail )
2871                                 {
2872                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2873                                 }
2874                             }
2875 #endif
2876                         }
2877                     }
2878
2879                 } /* do trie */
2880
2881             }
2882             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
2883                 scan = NEXTOPER(NEXTOPER(scan));
2884             } else                      /* single branch is optimized. */
2885                 scan = NEXTOPER(scan);
2886             continue;
2887         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2888             scan_frame *newframe = NULL;
2889             I32 paren;
2890             regnode *start;
2891             regnode *end;
2892
2893             if (OP(scan) != SUSPEND) {
2894             /* set the pointer */
2895                 if (OP(scan) == GOSUB) {
2896                     paren = ARG(scan);
2897                     RExC_recurse[ARG2L(scan)] = scan;
2898                     start = RExC_open_parens[paren-1];
2899                     end   = RExC_close_parens[paren-1];
2900                 } else {
2901                     paren = 0;
2902                     start = RExC_rxi->program + 1;
2903                     end   = RExC_opend;
2904                 }
2905                 if (!recursed) {
2906                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2907                     SAVEFREEPV(recursed);
2908                 }
2909                 if (!PAREN_TEST(recursed,paren+1)) {
2910                     PAREN_SET(recursed,paren+1);
2911                     Newx(newframe,1,scan_frame);
2912                 } else {
2913                     if (flags & SCF_DO_SUBSTR) {
2914                         SCAN_COMMIT(pRExC_state,data,minlenp);
2915                         data->longest = &(data->longest_float);
2916                     }
2917                     is_inf = is_inf_internal = 1;
2918                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2919                         cl_anything(pRExC_state, data->start_class);
2920                     flags &= ~SCF_DO_STCLASS;
2921                 }
2922             } else {
2923                 Newx(newframe,1,scan_frame);
2924                 paren = stopparen;
2925                 start = scan+2;
2926                 end = regnext(scan);
2927             }
2928             if (newframe) {
2929                 assert(start);
2930                 assert(end);
2931                 SAVEFREEPV(newframe);
2932                 newframe->next = regnext(scan);
2933                 newframe->last = last;
2934                 newframe->stop = stopparen;
2935                 newframe->prev = frame;
2936
2937                 frame = newframe;
2938                 scan =  start;
2939                 stopparen = paren;
2940                 last = end;
2941
2942                 continue;
2943             }
2944         }
2945         else if (OP(scan) == EXACT) {
2946             I32 l = STR_LEN(scan);
2947             UV uc;
2948             if (UTF) {
2949                 const U8 * const s = (U8*)STRING(scan);
2950                 l = utf8_length(s, s + l);
2951                 uc = utf8_to_uvchr(s, NULL);
2952             } else {
2953                 uc = *((U8*)STRING(scan));
2954             }
2955             min += l;
2956             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2957                 /* The code below prefers earlier match for fixed
2958                    offset, later match for variable offset.  */
2959                 if (data->last_end == -1) { /* Update the start info. */
2960                     data->last_start_min = data->pos_min;
2961                     data->last_start_max = is_inf
2962                         ? I32_MAX : data->pos_min + data->pos_delta;
2963                 }
2964                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2965                 if (UTF)
2966                     SvUTF8_on(data->last_found);
2967                 {
2968                     SV * const sv = data->last_found;
2969                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2970                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
2971                     if (mg && mg->mg_len >= 0)
2972                         mg->mg_len += utf8_length((U8*)STRING(scan),
2973                                                   (U8*)STRING(scan)+STR_LEN(scan));
2974                 }
2975                 data->last_end = data->pos_min + l;
2976                 data->pos_min += l; /* As in the first entry. */
2977                 data->flags &= ~SF_BEFORE_EOL;
2978             }
2979             if (flags & SCF_DO_STCLASS_AND) {
2980                 /* Check whether it is compatible with what we know already! */
2981                 int compat = 1;
2982
2983                 if (uc >= 0x100 ||
2984                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2985                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
2986                     && (!(data->start_class->flags & ANYOF_FOLD)
2987                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2988                     )
2989                     compat = 0;
2990                 ANYOF_CLASS_ZERO(data->start_class);
2991                 ANYOF_BITMAP_ZERO(data->start_class);
2992                 if (compat)
2993                     ANYOF_BITMAP_SET(data->start_class, uc);
2994                 data->start_class->flags &= ~ANYOF_EOS;
2995                 if (uc < 0x100)
2996                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2997             }
2998             else if (flags & SCF_DO_STCLASS_OR) {
2999                 /* false positive possible if the class is case-folded */
3000                 if (uc < 0x100)
3001                     ANYOF_BITMAP_SET(data->start_class, uc);
3002                 else
3003                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3004                 data->start_class->flags &= ~ANYOF_EOS;
3005                 cl_and(data->start_class, and_withp);
3006             }
3007             flags &= ~SCF_DO_STCLASS;
3008         }
3009         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3010             I32 l = STR_LEN(scan);
3011             UV uc = *((U8*)STRING(scan));
3012
3013             /* Search for fixed substrings supports EXACT only. */
3014             if (flags & SCF_DO_SUBSTR) {
3015                 assert(data);
3016                 SCAN_COMMIT(pRExC_state, data, minlenp);
3017             }
3018             if (UTF) {
3019                 const U8 * const s = (U8 *)STRING(scan);
3020                 l = utf8_length(s, s + l);
3021                 uc = utf8_to_uvchr(s, NULL);
3022             }
3023             min += l;
3024             if (flags & SCF_DO_SUBSTR)
3025                 data->pos_min += l;
3026             if (flags & SCF_DO_STCLASS_AND) {
3027                 /* Check whether it is compatible with what we know already! */
3028                 int compat = 1;
3029
3030                 if (uc >= 0x100 ||
3031                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3032                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3033                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3034                     compat = 0;
3035                 ANYOF_CLASS_ZERO(data->start_class);
3036                 ANYOF_BITMAP_ZERO(data->start_class);
3037                 if (compat) {
3038                     ANYOF_BITMAP_SET(data->start_class, uc);
3039                     data->start_class->flags &= ~ANYOF_EOS;
3040                     data->start_class->flags |= ANYOF_FOLD;
3041                     if (OP(scan) == EXACTFL)
3042                         data->start_class->flags |= ANYOF_LOCALE;
3043                 }
3044             }
3045             else if (flags & SCF_DO_STCLASS_OR) {
3046                 if (data->start_class->flags & ANYOF_FOLD) {
3047                     /* false positive possible if the class is case-folded.
3048                        Assume that the locale settings are the same... */
3049                     if (uc < 0x100)
3050                         ANYOF_BITMAP_SET(data->start_class, uc);
3051                     data->start_class->flags &= ~ANYOF_EOS;
3052                 }
3053                 cl_and(data->start_class, and_withp);
3054             }
3055             flags &= ~SCF_DO_STCLASS;
3056         }
3057         else if (strchr((const char*)PL_varies,OP(scan))) {
3058             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3059             I32 f = flags, pos_before = 0;
3060             regnode * const oscan = scan;
3061             struct regnode_charclass_class this_class;
3062             struct regnode_charclass_class *oclass = NULL;
3063             I32 next_is_eval = 0;
3064
3065             switch (PL_regkind[OP(scan)]) {
3066             case WHILEM:                /* End of (?:...)* . */
3067                 scan = NEXTOPER(scan);
3068                 goto finish;
3069             case PLUS:
3070                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3071                     next = NEXTOPER(scan);
3072                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3073                         mincount = 1;
3074                         maxcount = REG_INFTY;
3075                         next = regnext(scan);
3076                         scan = NEXTOPER(scan);
3077                         goto do_curly;
3078                     }
3079                 }
3080                 if (flags & SCF_DO_SUBSTR)
3081                     data->pos_min++;
3082                 min++;
3083                 /* Fall through. */
3084             case STAR:
3085                 if (flags & SCF_DO_STCLASS) {
3086                     mincount = 0;
3087                     maxcount = REG_INFTY;
3088                     next = regnext(scan);
3089                     scan = NEXTOPER(scan);
3090                     goto do_curly;
3091                 }
3092                 is_inf = is_inf_internal = 1;
3093                 scan = regnext(scan);
3094                 if (flags & SCF_DO_SUBSTR) {
3095                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3096                     data->longest = &(data->longest_float);
3097                 }
3098                 goto optimize_curly_tail;
3099             case CURLY:
3100                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3101                     && (scan->flags == stopparen))
3102                 {
3103                     mincount = 1;
3104                     maxcount = 1;
3105                 } else {
3106                     mincount = ARG1(scan);
3107                     maxcount = ARG2(scan);
3108                 }
3109                 next = regnext(scan);
3110                 if (OP(scan) == CURLYX) {
3111                     I32 lp = (data ? *(data->last_closep) : 0);
3112                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3113                 }
3114                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3115                 next_is_eval = (OP(scan) == EVAL);
3116               do_curly:
3117                 if (flags & SCF_DO_SUBSTR) {
3118                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3119                     pos_before = data->pos_min;
3120                 }
3121                 if (data) {
3122                     fl = data->flags;
3123                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3124                     if (is_inf)
3125                         data->flags |= SF_IS_INF;
3126                 }
3127                 if (flags & SCF_DO_STCLASS) {
3128                     cl_init(pRExC_state, &this_class);
3129                     oclass = data->start_class;
3130                     data->start_class = &this_class;
3131                     f |= SCF_DO_STCLASS_AND;
3132                     f &= ~SCF_DO_STCLASS_OR;
3133                 }
3134                 /* These are the cases when once a subexpression
3135                    fails at a particular position, it cannot succeed
3136                    even after backtracking at the enclosing scope.
3137
3138                    XXXX what if minimal match and we are at the
3139                         initial run of {n,m}? */
3140                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3141                     f &= ~SCF_WHILEM_VISITED_POS;
3142
3143                 /* This will finish on WHILEM, setting scan, or on NULL: */
3144                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3145                                       last, data, stopparen, recursed, NULL,
3146                                       (mincount == 0
3147                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3148
3149                 if (flags & SCF_DO_STCLASS)
3150                     data->start_class = oclass;
3151                 if (mincount == 0 || minnext == 0) {
3152                     if (flags & SCF_DO_STCLASS_OR) {
3153                         cl_or(pRExC_state, data->start_class, &this_class);
3154                     }
3155                     else if (flags & SCF_DO_STCLASS_AND) {
3156                         /* Switch to OR mode: cache the old value of
3157                          * data->start_class */
3158                         INIT_AND_WITHP;
3159                         StructCopy(data->start_class, and_withp,
3160                                    struct regnode_charclass_class);
3161                         flags &= ~SCF_DO_STCLASS_AND;
3162                         StructCopy(&this_class, data->start_class,
3163                                    struct regnode_charclass_class);
3164                         flags |= SCF_DO_STCLASS_OR;
3165                         data->start_class->flags |= ANYOF_EOS;
3166                     }
3167                 } else {                /* Non-zero len */
3168                     if (flags & SCF_DO_STCLASS_OR) {
3169                         cl_or(pRExC_state, data->start_class, &this_class);
3170                         cl_and(data->start_class, and_withp);
3171                     }
3172                     else if (flags & SCF_DO_STCLASS_AND)
3173                         cl_and(data->start_class, &this_class);
3174                     flags &= ~SCF_DO_STCLASS;
3175                 }
3176                 if (!scan)              /* It was not CURLYX, but CURLY. */
3177                     scan = next;
3178                 if ( /* ? quantifier ok, except for (?{ ... }) */
3179                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3180                     && (minnext == 0) && (deltanext == 0)
3181                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3182                     && maxcount <= REG_INFTY/3 /* Complement check for big count */
3183                     && ckWARN(WARN_REGEXP))
3184                 {
3185                     vWARN(RExC_parse,
3186                           "Quantifier unexpected on zero-length expression");
3187                 }
3188
3189                 min += minnext * mincount;
3190                 is_inf_internal |= ((maxcount == REG_INFTY
3191                                      && (minnext + deltanext) > 0)
3192                                     || deltanext == I32_MAX);
3193                 is_inf |= is_inf_internal;
3194                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3195
3196                 /* Try powerful optimization CURLYX => CURLYN. */
3197                 if (  OP(oscan) == CURLYX && data
3198                       && data->flags & SF_IN_PAR
3199                       && !(data->flags & SF_HAS_EVAL)
3200                       && !deltanext && minnext == 1 ) {
3201                     /* Try to optimize to CURLYN.  */
3202                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3203                     regnode * const nxt1 = nxt;
3204 #ifdef DEBUGGING
3205                     regnode *nxt2;
3206 #endif
3207
3208                     /* Skip open. */
3209                     nxt = regnext(nxt);
3210                     if (!strchr((const char*)PL_simple,OP(nxt))
3211                         && !(PL_regkind[OP(nxt)] == EXACT
3212                              && STR_LEN(nxt) == 1))
3213                         goto nogo;
3214 #ifdef DEBUGGING
3215                     nxt2 = nxt;
3216 #endif
3217                     nxt = regnext(nxt);
3218                     if (OP(nxt) != CLOSE)
3219                         goto nogo;
3220                     if (RExC_open_parens) {
3221                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3222                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3223                     }
3224                     /* Now we know that nxt2 is the only contents: */
3225                     oscan->flags = (U8)ARG(nxt);
3226                     OP(oscan) = CURLYN;
3227                     OP(nxt1) = NOTHING; /* was OPEN. */
3228
3229 #ifdef DEBUGGING
3230                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3231                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3232                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3233                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3234                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3235                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3236 #endif
3237                 }
3238               nogo:
3239
3240                 /* Try optimization CURLYX => CURLYM. */
3241                 if (  OP(oscan) == CURLYX && data
3242                       && !(data->flags & SF_HAS_PAR)
3243                       && !(data->flags & SF_HAS_EVAL)
3244                       && !deltanext     /* atom is fixed width */
3245                       && minnext != 0   /* CURLYM can't handle zero width */
3246                 ) {
3247                     /* XXXX How to optimize if data == 0? */
3248                     /* Optimize to a simpler form.  */
3249                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3250                     regnode *nxt2;
3251
3252                     OP(oscan) = CURLYM;
3253                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3254                             && (OP(nxt2) != WHILEM))
3255                         nxt = nxt2;
3256                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3257                     /* Need to optimize away parenths. */
3258                     if (data->flags & SF_IN_PAR) {
3259                         /* Set the parenth number.  */
3260                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3261
3262                         if (OP(nxt) != CLOSE)
3263                             FAIL("Panic opt close");
3264                         oscan->flags = (U8)ARG(nxt);
3265                         if (RExC_open_parens) {
3266                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3267                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3268                         }
3269                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3270                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3271
3272 #ifdef DEBUGGING
3273                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3274                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3275                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3276                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3277 #endif
3278 #if 0
3279                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3280                             regnode *nnxt = regnext(nxt1);
3281
3282                             if (nnxt == nxt) {
3283                                 if (reg_off_by_arg[OP(nxt1)])
3284                                     ARG_SET(nxt1, nxt2 - nxt1);
3285                                 else if (nxt2 - nxt1 < U16_MAX)
3286                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3287                                 else
3288                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3289                             }
3290                             nxt1 = nnxt;
3291                         }
3292 #endif
3293                         /* Optimize again: */
3294                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3295                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3296                     }
3297                     else
3298                         oscan->flags = 0;
3299                 }
3300                 else if ((OP(oscan) == CURLYX)
3301                          && (flags & SCF_WHILEM_VISITED_POS)
3302                          /* See the comment on a similar expression above.
3303                             However, this time it not a subexpression
3304                             we care about, but the expression itself. */
3305                          && (maxcount == REG_INFTY)
3306                          && data && ++data->whilem_c < 16) {
3307                     /* This stays as CURLYX, we can put the count/of pair. */
3308                     /* Find WHILEM (as in regexec.c) */
3309                     regnode *nxt = oscan + NEXT_OFF(oscan);
3310
3311                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3312                         nxt += ARG(nxt);
3313                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3314                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3315                 }
3316                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3317                     pars++;
3318                 if (flags & SCF_DO_SUBSTR) {
3319                     SV *last_str = NULL;
3320                     int counted = mincount != 0;
3321
3322                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3323 #if defined(SPARC64_GCC_WORKAROUND)
3324                         I32 b = 0;
3325                         STRLEN l = 0;
3326                         const char *s = NULL;
3327                         I32 old = 0;
3328
3329                         if (pos_before >= data->last_start_min)
3330                             b = pos_before;
3331                         else
3332                             b = data->last_start_min;
3333
3334                         l = 0;
3335                         s = SvPV_const(data->last_found, l);
3336                         old = b - data->last_start_min;
3337
3338 #else
3339                         I32 b = pos_before >= data->last_start_min
3340                             ? pos_before : data->last_start_min;
3341                         STRLEN l;
3342                         const char * const s = SvPV_const(data->last_found, l);
3343                         I32 old = b - data->last_start_min;
3344 #endif
3345
3346                         if (UTF)
3347                             old = utf8_hop((U8*)s, old) - (U8*)s;
3348
3349                         l -= old;
3350                         /* Get the added string: */
3351                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3352                         if (deltanext == 0 && pos_before == b) {
3353                             /* What was added is a constant string */
3354                             if (mincount > 1) {
3355                                 SvGROW(last_str, (mincount * l) + 1);
3356                                 repeatcpy(SvPVX(last_str) + l,
3357                                           SvPVX_const(last_str), l, mincount - 1);
3358                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3359                                 /* Add additional parts. */
3360                                 SvCUR_set(data->last_found,
3361                                           SvCUR(data->last_found) - l);
3362                                 sv_catsv(data->last_found, last_str);
3363                                 {
3364                                     SV * sv = data->last_found;
3365                                     MAGIC *mg =
3366                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3367                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3368                                     if (mg && mg->mg_len >= 0)
3369                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3370                                 }
3371                                 data->last_end += l * (mincount - 1);
3372                             }
3373                         } else {
3374                             /* start offset must point into the last copy */
3375                             data->last_start_min += minnext * (mincount - 1);
3376                             data->last_start_max += is_inf ? I32_MAX
3377                                 : (maxcount - 1) * (minnext + data->pos_delta);
3378                         }
3379                     }
3380                     /* It is counted once already... */
3381                     data->pos_min += minnext * (mincount - counted);
3382                     data->pos_delta += - counted * deltanext +
3383                         (minnext + deltanext) * maxcount - minnext * mincount;
3384                     if (mincount != maxcount) {
3385                          /* Cannot extend fixed substrings found inside
3386                             the group.  */
3387                         SCAN_COMMIT(pRExC_state,data,minlenp);
3388                         if (mincount && last_str) {
3389                             SV * const sv = data->last_found;
3390                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3391                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3392
3393                             if (mg)
3394                                 mg->mg_len = -1;
3395                             sv_setsv(sv, last_str);
3396                             data->last_end = data->pos_min;
3397                             data->last_start_min =
3398                                 data->pos_min - CHR_SVLEN(last_str);
3399                             data->last_start_max = is_inf
3400                                 ? I32_MAX
3401                                 : data->pos_min + data->pos_delta
3402                                 - CHR_SVLEN(last_str);
3403                         }
3404                         data->longest = &(data->longest_float);
3405                     }
3406                     SvREFCNT_dec(last_str);
3407                 }
3408                 if (data && (fl & SF_HAS_EVAL))
3409                     data->flags |= SF_HAS_EVAL;
3410               optimize_curly_tail:
3411                 if (OP(oscan) != CURLYX) {
3412                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3413                            && NEXT_OFF(next))
3414                         NEXT_OFF(oscan) += NEXT_OFF(next);
3415                 }
3416                 continue;
3417             default:                    /* REF and CLUMP only? */
3418                 if (flags & SCF_DO_SUBSTR) {
3419                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3420                     data->longest = &(data->longest_float);
3421                 }
3422                 is_inf = is_inf_internal = 1;
3423                 if (flags & SCF_DO_STCLASS_OR)
3424                     cl_anything(pRExC_state, data->start_class);
3425                 flags &= ~SCF_DO_STCLASS;
3426                 break;
3427             }
3428         }
3429         else if (OP(scan) == LNBREAK) {
3430             if (flags & SCF_DO_STCLASS) {
3431                 int value = 0;
3432                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3433                 if (flags & SCF_DO_STCLASS_AND) {
3434                     for (value = 0; value < 256; value++)
3435                         if (!is_VERTWS_cp(value))
3436                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3437                 }
3438                 else {
3439                     for (value = 0; value < 256; value++)
3440                         if (is_VERTWS_cp(value))
3441                             ANYOF_BITMAP_SET(data->start_class, value);
3442                 }
3443                 if (flags & SCF_DO_STCLASS_OR)
3444                     cl_and(data->start_class, and_withp);
3445                 flags &= ~SCF_DO_STCLASS;
3446             }
3447             min += 1;
3448             delta += 1;
3449             if (flags & SCF_DO_SUBSTR) {
3450                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3451                 data->pos_min += 1;
3452                 data->pos_delta += 1;
3453                 data->longest = &(data->longest_float);
3454             }
3455
3456         }
3457         else if (OP(scan) == FOLDCHAR) {
3458             int d = ARG(scan)==0xDF ? 1 : 2;
3459             flags &= ~SCF_DO_STCLASS;
3460             min += 1;
3461             delta += d;
3462             if (flags & SCF_DO_SUBSTR) {
3463                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3464                 data->pos_min += 1;
3465                 data->pos_delta += d;
3466                 data->longest = &(data->longest_float);
3467             }
3468         }
3469         else if (strchr((const char*)PL_simple,OP(scan))) {
3470             int value = 0;
3471
3472             if (flags & SCF_DO_SUBSTR) {
3473                 SCAN_COMMIT(pRExC_state,data,minlenp);
3474                 data->pos_min++;
3475             }
3476             min++;
3477             if (flags & SCF_DO_STCLASS) {
3478                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3479
3480                 /* Some of the logic below assumes that switching
3481                    locale on will only add false positives. */
3482                 switch (PL_regkind[OP(scan)]) {
3483                 case SANY:
3484                 default:
3485                   do_default:
3486                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3487                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3488                         cl_anything(pRExC_state, data->start_class);
3489                     break;
3490                 case REG_ANY:
3491                     if (OP(scan) == SANY)
3492                         goto do_default;
3493                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3494                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3495                                  || (data->start_class->flags & ANYOF_CLASS));
3496                         cl_anything(pRExC_state, data->start_class);
3497                     }
3498                     if (flags & SCF_DO_STCLASS_AND || !value)
3499                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3500                     break;
3501                 case ANYOF:
3502                     if (flags & SCF_DO_STCLASS_AND)
3503                         cl_and(data->start_class,
3504                                (struct regnode_charclass_class*)scan);
3505                     else
3506                         cl_or(pRExC_state, data->start_class,
3507                               (struct regnode_charclass_class*)scan);
3508                     break;
3509                 case ALNUM:
3510                     if (flags & SCF_DO_STCLASS_AND) {
3511                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3512                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3513                             for (value = 0; value < 256; value++)
3514                                 if (!isALNUM(value))
3515                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3516                         }
3517                     }
3518                     else {
3519                         if (data->start_class->flags & ANYOF_LOCALE)
3520                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3521                         else {
3522                             for (value = 0; value < 256; value++)
3523                                 if (isALNUM(value))
3524                                     ANYOF_BITMAP_SET(data->start_class, value);
3525                         }
3526                     }
3527                     break;
3528                 case ALNUML:
3529                     if (flags & SCF_DO_STCLASS_AND) {
3530                         if (data->start_class->flags & ANYOF_LOCALE)
3531                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3532                     }
3533                     else {
3534                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3535                         data->start_class->flags |= ANYOF_LOCALE;
3536                     }
3537                     break;
3538                 case NALNUM:
3539                     if (flags & SCF_DO_STCLASS_AND) {
3540                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3541                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3542                             for (value = 0; value < 256; value++)
3543                                 if (isALNUM(value))
3544                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3545                         }
3546                     }
3547                     else {
3548                         if (data->start_class->flags & ANYOF_LOCALE)
3549                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3550                         else {
3551                             for (value = 0; value < 256; value++)
3552                                 if (!isALNUM(value))
3553                                     ANYOF_BITMAP_SET(data->start_class, value);
3554                         }
3555                     }
3556                     break;
3557                 case NALNUML:
3558                     if (flags & SCF_DO_STCLASS_AND) {
3559                         if (data->start_class->flags & ANYOF_LOCALE)
3560                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3561                     }
3562                     else {
3563                         data->start_class->flags |= ANYOF_LOCALE;
3564                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3565                     }
3566                     break;
3567                 case SPACE:
3568                     if (flags & SCF_DO_STCLASS_AND) {
3569                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3570                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3571                             for (value = 0; value < 256; value++)
3572                                 if (!isSPACE(value))
3573                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3574                         }
3575                     }
3576                     else {
3577                         if (data->start_class->flags & ANYOF_LOCALE)
3578                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3579                         else {
3580                             for (value = 0; value < 256; value++)
3581                                 if (isSPACE(value))
3582                                     ANYOF_BITMAP_SET(data->start_class, value);
3583                         }
3584                     }
3585                     break;
3586                 case SPACEL:
3587                     if (flags & SCF_DO_STCLASS_AND) {
3588                         if (data->start_class->flags & ANYOF_LOCALE)
3589                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3590                     }
3591                     else {
3592                         data->start_class->flags |= ANYOF_LOCALE;
3593                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3594                     }
3595                     break;
3596                 case NSPACE:
3597                     if (flags & SCF_DO_STCLASS_AND) {
3598                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3599                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3600                             for (value = 0; value < 256; value++)
3601                                 if (isSPACE(value))
3602                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3603                         }
3604                     }
3605                     else {
3606                         if (data->start_class->flags & ANYOF_LOCALE)
3607                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3608                         else {
3609                             for (value = 0; value < 256; value++)
3610                                 if (!isSPACE(value))
3611                                     ANYOF_BITMAP_SET(data->start_class, value);
3612                         }
3613                     }
3614                     break;
3615                 case NSPACEL:
3616                     if (flags & SCF_DO_STCLASS_AND) {
3617                         if (data->start_class->flags & ANYOF_LOCALE) {
3618                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3619                             for (value = 0; value < 256; value++)
3620                                 if (!isSPACE(value))
3621                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3622                         }
3623                     }
3624                     else {
3625                         data->start_class->flags |= ANYOF_LOCALE;
3626                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3627                     }
3628                     break;
3629                 case DIGIT:
3630                     if (flags & SCF_DO_STCLASS_AND) {
3631                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3632                         for (value = 0; value < 256; value++)
3633                             if (!isDIGIT(value))
3634                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3635                     }
3636                     else {
3637                         if (data->start_class->flags & ANYOF_LOCALE)
3638                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3639                         else {
3640                             for (value = 0; value < 256; value++)
3641                                 if (isDIGIT(value))
3642                                     ANYOF_BITMAP_SET(data->start_class, value);
3643                         }
3644                     }
3645                     break;
3646                 case NDIGIT:
3647                     if (flags & SCF_DO_STCLASS_AND) {
3648                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3649                         for (value = 0; value < 256; value++)
3650                             if (isDIGIT(value))
3651                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3652                     }
3653                     else {
3654                         if (data->start_class->flags & ANYOF_LOCALE)
3655                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3656                         else {
3657                             for (value = 0; value < 256; value++)
3658                                 if (!isDIGIT(value))
3659                                     ANYOF_BITMAP_SET(data->start_class, value);
3660                         }
3661                     }
3662                     break;
3663                 CASE_SYNST_FNC(VERTWS);
3664                 CASE_SYNST_FNC(HORIZWS);
3665
3666                 }
3667                 if (flags & SCF_DO_STCLASS_OR)
3668                     cl_and(data->start_class, and_withp);
3669                 flags &= ~SCF_DO_STCLASS;
3670             }
3671         }
3672         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3673             data->flags |= (OP(scan) == MEOL
3674                             ? SF_BEFORE_MEOL
3675                             : SF_BEFORE_SEOL);
3676         }
3677         else if (  PL_regkind[OP(scan)] == BRANCHJ
3678                  /* Lookbehind, or need to calculate parens/evals/stclass: */
3679                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
3680                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3681             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3682                 || OP(scan) == UNLESSM )
3683             {
3684                 /* Negative Lookahead/lookbehind
3685                    In this case we can't do fixed string optimisation.
3686                 */
3687
3688                 I32 deltanext, minnext, fake = 0;
3689                 regnode *nscan;
3690                 struct regnode_charclass_class intrnl;
3691                 int f = 0;
3692
3693                 data_fake.flags = 0;
3694                 if (data) {
3695                     data_fake.whilem_c = data->whilem_c;
3696                     data_fake.last_closep = data->last_closep;
3697                 }
3698                 else
3699                     data_fake.last_closep = &fake;
3700                 data_fake.pos_delta = delta;
3701                 if ( flags & SCF_DO_STCLASS && !scan->flags
3702                      && OP(scan) == IFMATCH ) { /* Lookahead */
3703                     cl_init(pRExC_state, &intrnl);
3704                     data_fake.start_class = &intrnl;
3705                     f |= SCF_DO_STCLASS_AND;
3706                 }
3707                 if (flags & SCF_WHILEM_VISITED_POS)
3708                     f |= SCF_WHILEM_VISITED_POS;
3709                 next = regnext(scan);
3710                 nscan = NEXTOPER(NEXTOPER(scan));
3711                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3712                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3713                 if (scan->flags) {
3714                     if (deltanext) {
3715                         FAIL("Variable length lookbehind not implemented");
3716                     }
3717                     else if (minnext > (I32)U8_MAX) {
3718                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3719                     }
3720                     scan->flags = (U8)minnext;
3721                 }
3722                 if (data) {
3723                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3724                         pars++;
3725                     if (data_fake.flags & SF_HAS_EVAL)
3726                         data->flags |= SF_HAS_EVAL;
3727                     data->whilem_c = data_fake.whilem_c;
3728                 }
3729                 if (f & SCF_DO_STCLASS_AND) {
3730                     if (flags & SCF_DO_STCLASS_OR) {
3731                         /* OR before, AND after: ideally we would recurse with
3732                          * data_fake to get the AND applied by study of the
3733                          * remainder of the pattern, and then derecurse;
3734                          * *** HACK *** for now just treat as "no information".
3735                          * See [perl #56690].
3736                          */
3737                         cl_init(pRExC_state, data->start_class);
3738                     }  else {
3739                         /* AND before and after: combine and continue */
3740                         const int was = (data->start_class->flags & ANYOF_EOS);
3741
3742                         cl_and(data->start_class, &intrnl);
3743                         if (was)
3744                             data->start_class->flags |= ANYOF_EOS;
3745                     }
3746                 }
3747             }
3748 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3749             else {
3750                 /* Positive Lookahead/lookbehind
3751                    In this case we can do fixed string optimisation,
3752                    but we must be careful about it. Note in the case of
3753                    lookbehind the positions will be offset by the minimum
3754                    length of the pattern, something we won't know about
3755                    until after the recurse.
3756                 */
3757                 I32 deltanext, fake = 0;
3758                 regnode *nscan;
3759                 struct regnode_charclass_class intrnl;
3760                 int f = 0;
3761                 /* We use SAVEFREEPV so that when the full compile
3762                     is finished perl will clean up the allocated
3763                     minlens when its all done. This was we don't
3764                     have to worry about freeing them when we know
3765                     they wont be used, which would be a pain.
3766                  */
3767                 I32 *minnextp;
3768                 Newx( minnextp, 1, I32 );
3769                 SAVEFREEPV(minnextp);
3770
3771                 if (data) {
3772                     StructCopy(data, &data_fake, scan_data_t);
3773                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3774                         f |= SCF_DO_SUBSTR;
3775                         if (scan->flags)
3776                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3777                         data_fake.last_found=newSVsv(data->last_found);
3778                     }
3779                 }
3780                 else
3781                     data_fake.last_closep = &fake;
3782                 data_fake.flags = 0;
3783                 data_fake.pos_delta = delta;
3784                 if (is_inf)
3785                     data_fake.flags |= SF_IS_INF;
3786                 if ( flags & SCF_DO_STCLASS && !scan->flags
3787                      && OP(scan) == IFMATCH ) { /* Lookahead */
3788                     cl_init(pRExC_state, &intrnl);
3789                     data_fake.start_class = &intrnl;
3790                     f |= SCF_DO_STCLASS_AND;
3791                 }
3792                 if (flags & SCF_WHILEM_VISITED_POS)
3793                     f |= SCF_WHILEM_VISITED_POS;
3794                 next = regnext(scan);
3795                 nscan = NEXTOPER(NEXTOPER(scan));
3796
3797                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3798                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3799                 if (scan->flags) {
3800                     if (deltanext) {
3801                         FAIL("Variable length lookbehind not implemented");
3802                     }
3803                     else if (*minnextp > (I32)U8_MAX) {
3804                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3805                     }
3806                     scan->flags = (U8)*minnextp;
3807                 }
3808
3809                 *minnextp += min;
3810
3811                 if (f & SCF_DO_STCLASS_AND) {
3812                     const int was = (data->start_class->flags & ANYOF_EOS);
3813
3814                     cl_and(data->start_class, &intrnl);
3815                     if (was)
3816                         data->start_class->flags |= ANYOF_EOS;
3817                 }
3818                 if (data) {
3819                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3820                         pars++;
3821                     if (data_fake.flags & SF_HAS_EVAL)
3822                         data->flags |= SF_HAS_EVAL;
3823                     data->whilem_c = data_fake.whilem_c;
3824                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3825                         if (RExC_rx->minlen<*minnextp)
3826                             RExC_rx->minlen=*minnextp;
3827                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3828                         SvREFCNT_dec(data_fake.last_found);
3829
3830                         if ( data_fake.minlen_fixed != minlenp )
3831                         {
3832                             data->offset_fixed= data_fake.offset_fixed;
3833                             data->minlen_fixed= data_fake.minlen_fixed;
3834                             data->lookbehind_fixed+= scan->flags;
3835                         }
3836                         if ( data_fake.minlen_float != minlenp )
3837                         {
3838                             data->minlen_float= data_fake.minlen_float;
3839                             data->offset_float_min=data_fake.offset_float_min;
3840                             data->offset_float_max=data_fake.offset_float_max;
3841                             data->lookbehind_float+= scan->flags;
3842                         }
3843                     }
3844                 }
3845
3846
3847             }
3848 #endif
3849         }
3850         else if (OP(scan) == OPEN) {
3851             if (stopparen != (I32)ARG(scan))
3852                 pars++;
3853         }
3854         else if (OP(scan) == CLOSE) {
3855             if (stopparen == (I32)ARG(scan)) {
3856                 break;
3857             }
3858             if ((I32)ARG(scan) == is_par) {
3859                 next = regnext(scan);
3860
3861                 if ( next && (OP(next) != WHILEM) && next < last)
3862                     is_par = 0;         /* Disable optimization */
3863             }
3864             if (data)
3865                 *(data->last_closep) = ARG(scan);
3866         }
3867         else if (OP(scan) == EVAL) {
3868                 if (data)
3869                     data->flags |= SF_HAS_EVAL;
3870         }
3871         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3872             if (flags & SCF_DO_SUBSTR) {
3873                 SCAN_COMMIT(pRExC_state,data,minlenp);
3874                 flags &= ~SCF_DO_SUBSTR;
3875             }
3876             if (data && OP(scan)==ACCEPT) {
3877                 data->flags |= SCF_SEEN_ACCEPT;
3878                 if (stopmin > min)
3879                     stopmin = min;
3880             }
3881         }
3882         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3883         {
3884                 if (flags & SCF_DO_SUBSTR) {
3885                     SCAN_COMMIT(pRExC_state,data,minlenp);
3886                     data->longest = &(data->longest_float);
3887                 }
3888                 is_inf = is_inf_internal = 1;
3889                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3890                     cl_anything(pRExC_state, data->start_class);
3891                 flags &= ~SCF_DO_STCLASS;
3892         }
3893         else if (OP(scan) == GPOS) {
3894             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3895                 !(delta || is_inf || (data && data->pos_delta)))
3896             {
3897                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3898                     RExC_rx->extflags |= RXf_ANCH_GPOS;
3899                 if (RExC_rx->gofs < (U32)min)
3900                     RExC_rx->gofs = min;
3901             } else {
3902                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3903                 RExC_rx->gofs = 0;
3904             }
3905         }
3906 #ifdef TRIE_STUDY_OPT
3907 #ifdef FULL_TRIE_STUDY
3908         else if (PL_regkind[OP(scan)] == TRIE) {
3909             /* NOTE - There is similar code to this block above for handling
3910                BRANCH nodes on the initial study.  If you change stuff here
3911                check there too. */
3912             regnode *trie_node= scan;
3913             regnode *tail= regnext(scan);
3914             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3915             I32 max1 = 0, min1 = I32_MAX;
3916             struct regnode_charclass_class accum;
3917
3918             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3919                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3920             if (flags & SCF_DO_STCLASS)
3921                 cl_init_zero(pRExC_state, &accum);
3922
3923             if (!trie->jump) {
3924                 min1= trie->minlen;
3925                 max1= trie->maxlen;
3926             } else {
3927                 const regnode *nextbranch= NULL;
3928                 U32 word;
3929
3930                 for ( word=1 ; word <= trie->wordcount ; word++)
3931                 {
3932                     I32 deltanext=0, minnext=0, f = 0, fake;
3933                     struct regnode_charclass_class this_class;
3934
3935                     data_fake.flags = 0;
3936                     if (data) {
3937                         data_fake.whilem_c = data->whilem_c;
3938                         data_fake.last_closep = data->last_closep;
3939                     }
3940                     else
3941                         data_fake.last_closep = &fake;
3942                     data_fake.pos_delta = delta;
3943                     if (flags & SCF_DO_STCLASS) {
3944                         cl_init(pRExC_state, &this_class);
3945                         data_fake.start_class = &this_class;
3946                         f = SCF_DO_STCLASS_AND;
3947                     }
3948                     if (flags & SCF_WHILEM_VISITED_POS)
3949                         f |= SCF_WHILEM_VISITED_POS;
3950
3951                     if (trie->jump[word]) {
3952                         if (!nextbranch)
3953                             nextbranch = trie_node + trie->jump[0];
3954                         scan= trie_node + trie->jump[word];
3955                         /* We go from the jump point to the branch that follows
3956                            it. Note this means we need the vestigal unused branches
3957                            even though they arent otherwise used.
3958                          */
3959                         minnext = study_chunk(pRExC_state, &scan, minlenp,
3960                             &deltanext, (regnode *)nextbranch, &data_fake,
3961                             stopparen, recursed, NULL, f,depth+1);
3962                     }
3963                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3964                         nextbranch= regnext((regnode*)nextbranch);
3965
3966                     if (min1 > (I32)(minnext + trie->minlen))
3967                         min1 = minnext + trie->minlen;
3968                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3969                         max1 = minnext + deltanext + trie->maxlen;
3970                     if (deltanext == I32_MAX)
3971                         is_inf = is_inf_internal = 1;
3972
3973                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3974                         pars++;
3975                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3976                         if ( stopmin > min + min1)
3977                             stopmin = min + min1;
3978                         flags &= ~SCF_DO_SUBSTR;
3979                         if (data)
3980                             data->flags |= SCF_SEEN_ACCEPT;
3981                     }
3982                     if (data) {
3983                         if (data_fake.flags & SF_HAS_EVAL)
3984                             data->flags |= SF_HAS_EVAL;
3985                         data->whilem_c = data_fake.whilem_c;
3986                     }
3987                     if (flags & SCF_DO_STCLASS)
3988                         cl_or(pRExC_state, &accum, &this_class);
3989                 }
3990             }
3991             if (flags & SCF_DO_SUBSTR) {
3992                 data->pos_min += min1;
3993                 data->pos_delta += max1 - min1;
3994                 if (max1 != min1 || is_inf)
3995                     data->longest = &(data->longest_float);
3996             }
3997             min += min1;
3998             delta += max1 - min1;
3999             if (flags & SCF_DO_STCLASS_OR) {
4000                 cl_or(pRExC_state, data->start_class, &accum);
4001                 if (min1) {
4002                     cl_and(data->start_class, and_withp);
4003                     flags &= ~SCF_DO_STCLASS;
4004                 }
4005             }
4006             else if (flags & SCF_DO_STCLASS_AND) {
4007                 if (min1) {
4008                     cl_and(data->start_class, &accum);
4009                     flags &= ~SCF_DO_STCLASS;
4010                 }
4011                 else {
4012                     /* Switch to OR mode: cache the old value of
4013                      * data->start_class */
4014                     INIT_AND_WITHP;
4015                     StructCopy(data->start_class, and_withp,
4016                                struct regnode_charclass_class);
4017                     flags &= ~SCF_DO_STCLASS_AND;
4018                     StructCopy(&accum, data->start_class,
4019                                struct regnode_charclass_class);
4020                     flags |= SCF_DO_STCLASS_OR;
4021                     data->start_class->flags |= ANYOF_EOS;
4022                 }
4023             }
4024             scan= tail;
4025             continue;
4026         }
4027 #else
4028         else if (PL_regkind[OP(scan)] == TRIE) {
4029             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4030             U8*bang=NULL;
4031
4032             min += trie->minlen;
4033             delta += (trie->maxlen - trie->minlen);
4034             flags &= ~SCF_DO_STCLASS; /* xxx */
4035             if (flags & SCF_DO_SUBSTR) {
4036                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4037                 data->pos_min += trie->minlen;
4038                 data->pos_delta += (trie->maxlen - trie->minlen);
4039                 if (trie->maxlen != trie->minlen)
4040                     data->longest = &(data->longest_float);
4041             }
4042             if (trie->jump) /* no more substrings -- for now /grr*/
4043                 flags &= ~SCF_DO_SUBSTR;
4044         }
4045 #endif /* old or new */
4046 #endif /* TRIE_STUDY_OPT */
4047
4048         /* Else: zero-length, ignore. */
4049         scan = regnext(scan);
4050     }
4051     if (frame) {
4052         last = frame->last;
4053         scan = frame->next;
4054         stopparen = frame->stop;
4055         frame = frame->prev;
4056         goto fake_study_recurse;
4057     }
4058
4059   finish:
4060     assert(!frame);
4061     DEBUG_STUDYDATA("pre-fin:",data,depth);
4062
4063     *scanp = scan;
4064     *deltap = is_inf_internal ? I32_MAX : delta;
4065     if (flags & SCF_DO_SUBSTR && is_inf)
4066         data->pos_delta = I32_MAX - data->pos_min;
4067     if (is_par > (I32)U8_MAX)
4068         is_par = 0;
4069     if (is_par && pars==1 && data) {
4070         data->flags |= SF_IN_PAR;
4071         data->flags &= ~SF_HAS_PAR;
4072     }
4073     else if (pars && data) {
4074         data->flags |= SF_HAS_PAR;
4075         data->flags &= ~SF_IN_PAR;
4076     }
4077     if (flags & SCF_DO_STCLASS_OR)
4078         cl_and(data->start_class, and_withp);
4079     if (flags & SCF_TRIE_RESTUDY)
4080         data->flags |=  SCF_TRIE_RESTUDY;
4081
4082     DEBUG_STUDYDATA("post-fin:",data,depth);
4083
4084     return min < stopmin ? min : stopmin;
4085 }
4086
4087 STATIC U32
4088 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4089 {
4090     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4091
4092     PERL_ARGS_ASSERT_ADD_DATA;
4093
4094     Renewc(RExC_rxi->data,
4095            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4096            char, struct reg_data);
4097     if(count)
4098         Renew(RExC_rxi->data->what, count + n, U8);
4099     else
4100         Newx(RExC_rxi->data->what, n, U8);
4101     RExC_rxi->data->count = count + n;
4102     Copy(s, RExC_rxi->data->what + count, n, U8);
4103     return count;
4104 }
4105
4106 /*XXX: todo make this not included in a non debugging perl */
4107 #ifndef PERL_IN_XSUB_RE
4108 void
4109 Perl_reginitcolors(pTHX)
4110 {
4111     dVAR;
4112     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4113     if (s) {
4114         char *t = savepv(s);
4115         int i = 0;
4116         PL_colors[0] = t;
4117         while (++i < 6) {
4118             t = strchr(t, '\t');
4119             if (t) {
4120                 *t = '\0';
4121                 PL_colors[i] = ++t;
4122             }
4123             else
4124                 PL_colors[i] = t = (char *)"";
4125         }
4126     } else {
4127         int i = 0;
4128         while (i < 6)
4129             PL_colors[i++] = (char *)"";
4130     }
4131     PL_colorset = 1;
4132 }
4133 #endif
4134
4135
4136 #ifdef TRIE_STUDY_OPT
4137 #define CHECK_RESTUDY_GOTO                                  \
4138         if (                                                \
4139               (data.flags & SCF_TRIE_RESTUDY)               \
4140               && ! restudied++                              \
4141         )     goto reStudy
4142 #else
4143 #define CHECK_RESTUDY_GOTO
4144 #endif
4145
4146 /*
4147  - pregcomp - compile a regular expression into internal code
4148  *
4149  * We can't allocate space until we know how big the compiled form will be,
4150  * but we can't compile it (and thus know how big it is) until we've got a
4151  * place to put the code.  So we cheat:  we compile it twice, once with code
4152  * generation turned off and size counting turned on, and once "for real".
4153  * This also means that we don't allocate space until we are sure that the
4154  * thing really will compile successfully, and we never have to move the
4155  * code and thus invalidate pointers into it.  (Note that it has to be in
4156  * one piece because free() must be able to free it all.) [NB: not true in perl]
4157  *
4158  * Beware that the optimization-preparation code in here knows about some
4159  * of the structure of the compiled regexp.  [I'll say.]
4160  */
4161
4162
4163
4164 #ifndef PERL_IN_XSUB_RE
4165 #define RE_ENGINE_PTR &PL_core_reg_engine
4166 #else
4167 extern const struct regexp_engine my_reg_engine;
4168 #define RE_ENGINE_PTR &my_reg_engine
4169 #endif
4170
4171 #ifndef PERL_IN_XSUB_RE
4172 REGEXP *
4173 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4174 {
4175     dVAR;
4176     HV * const table = GvHV(PL_hintgv);
4177
4178     PERL_ARGS_ASSERT_PREGCOMP;
4179
4180     /* Dispatch a request to compile a regexp to correct
4181        regexp engine. */
4182     if (table) {
4183         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4184         GET_RE_DEBUG_FLAGS_DECL;
4185         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4186             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4187             DEBUG_COMPILE_r({
4188                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4189                     SvIV(*ptr));
4190             });
4191             return CALLREGCOMP_ENG(eng, pattern, flags);
4192         }
4193     }
4194     return Perl_re_compile(aTHX_ pattern, flags);
4195 }
4196 #endif
4197
4198 REGEXP *
4199 Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
4200 {
4201     dVAR;
4202     REGEXP *rx;
4203     struct regexp *r;
4204     register regexp_internal *ri;
4205     STRLEN plen;
4206     char  *exp = SvPV(pattern, plen);
4207     char* xend = exp + plen;
4208     regnode *scan;
4209     I32 flags;
4210     I32 minlen = 0;
4211     I32 sawplus = 0;
4212     I32 sawopen = 0;
4213     scan_data_t data;
4214     RExC_state_t RExC_state;
4215     RExC_state_t * const pRExC_state = &RExC_state;
4216 #ifdef TRIE_STUDY_OPT
4217     int restudied= 0;
4218     RExC_state_t copyRExC_state;
4219 #endif
4220     GET_RE_DEBUG_FLAGS_DECL;
4221
4222     PERL_ARGS_ASSERT_RE_COMPILE;
4223
4224     DEBUG_r(if (!PL_colorset) reginitcolors());
4225
4226     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4227
4228     DEBUG_COMPILE_r({
4229         SV *dsv= sv_newmortal();
4230         RE_PV_QUOTED_DECL(s, RExC_utf8,
4231             dsv, exp, plen, 60);
4232         PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4233                        PL_colors[4],PL_colors[5],s);
4234     });
4235
4236 redo_first_pass:
4237     RExC_precomp = exp;
4238     RExC_flags = pm_flags;
4239     RExC_sawback = 0;
4240
4241     RExC_seen = 0;
4242     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4243     RExC_seen_evals = 0;
4244     RExC_extralen = 0;
4245
4246     /* First pass: determine size, legality. */
4247     RExC_parse = exp;
4248     RExC_start = exp;
4249     RExC_end = xend;
4250     RExC_naughty = 0;
4251     RExC_npar = 1;
4252     RExC_nestroot = 0;
4253     RExC_size = 0L;
4254     RExC_emit = &PL_regdummy;
4255     RExC_whilem_seen = 0;
4256     RExC_charnames = NULL;
4257     RExC_open_parens = NULL;
4258     RExC_close_parens = NULL;
4259     RExC_opend = NULL;
4260     RExC_paren_names = NULL;
4261 #ifdef DEBUGGING
4262     RExC_paren_name_list = NULL;
4263 #endif
4264     RExC_recurse = NULL;
4265     RExC_recurse_count = 0;
4266
4267 #if 0 /* REGC() is (currently) a NOP at the first pass.
4268        * Clever compilers notice this and complain. --jhi */
4269     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4270 #endif
4271     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4272     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4273         RExC_precomp = NULL;
4274         return(NULL);
4275     }
4276     if (RExC_utf8 && !RExC_orig_utf8) {
4277         /* It's possible to write a regexp in ascii that represents Unicode
4278         codepoints outside of the byte range, such as via \x{100}. If we
4279         detect such a sequence we have to convert the entire pattern to utf8
4280         and then recompile, as our sizing calculation will have been based
4281         on 1 byte == 1 character, but we will need to use utf8 to encode
4282         at least some part of the pattern, and therefore must convert the whole
4283         thing.
4284         XXX: somehow figure out how to make this less expensive...
4285         -- dmq */
4286         STRLEN len = plen;
4287         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4288             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4289         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4290         xend = exp + len;
4291         RExC_orig_utf8 = RExC_utf8;
4292         SAVEFREEPV(exp);
4293         goto redo_first_pass;
4294     }
4295     DEBUG_PARSE_r({
4296         PerlIO_printf(Perl_debug_log,
4297             "Required size %"IVdf" nodes\n"
4298             "Starting second pass (creation)\n",
4299             (IV)RExC_size);
4300         RExC_lastnum=0;
4301         RExC_lastparse=NULL;
4302     });
4303     /* Small enough for pointer-storage convention?
4304        If extralen==0, this means that we will not need long jumps. */
4305     if (RExC_size >= 0x10000L && RExC_extralen)
4306         RExC_size += RExC_extralen;
4307     else
4308         RExC_extralen = 0;
4309     if (RExC_whilem_seen > 15)
4310         RExC_whilem_seen = 15;
4311
4312     /* Allocate space and zero-initialize. Note, the two step process
4313        of zeroing when in debug mode, thus anything assigned has to
4314        happen after that */
4315     rx = (REGEXP*) newSV_type(SVt_REGEXP);
4316     r = (struct regexp*)SvANY(rx);
4317     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4318          char, regexp_internal);
4319     if ( r == NULL || ri == NULL )
4320         FAIL("Regexp out of space");
4321 #ifdef DEBUGGING
4322     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4323     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4324 #else
4325     /* bulk initialize base fields with 0. */
4326     Zero(ri, sizeof(regexp_internal), char);
4327 #endif
4328
4329     /* non-zero initialization begins here */
4330     RXi_SET( r, ri );
4331     r->engine= RE_ENGINE_PTR;
4332     r->extflags = pm_flags;
4333     {
4334         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4335         bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4336         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4337         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4338                             >> RXf_PMf_STD_PMMOD_SHIFT);
4339         const char *fptr = STD_PAT_MODS;        /*"msix"*/
4340         char *p;
4341         const STRLEN wraplen = plen + has_minus + has_p + has_runon
4342             + (sizeof(STD_PAT_MODS) - 1)
4343             + (sizeof("(?:)") - 1);
4344
4345         p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
4346         SvCUR_set(rx, wraplen);
4347         SvPOK_on(rx);
4348         SvFLAGS(rx) |= SvUTF8(pattern);
4349         *p++='('; *p++='?';
4350         if (has_p)
4351             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4352         {
4353             char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4354             char *colon = r + 1;
4355             char ch;
4356
4357             while((ch = *fptr++)) {
4358                 if(reganch & 1)
4359                     *p++ = ch;
4360                 else
4361                     *r-- = ch;
4362                 reganch >>= 1;
4363             }
4364             if(has_minus) {
4365                 *r = '-';
4366                 p = colon;
4367             }
4368         }
4369
4370         *p++ = ':';
4371         Copy(RExC_precomp, p, plen, char);
4372         assert ((RX_WRAPPED(rx) - p) < 16);
4373         r->pre_prefix = p - RX_WRAPPED(rx);
4374         p += plen;
4375         if (has_runon)
4376             *p++ = '\n';
4377         *p++ = ')';
4378         *p = 0;
4379     }
4380
4381     r->intflags = 0;
4382     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4383
4384     if (RExC_seen & REG_SEEN_RECURSE) {
4385         Newxz(RExC_open_parens, RExC_npar,regnode *);
4386         SAVEFREEPV(RExC_open_parens);
4387         Newxz(RExC_close_parens,RExC_npar,regnode *);
4388         SAVEFREEPV(RExC_close_parens);
4389     }
4390
4391     /* Useful during FAIL. */
4392 #ifdef RE_TRACK_PATTERN_OFFSETS
4393     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4394     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4395                           "%s %"UVuf" bytes for offset annotations.\n",
4396                           ri->u.offsets ? "Got" : "Couldn't get",
4397                           (UV)((2*RExC_size+1) * sizeof(U32))));
4398 #endif
4399     SetProgLen(ri,RExC_size);
4400     RExC_rx_sv = rx;
4401     RExC_rx = r;
4402     RExC_rxi = ri;
4403     REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4404
4405     /* Second pass: emit code. */
4406     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
4407     RExC_parse = exp;
4408     RExC_end = xend;
4409     RExC_naughty = 0;
4410     RExC_npar = 1;
4411     RExC_emit_start = ri->program;
4412     RExC_emit = ri->program;
4413     RExC_emit_bound = ri->program + RExC_size + 1;
4414
4415     /* Store the count of eval-groups for security checks: */
4416     RExC_rx->seen_evals = RExC_seen_evals;
4417     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4418     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4419         ReREFCNT_dec(rx);
4420         return(NULL);
4421     }
4422     /* XXXX To minimize changes to RE engine we always allocate
4423        3-units-long substrs field. */
4424     Newx(r->substrs, 1, struct reg_substr_data);
4425     if (RExC_recurse_count) {
4426         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4427         SAVEFREEPV(RExC_recurse);
4428     }
4429
4430 reStudy:
4431     r->minlen = minlen = sawplus = sawopen = 0;
4432     Zero(r->substrs, 1, struct reg_substr_data);
4433
4434 #ifdef TRIE_STUDY_OPT
4435     if (!restudied) {
4436         StructCopy(&zero_scan_data, &data, scan_data_t);
4437         copyRExC_state = RExC_state;
4438     } else {
4439         U32 seen=RExC_seen;
4440         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4441
4442         RExC_state = copyRExC_state;
4443         if (seen & REG_TOP_LEVEL_BRANCHES)
4444             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4445         else
4446             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4447         if (data.last_found) {
4448             SvREFCNT_dec(data.longest_fixed);
4449             SvREFCNT_dec(data.longest_float);
4450             SvREFCNT_dec(data.last_found);
4451         }
4452         StructCopy(&zero_scan_data, &data, scan_data_t);
4453     }
4454 #else
4455     StructCopy(&zero_scan_data, &data, scan_data_t);
4456 #endif
4457
4458     /* Dig out information for optimizations. */
4459     r->extflags = RExC_flags; /* was pm_op */
4460     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4461
4462     if (UTF)
4463         SvUTF8_on(rx);  /* Unicode in it? */
4464     ri->regstclass = NULL;
4465     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
4466         r->intflags |= PREGf_NAUGHTY;
4467     scan = ri->program + 1;             /* First BRANCH. */
4468
4469     /* testing for BRANCH here tells us whether there is "must appear"
4470        data in the pattern. If there is then we can use it for optimisations */
4471     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4472         I32 fake;
4473         STRLEN longest_float_length, longest_fixed_length;
4474         struct regnode_charclass_class ch_class; /* pointed to by data */
4475         int stclass_flag;
4476         I32 last_close = 0; /* pointed to by data */
4477         regnode *first= scan;
4478         regnode *first_next= regnext(first);
4479
4480         /*
4481          * Skip introductions and multiplicators >= 1
4482          * so that we can extract the 'meat' of the pattern that must
4483          * match in the large if() sequence following.
4484          * NOTE that EXACT is NOT covered here, as it is normally
4485          * picked up by the optimiser separately.
4486          *
4487          * This is unfortunate as the optimiser isnt handling lookahead
4488          * properly currently.
4489          *
4490          */
4491         while ((OP(first) == OPEN && (sawopen = 1)) ||
4492                /* An OR of *one* alternative - should not happen now. */
4493             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4494             /* for now we can't handle lookbehind IFMATCH*/
4495             (OP(first) == IFMATCH && !first->flags) ||
4496             (OP(first) == PLUS) ||
4497             (OP(first) == MINMOD) ||
4498                /* An {n,m} with n>0 */
4499             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4500             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4501         {
4502                 /*
4503                  * the only op that could be a regnode is PLUS, all the rest
4504                  * will be regnode_1 or regnode_2.
4505                  *
4506                  */
4507                 if (OP(first) == PLUS)
4508                     sawplus = 1;
4509                 else
4510                     first += regarglen[OP(first)];
4511
4512                 first = NEXTOPER(first);
4513                 first_next= regnext(first);
4514         }
4515
4516         /* Starting-point info. */
4517       again:
4518         DEBUG_PEEP("first:",first,0);
4519         /* Ignore EXACT as we deal with it later. */
4520         if (PL_regkind[OP(first)] == EXACT) {
4521             if (OP(first) == EXACT)
4522                 NOOP;   /* Empty, get anchored substr later. */
4523             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4524                 ri->regstclass = first;
4525         }
4526 #ifdef TRIE_STCLASS
4527         else if (PL_regkind[OP(first)] == TRIE &&
4528                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4529         {
4530             regnode *trie_op;
4531             /* this can happen only on restudy */
4532             if ( OP(first) == TRIE ) {
4533                 struct regnode_1 *trieop = (struct regnode_1 *)
4534                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
4535                 StructCopy(first,trieop,struct regnode_1);
4536                 trie_op=(regnode *)trieop;
4537             } else {
4538                 struct regnode_charclass *trieop = (struct regnode_charclass *)
4539                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4540                 StructCopy(first,trieop,struct regnode_charclass);
4541                 trie_op=(regnode *)trieop;
4542             }
4543             OP(trie_op)+=2;
4544             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4545             ri->regstclass = trie_op;
4546         }
4547 #endif
4548         else if (strchr((const char*)PL_simple,OP(first)))
4549             ri->regstclass = first;
4550         else if (PL_regkind[OP(first)] == BOUND ||
4551                  PL_regkind[OP(first)] == NBOUND)
4552             ri->regstclass = first;
4553         else if (PL_regkind[OP(first)] == BOL) {
4554             r->extflags |= (OP(first) == MBOL
4555                            ? RXf_ANCH_MBOL
4556                            : (OP(first) == SBOL
4557                               ? RXf_ANCH_SBOL
4558                               : RXf_ANCH_BOL));
4559             first = NEXTOPER(first);
4560             goto again;
4561         }
4562         else if (OP(first) == GPOS) {
4563             r->extflags |= RXf_ANCH_GPOS;
4564             first = NEXTOPER(first);
4565             goto again;
4566         }
4567         else if ((!sawopen || !RExC_sawback) &&
4568             (OP(first) == STAR &&
4569             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4570             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4571         {
4572             /* turn .* into ^.* with an implied $*=1 */
4573             const int type =
4574                 (OP(NEXTOPER(first)) == REG_ANY)
4575                     ? RXf_ANCH_MBOL
4576                     : RXf_ANCH_SBOL;
4577             r->extflags |= type;
4578             r->intflags |= PREGf_IMPLICIT;
4579             first = NEXTOPER(first);
4580             goto again;
4581         }
4582         if (sawplus && (!sawopen || !RExC_sawback)
4583             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4584             /* x+ must match at the 1st pos of run of x's */
4585             r->intflags |= PREGf_SKIP;
4586
4587         /* Scan is after the zeroth branch, first is atomic matcher. */
4588 #ifdef TRIE_STUDY_OPT
4589         DEBUG_PARSE_r(
4590             if (!restudied)
4591                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4592                               (IV)(first - scan + 1))
4593         );
4594 #else
4595         DEBUG_PARSE_r(
4596             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4597                 (IV)(first - scan + 1))
4598         );
4599 #endif
4600
4601
4602         /*
4603         * If there's something expensive in the r.e., find the
4604         * longest literal string that must appear and make it the
4605         * regmust.  Resolve ties in favor of later strings, since
4606         * the regstart check works with the beginning of the r.e.
4607         * and avoiding duplication strengthens checking.  Not a
4608         * strong reason, but sufficient in the absence of others.
4609         * [Now we resolve ties in favor of the earlier string if
4610         * it happens that c_offset_min has been invalidated, since the
4611         * earlier string may buy us something the later one won't.]
4612         */
4613
4614         data.longest_fixed = newSVpvs("");
4615         data.longest_float = newSVpvs("");
4616         data.last_found = newSVpvs("");
4617         data.longest = &(data.longest_fixed);
4618         first = scan;
4619         if (!ri->regstclass) {
4620             cl_init(pRExC_state, &ch_class);
4621             data.start_class = &ch_class;
4622             stclass_flag = SCF_DO_STCLASS_AND;
4623         } else                          /* XXXX Check for BOUND? */
4624             stclass_flag = 0;
4625         data.last_closep = &last_close;
4626
4627         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4628             &data, -1, NULL, NULL,
4629             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4630
4631
4632         CHECK_RESTUDY_GOTO;
4633
4634
4635         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4636              && data.last_start_min == 0 && data.last_end > 0
4637              && !RExC_seen_zerolen
4638              && !(RExC_seen & REG_SEEN_VERBARG)
4639              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4640             r->extflags |= RXf_CHECK_ALL;
4641         scan_commit(pRExC_state, &data,&minlen,0);
4642         SvREFCNT_dec(data.last_found);
4643
4644         /* Note that code very similar to this but for anchored string
4645            follows immediately below, changes may need to be made to both.
4646            Be careful.
4647          */
4648         longest_float_length = CHR_SVLEN(data.longest_float);
4649         if (longest_float_length
4650             || (data.flags & SF_FL_BEFORE_EOL
4651                 && (!(data.flags & SF_FL_BEFORE_MEOL)
4652                     || (RExC_flags & RXf_PMf_MULTILINE))))
4653         {
4654             I32 t,ml;
4655
4656             if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
4657                 && data.offset_fixed == data.offset_float_min
4658                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4659                     goto remove_float;          /* As in (a)+. */
4660
4661             /* copy the information about the longest float from the reg_scan_data
4662                over to the program. */
4663             if (SvUTF8(data.longest_float)) {
4664                 r->float_utf8 = data.longest_float;
4665                 r->float_substr = NULL;
4666             } else {
4667                 r->float_substr = data.longest_float;
4668                 r->float_utf8 = NULL;
4669             }
4670             /* float_end_shift is how many chars that must be matched that
4671                follow this item. We calculate it ahead of time as once the
4672                lookbehind offset is added in we lose the ability to correctly
4673                calculate it.*/
4674             ml = data.minlen_float ? *(data.minlen_float)
4675                                    : (I32)longest_float_length;
4676             r->float_end_shift = ml - data.offset_float_min
4677                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4678                 + data.lookbehind_float;
4679             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4680             r->float_max_offset = data.offset_float_max;
4681             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4682                 r->float_max_offset -= data.lookbehind_float;
4683
4684             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4685                        && (!(data.flags & SF_FL_BEFORE_MEOL)
4686                            || (RExC_flags & RXf_PMf_MULTILINE)));
4687             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4688         }
4689         else {
4690           remove_float:
4691             r->float_substr = r->float_utf8 = NULL;
4692             SvREFCNT_dec(data.longest_float);
4693             longest_float_length = 0;
4694         }
4695
4696         /* Note that code very similar to this but for floating string
4697            is immediately above, changes may need to be made to both.
4698            Be careful.
4699          */
4700         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4701         if (longest_fixed_length
4702             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4703                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4704                     || (RExC_flags & RXf_PMf_MULTILINE))))
4705         {
4706             I32 t,ml;
4707
4708             /* copy the information about the longest fixed
4709                from the reg_scan_data over to the program. */
4710             if (SvUTF8(data.longest_fixed)) {
4711                 r->anchored_utf8 = data.longest_fixed;
4712                 r->anchored_substr = NULL;
4713             } else {
4714                 r->anchored_substr = data.longest_fixed;
4715                 r->anchored_utf8 = NULL;
4716             }
4717             /* fixed_end_shift is how many chars that must be matched that
4718                follow this item. We calculate it ahead of time as once the
4719                lookbehind offset is added in we lose the ability to correctly
4720                calculate it.*/
4721             ml = data.minlen_fixed ? *(data.minlen_fixed)
4722                                    : (I32)longest_fixed_length;
4723             r->anchored_end_shift = ml - data.offset_fixed
4724                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4725                 + data.lookbehind_fixed;
4726             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4727
4728             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4729                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
4730                      || (RExC_flags & RXf_PMf_MULTILINE)));
4731             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4732         }
4733         else {
4734             r->anchored_substr = r->anchored_utf8 = NULL;
4735             SvREFCNT_dec(data.longest_fixed);
4736             longest_fixed_length = 0;
4737         }
4738         if (ri->regstclass
4739             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4740             ri->regstclass = NULL;
4741         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4742             && stclass_flag
4743             && !(data.start_class->flags & ANYOF_EOS)
4744             && !cl_is_anything(data.start_class))
4745         {
4746             const U32 n = add_data(pRExC_state, 1, "f");
4747
4748             Newx(RExC_rxi->data->data[n], 1,
4749                 struct regnode_charclass_class);
4750             StructCopy(data.start_class,
4751                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4752                        struct regnode_charclass_class);
4753             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4754             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4755             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4756                       regprop(r, sv, (regnode*)data.start_class);
4757                       PerlIO_printf(Perl_debug_log,
4758                                     "synthetic stclass \"%s\".\n",
4759                                     SvPVX_const(sv));});
4760         }
4761
4762         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4763         if (longest_fixed_length > longest_float_length) {
4764             r->check_end_shift = r->anchored_end_shift;
4765             r->check_substr = r->anchored_substr;
4766             r->check_utf8 = r->anchored_utf8;
4767             r->check_offset_min = r->check_offset_max = r->anchored_offset;
4768             if (r->extflags & RXf_ANCH_SINGLE)
4769                 r->extflags |= RXf_NOSCAN;
4770         }
4771         else {
4772             r->check_end_shift = r->float_end_shift;
4773             r->check_substr = r->float_substr;
4774             r->check_utf8 = r->float_utf8;
4775             r->check_offset_min = r->float_min_offset;
4776             r->check_offset_max = r->float_max_offset;
4777         }
4778         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4779            This should be changed ASAP!  */
4780         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4781             r->extflags |= RXf_USE_INTUIT;
4782             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4783                 r->extflags |= RXf_INTUIT_TAIL;
4784         }
4785         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4786         if ( (STRLEN)minlen < longest_float_length )
4787             minlen= longest_float_length;
4788         if ( (STRLEN)minlen < longest_fixed_length )
4789             minlen= longest_fixed_length;
4790         */
4791     }
4792     else {
4793         /* Several toplevels. Best we can is to set minlen. */
4794         I32 fake;
4795         struct regnode_charclass_class ch_class;
4796         I32 last_close = 0;
4797
4798         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4799
4800         scan = ri->program + 1;
4801         cl_init(pRExC_state, &ch_class);
4802         data.start_class = &ch_class;
4803         data.last_closep = &last_close;
4804
4805
4806         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4807             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4808
4809         CHECK_RESTUDY_GOTO;
4810
4811         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4812                 = r->float_substr = r->float_utf8 = NULL;
4813         if (!(data.start_class->flags & ANYOF_EOS)
4814             && !cl_is_anything(data.start_class))
4815         {
4816             const U32 n = add_data(pRExC_state, 1, "f");
4817
4818             Newx(RExC_rxi->data->data[n], 1,
4819                 struct regnode_charclass_class);
4820             StructCopy(data.start_class,
4821                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4822                        struct regnode_charclass_class);
4823             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4824             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4825             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4826                       regprop(r, sv, (regnode*)data.start_class);
4827                       PerlIO_printf(Perl_debug_log,
4828                                     "synthetic stclass \"%s\".\n",
4829                                     SvPVX_const(sv));});
4830         }
4831     }
4832
4833     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4834        the "real" pattern. */
4835     DEBUG_OPTIMISE_r({
4836         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4837                       (IV)minlen, (IV)r->minlen);
4838     });
4839     r->minlenret = minlen;
4840     if (r->minlen < minlen)
4841         r->minlen = minlen;
4842
4843     if (RExC_seen & REG_SEEN_GPOS)
4844         r->extflags |= RXf_GPOS_SEEN;
4845     if (RExC_seen & REG_SEEN_LOOKBEHIND)
4846         r->extflags |= RXf_LOOKBEHIND_SEEN;
4847     if (RExC_seen & REG_SEEN_EVAL)
4848         r->extflags |= RXf_EVAL_SEEN;
4849     if (RExC_seen & REG_SEEN_CANY)
4850         r->extflags |= RXf_CANY_SEEN;
4851     if (RExC_seen & REG_SEEN_VERBARG)
4852         r->intflags |= PREGf_VERBARG_SEEN;
4853     if (RExC_seen & REG_SEEN_CUTGROUP)
4854         r->intflags |= PREGf_CUTGROUP_SEEN;
4855     if (RExC_paren_names)
4856         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
4857     else
4858         RXp_PAREN_NAMES(r) = NULL;
4859
4860 #ifdef STUPID_PATTERN_CHECKS
4861     if (RX_PRELEN(rx) == 0)
4862         r->extflags |= RXf_NULL;
4863     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4864         /* XXX: this should happen BEFORE we compile */
4865         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4866     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
4867         r->extflags |= RXf_WHITE;
4868     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
4869         r->extflags |= RXf_START_ONLY;
4870 #else
4871     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4872             /* XXX: this should happen BEFORE we compile */
4873             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4874     else {
4875         regnode *first = ri->program + 1;
4876         U8 fop = OP(first);
4877         U8 nop = OP(NEXTOPER(first));
4878
4879         if (PL_regkind[fop] == NOTHING && nop == END)
4880             r->extflags |= RXf_NULL;
4881         else if (PL_regkind[fop] == BOL && nop == END)
4882             r->extflags |= RXf_START_ONLY;
4883         else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4884             r->extflags |= RXf_WHITE;
4885     }
4886 #endif
4887 #ifdef DEBUGGING
4888     if (RExC_paren_names) {
4889         ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4890         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4891     } else
4892 #endif
4893         ri->name_list_idx = 0;
4894
4895     if (RExC_recurse_count) {
4896         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4897             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4898             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4899         }
4900     }
4901     Newxz(r->offs, RExC_npar, regexp_paren_pair);
4902     /* assume we don't need to swap parens around before we match */
4903
4904     DEBUG_DUMP_r({
4905         PerlIO_printf(Perl_debug_log,"Final program:\n");
4906         regdump(r);
4907     });
4908 #ifdef RE_TRACK_PATTERN_OFFSETS
4909     DEBUG_OFFSETS_r(if (ri->u.offsets) {
4910         const U32 len = ri->u.offsets[0];
4911         U32 i;
4912         GET_RE_DEBUG_FLAGS_DECL;
4913         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4914         for (i = 1; i <= len; i++) {
4915             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4916                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4917                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4918             }
4919         PerlIO_printf(Perl_debug_log, "\n");
4920     });
4921 #endif
4922     return rx;
4923 }
4924
4925 #undef RE_ENGINE_PTR
4926
4927
4928 SV*
4929 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4930                     const U32 flags)
4931 {
4932     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
4933
4934     PERL_UNUSED_ARG(value);
4935
4936     if (flags & RXapif_FETCH) {
4937         return reg_named_buff_fetch(rx, key, flags);
4938     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4939         Perl_croak(aTHX_ "%s", PL_no_modify);
4940         return NULL;
4941     } else if (flags & RXapif_EXISTS) {
4942         return reg_named_buff_exists(rx, key, flags)
4943             ? &PL_sv_yes
4944             : &PL_sv_no;
4945     } else if (flags & RXapif_REGNAMES) {
4946         return reg_named_buff_all(rx, flags);
4947     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
4948         return reg_named_buff_scalar(rx, flags);
4949     } else {
4950         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4951         return NULL;
4952     }
4953 }
4954
4955 SV*
4956 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4957                          const U32 flags)
4958 {
4959     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
4960     PERL_UNUSED_ARG(lastkey);
4961
4962     if (flags & RXapif_FIRSTKEY)
4963         return reg_named_buff_firstkey(rx, flags);
4964     else if (flags & RXapif_NEXTKEY)
4965         return reg_named_buff_nextkey(rx, flags);
4966     else {
4967         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4968         return NULL;
4969     }
4970 }
4971
4972 SV*
4973 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
4974                           const U32 flags)
4975 {
4976     AV *retarray = NULL;
4977     SV *ret;
4978     struct regexp *const rx = (struct regexp *)SvANY(r);
4979
4980     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
4981
4982     if (flags & RXapif_ALL)
4983         retarray=newAV();
4984
4985     if (rx && RXp_PAREN_NAMES(rx)) {
4986         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
4987         if (he_str) {
4988             IV i;
4989             SV* sv_dat=HeVAL(he_str);
4990             I32 *nums=(I32*)SvPVX(sv_dat);
4991             for ( i=0; i<SvIVX(sv_dat); i++ ) {
4992                 if ((I32)(rx->nparens) >= nums[i]
4993                     && rx->offs[nums[i]].start != -1
4994                     && rx->offs[nums[i]].end != -1)
4995                 {
4996                     ret = newSVpvs("");
4997                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
4998                     if (!retarray)
4999                         return ret;
5000                 } else {
5001                     ret = newSVsv(&PL_sv_undef);
5002                 }
5003                 if (retarray)
5004                     av_push(retarray, ret);
5005             }
5006             if (retarray)
5007                 return newRV_noinc(MUTABLE_SV(retarray));
5008         }
5009     }
5010     return NULL;
5011 }
5012
5013 bool
5014 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5015                            const U32 flags)
5016 {
5017     struct regexp *const rx = (struct regexp *)SvANY(r);
5018
5019     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5020
5021     if (rx && RXp_PAREN_NAMES(rx)) {
5022         if (flags & RXapif_ALL) {
5023             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5024         } else {
5025             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5026             if (sv) {
5027                 SvREFCNT_dec(sv);
5028                 return TRUE;
5029             } else {
5030                 return FALSE;
5031             }
5032         }
5033     } else {
5034         return FALSE;
5035     }
5036 }
5037
5038 SV*
5039 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5040 {
5041     struct regexp *const rx = (struct regexp *)SvANY(r);
5042
5043     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5044
5045     if ( rx && RXp_PAREN_NAMES(rx) ) {
5046         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5047
5048         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5049     } else {
5050         return FALSE;
5051     }
5052 }
5053
5054 SV*
5055 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5056 {
5057     struct regexp *const rx = (struct regexp *)SvANY(r);
5058     GET_RE_DEBUG_FLAGS_DECL;
5059
5060     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5061
5062     if (rx && RXp_PAREN_NAMES(rx)) {
5063         HV *hv = RXp_PAREN_NAMES(rx);
5064         HE *temphe;
5065         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5066             IV i;
5067             IV parno = 0;
5068             SV* sv_dat = HeVAL(temphe);
5069             I32 *nums = (I32*)SvPVX(sv_dat);
5070             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5071                 if ((I32)(rx->lastparen) >= nums[i] &&
5072                     rx->offs[nums[i]].start != -1 &&
5073                     rx->offs[nums[i]].end != -1)
5074                 {
5075                     parno = nums[i];
5076                     break;
5077                 }
5078             }
5079             if (parno || flags & RXapif_ALL) {
5080                 return newSVhek(HeKEY_hek(temphe));
5081             }
5082         }
5083     }
5084     return NULL;
5085 }
5086
5087 SV*
5088 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5089 {
5090     SV *ret;
5091     AV *av;
5092     I32 length;
5093     struct regexp *const rx = (struct regexp *)SvANY(r);
5094
5095     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5096
5097     if (rx && RXp_PAREN_NAMES(rx)) {
5098         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5099             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5100         } else if (flags & RXapif_ONE) {
5101             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5102             av = MUTABLE_AV(SvRV(ret));
5103             length = av_len(av);
5104             SvREFCNT_dec(ret);
5105             return newSViv(length + 1);
5106         } else {
5107             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5108             return NULL;
5109         }
5110     }
5111     return &PL_sv_undef;
5112 }
5113
5114 SV*
5115 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5116 {
5117     struct regexp *const rx = (struct regexp *)SvANY(r);
5118     AV *av = newAV();
5119
5120     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5121
5122     if (rx && RXp_PAREN_NAMES(rx)) {
5123         HV *hv= RXp_PAREN_NAMES(rx);
5124         HE *temphe;
5125         (void)hv_iterinit(hv);
5126         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5127             IV i;
5128             IV parno = 0;
5129             SV* sv_dat = HeVAL(temphe);
5130             I32 *nums = (I32*)SvPVX(sv_dat);
5131             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5132                 if ((I32)(rx->lastparen) >= nums[i] &&
5133                     rx->offs[nums[i]].start != -1 &&
5134                     rx->offs[nums[i]].end != -1)
5135                 {
5136                     parno = nums[i];
5137                     break;
5138                 }
5139             }
5140             if (parno || flags & RXapif_ALL) {
5141                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5142             }
5143         }
5144     }
5145
5146     return newRV_noinc(MUTABLE_SV(av));
5147 }
5148
5149 void
5150 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5151                              SV * const sv)
5152 {
5153     struct regexp *const rx = (struct regexp *)SvANY(r);
5154     char *s = NULL;
5155     I32 i = 0;
5156     I32 s1, t1;
5157
5158     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5159
5160     if (!rx->subbeg) {
5161         sv_setsv(sv,&PL_sv_undef);
5162         return;
5163     }
5164     else
5165     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5166         /* $` */
5167         i = rx->offs[0].start;
5168         s = rx->subbeg;
5169     }
5170     else
5171     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5172         /* $' */
5173         s = rx->subbeg + rx->offs[0].end;
5174         i = rx->sublen - rx->offs[0].end;
5175     }
5176     else
5177     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5178         (s1 = rx->offs[paren].start) != -1 &&
5179         (t1 = rx->offs[paren].end) != -1)
5180     {
5181         /* $& $1 ... */
5182         i = t1 - s1;
5183         s = rx->subbeg + s1;
5184     } else {
5185         sv_setsv(sv,&PL_sv_undef);
5186         return;
5187     }
5188     assert(rx->sublen >= (s - rx->subbeg) + i );
5189     if (i >= 0) {
5190         const int oldtainted = PL_tainted;
5191         TAINT_NOT;
5192         sv_setpvn(sv, s, i);
5193         PL_tainted = oldtainted;
5194         if ( (rx->extflags & RXf_CANY_SEEN)
5195             ? (RXp_MATCH_UTF8(rx)
5196                         && (!i || is_utf8_string((U8*)s, i)))
5197             : (RXp_MATCH_UTF8(rx)) )
5198         {
5199             SvUTF8_on(sv);
5200         }
5201         else
5202             SvUTF8_off(sv);
5203         if (PL_tainting) {
5204             if (RXp_MATCH_TAINTED(rx)) {
5205                 if (SvTYPE(sv) >= SVt_PVMG) {
5206                     MAGIC* const mg = SvMAGIC(sv);
5207                     MAGIC* mgt;
5208                     PL_tainted = 1;
5209                     SvMAGIC_set(sv, mg->mg_moremagic);
5210                     SvTAINT(sv);
5211                     if ((mgt = SvMAGIC(sv))) {
5212                         mg->mg_moremagic = mgt;
5213                         SvMAGIC_set(sv, mg);
5214                     }
5215                 } else {
5216                     PL_tainted = 1;
5217                     SvTAINT(sv);
5218                 }
5219             } else
5220                 SvTAINTED_off(sv);
5221         }
5222     } else {
5223         sv_setsv(sv,&PL_sv_undef);
5224         return;
5225     }
5226 }
5227
5228 void
5229 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5230                                                          SV const * const value)
5231 {
5232     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5233
5234     PERL_UNUSED_ARG(rx);
5235     PERL_UNUSED_ARG(paren);
5236     PERL_UNUSED_ARG(value);
5237
5238     if (!PL_localizing)
5239         Perl_croak(aTHX_ "%s", PL_no_modify);
5240 }
5241
5242 I32
5243 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5244                               const I32 paren)
5245 {
5246     struct regexp *const rx = (struct regexp *)SvANY(r);
5247     I32 i;
5248     I32 s1, t1;
5249
5250     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5251
5252     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5253         switch (paren) {
5254       /* $` / ${^PREMATCH} */
5255       case RX_BUFF_IDX_PREMATCH:
5256         if (rx->offs[0].start != -1) {
5257                         i = rx->offs[0].start;
5258                         if (i > 0) {
5259                                 s1 = 0;
5260                                 t1 = i;
5261                                 goto getlen;
5262                         }
5263             }
5264         return 0;
5265       /* $' / ${^POSTMATCH} */
5266       case RX_BUFF_IDX_POSTMATCH:
5267             if (rx->offs[0].end != -1) {
5268                         i = rx->sublen - rx->offs[0].end;
5269                         if (i > 0) {
5270                                 s1 = rx->offs[0].end;
5271                                 t1 = rx->sublen;
5272                                 goto getlen;
5273                         }
5274             }
5275         return 0;
5276       /* $& / ${^MATCH}, $1, $2, ... */
5277       default:
5278             if (paren <= (I32)rx->nparens &&
5279             (s1 = rx->offs[paren].start) != -1 &&
5280             (t1 = rx->offs[paren].end) != -1)
5281             {
5282             i = t1 - s1;
5283             goto getlen;
5284         } else {
5285             if (ckWARN(WARN_UNINITIALIZED))
5286                 report_uninit((const SV *)sv);
5287             return 0;
5288         }
5289     }
5290   getlen:
5291     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5292         const char * const s = rx->subbeg + s1;
5293         const U8 *ep;
5294         STRLEN el;
5295
5296         i = t1 - s1;
5297         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5298                         i = el;
5299     }
5300     return i;
5301 }
5302
5303 SV*
5304 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5305 {
5306     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5307         PERL_UNUSED_ARG(rx);
5308         if (0)
5309             return NULL;
5310         else
5311             return newSVpvs("Regexp");
5312 }
5313
5314 /* Scans the name of a named buffer from the pattern.
5315  * If flags is REG_RSN_RETURN_NULL returns null.
5316  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5317  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5318  * to the parsed name as looked up in the RExC_paren_names hash.
5319  * If there is an error throws a vFAIL().. type exception.
5320  */
5321
5322 #define REG_RSN_RETURN_NULL    0
5323 #define REG_RSN_RETURN_NAME    1
5324 #define REG_RSN_RETURN_DATA    2
5325
5326 STATIC SV*
5327 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5328 {
5329     char *name_start = RExC_parse;
5330
5331     PERL_ARGS_ASSERT_REG_SCAN_NAME;
5332
5333     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5334          /* skip IDFIRST by using do...while */
5335         if (UTF)
5336             do {
5337                 RExC_parse += UTF8SKIP(RExC_parse);
5338             } while (isALNUM_utf8((U8*)RExC_parse));
5339         else
5340             do {
5341                 RExC_parse++;
5342             } while (isALNUM(*RExC_parse));
5343     }
5344
5345     if ( flags ) {
5346         SV* sv_name
5347             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5348                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5349         if ( flags == REG_RSN_RETURN_NAME)
5350             return sv_name;
5351         else if (flags==REG_RSN_RETURN_DATA) {
5352             HE *he_str = NULL;
5353             SV *sv_dat = NULL;
5354             if ( ! sv_name )      /* should not happen*/
5355                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5356             if (RExC_paren_names)
5357                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5358             if ( he_str )
5359                 sv_dat = HeVAL(he_str);
5360             if ( ! sv_dat )
5361                 vFAIL("Reference to nonexistent named group");
5362             return sv_dat;
5363         }
5364         else {
5365             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5366         }
5367         /* NOT REACHED */
5368     }
5369     return NULL;
5370 }
5371
5372 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5373     int rem=(int)(RExC_end - RExC_parse);                       \
5374     int cut;                                                    \
5375     int num;                                                    \
5376     int iscut=0;                                                \
5377     if (rem>10) {                                               \
5378         rem=10;                                                 \
5379         iscut=1;                                                \
5380     }                                                           \
5381     cut=10-rem;                                                 \
5382     if (RExC_lastparse!=RExC_parse)                             \
5383         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5384             rem, RExC_parse,                                    \
5385             cut + 4,                                            \
5386             iscut ? "..." : "<"                                 \
5387         );                                                      \
5388     else                                                        \
5389         PerlIO_printf(Perl_debug_log,"%16s","");                \
5390                                                                 \
5391     if (SIZE_ONLY)                                              \
5392        num = RExC_size + 1;                                     \
5393     else                                                        \
5394        num=REG_NODE_NUM(RExC_emit);                             \
5395     if (RExC_lastnum!=num)                                      \
5396        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5397     else                                                        \
5398        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5399     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5400         (int)((depth*2)), "",                                   \
5401         (funcname)                                              \
5402     );                                                          \
5403     RExC_lastnum=num;                                           \
5404     RExC_lastparse=RExC_parse;                                  \
5405 })
5406
5407
5408
5409 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5410     DEBUG_PARSE_MSG((funcname));                            \
5411     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5412 })
5413 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5414     DEBUG_PARSE_MSG((funcname));                            \
5415     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5416 })
5417 /*
5418  - reg - regular expression, i.e. main body or parenthesized thing
5419  *
5420  * Caller must absorb opening parenthesis.
5421  *
5422  * Combining parenthesis handling with the base level of regular expression
5423  * is a trifle forced, but the need to tie the tails of the branches to what
5424  * follows makes it hard to avoid.
5425  */
5426 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5427 #ifdef DEBUGGING
5428 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5429 #else
5430 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5431 #endif
5432
5433 STATIC regnode *
5434 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5435     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5436 {
5437     dVAR;
5438     register regnode *ret;              /* Will be the head of the group. */
5439     register regnode *br;
5440     register regnode *lastbr;
5441     register regnode *ender = NULL;
5442     register I32 parno = 0;
5443     I32 flags;
5444     U32 oregflags = RExC_flags;
5445     bool have_branch = 0;
5446     bool is_open = 0;
5447     I32 freeze_paren = 0;
5448     I32 after_freeze = 0;
5449
5450     /* for (?g), (?gc), and (?o) warnings; warning
5451        about (?c) will warn about (?g) -- japhy    */
5452
5453 #define WASTED_O  0x01
5454 #define WASTED_G  0x02
5455 #define WASTED_C  0x04
5456 #define WASTED_GC (0x02|0x04)
5457     I32 wastedflags = 0x00;
5458
5459     char * parse_start = RExC_parse; /* MJD */
5460     char * const oregcomp_parse = RExC_parse;
5461
5462     GET_RE_DEBUG_FLAGS_DECL;
5463
5464     PERL_ARGS_ASSERT_REG;
5465     DEBUG_PARSE("reg ");
5466
5467     *flagp = 0;                         /* Tentatively. */
5468
5469
5470     /* Make an OPEN node, if parenthesized. */
5471     if (paren) {
5472         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5473             char *start_verb = RExC_parse;
5474             STRLEN verb_len = 0;
5475             char *start_arg = NULL;
5476             unsigned char op = 0;
5477             int argok = 1;
5478             int internal_argval = 0; /* internal_argval is only useful if !argok */
5479             while ( *RExC_parse && *RExC_parse != ')' ) {
5480                 if ( *RExC_parse == ':' ) {
5481                     start_arg = RExC_parse + 1;
5482                     break;
5483                 }
5484                 RExC_parse++;
5485             }
5486             ++start_verb;
5487             verb_len = RExC_parse - start_verb;
5488             if ( start_arg ) {
5489                 RExC_parse++;
5490                 while ( *RExC_parse && *RExC_parse != ')' )
5491                     RExC_parse++;
5492                 if ( *RExC_parse != ')' )
5493                     vFAIL("Unterminated verb pattern argument");
5494                 if ( RExC_parse == start_arg )
5495                     start_arg = NULL;
5496             } else {
5497                 if ( *RExC_parse != ')' )
5498                     vFAIL("Unterminated verb pattern");
5499             }
5500
5501             switch ( *start_verb ) {
5502             case 'A':  /* (*ACCEPT) */
5503                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5504                     op = ACCEPT;
5505                     internal_argval = RExC_nestroot;
5506                 }
5507                 break;
5508             case 'C':  /* (*COMMIT) */
5509                 if ( memEQs(start_verb,verb_len,"COMMIT") )
5510                     op = COMMIT;
5511                 break;
5512             case 'F':  /* (*FAIL) */
5513                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5514                     op = OPFAIL;
5515                     argok = 0;
5516                 }
5517                 break;
5518             case ':':  /* (*:NAME) */
5519             case 'M':  /* (*MARK:NAME) */
5520                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5521                     op = MARKPOINT;
5522                     argok = -1;
5523                 }
5524                 break;
5525             case 'P':  /* (*PRUNE) */
5526                 if ( memEQs(start_verb,verb_len,"PRUNE") )
5527                     op = PRUNE;
5528                 break;
5529             case 'S':   /* (*SKIP) */
5530                 if ( memEQs(start_verb,verb_len,"SKIP") )
5531                     op = SKIP;
5532                 break;
5533             case 'T':  /* (*THEN) */
5534                 /* [19:06] <TimToady> :: is then */
5535                 if ( memEQs(start_verb,verb_len,"THEN") ) {
5536                     op = CUTGROUP;
5537                     RExC_seen |= REG_SEEN_CUTGROUP;
5538                 }
5539                 break;
5540             }
5541             if ( ! op ) {
5542                 RExC_parse++;
5543                 vFAIL3("Unknown verb pattern '%.*s'",
5544                     verb_len, start_verb);
5545             }
5546             if ( argok ) {
5547                 if ( start_arg && internal_argval ) {
5548                     vFAIL3("Verb pattern '%.*s' may not have an argument",
5549                         verb_len, start_verb);
5550                 } else if ( argok < 0 && !start_arg ) {
5551                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5552                         verb_len, start_verb);
5553                 } else {
5554                     ret = reganode(pRExC_state, op, internal_argval);
5555                     if ( ! internal_argval && ! SIZE_ONLY ) {
5556                         if (start_arg) {
5557                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5558                             ARG(ret) = add_data( pRExC_state, 1, "S" );
5559                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5560                             ret->flags = 0;
5561                         } else {
5562                             ret->flags = 1;
5563                         }
5564                     }
5565                 }
5566                 if (!internal_argval)
5567                     RExC_seen |= REG_SEEN_VERBARG;
5568             } else if ( start_arg ) {
5569                 vFAIL3("Verb pattern '%.*s' may not have an argument",
5570                         verb_len, start_verb);
5571             } else {
5572                 ret = reg_node(pRExC_state, op);
5573             }
5574             nextchar(pRExC_state);
5575             return ret;
5576         } else
5577         if (*RExC_parse == '?') { /* (?...) */
5578             bool is_logical = 0;
5579             const char * const seqstart = RExC_parse;
5580
5581             RExC_parse++;
5582             paren = *RExC_parse++;
5583             ret = NULL;                 /* For look-ahead/behind. */
5584             switch (paren) {
5585
5586             case 'P':   /* (?P...) variants for those used to PCRE/Python */
5587                 paren = *RExC_parse++;
5588                 if ( paren == '<')         /* (?P<...>) named capture */
5589                     goto named_capture;
5590                 else if (paren == '>') {   /* (?P>name) named recursion */
5591                     goto named_recursion;
5592                 }
5593                 else if (paren == '=') {   /* (?P=...)  named backref */
5594                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
5595                        you change this make sure you change that */
5596                     char* name_start = RExC_parse;
5597                     U32 num = 0;
5598                     SV *sv_dat = reg_scan_name(pRExC_state,
5599                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5600                     if (RExC_parse == name_start || *RExC_parse != ')')
5601                         vFAIL2("Sequence %.3s... not terminated",parse_start);
5602
5603                     if (!SIZE_ONLY) {
5604                         num = add_data( pRExC_state, 1, "S" );
5605                         RExC_rxi->data->data[num]=(void*)sv_dat;
5606                         SvREFCNT_inc_simple_void(sv_dat);
5607                     }
5608                     RExC_sawback = 1;
5609                     ret = reganode(pRExC_state,
5610                            (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5611                            num);
5612                     *flagp |= HASWIDTH;
5613
5614                     Set_Node_Offset(ret, parse_start+1);
5615                     Set_Node_Cur_Length(ret); /* MJD */
5616
5617                     nextchar(pRExC_state);
5618                     return ret;
5619                 }
5620                 RExC_parse++;
5621                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5622                 /*NOTREACHED*/
5623             case '<':           /* (?<...) */
5624                 if (*RExC_parse == '!')
5625                     paren = ',';
5626                 else if (*RExC_parse != '=')
5627               named_capture:
5628                 {               /* (?<...>) */
5629                     char *name_start;
5630                     SV *svname;
5631                     paren= '>';
5632             case '\'':          /* (?'...') */
5633                     name_start= RExC_parse;
5634                     svname = reg_scan_name(pRExC_state,
5635                         SIZE_ONLY ?  /* reverse test from the others */
5636                         REG_RSN_RETURN_NAME :
5637                         REG_RSN_RETURN_NULL);
5638                     if (RExC_parse == name_start) {
5639                         RExC_parse++;
5640                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5641                         /*NOTREACHED*/
5642                     }
5643                     if (*RExC_parse != paren)
5644                         vFAIL2("Sequence (?%c... not terminated",
5645                             paren=='>' ? '<' : paren);
5646                     if (SIZE_ONLY) {
5647                         HE *he_str;
5648                         SV *sv_dat = NULL;
5649                         if (!svname) /* shouldnt happen */
5650                             Perl_croak(aTHX_
5651                                 "panic: reg_scan_name returned NULL");
5652                         if (!RExC_paren_names) {
5653                             RExC_paren_names= newHV();
5654                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
5655 #ifdef DEBUGGING
5656                             RExC_paren_name_list= newAV();
5657                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5658 #endif
5659                         }
5660                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5661                         if ( he_str )
5662                             sv_dat = HeVAL(he_str);
5663                         if ( ! sv_dat ) {
5664                             /* croak baby croak */
5665                             Perl_croak(aTHX_
5666                                 "panic: paren_name hash element allocation failed");
5667                         } else if ( SvPOK(sv_dat) ) {
5668                             /* (?|...) can mean we have dupes so scan to check
5669                                its already been stored. Maybe a flag indicating
5670                                we are inside such a construct would be useful,
5671                                but the arrays are likely to be quite small, so
5672                                for now we punt -- dmq */
5673                             IV count = SvIV(sv_dat);
5674                             I32 *pv = (I32*)SvPVX(sv_dat);
5675                             IV i;
5676                             for ( i = 0 ; i < count ; i++ ) {
5677                                 if ( pv[i] == RExC_npar ) {
5678                                     count = 0;
5679                                     break;
5680                                 }
5681                             }
5682                             if ( count ) {
5683                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5684                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5685                                 pv[count] = RExC_npar;
5686                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5687                             }
5688                         } else {
5689                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
5690                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5691                             SvIOK_on(sv_dat);
5692                             SvIV_set(sv_dat, 1);
5693                         }
5694 #ifdef DEBUGGING
5695                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5696                             SvREFCNT_dec(svname);
5697 #endif
5698
5699                         /*sv_dump(sv_dat);*/
5700                     }
5701                     nextchar(pRExC_state);
5702                     paren = 1;
5703                     goto capturing_parens;
5704                 }
5705                 RExC_seen |= REG_SEEN_LOOKBEHIND;
5706                 RExC_parse++;
5707             case '=':           /* (?=...) */
5708                 RExC_seen_zerolen++;
5709                         break;
5710             case '!':           /* (?!...) */
5711                 RExC_seen_zerolen++;
5712                 if (*RExC_parse == ')') {
5713                     ret=reg_node(pRExC_state, OPFAIL);
5714                     nextchar(pRExC_state);
5715                     return ret;
5716                 }
5717                 break;
5718             case '|':           /* (?|...) */
5719                 /* branch reset, behave like a (?:...) except that
5720                    buffers in alternations share the same numbers */
5721                 paren = ':';
5722                 after_freeze = freeze_paren = RExC_npar;
5723                 break;
5724             case ':':           /* (?:...) */
5725             case '>':           /* (?>...) */
5726                 break;
5727             case '$':           /* (?$...) */
5728             case '@':           /* (?@...) */
5729                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5730                 break;
5731             case '#':           /* (?#...) */
5732                 while (*RExC_parse && *RExC_parse != ')')
5733                     RExC_parse++;
5734                 if (*RExC_parse != ')')
5735                     FAIL("Sequence (?#... not terminated");
5736                 nextchar(pRExC_state);
5737                 *flagp = TRYAGAIN;
5738                 return NULL;
5739             case '0' :           /* (?0) */
5740             case 'R' :           /* (?R) */
5741                 if (*RExC_parse != ')')
5742                     FAIL("Sequence (?R) not terminated");
5743                 ret = reg_node(pRExC_state, GOSTART);
5744                 *flagp |= POSTPONED;
5745                 nextchar(pRExC_state);
5746                 return ret;
5747                 /*notreached*/
5748             { /* named and numeric backreferences */
5749                 I32 num;
5750             case '&':            /* (?&NAME) */
5751                 parse_start = RExC_parse - 1;
5752               named_recursion:
5753                 {
5754                     SV *sv_dat = reg_scan_name(pRExC_state,
5755                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5756                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5757                 }
5758                 goto gen_recurse_regop;
5759                 /* NOT REACHED */
5760             case '+':
5761                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5762                     RExC_parse++;
5763                     vFAIL("Illegal pattern");
5764                 }
5765                 goto parse_recursion;
5766                 /* NOT REACHED*/
5767             case '-': /* (?-1) */
5768                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5769                     RExC_parse--; /* rewind to let it be handled later */
5770                     goto parse_flags;
5771                 }
5772                 /*FALLTHROUGH */
5773             case '1': case '2': case '3': case '4': /* (?1) */
5774             case '5': case '6': case '7': case '8': case '9':
5775                 RExC_parse--;
5776               parse_recursion:
5777                 num = atoi(RExC_parse);
5778                 parse_start = RExC_parse - 1; /* MJD */
5779                 if (*RExC_parse == '-')
5780                     RExC_parse++;
5781                 while (isDIGIT(*RExC_parse))
5782                         RExC_parse++;
5783                 if (*RExC_parse!=')')
5784                     vFAIL("Expecting close bracket");
5785
5786               gen_recurse_regop:
5787                 if ( paren == '-' ) {
5788                     /*
5789                     Diagram of capture buffer numbering.
5790                     Top line is the normal capture buffer numbers
5791                     Botton line is the negative indexing as from
5792                     the X (the (?-2))
5793
5794                     +   1 2    3 4 5 X          6 7
5795                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5796                     -   5 4    3 2 1 X          x x
5797
5798                     */
5799                     num = RExC_npar + num;
5800                     if (num < 1)  {
5801                         RExC_parse++;
5802                         vFAIL("Reference to nonexistent group");
5803                     }
5804                 } else if ( paren == '+' ) {
5805                     num = RExC_npar + num - 1;
5806                 }
5807
5808                 ret = reganode(pRExC_state, GOSUB, num);
5809                 if (!SIZE_ONLY) {
5810                     if (num > (I32)RExC_rx->nparens) {
5811                         RExC_parse++;
5812                         vFAIL("Reference to nonexistent group");
5813                     }
5814                     ARG2L_SET( ret, RExC_recurse_count++);
5815                     RExC_emit++;
5816                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5817                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5818                 } else {
5819                     RExC_size++;
5820                 }
5821                 RExC_seen |= REG_SEEN_RECURSE;
5822                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5823                 Set_Node_Offset(ret, parse_start); /* MJD */
5824
5825                 *flagp |= POSTPONED;
5826                 nextchar(pRExC_state);
5827                 return ret;
5828             } /* named and numeric backreferences */
5829             /* NOT REACHED */
5830
5831             case '?':           /* (??...) */
5832                 is_logical = 1;
5833                 if (*RExC_parse != '{') {
5834                     RExC_parse++;
5835                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5836                     /*NOTREACHED*/
5837                 }
5838                 *flagp |= POSTPONED;
5839                 paren = *RExC_parse++;
5840                 /* FALL THROUGH */
5841             case '{':           /* (?{...}) */
5842             {
5843                 I32 count = 1;
5844                 U32 n = 0;
5845                 char c;
5846                 char *s = RExC_parse;
5847
5848                 RExC_seen_zerolen++;
5849                 RExC_seen |= REG_SEEN_EVAL;
5850                 while (count && (c = *RExC_parse)) {
5851                     if (c == '\\') {
5852                         if (RExC_parse[1])
5853                             RExC_parse++;
5854                     }
5855                     else if (c == '{')
5856                         count++;
5857                     else if (c == '}')
5858                         count--;
5859                     RExC_parse++;
5860                 }
5861                 if (*RExC_parse != ')') {
5862                     RExC_parse = s;
5863                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5864                 }
5865                 if (!SIZE_ONLY) {
5866                     PAD *pad;
5867                     OP_4tree *sop, *rop;
5868                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5869
5870                     ENTER;
5871                     Perl_save_re_context(aTHX);
5872                     rop = sv_compile_2op(sv, &sop, "re", &pad);
5873                     sop->op_private |= OPpREFCOUNTED;
5874                     /* re_dup will OpREFCNT_inc */
5875                     OpREFCNT_set(sop, 1);
5876                     LEAVE;
5877
5878                     n = add_data(pRExC_state, 3, "nop");
5879                     RExC_rxi->data->data[n] = (void*)rop;
5880                     RExC_rxi->data->data[n+1] = (void*)sop;
5881                     RExC_rxi->data->data[n+2] = (void*)pad;
5882                     SvREFCNT_dec(sv);
5883                 }
5884                 else {                                          /* First pass */
5885                     if (PL_reginterp_cnt < ++RExC_seen_evals
5886                         && IN_PERL_RUNTIME)
5887                         /* No compiled RE interpolated, has runtime
5888                            components ===> unsafe.  */
5889                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
5890                     if (PL_tainting && PL_tainted)
5891                         FAIL("Eval-group in insecure regular expression");
5892 #if PERL_VERSION > 8
5893                     if (IN_PERL_COMPILETIME)
5894                         PL_cv_has_eval = 1;
5895 #endif
5896                 }
5897
5898                 nextchar(pRExC_state);
5899                 if (is_logical) {
5900                     ret = reg_node(pRExC_state, LOGICAL);
5901                     if (!SIZE_ONLY)
5902                         ret->flags = 2;
5903                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5904                     /* deal with the length of this later - MJD */
5905                     return ret;
5906                 }
5907                 ret = reganode(pRExC_state, EVAL, n);
5908                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5909                 Set_Node_Offset(ret, parse_start);
5910                 return ret;
5911             }
5912             case '(':           /* (?(?{...})...) and (?(?=...)...) */
5913             {
5914                 int is_define= 0;
5915                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
5916                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5917                         || RExC_parse[1] == '<'
5918                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
5919                         I32 flag;
5920
5921                         ret = reg_node(pRExC_state, LOGICAL);
5922                         if (!SIZE_ONLY)
5923                             ret->flags = 1;
5924                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5925                         goto insert_if;
5926                     }
5927                 }
5928                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
5929                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5930                 {
5931                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
5932                     char *name_start= RExC_parse++;
5933                     U32 num = 0;
5934                     SV *sv_dat=reg_scan_name(pRExC_state,
5935                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5936                     if (RExC_parse == name_start || *RExC_parse != ch)
5937                         vFAIL2("Sequence (?(%c... not terminated",
5938                             (ch == '>' ? '<' : ch));
5939                     RExC_parse++;
5940                     if (!SIZE_ONLY) {
5941                         num = add_data( pRExC_state, 1, "S" );
5942                         RExC_rxi->data->data[num]=(void*)sv_dat;
5943                         SvREFCNT_inc_simple_void(sv_dat);
5944                     }
5945                     ret = reganode(pRExC_state,NGROUPP,num);
5946                     goto insert_if_check_paren;
5947                 }
5948                 else if (RExC_parse[0] == 'D' &&
5949                          RExC_parse[1] == 'E' &&
5950                          RExC_parse[2] == 'F' &&
5951                          RExC_parse[3] == 'I' &&
5952                          RExC_parse[4] == 'N' &&
5953                          RExC_parse[5] == 'E')
5954                 {
5955                     ret = reganode(pRExC_state,DEFINEP,0);
5956                     RExC_parse +=6 ;
5957                     is_define = 1;
5958                     goto insert_if_check_paren;
5959                 }
5960                 else if (RExC_parse[0] == 'R') {
5961                     RExC_parse++;
5962                     parno = 0;
5963                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5964                         parno = atoi(RExC_parse++);
5965                         while (isDIGIT(*RExC_parse))
5966                             RExC_parse++;
5967                     } else if (RExC_parse[0] == '&') {
5968                         SV *sv_dat;
5969                         RExC_parse++;
5970                         sv_dat = reg_scan_name(pRExC_state,
5971                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5972                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5973                     }
5974                     ret = reganode(pRExC_state,INSUBP,parno);
5975                     goto insert_if_check_paren;
5976                 }
5977                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5978                     /* (?(1)...) */
5979                     char c;
5980                     parno = atoi(RExC_parse++);
5981
5982                     while (isDIGIT(*RExC_parse))
5983                         RExC_parse++;
5984                     ret = reganode(pRExC_state, GROUPP, parno);
5985
5986                  insert_if_check_paren:
5987                     if ((c = *nextchar(pRExC_state)) != ')')
5988                         vFAIL("Switch condition not recognized");
5989                   insert_if:
5990                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5991                     br = regbranch(pRExC_state, &flags, 1,depth+1);
5992                     if (br == NULL)
5993                         br = reganode(pRExC_state, LONGJMP, 0);
5994                     else
5995                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
5996                     c = *nextchar(pRExC_state);
5997                     if (flags&HASWIDTH)
5998                         *flagp |= HASWIDTH;
5999                     if (c == '|') {
6000                         if (is_define)
6001                             vFAIL("(?(DEFINE)....) does not allow branches");
6002                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6003                         regbranch(pRExC_state, &flags, 1,depth+1);
6004                         REGTAIL(pRExC_state, ret, lastbr);
6005                         if (flags&HASWIDTH)
6006                             *flagp |= HASWIDTH;
6007                         c = *nextchar(pRExC_state);
6008                     }
6009                     else
6010                         lastbr = NULL;
6011                     if (c != ')')
6012                         vFAIL("Switch (?(condition)... contains too many branches");
6013                     ender = reg_node(pRExC_state, TAIL);
6014                     REGTAIL(pRExC_state, br, ender);
6015                     if (lastbr) {
6016                         REGTAIL(pRExC_state, lastbr, ender);
6017                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6018                     }
6019                     else
6020                         REGTAIL(pRExC_state, ret, ender);
6021                     RExC_size++; /* XXX WHY do we need this?!!
6022                                     For large programs it seems to be required
6023                                     but I can't figure out why. -- dmq*/
6024                     return ret;
6025                 }
6026                 else {
6027                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6028                 }
6029             }
6030             case 0:
6031                 RExC_parse--; /* for vFAIL to print correctly */
6032                 vFAIL("Sequence (? incomplete");
6033                 break;
6034             default:
6035                 --RExC_parse;
6036                 parse_flags:      /* (?i) */
6037             {
6038                 U32 posflags = 0, negflags = 0;
6039                 U32 *flagsp = &posflags;
6040
6041                 while (*RExC_parse) {
6042                     /* && strchr("iogcmsx", *RExC_parse) */
6043                     /* (?g), (?gc) and (?o) are useless here
6044                        and must be globally applied -- japhy */
6045                     switch (*RExC_parse) {
6046                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6047                     case ONCE_PAT_MOD: /* 'o' */
6048                     case GLOBAL_PAT_MOD: /* 'g' */
6049                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6050                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6051                             if (! (wastedflags & wflagbit) ) {
6052                                 wastedflags |= wflagbit;
6053                                 vWARN5(
6054                                     RExC_parse + 1,
6055                                     "Useless (%s%c) - %suse /%c modifier",
6056                                     flagsp == &negflags ? "?-" : "?",
6057                                     *RExC_parse,
6058                                     flagsp == &negflags ? "don't " : "",
6059                                     *RExC_parse
6060                                 );
6061                             }
6062                         }
6063                         break;
6064
6065                     case CONTINUE_PAT_MOD: /* 'c' */
6066                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6067                             if (! (wastedflags & WASTED_C) ) {
6068                                 wastedflags |= WASTED_GC;
6069                                 vWARN3(
6070                                     RExC_parse + 1,
6071                                     "Useless (%sc) - %suse /gc modifier",
6072                                     flagsp == &negflags ? "?-" : "?",
6073                                     flagsp == &negflags ? "don't " : ""
6074                                 );
6075                             }
6076                         }
6077                         break;
6078                     case KEEPCOPY_PAT_MOD: /* 'p' */
6079                         if (flagsp == &negflags) {
6080                             if (SIZE_ONLY && ckWARN(WARN_REGEXP))
6081                                 vWARN(RExC_parse + 1,"Useless use of (?-p)");
6082                         } else {
6083                             *flagsp |= RXf_PMf_KEEPCOPY;
6084                         }
6085                         break;
6086                     case '-':
6087                         if (flagsp == &negflags) {
6088                             RExC_parse++;
6089                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6090                             /*NOTREACHED*/
6091                         }
6092                         flagsp = &negflags;
6093                         wastedflags = 0;  /* reset so (?g-c) warns twice */
6094                         break;
6095                     case ':':
6096                         paren = ':';
6097                         /*FALLTHROUGH*/
6098                     case ')':
6099                         RExC_flags |= posflags;
6100                         RExC_flags &= ~negflags;
6101                         if (paren != ':') {
6102                             oregflags |= posflags;
6103                             oregflags &= ~negflags;
6104                         }
6105                         nextchar(pRExC_state);
6106                         if (paren != ':') {
6107                             *flagp = TRYAGAIN;
6108                             return NULL;
6109                         } else {
6110                             ret = NULL;
6111                             goto parse_rest;
6112                         }
6113                         /*NOTREACHED*/
6114                     default:
6115                         RExC_parse++;
6116                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6117                         /*NOTREACHED*/
6118                     }
6119                     ++RExC_parse;
6120                 }
6121             }} /* one for the default block, one for the switch */
6122         }
6123         else {                  /* (...) */
6124           capturing_parens:
6125             parno = RExC_npar;
6126             RExC_npar++;
6127
6128             ret = reganode(pRExC_state, OPEN, parno);
6129             if (!SIZE_ONLY ){
6130                 if (!RExC_nestroot)
6131                     RExC_nestroot = parno;
6132                 if (RExC_seen & REG_SEEN_RECURSE
6133                     && !RExC_open_parens[parno-1])
6134                 {
6135                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6136                         "Setting open paren #%"IVdf" to %d\n",
6137                         (IV)parno, REG_NODE_NUM(ret)));
6138                     RExC_open_parens[parno-1]= ret;
6139                 }
6140             }
6141             Set_Node_Length(ret, 1); /* MJD */
6142             Set_Node_Offset(ret, RExC_parse); /* MJD */
6143             is_open = 1;
6144         }
6145     }
6146     else                        /* ! paren */
6147         ret = NULL;
6148
6149    parse_rest:
6150     /* Pick up the branches, linking them together. */
6151     parse_start = RExC_parse;   /* MJD */
6152     br = regbranch(pRExC_state, &flags, 1,depth+1);
6153
6154     if (freeze_paren) {
6155         if (RExC_npar > after_freeze)
6156             after_freeze = RExC_npar;
6157         RExC_npar = freeze_paren;
6158     }
6159
6160     /*     branch_len = (paren != 0); */
6161
6162     if (br == NULL)
6163         return(NULL);
6164     if (*RExC_parse == '|') {
6165         if (!SIZE_ONLY && RExC_extralen) {
6166             reginsert(pRExC_state, BRANCHJ, br, depth+1);
6167         }
6168         else {                  /* MJD */
6169             reginsert(pRExC_state, BRANCH, br, depth+1);
6170             Set_Node_Length(br, paren != 0);
6171             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6172         }
6173         have_branch = 1;
6174         if (SIZE_ONLY)
6175             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
6176     }
6177     else if (paren == ':') {
6178         *flagp |= flags&SIMPLE;
6179     }
6180     if (is_open) {                              /* Starts with OPEN. */
6181         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
6182     }
6183     else if (paren != '?')              /* Not Conditional */
6184         ret = br;
6185     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6186     lastbr = br;
6187     while (*RExC_parse == '|') {
6188         if (!SIZE_ONLY && RExC_extralen) {
6189             ender = reganode(pRExC_state, LONGJMP,0);
6190             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6191         }
6192         if (SIZE_ONLY)
6193             RExC_extralen += 2;         /* Account for LONGJMP. */
6194         nextchar(pRExC_state);
6195         if (freeze_paren) {
6196             if (RExC_npar > after_freeze)
6197                 after_freeze = RExC_npar;
6198             RExC_npar = freeze_paren;
6199         }
6200         br = regbranch(pRExC_state, &flags, 0, depth+1);
6201
6202         if (br == NULL)
6203             return(NULL);
6204         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
6205         lastbr = br;
6206         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6207     }
6208
6209     if (have_branch || paren != ':') {
6210         /* Make a closing node, and hook it on the end. */
6211         switch (paren) {
6212         case ':':
6213             ender = reg_node(pRExC_state, TAIL);
6214             break;
6215         case 1:
6216             ender = reganode(pRExC_state, CLOSE, parno);
6217             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6218                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6219                         "Setting close paren #%"IVdf" to %d\n",
6220                         (IV)parno, REG_NODE_NUM(ender)));
6221                 RExC_close_parens[parno-1]= ender;
6222                 if (RExC_nestroot == parno)
6223                     RExC_nestroot = 0;
6224             }
6225             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6226             Set_Node_Length(ender,1); /* MJD */
6227             break;
6228         case '<':
6229         case ',':
6230         case '=':
6231         case '!':
6232             *flagp &= ~HASWIDTH;
6233             /* FALL THROUGH */
6234         case '>':
6235             ender = reg_node(pRExC_state, SUCCEED);
6236             break;
6237         case 0:
6238             ender = reg_node(pRExC_state, END);
6239             if (!SIZE_ONLY) {
6240                 assert(!RExC_opend); /* there can only be one! */
6241                 RExC_opend = ender;
6242             }
6243             break;
6244         }
6245         REGTAIL(pRExC_state, lastbr, ender);
6246
6247         if (have_branch && !SIZE_ONLY) {
6248             if (depth==1)
6249                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6250
6251             /* Hook the tails of the branches to the closing node. */
6252             for (br = ret; br; br = regnext(br)) {
6253                 const U8 op = PL_regkind[OP(br)];
6254                 if (op == BRANCH) {
6255                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6256                 }
6257                 else if (op == BRANCHJ) {
6258                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6259                 }
6260             }
6261         }
6262     }
6263
6264     {
6265         const char *p;
6266         static const char parens[] = "=!<,>";
6267
6268         if (paren && (p = strchr(parens, paren))) {
6269             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6270             int flag = (p - parens) > 1;
6271
6272             if (paren == '>')
6273                 node = SUSPEND, flag = 0;
6274             reginsert(pRExC_state, node,ret, depth+1);
6275             Set_Node_Cur_Length(ret);
6276             Set_Node_Offset(ret, parse_start + 1);
6277             ret->flags = flag;
6278             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6279         }
6280     }
6281
6282     /* Check for proper termination. */
6283     if (paren) {
6284         RExC_flags = oregflags;
6285         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6286             RExC_parse = oregcomp_parse;
6287             vFAIL("Unmatched (");
6288         }
6289     }
6290     else if (!paren && RExC_parse < RExC_end) {
6291         if (*RExC_parse == ')') {
6292             RExC_parse++;
6293             vFAIL("Unmatched )");
6294         }
6295         else
6296             FAIL("Junk on end of regexp");      /* "Can't happen". */
6297         /* NOTREACHED */
6298     }
6299     if (after_freeze)
6300         RExC_npar = after_freeze;
6301     return(ret);
6302 }
6303
6304 /*
6305  - regbranch - one alternative of an | operator
6306  *
6307  * Implements the concatenation operator.
6308  */
6309 STATIC regnode *
6310 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6311 {
6312     dVAR;
6313     register regnode *ret;
6314     register regnode *chain = NULL;
6315     register regnode *latest;
6316     I32 flags = 0, c = 0;
6317     GET_RE_DEBUG_FLAGS_DECL;
6318
6319     PERL_ARGS_ASSERT_REGBRANCH;
6320
6321     DEBUG_PARSE("brnc");
6322
6323     if (first)
6324         ret = NULL;
6325     else {
6326         if (!SIZE_ONLY && RExC_extralen)
6327             ret = reganode(pRExC_state, BRANCHJ,0);
6328         else {
6329             ret = reg_node(pRExC_state, BRANCH);
6330             Set_Node_Length(ret, 1);
6331         }
6332     }
6333
6334     if (!first && SIZE_ONLY)
6335         RExC_extralen += 1;                     /* BRANCHJ */
6336
6337     *flagp = WORST;                     /* Tentatively. */
6338
6339     RExC_parse--;
6340     nextchar(pRExC_state);
6341     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6342         flags &= ~TRYAGAIN;
6343         latest = regpiece(pRExC_state, &flags,depth+1);
6344         if (latest == NULL) {
6345             if (flags & TRYAGAIN)
6346                 continue;
6347             return(NULL);
6348         }
6349         else if (ret == NULL)
6350             ret = latest;
6351         *flagp |= flags&(HASWIDTH|POSTPONED);
6352         if (chain == NULL)      /* First piece. */
6353             *flagp |= flags&SPSTART;
6354         else {
6355             RExC_naughty++;
6356             REGTAIL(pRExC_state, chain, latest);
6357         }
6358         chain = latest;
6359         c++;
6360     }
6361     if (chain == NULL) {        /* Loop ran zero times. */
6362         chain = reg_node(pRExC_state, NOTHING);
6363         if (ret == NULL)
6364             ret = chain;
6365     }
6366     if (c == 1) {
6367         *flagp |= flags&SIMPLE;
6368     }
6369
6370     return ret;
6371 }
6372
6373 /*
6374  - regpiece - something followed by possible [*+?]
6375  *
6376  * Note that the branching code sequences used for ? and the general cases
6377  * of * and + are somewhat optimized:  they use the same NOTHING node as
6378  * both the endmarker for their branch list and the body of the last branch.
6379  * It might seem that this node could be dispensed with entirely, but the
6380  * endmarker role is not redundant.
6381  */
6382 STATIC regnode *
6383 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6384 {
6385     dVAR;
6386     register regnode *ret;
6387     register char op;
6388     register char *next;
6389     I32 flags;
6390     const char * const origparse = RExC_parse;
6391     I32 min;
6392     I32 max = REG_INFTY;
6393     char *parse_start;
6394     const char *maxpos = NULL;
6395     GET_RE_DEBUG_FLAGS_DECL;
6396
6397     PERL_ARGS_ASSERT_REGPIECE;
6398
6399     DEBUG_PARSE("piec");
6400
6401     ret = regatom(pRExC_state, &flags,depth+1);
6402     if (ret == NULL) {
6403         if (flags & TRYAGAIN)
6404             *flagp |= TRYAGAIN;
6405         return(NULL);
6406     }
6407
6408     op = *RExC_parse;
6409
6410     if (op == '{' && regcurly(RExC_parse)) {
6411         maxpos = NULL;
6412         parse_start = RExC_parse; /* MJD */
6413         next = RExC_parse + 1;
6414         while (isDIGIT(*next) || *next == ',') {
6415             if (*next == ',') {
6416                 if (maxpos)
6417                     break;
6418                 else
6419                     maxpos = next;
6420             }
6421             next++;
6422         }
6423         if (*next == '}') {             /* got one */
6424             if (!maxpos)
6425                 maxpos = next;
6426             RExC_parse++;
6427             min = atoi(RExC_parse);
6428             if (*maxpos == ',')
6429                 maxpos++;
6430             else
6431                 maxpos = RExC_parse;
6432             max = atoi(maxpos);
6433             if (!max && *maxpos != '0')
6434                 max = REG_INFTY;                /* meaning "infinity" */
6435             else if (max >= REG_INFTY)
6436                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6437             RExC_parse = next;
6438             nextchar(pRExC_state);
6439
6440         do_curly:
6441             if ((flags&SIMPLE)) {
6442                 RExC_naughty += 2 + RExC_naughty / 2;
6443                 reginsert(pRExC_state, CURLY, ret, depth+1);
6444                 Set_Node_Offset(ret, parse_start+1); /* MJD */
6445                 Set_Node_Cur_Length(ret);
6446             }
6447             else {
6448                 regnode * const w = reg_node(pRExC_state, WHILEM);
6449
6450                 w->flags = 0;
6451                 REGTAIL(pRExC_state, ret, w);
6452                 if (!SIZE_ONLY && RExC_extralen) {
6453                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
6454                     reginsert(pRExC_state, NOTHING,ret, depth+1);
6455                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
6456                 }
6457                 reginsert(pRExC_state, CURLYX,ret, depth+1);
6458                                 /* MJD hk */
6459                 Set_Node_Offset(ret, parse_start+1);
6460                 Set_Node_Length(ret,
6461                                 op == '{' ? (RExC_parse - parse_start) : 1);
6462
6463                 if (!SIZE_ONLY && RExC_extralen)
6464                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
6465                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6466                 if (SIZE_ONLY)
6467                     RExC_whilem_seen++, RExC_extralen += 3;
6468                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
6469             }
6470             ret->flags = 0;
6471
6472             if (min > 0)
6473                 *flagp = WORST;
6474             if (max > 0)
6475                 *flagp |= HASWIDTH;
6476             if (max < min)
6477                 vFAIL("Can't do {n,m} with n > m");
6478             if (!SIZE_ONLY) {
6479                 ARG1_SET(ret, (U16)min);
6480                 ARG2_SET(ret, (U16)max);
6481             }
6482
6483             goto nest_check;
6484         }
6485     }
6486
6487     if (!ISMULT1(op)) {
6488         *flagp = flags;
6489         return(ret);
6490     }
6491
6492 #if 0                           /* Now runtime fix should be reliable. */
6493
6494     /* if this is reinstated, don't forget to put this back into perldiag:
6495
6496             =item Regexp *+ operand could be empty at {#} in regex m/%s/
6497
6498            (F) The part of the regexp subject to either the * or + quantifier
6499            could match an empty string. The {#} shows in the regular
6500            expression about where the problem was discovered.
6501
6502     */
6503
6504     if (!(flags&HASWIDTH) && op != '?')
6505       vFAIL("Regexp *+ operand could be empty");
6506 #endif
6507
6508     parse_start = RExC_parse;
6509     nextchar(pRExC_state);
6510
6511     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6512
6513     if (op == '*' && (flags&SIMPLE)) {
6514         reginsert(pRExC_state, STAR, ret, depth+1);
6515         ret->flags = 0;
6516         RExC_naughty += 4;
6517     }
6518     else if (op == '*') {
6519         min = 0;
6520         goto do_curly;
6521     }
6522     else if (op == '+' && (flags&SIMPLE)) {
6523         reginsert(pRExC_state, PLUS, ret, depth+1);
6524         ret->flags = 0;
6525         RExC_naughty += 3;
6526     }
6527     else if (op == '+') {
6528         min = 1;
6529         goto do_curly;
6530     }
6531     else if (op == '?') {
6532         min = 0; max = 1;
6533         goto do_curly;
6534     }
6535   nest_check:
6536     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
6537         vWARN3(RExC_parse,
6538                "%.*s matches null string many times",
6539                (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6540                origparse);
6541     }
6542
6543     if (RExC_parse < RExC_end && *RExC_parse == '?') {
6544         nextchar(pRExC_state);
6545         reginsert(pRExC_state, MINMOD, ret, depth+1);
6546         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6547     }
6548 #ifndef REG_ALLOW_MINMOD_SUSPEND
6549     else
6550 #endif
6551     if (RExC_parse < RExC_end && *RExC_parse == '+') {
6552         regnode *ender;
6553         nextchar(pRExC_state);
6554         ender = reg_node(pRExC_state, SUCCEED);
6555         REGTAIL(pRExC_state, ret, ender);
6556         reginsert(pRExC_state, SUSPEND, ret, depth+1);
6557         ret->flags = 0;
6558         ender = reg_node(pRExC_state, TAIL);
6559         REGTAIL(pRExC_state, ret, ender);
6560         /*ret= ender;*/
6561     }
6562
6563     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6564         RExC_parse++;
6565         vFAIL("Nested quantifiers");
6566     }
6567
6568     return(ret);
6569 }
6570
6571
6572 /* reg_namedseq(pRExC_state,UVp)
6573
6574    This is expected to be called by a parser routine that has
6575    recognized '\N' and needs to handle the rest. RExC_parse is
6576    expected to point at the first char following the N at the time
6577    of the call.
6578
6579    If valuep is non-null then it is assumed that we are parsing inside
6580    of a charclass definition and the first codepoint in the resolved
6581    string is returned via *valuep and the routine will return NULL.
6582    In this mode if a multichar string is returned from the charnames
6583    handler a warning will be issued, and only the first char in the
6584    sequence will be examined. If the string returned is zero length
6585    then the value of *valuep is undefined and NON-NULL will
6586    be returned to indicate failure. (This will NOT be a valid pointer
6587    to a regnode.)
6588
6589    If valuep is null then it is assumed that we are parsing normal text
6590    and inserts a new EXACT node into the program containing the resolved
6591    string and returns a pointer to the new node. If the string is
6592    zerolength a NOTHING node is emitted.
6593
6594    On success RExC_parse is set to the char following the endbrace.
6595    Parsing failures will generate a fatal errorvia vFAIL(...)
6596
6597    NOTE: We cache all results from the charnames handler locally in
6598    the RExC_charnames hash (created on first use) to prevent a charnames
6599    handler from playing silly-buggers and returning a short string and
6600    then a long string for a given pattern. Since the regexp program
6601    size is calculated during an initial parse this would result
6602    in a buffer overrun so we cache to prevent the charname result from
6603    changing during the course of the parse.
6604
6605  */
6606 STATIC regnode *
6607 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6608 {
6609     char * name;        /* start of the content of the name */
6610     char * endbrace;    /* endbrace following the name */
6611     SV *sv_str = NULL;
6612     SV *sv_name = NULL;
6613     STRLEN len; /* this has various purposes throughout the code */
6614     bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
6615     regnode *ret = NULL;
6616
6617     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6618
6619     if (*RExC_parse != '{' ||
6620             (*RExC_parse == '{' && RExC_parse[1]
6621              && strchr("0123456789", RExC_parse[1])))
6622     {
6623         GET_RE_DEBUG_FLAGS_DECL;
6624         if (valuep)
6625             /* no bare \N in a charclass */
6626             vFAIL("Missing braces on \\N{}");
6627         GET_RE_DEBUG_FLAGS;
6628         nextchar(pRExC_state);
6629         ret = reg_node(pRExC_state, REG_ANY);
6630         *flagp |= HASWIDTH|SIMPLE;
6631         RExC_naughty++;
6632         RExC_parse--;
6633         Set_Node_Length(ret, 1); /* MJD */
6634         return ret;
6635     }
6636     name = RExC_parse+1;
6637     endbrace = strchr(RExC_parse, '}');
6638     if ( ! endbrace ) {
6639         RExC_parse++;
6640         vFAIL("Missing right brace on \\N{}");
6641     }
6642     RExC_parse = endbrace + 1;
6643
6644
6645     /* RExC_parse points at the beginning brace,
6646        endbrace points at the last */
6647     if ( name[0]=='U' && name[1]=='+' ) {
6648         /* its a "Unicode hex" notation {U+89AB} */
6649         I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6650             | PERL_SCAN_DISALLOW_PREFIX
6651             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6652         UV cp;
6653         len = (STRLEN)(endbrace - name - 2);
6654         cp = grok_hex(name + 2, &len, &fl, NULL);
6655         if ( len != (STRLEN)(endbrace - name - 2) ) {
6656             cp = 0xFFFD;
6657         }
6658         if ( valuep ) {
6659             if (cp > 0xff) RExC_utf8 = 1;
6660             *valuep = cp;
6661             return NULL;
6662         }
6663
6664         /* Need to convert to utf8 if either: won't fit into a byte, or the re
6665          * is going to be in utf8 and the representation changes under utf8. */
6666         if (cp > 0xff || (RExC_utf8 && ! UNI_IS_INVARIANT(cp))) {
6667             U8 string[UTF8_MAXBYTES+1];
6668             U8 *tmps;
6669             RExC_utf8 = 1;
6670             tmps = uvuni_to_utf8(string, cp);
6671             sv_str = newSVpvn_utf8((char*)string, tmps - string, TRUE);
6672         } else {    /* Otherwise, no need for utf8, can skip that step */
6673             char string;
6674             string = (char)cp;
6675             sv_str= newSVpvn(&string, 1);
6676         }
6677     } else {
6678         /* fetch the charnames handler for this scope */
6679         HV * const table = GvHV(PL_hintgv);
6680         SV **cvp= table ?
6681             hv_fetchs(table, "charnames", FALSE) :
6682             NULL;
6683         SV *cv= cvp ? *cvp : NULL;
6684         HE *he_str;
6685         int count;
6686         /* create an SV with the name as argument */
6687         sv_name = newSVpvn(name, endbrace - name);
6688
6689         if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6690             vFAIL2("Constant(\\N{%s}) unknown: "
6691                   "(possibly a missing \"use charnames ...\")",
6692                   SvPVX(sv_name));
6693         }
6694         if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6695             vFAIL2("Constant(\\N{%s}): "
6696                   "$^H{charnames} is not defined",SvPVX(sv_name));
6697         }
6698
6699
6700
6701         if (!RExC_charnames) {
6702             /* make sure our cache is allocated */
6703             RExC_charnames = newHV();
6704             sv_2mortal(MUTABLE_SV(RExC_charnames));
6705         }
6706             /* see if we have looked this one up before */
6707         he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6708         if ( he_str ) {
6709             sv_str = HeVAL(he_str);
6710             cached = 1;
6711         } else {
6712             dSP ;
6713
6714             ENTER ;
6715             SAVETMPS ;
6716             PUSHMARK(SP) ;
6717
6718             XPUSHs(sv_name);
6719
6720             PUTBACK ;
6721
6722             count= call_sv(cv, G_SCALAR);
6723
6724             if (count == 1) { /* XXXX is this right? dmq */
6725                 sv_str = POPs;
6726                 SvREFCNT_inc_simple_void(sv_str);
6727             }
6728
6729             SPAGAIN ;
6730             PUTBACK ;
6731             FREETMPS ;
6732             LEAVE ;
6733
6734             if ( !sv_str || !SvOK(sv_str) ) {
6735                 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
6736                       "did not return a defined value",SvPVX(sv_name));
6737             }
6738             if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6739                 cached = 1;
6740         }
6741     }
6742     if (valuep) {
6743         char *p = SvPV(sv_str, len);
6744         if (len) {
6745             STRLEN numlen = 1;
6746             if ( SvUTF8(sv_str) ) {
6747                 *valuep = utf8_to_uvchr((U8*)p, &numlen);
6748                 if (*valuep > 0x7F)
6749                     RExC_utf8 = 1;
6750                 /* XXXX
6751                   We have to turn on utf8 for high bit chars otherwise
6752                   we get failures with
6753
6754                    "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6755                    "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6756
6757                   This is different from what \x{} would do with the same
6758                   codepoint, where the condition is > 0xFF.
6759                   - dmq
6760                 */
6761
6762
6763             } else {
6764                 *valuep = (UV)*p;
6765                 /* warn if we havent used the whole string? */
6766             }
6767             if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6768                 vWARN2(RExC_parse,
6769                     "Ignoring excess chars from \\N{%s} in character class",
6770                     SvPVX(sv_name)
6771                 );
6772             }
6773         } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6774             vWARN2(RExC_parse,
6775                     "Ignoring zero length \\N{%s} in character class",
6776                     SvPVX(sv_name)
6777                 );
6778         }
6779         if (sv_name)
6780             SvREFCNT_dec(sv_name);
6781         if (!cached)
6782             SvREFCNT_dec(sv_str);
6783         return len ? NULL : (regnode *)&len;
6784     } else if(SvCUR(sv_str)) {
6785
6786         char *s;
6787         char *p, *pend;
6788         STRLEN charlen = 1;
6789 #ifdef DEBUGGING
6790         char * parse_start = name-3; /* needed for the offsets */
6791 #endif
6792         GET_RE_DEBUG_FLAGS_DECL;     /* needed for the offsets */
6793
6794         ret = reg_node(pRExC_state,
6795             (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6796         s= STRING(ret);
6797
6798         if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6799             sv_utf8_upgrade(sv_str);
6800         } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6801             RExC_utf8= 1;
6802         }
6803
6804         p = SvPV(sv_str, len);
6805         pend = p + len;
6806         /* len is the length written, charlen is the size the char read */
6807         for ( len = 0; p < pend; p += charlen ) {
6808             if (UTF) {
6809                 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
6810                 if (FOLD) {
6811                     STRLEN foldlen,numlen;
6812                     U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6813                     uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6814                     /* Emit all the Unicode characters. */
6815
6816                     for (foldbuf = tmpbuf;
6817                         foldlen;
6818                         foldlen -= numlen)
6819                     {
6820                         uvc = utf8_to_uvchr(foldbuf, &numlen);
6821                         if (numlen > 0) {
6822                             const STRLEN unilen = reguni(pRExC_state, uvc, s);
6823                             s       += unilen;
6824                             len     += unilen;
6825                             /* In EBCDIC the numlen
6826                             * and unilen can differ. */
6827                             foldbuf += numlen;
6828                             if (numlen >= foldlen)
6829                                 break;
6830                         }
6831                         else
6832                             break; /* "Can't happen." */
6833                     }
6834                 } else {
6835                     const STRLEN unilen = reguni(pRExC_state, uvc, s);
6836                     if (unilen > 0) {
6837                        s   += unilen;
6838                        len += unilen;
6839                     }
6840                 }
6841             } else {
6842                 len++;
6843                 REGC(*p, s++);
6844             }
6845         }
6846         if (SIZE_ONLY) {
6847             RExC_size += STR_SZ(len);
6848         } else {
6849             STR_LEN(ret) = len;
6850             RExC_emit += STR_SZ(len);
6851         }
6852         Set_Node_Cur_Length(ret); /* MJD */
6853         RExC_parse--;
6854         nextchar(pRExC_state);
6855     } else {    /* zero length */
6856         ret = reg_node(pRExC_state,NOTHING);
6857     }
6858     if (!cached) {
6859         SvREFCNT_dec(sv_str);
6860     }
6861     if (sv_name) {
6862         SvREFCNT_dec(sv_name);
6863     }
6864     return ret;
6865
6866 }
6867
6868
6869 /*
6870  * reg_recode
6871  *
6872  * It returns the code point in utf8 for the value in *encp.
6873  *    value: a code value in the source encoding
6874  *    encp:  a pointer to an Encode object
6875  *
6876  * If the result from Encode is not a single character,
6877  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6878  */
6879 STATIC UV
6880 S_reg_recode(pTHX_ const char value, SV **encp)
6881 {
6882     STRLEN numlen = 1;
6883     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
6884     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6885     const STRLEN newlen = SvCUR(sv);
6886     UV uv = UNICODE_REPLACEMENT;
6887
6888     PERL_ARGS_ASSERT_REG_RECODE;
6889
6890     if (newlen)
6891         uv = SvUTF8(sv)
6892              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6893              : *(U8*)s;
6894
6895     if (!newlen || numlen != newlen) {
6896         uv = UNICODE_REPLACEMENT;
6897         *encp = NULL;
6898     }
6899     return uv;
6900 }
6901
6902
6903 /*
6904  - regatom - the lowest level
6905
6906    Try to identify anything special at the start of the pattern. If there
6907    is, then handle it as required. This may involve generating a single regop,
6908    such as for an assertion; or it may involve recursing, such as to
6909    handle a () structure.
6910
6911    If the string doesn't start with something special then we gobble up
6912    as much literal text as we can.
6913
6914    Once we have been able to handle whatever type of thing started the
6915    sequence, we return.
6916
6917    Note: we have to be careful with escapes, as they can be both literal
6918    and special, and in the case of \10 and friends can either, depending
6919    on context. Specifically there are two seperate switches for handling
6920    escape sequences, with the one for handling literal escapes requiring
6921    a dummy entry for all of the special escapes that are actually handled
6922    by the other.
6923 */
6924
6925 STATIC regnode *
6926 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6927 {
6928     dVAR;
6929     register regnode *ret = NULL;
6930     I32 flags;
6931     char *parse_start = RExC_parse;
6932     GET_RE_DEBUG_FLAGS_DECL;
6933     DEBUG_PARSE("atom");
6934     *flagp = WORST;             /* Tentatively. */
6935
6936     PERL_ARGS_ASSERT_REGATOM;
6937
6938 tryagain:
6939     switch ((U8)*RExC_parse) {
6940     case '^':
6941         RExC_seen_zerolen++;
6942         nextchar(pRExC_state);
6943         if (RExC_flags & RXf_PMf_MULTILINE)
6944             ret = reg_node(pRExC_state, MBOL);
6945         else if (RExC_flags & RXf_PMf_SINGLELINE)
6946             ret = reg_node(pRExC_state, SBOL);
6947         else
6948             ret = reg_node(pRExC_state, BOL);
6949         Set_Node_Length(ret, 1); /* MJD */
6950         break;
6951     case '$':
6952         nextchar(pRExC_state);
6953         if (*RExC_parse)
6954             RExC_seen_zerolen++;
6955         if (RExC_flags & RXf_PMf_MULTILINE)
6956             ret = reg_node(pRExC_state, MEOL);
6957         else if (RExC_flags & RXf_PMf_SINGLELINE)
6958             ret = reg_node(pRExC_state, SEOL);
6959         else
6960             ret = reg_node(pRExC_state, EOL);
6961         Set_Node_Length(ret, 1); /* MJD */
6962         break;
6963     case '.':
6964         nextchar(pRExC_state);
6965         if (RExC_flags & RXf_PMf_SINGLELINE)
6966             ret = reg_node(pRExC_state, SANY);
6967         else
6968             ret = reg_node(pRExC_state, REG_ANY);
6969         *flagp |= HASWIDTH|SIMPLE;
6970         RExC_naughty++;
6971         Set_Node_Length(ret, 1); /* MJD */
6972         break;
6973     case '[':
6974     {
6975         char * const oregcomp_parse = ++RExC_parse;
6976         ret = regclass(pRExC_state,depth+1);
6977         if (*RExC_parse != ']') {
6978             RExC_parse = oregcomp_parse;
6979             vFAIL("Unmatched [");
6980         }
6981         nextchar(pRExC_state);
6982         *flagp |= HASWIDTH|SIMPLE;
6983         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6984         break;
6985     }
6986     case '(':
6987         nextchar(pRExC_state);
6988         ret = reg(pRExC_state, 1, &flags,depth+1);
6989         if (ret == NULL) {
6990                 if (flags & TRYAGAIN) {
6991                     if (RExC_parse == RExC_end) {
6992                          /* Make parent create an empty node if needed. */
6993                         *flagp |= TRYAGAIN;
6994                         return(NULL);
6995                     }
6996                     goto tryagain;
6997                 }
6998                 return(NULL);
6999         }
7000         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7001         break;
7002     case '|':
7003     case ')':
7004         if (flags & TRYAGAIN) {
7005             *flagp |= TRYAGAIN;
7006             return NULL;
7007         }
7008         vFAIL("Internal urp");
7009                                 /* Supposed to be caught earlier. */
7010         break;
7011     case '{':
7012         if (!regcurly(RExC_parse)) {
7013             RExC_parse++;
7014             goto defchar;
7015         }
7016         /* FALL THROUGH */
7017     case '?':
7018     case '+':
7019     case '*':
7020         RExC_parse++;
7021         vFAIL("Quantifier follows nothing");
7022         break;
7023     case 0xDF:
7024     case 0xC3:
7025     case 0xCE:
7026         do_foldchar:
7027         if (!LOC && FOLD) {
7028             U32 len,cp;
7029             len=0; /* silence a spurious compiler warning */
7030             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7031                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7032                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7033                 ret = reganode(pRExC_state, FOLDCHAR, cp);
7034                 Set_Node_Length(ret, 1); /* MJD */
7035                 nextchar(pRExC_state); /* kill whitespace under /x */
7036                 return ret;
7037             }
7038         }
7039         goto outer_default;
7040     case '\\':
7041         /* Special Escapes
7042
7043            This switch handles escape sequences that resolve to some kind
7044            of special regop and not to literal text. Escape sequnces that
7045            resolve to literal text are handled below in the switch marked
7046            "Literal Escapes".
7047
7048            Every entry in this switch *must* have a corresponding entry
7049            in the literal escape switch. However, the opposite is not
7050            required, as the default for this switch is to jump to the
7051            literal text handling code.
7052         */
7053         switch ((U8)*++RExC_parse) {
7054         case 0xDF:
7055         case 0xC3:
7056         case 0xCE:
7057                    goto do_foldchar;
7058         /* Special Escapes */
7059         case 'A':
7060             RExC_seen_zerolen++;
7061             ret = reg_node(pRExC_state, SBOL);
7062             *flagp |= SIMPLE;
7063             goto finish_meta_pat;
7064         case 'G':
7065             ret = reg_node(pRExC_state, GPOS);
7066             RExC_seen |= REG_SEEN_GPOS;
7067             *flagp |= SIMPLE;
7068             goto finish_meta_pat;
7069         case 'K':
7070             RExC_seen_zerolen++;
7071             ret = reg_node(pRExC_state, KEEPS);
7072             *flagp |= SIMPLE;
7073             /* XXX:dmq : disabling in-place substitution seems to
7074              * be necessary here to avoid cases of memory corruption, as
7075              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7076              */
7077             RExC_seen |= REG_SEEN_LOOKBEHIND;
7078             goto finish_meta_pat;
7079         case 'Z':
7080             ret = reg_node(pRExC_state, SEOL);
7081             *flagp |= SIMPLE;
7082             RExC_seen_zerolen++;                /* Do not optimize RE away */
7083             goto finish_meta_pat;
7084         case 'z':
7085             ret = reg_node(pRExC_state, EOS);
7086             *flagp |= SIMPLE;
7087             RExC_seen_zerolen++;                /* Do not optimize RE away */
7088             goto finish_meta_pat;
7089         case 'C':
7090             ret = reg_node(pRExC_state, CANY);
7091             RExC_seen |= REG_SEEN_CANY;
7092             *flagp |= HASWIDTH|SIMPLE;
7093             goto finish_meta_pat;
7094         case 'X':
7095             ret = reg_node(pRExC_state, CLUMP);
7096             *flagp |= HASWIDTH;
7097             goto finish_meta_pat;
7098         case 'w':
7099             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
7100             *flagp |= HASWIDTH|SIMPLE;
7101             goto finish_meta_pat;
7102         case 'W':
7103             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
7104             *flagp |= HASWIDTH|SIMPLE;
7105             goto finish_meta_pat;
7106         case 'b':
7107             RExC_seen_zerolen++;
7108             RExC_seen |= REG_SEEN_LOOKBEHIND;
7109             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
7110             *flagp |= SIMPLE;
7111             goto finish_meta_pat;
7112         case 'B':
7113             RExC_seen_zerolen++;
7114             RExC_seen |= REG_SEEN_LOOKBEHIND;
7115             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
7116             *flagp |= SIMPLE;
7117             goto finish_meta_pat;
7118         case 's':
7119             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
7120             *flagp |= HASWIDTH|SIMPLE;
7121             goto finish_meta_pat;
7122         case 'S':
7123             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
7124             *flagp |= HASWIDTH|SIMPLE;
7125             goto finish_meta_pat;
7126         case 'd':
7127             ret = reg_node(pRExC_state, DIGIT);
7128             *flagp |= HASWIDTH|SIMPLE;
7129             goto finish_meta_pat;
7130         case 'D':
7131             ret = reg_node(pRExC_state, NDIGIT);
7132             *flagp |= HASWIDTH|SIMPLE;
7133             goto finish_meta_pat;
7134         case 'R':
7135             ret = reg_node(pRExC_state, LNBREAK);
7136             *flagp |= HASWIDTH|SIMPLE;
7137             goto finish_meta_pat;
7138         case 'h':
7139             ret = reg_node(pRExC_state, HORIZWS);
7140             *flagp |= HASWIDTH|SIMPLE;
7141             goto finish_meta_pat;
7142         case 'H':
7143             ret = reg_node(pRExC_state, NHORIZWS);
7144             *flagp |= HASWIDTH|SIMPLE;
7145             goto finish_meta_pat;
7146         case 'v':
7147             ret = reg_node(pRExC_state, VERTWS);
7148             *flagp |= HASWIDTH|SIMPLE;
7149             goto finish_meta_pat;
7150         case 'V':
7151             ret = reg_node(pRExC_state, NVERTWS);
7152             *flagp |= HASWIDTH|SIMPLE;
7153          finish_meta_pat:
7154             nextchar(pRExC_state);
7155             Set_Node_Length(ret, 2); /* MJD */
7156             break;
7157         case 'p':
7158         case 'P':
7159             {
7160                 char* const oldregxend = RExC_end;
7161 #ifdef DEBUGGING
7162                 char* parse_start = RExC_parse - 2;
7163 #endif
7164
7165                 if (RExC_parse[1] == '{') {
7166                   /* a lovely hack--pretend we saw [\pX] instead */
7167                     RExC_end = strchr(RExC_parse, '}');
7168                     if (!RExC_end) {
7169                         const U8 c = (U8)*RExC_parse;
7170                         RExC_parse += 2;
7171                         RExC_end = oldregxend;
7172                         vFAIL2("Missing right brace on \\%c{}", c);
7173                     }
7174                     RExC_end++;
7175                 }
7176                 else {
7177                     RExC_end = RExC_parse + 2;
7178                     if (RExC_end > oldregxend)
7179                         RExC_end = oldregxend;
7180                 }
7181                 RExC_parse--;
7182
7183                 ret = regclass(pRExC_state,depth+1);
7184
7185                 RExC_end = oldregxend;
7186                 RExC_parse--;
7187
7188                 Set_Node_Offset(ret, parse_start + 2);
7189                 Set_Node_Cur_Length(ret);
7190                 nextchar(pRExC_state);
7191                 *flagp |= HASWIDTH|SIMPLE;
7192             }
7193             break;
7194         case 'N':
7195             /* Handle \N and \N{NAME} here and not below because it can be
7196             multicharacter. join_exact() will join them up later on.
7197             Also this makes sure that things like /\N{BLAH}+/ and
7198             \N{BLAH} being multi char Just Happen. dmq*/
7199             ++RExC_parse;
7200             ret= reg_namedseq(pRExC_state, NULL, flagp);
7201             break;
7202         case 'k':    /* Handle \k<NAME> and \k'NAME' */
7203         parse_named_seq:
7204         {
7205             char ch= RExC_parse[1];
7206             if (ch != '<' && ch != '\'' && ch != '{') {
7207                 RExC_parse++;
7208                 vFAIL2("Sequence %.2s... not terminated",parse_start);
7209             } else {
7210                 /* this pretty much dupes the code for (?P=...) in reg(), if
7211                    you change this make sure you change that */
7212                 char* name_start = (RExC_parse += 2);
7213                 U32 num = 0;
7214                 SV *sv_dat = reg_scan_name(pRExC_state,
7215                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7216                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7217                 if (RExC_parse == name_start || *RExC_parse != ch)
7218                     vFAIL2("Sequence %.3s... not terminated",parse_start);
7219
7220                 if (!SIZE_ONLY) {
7221                     num = add_data( pRExC_state, 1, "S" );
7222                     RExC_rxi->data->data[num]=(void*)sv_dat;
7223                     SvREFCNT_inc_simple_void(sv_dat);
7224                 }
7225
7226                 RExC_sawback = 1;
7227                 ret = reganode(pRExC_state,
7228                            (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7229                            num);
7230                 *flagp |= HASWIDTH;
7231
7232                 /* override incorrect value set in reganode MJD */
7233                 Set_Node_Offset(ret, parse_start+1);
7234                 Set_Node_Cur_Length(ret); /* MJD */
7235                 nextchar(pRExC_state);
7236
7237             }
7238             break;
7239         }
7240         case 'g':
7241         case '1': case '2': case '3': case '4':
7242         case '5': case '6': case '7': case '8': case '9':
7243             {
7244                 I32 num;
7245                 bool isg = *RExC_parse == 'g';
7246                 bool isrel = 0;
7247                 bool hasbrace = 0;
7248                 if (isg) {
7249                     RExC_parse++;
7250                     if (*RExC_parse == '{') {
7251                         RExC_parse++;
7252                         hasbrace = 1;
7253                     }
7254                     if (*RExC_parse == '-') {
7255                         RExC_parse++;
7256                         isrel = 1;
7257                     }
7258                     if (hasbrace && !isDIGIT(*RExC_parse)) {
7259                         if (isrel) RExC_parse--;
7260                         RExC_parse -= 2;
7261                         goto parse_named_seq;
7262                 }   }
7263                 num = atoi(RExC_parse);
7264                 if (isg && num == 0)
7265                     vFAIL("Reference to invalid group 0");
7266                 if (isrel) {
7267                     num = RExC_npar - num;
7268                     if (num < 1)
7269                         vFAIL("Reference to nonexistent or unclosed group");
7270                 }
7271                 if (!isg && num > 9 && num >= RExC_npar)
7272                     goto defchar;
7273                 else {
7274                     char * const parse_start = RExC_parse - 1; /* MJD */
7275                     while (isDIGIT(*RExC_parse))
7276                         RExC_parse++;
7277                     if (parse_start == RExC_parse - 1)
7278                         vFAIL("Unterminated \\g... pattern");
7279                     if (hasbrace) {
7280                         if (*RExC_parse != '}')
7281                             vFAIL("Unterminated \\g{...} pattern");
7282                         RExC_parse++;
7283                     }
7284                     if (!SIZE_ONLY) {
7285                         if (num > (I32)RExC_rx->nparens)
7286                             vFAIL("Reference to nonexistent group");
7287                     }
7288                     RExC_sawback = 1;
7289                     ret = reganode(pRExC_state,
7290                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7291                                    num);
7292                     *flagp |= HASWIDTH;
7293
7294                     /* override incorrect value set in reganode MJD */
7295                     Set_Node_Offset(ret, parse_start+1);
7296                     Set_Node_Cur_Length(ret); /* MJD */
7297                     RExC_parse--;
7298                     nextchar(pRExC_state);
7299                 }
7300             }
7301             break;
7302         case '\0':
7303             if (RExC_parse >= RExC_end)
7304                 FAIL("Trailing \\");
7305             /* FALL THROUGH */
7306         default:
7307             /* Do not generate "unrecognized" warnings here, we fall
7308                back into the quick-grab loop below */
7309             parse_start--;
7310             goto defchar;
7311         }
7312         break;
7313
7314     case '#':
7315         if (RExC_flags & RXf_PMf_EXTENDED) {
7316             if ( reg_skipcomment( pRExC_state ) )
7317                 goto tryagain;
7318         }
7319         /* FALL THROUGH */
7320
7321     default:
7322         outer_default:{
7323             register STRLEN len;
7324             register UV ender;
7325             register char *p;
7326             char *s;
7327             STRLEN foldlen;
7328             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7329
7330             parse_start = RExC_parse - 1;
7331
7332             RExC_parse++;
7333
7334         defchar:
7335             ender = 0;
7336             ret = reg_node(pRExC_state,
7337                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7338             s = STRING(ret);
7339             for (len = 0, p = RExC_parse - 1;
7340               len < 127 && p < RExC_end;
7341               len++)
7342             {
7343                 char * const oldp = p;
7344
7345                 if (RExC_flags & RXf_PMf_EXTENDED)
7346                     p = regwhite( pRExC_state, p );
7347                 switch ((U8)*p) {
7348                 case 0xDF:
7349                 case 0xC3:
7350                 case 0xCE:
7351                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7352                                 goto normal_default;
7353                 case '^':
7354                 case '$':
7355                 case '.':
7356                 case '[':
7357                 case '(':
7358                 case ')':
7359                 case '|':
7360                     goto loopdone;
7361                 case '\\':
7362                     /* Literal Escapes Switch
7363
7364                        This switch is meant to handle escape sequences that
7365                        resolve to a literal character.
7366
7367                        Every escape sequence that represents something
7368                        else, like an assertion or a char class, is handled
7369                        in the switch marked 'Special Escapes' above in this
7370                        routine, but also has an entry here as anything that
7371                        isn't explicitly mentioned here will be treated as
7372                        an unescaped equivalent literal.
7373                     */
7374
7375                     switch ((U8)*++p) {
7376                     /* These are all the special escapes. */
7377                     case 0xDF:
7378                     case 0xC3:
7379                     case 0xCE:
7380                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7381                                 goto normal_default;
7382                     case 'A':             /* Start assertion */
7383                     case 'b': case 'B':   /* Word-boundary assertion*/
7384                     case 'C':             /* Single char !DANGEROUS! */
7385                     case 'd': case 'D':   /* digit class */
7386                     case 'g': case 'G':   /* generic-backref, pos assertion */
7387                     case 'h': case 'H':   /* HORIZWS */
7388                     case 'k': case 'K':   /* named backref, keep marker */
7389                     case 'N':             /* named char sequence */
7390                     case 'p': case 'P':   /* Unicode property */
7391                               case 'R':   /* LNBREAK */
7392                     case 's': case 'S':   /* space class */
7393                     case 'v': case 'V':   /* VERTWS */
7394                     case 'w': case 'W':   /* word class */
7395                     case 'X':             /* eXtended Unicode "combining character sequence" */
7396                     case 'z': case 'Z':   /* End of line/string assertion */
7397                         --p;
7398                         goto loopdone;
7399
7400                     /* Anything after here is an escape that resolves to a
7401                        literal. (Except digits, which may or may not)
7402                      */
7403                     case 'n':
7404                         ender = '\n';
7405                         p++;
7406                         break;
7407                     case 'r':
7408                         ender = '\r';
7409                         p++;
7410                         break;
7411                     case 't':
7412                         ender = '\t';
7413                         p++;
7414                         break;
7415                     case 'f':
7416                         ender = '\f';
7417                         p++;
7418                         break;
7419                     case 'e':
7420                           ender = ASCII_TO_NATIVE('\033');
7421                         p++;
7422                         break;
7423                     case 'a':
7424                           ender = ASCII_TO_NATIVE('\007');
7425                         p++;
7426                         break;
7427                     case 'x':
7428                         if (*++p == '{') {
7429                             char* const e = strchr(p, '}');
7430
7431                             if (!e) {
7432                                 RExC_parse = p + 1;
7433                                 vFAIL("Missing right brace on \\x{}");
7434                             }
7435                             else {
7436                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7437                                     | PERL_SCAN_DISALLOW_PREFIX;
7438                                 STRLEN numlen = e - p - 1;
7439                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7440                                 if (ender > 0xff)
7441                                     RExC_utf8 = 1;
7442                                 p = e + 1;
7443                             }
7444                         }
7445                         else {
7446                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7447                             STRLEN numlen = 2;
7448                             ender = grok_hex(p, &numlen, &flags, NULL);
7449                             p += numlen;
7450                         }
7451                         if (PL_encoding && ender < 0x100)
7452                             goto recode_encoding;
7453                         break;
7454                     case 'c':
7455                         p++;
7456                         ender = UCHARAT(p++);
7457                         ender = toCTRL(ender);
7458                         break;
7459                     case '0': case '1': case '2': case '3':case '4':
7460                     case '5': case '6': case '7': case '8':case '9':
7461                         if (*p == '0' ||
7462                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
7463                             I32 flags = 0;
7464                             STRLEN numlen = 3;
7465                             ender = grok_oct(p, &numlen, &flags, NULL);
7466
7467                             /* An octal above 0xff is interpreted differently
7468                              * depending on if the re is in utf8 or not.  If it
7469                              * is in utf8, the value will be itself, otherwise
7470                              * it is interpreted as modulo 0x100.  It has been
7471                              * decided to discourage the use of octal above the
7472                              * single-byte range.  For now, warn only when
7473                              * it ends up modulo */
7474                             if (SIZE_ONLY && ender >= 0x100
7475                                     && ! UTF && ! PL_encoding
7476                                     && ckWARN2(WARN_DEPRECATED, WARN_REGEXP)) {
7477                                 vWARNdep(p, "Use of octal value above 377 is deprecated");
7478                             }
7479                             p += numlen;
7480                         }
7481                         else {
7482                             --p;
7483                             goto loopdone;
7484                         }
7485                         if (PL_encoding && ender < 0x100)
7486                             goto recode_encoding;
7487                         break;
7488                     recode_encoding:
7489                         {
7490                             SV* enc = PL_encoding;
7491                             ender = reg_recode((const char)(U8)ender, &enc);
7492                             if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7493                                 vWARN(p, "Invalid escape in the specified encoding");
7494                             RExC_utf8 = 1;
7495                         }
7496                         break;
7497                     case '\0':
7498                         if (p >= RExC_end)
7499                             FAIL("Trailing \\");
7500                         /* FALL THROUGH */
7501                     default:
7502                         if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
7503                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7504                         goto normal_default;
7505                     }
7506                     break;
7507                 default:
7508                   normal_default:
7509                     if (UTF8_IS_START(*p) && UTF) {
7510                         STRLEN numlen;
7511                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7512                                                &numlen, UTF8_ALLOW_DEFAULT);
7513                         p += numlen;
7514                     }
7515                     else
7516                         ender = *p++;
7517                     break;
7518                 }
7519                 if ( RExC_flags & RXf_PMf_EXTENDED)
7520                     p = regwhite( pRExC_state, p );
7521                 if (UTF && FOLD) {
7522                     /* Prime the casefolded buffer. */
7523                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7524                 }
7525                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7526                     if (len)
7527                         p = oldp;
7528                     else if (UTF) {
7529                          if (FOLD) {
7530                               /* Emit all the Unicode characters. */
7531                               STRLEN numlen;
7532                               for (foldbuf = tmpbuf;
7533                                    foldlen;
7534                                    foldlen -= numlen) {
7535                                    ender = utf8_to_uvchr(foldbuf, &numlen);
7536                                    if (numlen > 0) {
7537                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
7538                                         s       += unilen;
7539                                         len     += unilen;
7540                                         /* In EBCDIC the numlen
7541                                          * and unilen can differ. */
7542                                         foldbuf += numlen;
7543                                         if (numlen >= foldlen)
7544                                              break;
7545                                    }
7546                                    else
7547                                         break; /* "Can't happen." */
7548                               }
7549                          }
7550                          else {
7551                               const STRLEN unilen = reguni(pRExC_state, ender, s);
7552                               if (unilen > 0) {
7553                                    s   += unilen;
7554                                    len += unilen;
7555                               }
7556                          }
7557                     }
7558                     else {
7559                         len++;
7560                         REGC((char)ender, s++);
7561                     }
7562                     break;
7563                 }
7564                 if (UTF) {
7565                      if (FOLD) {
7566                           /* Emit all the Unicode characters. */
7567                           STRLEN numlen;
7568                           for (foldbuf = tmpbuf;
7569                                foldlen;
7570                                foldlen -= numlen) {
7571                                ender = utf8_to_uvchr(foldbuf, &numlen);
7572                                if (numlen > 0) {
7573                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
7574                                     len     += unilen;
7575                                     s       += unilen;
7576                                     /* In EBCDIC the numlen
7577                                      * and unilen can differ. */
7578                                     foldbuf += numlen;
7579                                     if (numlen >= foldlen)
7580                                          break;
7581                                }
7582                                else
7583                                     break;
7584                           }
7585                      }
7586                      else {
7587                           const STRLEN unilen = reguni(pRExC_state, ender, s);
7588                           if (unilen > 0) {
7589                                s   += unilen;
7590                                len += unilen;
7591                           }
7592                      }
7593                      len--;
7594                 }
7595                 else
7596                     REGC((char)ender, s++);
7597             }
7598         loopdone:
7599             RExC_parse = p - 1;
7600             Set_Node_Cur_Length(ret); /* MJD */
7601             nextchar(pRExC_state);
7602             {
7603                 /* len is STRLEN which is unsigned, need to copy to signed */
7604                 IV iv = len;
7605                 if (iv < 0)
7606                     vFAIL("Internal disaster");
7607             }
7608             if (len > 0)
7609                 *flagp |= HASWIDTH;
7610             if (len == 1 && UNI_IS_INVARIANT(ender))
7611                 *flagp |= SIMPLE;
7612
7613             if (SIZE_ONLY)
7614                 RExC_size += STR_SZ(len);
7615             else {
7616                 STR_LEN(ret) = len;
7617                 RExC_emit += STR_SZ(len);
7618             }
7619         }
7620         break;
7621     }
7622
7623     return(ret);
7624 }
7625
7626 STATIC char *
7627 S_regwhite( RExC_state_t *pRExC_state, char *p )
7628 {
7629     const char *e = RExC_end;
7630
7631     PERL_ARGS_ASSERT_REGWHITE;
7632
7633     while (p < e) {
7634         if (isSPACE(*p))
7635             ++p;
7636         else if (*p == '#') {
7637             bool ended = 0;
7638             do {
7639                 if (*p++ == '\n') {
7640                     ended = 1;
7641                     break;
7642                 }
7643             } while (p < e);
7644             if (!ended)
7645                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7646         }
7647         else
7648             break;
7649     }
7650     return p;
7651 }
7652
7653 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7654    Character classes ([:foo:]) can also be negated ([:^foo:]).
7655    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7656    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7657    but trigger failures because they are currently unimplemented. */
7658
7659 #define POSIXCC_DONE(c)   ((c) == ':')
7660 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7661 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7662
7663 STATIC I32
7664 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7665 {
7666     dVAR;
7667     I32 namedclass = OOB_NAMEDCLASS;
7668
7669     PERL_ARGS_ASSERT_REGPPOSIXCC;
7670
7671     if (value == '[' && RExC_parse + 1 < RExC_end &&
7672         /* I smell either [: or [= or [. -- POSIX has been here, right? */
7673         POSIXCC(UCHARAT(RExC_parse))) {
7674         const char c = UCHARAT(RExC_parse);
7675         char* const s = RExC_parse++;
7676
7677         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7678             RExC_parse++;
7679         if (RExC_parse == RExC_end)
7680             /* Grandfather lone [:, [=, [. */
7681             RExC_parse = s;
7682         else {
7683             const char* const t = RExC_parse++; /* skip over the c */
7684             assert(*t == c);
7685
7686             if (UCHARAT(RExC_parse) == ']') {
7687                 const char *posixcc = s + 1;
7688                 RExC_parse++; /* skip over the ending ] */
7689
7690                 if (*s == ':') {
7691                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7692                     const I32 skip = t - posixcc;
7693
7694                     /* Initially switch on the length of the name.  */
7695                     switch (skip) {
7696                     case 4:
7697                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7698                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7699                         break;
7700                     case 5:
7701                         /* Names all of length 5.  */
7702                         /* alnum alpha ascii blank cntrl digit graph lower
7703                            print punct space upper  */
7704                         /* Offset 4 gives the best switch position.  */
7705                         switch (posixcc[4]) {
7706                         case 'a':
7707                             if (memEQ(posixcc, "alph", 4)) /* alpha */
7708                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7709                             break;
7710                         case 'e':
7711                             if (memEQ(posixcc, "spac", 4)) /* space */
7712                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7713                             break;
7714                         case 'h':
7715                             if (memEQ(posixcc, "grap", 4)) /* graph */
7716                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7717                             break;
7718                         case 'i':
7719                             if (memEQ(posixcc, "asci", 4)) /* ascii */
7720                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7721                             break;
7722                         case 'k':
7723                             if (memEQ(posixcc, "blan", 4)) /* blank */
7724                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7725                             break;
7726                         case 'l':
7727                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7728                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7729                             break;
7730                         case 'm':
7731                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
7732                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7733                             break;
7734                         case 'r':
7735                             if (memEQ(posixcc, "lowe", 4)) /* lower */
7736                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7737                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
7738                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7739                             break;
7740                         case 't':
7741                             if (memEQ(posixcc, "digi", 4)) /* digit */
7742                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7743                             else if (memEQ(posixcc, "prin", 4)) /* print */
7744                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7745                             else if (memEQ(posixcc, "punc", 4)) /* punct */
7746                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7747                             break;
7748                         }
7749                         break;
7750                     case 6:
7751                         if (memEQ(posixcc, "xdigit", 6))
7752                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7753                         break;
7754                     }
7755
7756                     if (namedclass == OOB_NAMEDCLASS)
7757                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7758                                       t - s - 1, s + 1);
7759                     assert (posixcc[skip] == ':');
7760                     assert (posixcc[skip+1] == ']');
7761                 } else if (!SIZE_ONLY) {
7762                     /* [[=foo=]] and [[.foo.]] are still future. */
7763
7764                     /* adjust RExC_parse so the warning shows after
7765                        the class closes */
7766                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7767                         RExC_parse++;
7768                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7769                 }
7770             } else {
7771                 /* Maternal grandfather:
7772                  * "[:" ending in ":" but not in ":]" */
7773                 RExC_parse = s;
7774             }
7775         }
7776     }
7777
7778     return namedclass;
7779 }
7780
7781 STATIC void
7782 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7783 {
7784     dVAR;
7785
7786     PERL_ARGS_ASSERT_CHECKPOSIXCC;
7787
7788     if (POSIXCC(UCHARAT(RExC_parse))) {
7789         const char *s = RExC_parse;
7790         const char  c = *s++;
7791
7792         while (isALNUM(*s))
7793             s++;
7794         if (*s && c == *s && s[1] == ']') {
7795             if (ckWARN(WARN_REGEXP))
7796                 vWARN3(s+2,
7797                         "POSIX syntax [%c %c] belongs inside character classes",
7798                         c, c);
7799
7800             /* [[=foo=]] and [[.foo.]] are still future. */
7801             if (POSIXCC_NOTYET(c)) {
7802                 /* adjust RExC_parse so the error shows after
7803                    the class closes */
7804                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7805                     NOOP;
7806                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7807             }
7808         }
7809     }
7810 }
7811
7812
7813 #define _C_C_T_(NAME,TEST,WORD)                         \
7814 ANYOF_##NAME:                                           \
7815     if (LOC)                                            \
7816         ANYOF_CLASS_SET(ret, ANYOF_##NAME);             \
7817     else {                                              \
7818         for (value = 0; value < 256; value++)           \
7819             if (TEST)                                   \
7820                 ANYOF_BITMAP_SET(ret, value);           \
7821     }                                                   \
7822     yesno = '+';                                        \
7823     what = WORD;                                        \
7824     break;                                              \
7825 case ANYOF_N##NAME:                                     \
7826     if (LOC)                                            \
7827         ANYOF_CLASS_SET(ret, ANYOF_N##NAME);            \
7828     else {                                              \
7829         for (value = 0; value < 256; value++)           \
7830             if (!TEST)                                  \
7831                 ANYOF_BITMAP_SET(ret, value);           \
7832     }                                                   \
7833     yesno = '!';                                        \
7834     what = WORD;                                        \
7835     break
7836
7837 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
7838 ANYOF_##NAME:                                           \
7839         for (value = 0; value < 256; value++)           \
7840             if (TEST)                                   \
7841                 ANYOF_BITMAP_SET(ret, value);           \
7842     yesno = '+';                                        \
7843     what = WORD;                                        \
7844     break;                                              \
7845 case ANYOF_N##NAME:                                     \
7846         for (value = 0; value < 256; value++)           \
7847             if (!TEST)                                  \
7848                 ANYOF_BITMAP_SET(ret, value);           \
7849     yesno = '!';                                        \
7850     what = WORD;                                        \
7851     break
7852
7853 /*
7854    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7855    so that it is possible to override the option here without having to
7856    rebuild the entire core. as we are required to do if we change regcomp.h
7857    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7858 */
7859 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7860 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7861 #endif
7862
7863 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7864 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7865 #else
7866 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7867 #endif
7868
7869 /*
7870    parse a class specification and produce either an ANYOF node that
7871    matches the pattern or if the pattern matches a single char only and
7872    that char is < 256 and we are case insensitive then we produce an
7873    EXACT node instead.
7874 */
7875
7876 STATIC regnode *
7877 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7878 {
7879     dVAR;
7880     register UV nextvalue;
7881     register IV prevvalue = OOB_UNICODE;
7882     register IV range = 0;
7883     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7884     register regnode *ret;
7885     STRLEN numlen;
7886     IV namedclass;
7887     char *rangebegin = NULL;
7888     bool need_class = 0;
7889     SV *listsv = NULL;
7890     UV n;
7891     bool optimize_invert   = TRUE;
7892     AV* unicode_alternate  = NULL;
7893 #ifdef EBCDIC
7894     UV literal_endpoint = 0;
7895 #endif
7896     UV stored = 0;  /* number of chars stored in the class */
7897
7898     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7899         case we need to change the emitted regop to an EXACT. */
7900     const char * orig_parse = RExC_parse;
7901     GET_RE_DEBUG_FLAGS_DECL;
7902
7903     PERL_ARGS_ASSERT_REGCLASS;
7904 #ifndef DEBUGGING
7905     PERL_UNUSED_ARG(depth);
7906 #endif
7907
7908     DEBUG_PARSE("clas");
7909
7910     /* Assume we are going to generate an ANYOF node. */
7911     ret = reganode(pRExC_state, ANYOF, 0);
7912
7913     if (!SIZE_ONLY)
7914         ANYOF_FLAGS(ret) = 0;
7915
7916     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
7917         RExC_naughty++;
7918         RExC_parse++;
7919         if (!SIZE_ONLY)
7920             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7921     }
7922
7923     if (SIZE_ONLY) {
7924         RExC_size += ANYOF_SKIP;
7925         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7926     }
7927     else {
7928         RExC_emit += ANYOF_SKIP;
7929         if (FOLD)
7930             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7931         if (LOC)
7932             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7933         ANYOF_BITMAP_ZERO(ret);
7934         listsv = newSVpvs("# comment\n");
7935     }
7936
7937     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7938
7939     if (!SIZE_ONLY && POSIXCC(nextvalue))
7940         checkposixcc(pRExC_state);
7941
7942     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7943     if (UCHARAT(RExC_parse) == ']')
7944         goto charclassloop;
7945
7946 parseit:
7947     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7948
7949     charclassloop:
7950
7951         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7952
7953         if (!range)
7954             rangebegin = RExC_parse;
7955         if (UTF) {
7956             value = utf8n_to_uvchr((U8*)RExC_parse,
7957                                    RExC_end - RExC_parse,
7958                                    &numlen, UTF8_ALLOW_DEFAULT);
7959             RExC_parse += numlen;
7960         }
7961         else
7962             value = UCHARAT(RExC_parse++);
7963
7964         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7965         if (value == '[' && POSIXCC(nextvalue))
7966             namedclass = regpposixcc(pRExC_state, value);
7967         else if (value == '\\') {
7968             if (UTF) {
7969                 value = utf8n_to_uvchr((U8*)RExC_parse,
7970                                    RExC_end - RExC_parse,
7971                                    &numlen, UTF8_ALLOW_DEFAULT);
7972                 RExC_parse += numlen;
7973             }
7974             else
7975                 value = UCHARAT(RExC_parse++);
7976             /* Some compilers cannot handle switching on 64-bit integer
7977              * values, therefore value cannot be an UV.  Yes, this will
7978              * be a problem later if we want switch on Unicode.
7979              * A similar issue a little bit later when switching on
7980              * namedclass. --jhi */
7981             switch ((I32)value) {
7982             case 'w':   namedclass = ANYOF_ALNUM;       break;
7983             case 'W':   namedclass = ANYOF_NALNUM;      break;
7984             case 's':   namedclass = ANYOF_SPACE;       break;
7985             case 'S':   namedclass = ANYOF_NSPACE;      break;
7986             case 'd':   namedclass = ANYOF_DIGIT;       break;
7987             case 'D':   namedclass = ANYOF_NDIGIT;      break;
7988             case 'v':   namedclass = ANYOF_VERTWS;      break;
7989             case 'V':   namedclass = ANYOF_NVERTWS;     break;
7990             case 'h':   namedclass = ANYOF_HORIZWS;     break;
7991             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
7992             case 'N':  /* Handle \N{NAME} in class */
7993                 {
7994                     /* We only pay attention to the first char of
7995                     multichar strings being returned. I kinda wonder
7996                     if this makes sense as it does change the behaviour
7997                     from earlier versions, OTOH that behaviour was broken
7998                     as well. */
7999                     UV v; /* value is register so we cant & it /grrr */
8000                     if (reg_namedseq(pRExC_state, &v, NULL)) {
8001                         goto parseit;
8002                     }
8003                     value= v;
8004                 }
8005                 break;
8006             case 'p':
8007             case 'P':
8008                 {
8009                 char *e;
8010                 if (RExC_parse >= RExC_end)
8011                     vFAIL2("Empty \\%c{}", (U8)value);
8012                 if (*RExC_parse == '{') {
8013                     const U8 c = (U8)value;
8014                     e = strchr(RExC_parse++, '}');
8015                     if (!e)
8016                         vFAIL2("Missing right brace on \\%c{}", c);
8017                     while (isSPACE(UCHARAT(RExC_parse)))
8018                         RExC_parse++;
8019                     if (e == RExC_parse)
8020                         vFAIL2("Empty \\%c{}", c);
8021                     n = e - RExC_parse;
8022                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8023                         n--;
8024                 }
8025                 else {
8026                     e = RExC_parse;
8027                     n = 1;
8028                 }
8029                 if (!SIZE_ONLY) {
8030                     if (UCHARAT(RExC_parse) == '^') {
8031                          RExC_parse++;
8032                          n--;
8033                          value = value == 'p' ? 'P' : 'p'; /* toggle */
8034                          while (isSPACE(UCHARAT(RExC_parse))) {
8035                               RExC_parse++;
8036                               n--;
8037                          }
8038                     }
8039                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8040                         (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8041                 }
8042                 RExC_parse = e + 1;
8043                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8044                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
8045                 }
8046                 break;
8047             case 'n':   value = '\n';                   break;
8048             case 'r':   value = '\r';                   break;
8049             case 't':   value = '\t';                   break;
8050             case 'f':   value = '\f';                   break;
8051             case 'b':   value = '\b';                   break;
8052             case 'e':   value = ASCII_TO_NATIVE('\033');break;
8053             case 'a':   value = ASCII_TO_NATIVE('\007');break;
8054             case 'x':
8055                 if (*RExC_parse == '{') {
8056                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8057                         | PERL_SCAN_DISALLOW_PREFIX;
8058                     char * const e = strchr(RExC_parse++, '}');
8059                     if (!e)
8060                         vFAIL("Missing right brace on \\x{}");
8061
8062                     numlen = e - RExC_parse;
8063                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8064                     RExC_parse = e + 1;
8065                 }
8066                 else {
8067                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8068                     numlen = 2;
8069                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8070                     RExC_parse += numlen;
8071                 }
8072                 if (PL_encoding && value < 0x100)
8073                     goto recode_encoding;
8074                 break;
8075             case 'c':
8076                 value = UCHARAT(RExC_parse++);
8077                 value = toCTRL(value);
8078                 break;
8079             case '0': case '1': case '2': case '3': case '4':
8080             case '5': case '6': case '7': case '8': case '9':
8081                 {
8082                     I32 flags = 0;
8083                     numlen = 3;
8084                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8085                     RExC_parse += numlen;
8086                     if (PL_encoding && value < 0x100)
8087                         goto recode_encoding;
8088                     break;
8089                 }
8090             recode_encoding:
8091                 {
8092                     SV* enc = PL_encoding;
8093                     value = reg_recode((const char)(U8)value, &enc);
8094                     if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
8095                         vWARN(RExC_parse,
8096                               "Invalid escape in the specified encoding");
8097                     break;
8098                 }
8099             default:
8100                 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
8101                     vWARN2(RExC_parse,
8102                            "Unrecognized escape \\%c in character class passed through",
8103                            (int)value);
8104                 break;
8105             }
8106         } /* end of \blah */
8107 #ifdef EBCDIC
8108         else
8109             literal_endpoint++;
8110 #endif
8111
8112         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8113
8114             if (!SIZE_ONLY && !need_class)
8115                 ANYOF_CLASS_ZERO(ret);
8116
8117             need_class = 1;
8118
8119             /* a bad range like a-\d, a-[:digit:] ? */
8120             if (range) {
8121                 if (!SIZE_ONLY) {
8122                     if (ckWARN(WARN_REGEXP)) {
8123                         const int w =
8124                             RExC_parse >= rangebegin ?
8125                             RExC_parse - rangebegin : 0;
8126                         vWARN4(RExC_parse,
8127                                "False [] range \"%*.*s\"",
8128                                w, w, rangebegin);
8129                     }
8130                     if (prevvalue < 256) {
8131                         ANYOF_BITMAP_SET(ret, prevvalue);
8132                         ANYOF_BITMAP_SET(ret, '-');
8133                     }
8134                     else {
8135                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8136                         Perl_sv_catpvf(aTHX_ listsv,
8137                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8138                     }
8139                 }
8140
8141                 range = 0; /* this was not a true range */
8142             }
8143
8144
8145
8146             if (!SIZE_ONLY) {
8147                 const char *what = NULL;
8148                 char yesno = 0;
8149
8150                 if (namedclass > OOB_NAMEDCLASS)
8151                     optimize_invert = FALSE;
8152                 /* Possible truncation here but in some 64-bit environments
8153                  * the compiler gets heartburn about switch on 64-bit values.
8154                  * A similar issue a little earlier when switching on value.
8155                  * --jhi */
8156                 switch ((I32)namedclass) {
8157
8158                 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8159                 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8160                 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8161                 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8162                 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8163                 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8164                 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8165                 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8166                 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8167                 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8168 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8169                 case _C_C_T_(ALNUM, isALNUM(value), "Word");
8170                 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
8171 #else
8172                 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8173                 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8174 #endif
8175                 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
8176                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8177                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8178                 case ANYOF_ASCII:
8179                     if (LOC)
8180                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8181                     else {
8182 #ifndef EBCDIC
8183                         for (value = 0; value < 128; value++)
8184                             ANYOF_BITMAP_SET(ret, value);
8185 #else  /* EBCDIC */
8186                         for (value = 0; value < 256; value++) {
8187                             if (isASCII(value))
8188                                 ANYOF_BITMAP_SET(ret, value);
8189                         }
8190 #endif /* EBCDIC */
8191                     }
8192                     yesno = '+';
8193                     what = "ASCII";
8194                     break;
8195                 case ANYOF_NASCII:
8196                     if (LOC)
8197                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8198                     else {
8199 #ifndef EBCDIC
8200                         for (value = 128; value < 256; value++)
8201                             ANYOF_BITMAP_SET(ret, value);
8202 #else  /* EBCDIC */
8203                         for (value = 0; value < 256; value++) {
8204                             if (!isASCII(value))
8205                                 ANYOF_BITMAP_SET(ret, value);
8206                         }
8207 #endif /* EBCDIC */
8208                     }
8209                     yesno = '!';
8210                     what = "ASCII";
8211                     break;
8212                 case ANYOF_DIGIT:
8213                     if (LOC)
8214                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8215                     else {
8216                         /* consecutive digits assumed */
8217                         for (value = '0'; value <= '9'; value++)
8218                             ANYOF_BITMAP_SET(ret, value);
8219                     }
8220                     yesno = '+';
8221                     what = POSIX_CC_UNI_NAME("Digit");
8222                     break;
8223                 case ANYOF_NDIGIT:
8224                     if (LOC)
8225                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8226                     else {
8227                         /* consecutive digits assumed */
8228                         for (value = 0; value < '0'; value++)
8229                             ANYOF_BITMAP_SET(ret, value);
8230                         for (value = '9' + 1; value < 256; value++)
8231                             ANYOF_BITMAP_SET(ret, value);
8232                     }
8233                     yesno = '!';
8234                     what = POSIX_CC_UNI_NAME("Digit");
8235                     break;
8236                 case ANYOF_MAX:
8237                     /* this is to handle \p and \P */
8238                     break;
8239                 default:
8240                     vFAIL("Invalid [::] class");
8241                     break;
8242                 }
8243                 if (what) {
8244                     /* Strings such as "+utf8::isWord\n" */
8245                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8246                 }
8247                 if (LOC)
8248                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8249                 continue;
8250             }
8251         } /* end of namedclass \blah */
8252
8253         if (range) {
8254             if (prevvalue > (IV)value) /* b-a */ {
8255                 const int w = RExC_parse - rangebegin;
8256                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8257                 range = 0; /* not a valid range */
8258             }
8259         }
8260         else {
8261             prevvalue = value; /* save the beginning of the range */
8262             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8263                 RExC_parse[1] != ']') {
8264                 RExC_parse++;
8265
8266                 /* a bad range like \w-, [:word:]- ? */
8267                 if (namedclass > OOB_NAMEDCLASS) {
8268                     if (ckWARN(WARN_REGEXP)) {
8269                         const int w =
8270                             RExC_parse >= rangebegin ?
8271                             RExC_parse - rangebegin : 0;
8272                         vWARN4(RExC_parse,
8273                                "False [] range \"%*.*s\"",
8274                                w, w, rangebegin);
8275                     }
8276                     if (!SIZE_ONLY)
8277                         ANYOF_BITMAP_SET(ret, '-');
8278                 } else
8279                     range = 1;  /* yeah, it's a range! */
8280                 continue;       /* but do it the next time */
8281             }
8282         }
8283
8284         /* now is the next time */
8285         /*stored += (value - prevvalue + 1);*/
8286         if (!SIZE_ONLY) {
8287             if (prevvalue < 256) {
8288                 const IV ceilvalue = value < 256 ? value : 255;
8289                 IV i;
8290 #ifdef EBCDIC
8291                 /* In EBCDIC [\x89-\x91] should include
8292                  * the \x8e but [i-j] should not. */
8293                 if (literal_endpoint == 2 &&
8294                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8295                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8296                 {
8297                     if (isLOWER(prevvalue)) {
8298                         for (i = prevvalue; i <= ceilvalue; i++)
8299                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8300                                 stored++;
8301                                 ANYOF_BITMAP_SET(ret, i);
8302                             }
8303                     } else {
8304                         for (i = prevvalue; i <= ceilvalue; i++)
8305                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8306                                 stored++;
8307                                 ANYOF_BITMAP_SET(ret, i);
8308                             }
8309                     }
8310                 }
8311                 else
8312 #endif
8313                       for (i = prevvalue; i <= ceilvalue; i++) {
8314                         if (!ANYOF_BITMAP_TEST(ret,i)) {
8315                             stored++;
8316                             ANYOF_BITMAP_SET(ret, i);
8317                         }
8318                       }
8319           }
8320           if (value > 255 || UTF) {
8321                 const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
8322                 const UV natvalue      = NATIVE_TO_UNI(value);
8323                 stored+=2; /* can't optimize this class */
8324                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8325                 if (prevnatvalue < natvalue) { /* what about > ? */
8326                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8327                                    prevnatvalue, natvalue);
8328                 }
8329                 else if (prevnatvalue == natvalue) {
8330                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8331                     if (FOLD) {
8332                          U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8333                          STRLEN foldlen;
8334                          const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8335
8336 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8337                          if (RExC_precomp[0] == ':' &&
8338                              RExC_precomp[1] == '[' &&
8339                              (f == 0xDF || f == 0x92)) {
8340                              f = NATIVE_TO_UNI(f);
8341                         }
8342 #endif
8343                          /* If folding and foldable and a single
8344                           * character, insert also the folded version
8345                           * to the charclass. */
8346                          if (f != value) {
8347 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8348                              if ((RExC_precomp[0] == ':' &&
8349                                   RExC_precomp[1] == '[' &&
8350                                   (f == 0xA2 &&
8351                                    (value == 0xFB05 || value == 0xFB06))) ?
8352                                  foldlen == ((STRLEN)UNISKIP(f) - 1) :
8353                                  foldlen == (STRLEN)UNISKIP(f) )
8354 #else
8355                               if (foldlen == (STRLEN)UNISKIP(f))
8356 #endif
8357                                   Perl_sv_catpvf(aTHX_ listsv,
8358                                                  "%04"UVxf"\n", f);
8359                               else {
8360                                   /* Any multicharacter foldings
8361                                    * require the following transform:
8362                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8363                                    * where E folds into "pq" and F folds
8364                                    * into "rst", all other characters
8365                                    * fold to single characters.  We save
8366                                    * away these multicharacter foldings,
8367                                    * to be later saved as part of the
8368                                    * additional "s" data. */
8369                                   SV *sv;
8370
8371                                   if (!unicode_alternate)
8372                                       unicode_alternate = newAV();
8373                                   sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8374                                                      TRUE);
8375                                   av_push(unicode_alternate, sv);
8376                               }
8377                          }
8378
8379                          /* If folding and the value is one of the Greek
8380                           * sigmas insert a few more sigmas to make the
8381                           * folding rules of the sigmas to work right.
8382                           * Note that not all the possible combinations
8383                           * are handled here: some of them are handled
8384                           * by the standard folding rules, and some of
8385                           * them (literal or EXACTF cases) are handled
8386                           * during runtime in regexec.c:S_find_byclass(). */
8387                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8388                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8389                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8390                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8391                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8392                          }
8393                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8394                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8395                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8396                     }
8397                 }
8398             }
8399 #ifdef EBCDIC
8400             literal_endpoint = 0;
8401 #endif
8402         }
8403
8404         range = 0; /* this range (if it was one) is done now */
8405     }
8406
8407     if (need_class) {
8408         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8409         if (SIZE_ONLY)
8410             RExC_size += ANYOF_CLASS_ADD_SKIP;
8411         else
8412             RExC_emit += ANYOF_CLASS_ADD_SKIP;
8413     }
8414
8415
8416     if (SIZE_ONLY)
8417         return ret;
8418     /****** !SIZE_ONLY AFTER HERE *********/
8419
8420     if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8421         && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8422     ) {
8423         /* optimize single char class to an EXACT node
8424            but *only* when its not a UTF/high char  */
8425         const char * cur_parse= RExC_parse;
8426         RExC_emit = (regnode *)orig_emit;
8427         RExC_parse = (char *)orig_parse;
8428         ret = reg_node(pRExC_state,
8429                        (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8430         RExC_parse = (char *)cur_parse;
8431         *STRING(ret)= (char)value;
8432         STR_LEN(ret)= 1;
8433         RExC_emit += STR_SZ(1);
8434         if (listsv) {
8435             SvREFCNT_dec(listsv);
8436         }
8437         return ret;
8438     }
8439     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8440     if ( /* If the only flag is folding (plus possibly inversion). */
8441         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8442        ) {
8443         for (value = 0; value < 256; ++value) {
8444             if (ANYOF_BITMAP_TEST(ret, value)) {
8445                 UV fold = PL_fold[value];
8446
8447                 if (fold != value)
8448                     ANYOF_BITMAP_SET(ret, fold);
8449             }
8450         }
8451         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8452     }
8453
8454     /* optimize inverted simple patterns (e.g. [^a-z]) */
8455     if (optimize_invert &&
8456         /* If the only flag is inversion. */
8457         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8458         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8459             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8460         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8461     }
8462     {
8463         AV * const av = newAV();
8464         SV *rv;
8465         /* The 0th element stores the character class description
8466          * in its textual form: used later (regexec.c:Perl_regclass_swash())
8467          * to initialize the appropriate swash (which gets stored in
8468          * the 1st element), and also useful for dumping the regnode.
8469          * The 2nd element stores the multicharacter foldings,
8470          * used later (regexec.c:S_reginclass()). */
8471         av_store(av, 0, listsv);
8472         av_store(av, 1, NULL);
8473         av_store(av, 2, MUTABLE_SV(unicode_alternate));
8474         rv = newRV_noinc(MUTABLE_SV(av));
8475         n = add_data(pRExC_state, 1, "s");
8476         RExC_rxi->data->data[n] = (void*)rv;
8477         ARG_SET(ret, n);
8478     }
8479     return ret;
8480 }
8481 #undef _C_C_T_
8482
8483
8484 /* reg_skipcomment()
8485
8486    Absorbs an /x style # comments from the input stream.
8487    Returns true if there is more text remaining in the stream.
8488    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8489    terminates the pattern without including a newline.
8490
8491    Note its the callers responsibility to ensure that we are
8492    actually in /x mode
8493
8494 */
8495
8496 STATIC bool
8497 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8498 {
8499     bool ended = 0;
8500
8501     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8502
8503     while (RExC_parse < RExC_end)
8504         if (*RExC_parse++ == '\n') {
8505             ended = 1;
8506             break;
8507         }
8508     if (!ended) {
8509         /* we ran off the end of the pattern without ending
8510            the comment, so we have to add an \n when wrapping */
8511         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8512         return 0;
8513     } else
8514         return 1;
8515 }
8516
8517 /* nextchar()
8518
8519    Advance that parse position, and optionall absorbs
8520    "whitespace" from the inputstream.
8521
8522    Without /x "whitespace" means (?#...) style comments only,
8523    with /x this means (?#...) and # comments and whitespace proper.
8524
8525    Returns the RExC_parse point from BEFORE the scan occurs.
8526
8527    This is the /x friendly way of saying RExC_parse++.
8528 */
8529
8530 STATIC char*
8531 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8532 {
8533     char* const retval = RExC_parse++;
8534
8535     PERL_ARGS_ASSERT_NEXTCHAR;
8536
8537     for (;;) {
8538         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8539                 RExC_parse[2] == '#') {
8540             while (*RExC_parse != ')') {
8541                 if (RExC_parse == RExC_end)
8542                     FAIL("Sequence (?#... not terminated");
8543                 RExC_parse++;
8544             }
8545             RExC_parse++;
8546             continue;
8547         }
8548         if (RExC_flags & RXf_PMf_EXTENDED) {
8549             if (isSPACE(*RExC_parse)) {
8550                 RExC_parse++;
8551                 continue;
8552             }
8553             else if (*RExC_parse == '#') {
8554                 if ( reg_skipcomment( pRExC_state ) )
8555                     continue;
8556             }
8557         }
8558         return retval;
8559     }
8560 }
8561
8562 /*
8563 - reg_node - emit a node
8564 */
8565 STATIC regnode *                        /* Location. */
8566 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8567 {
8568     dVAR;
8569     register regnode *ptr;
8570     regnode * const ret = RExC_emit;
8571     GET_RE_DEBUG_FLAGS_DECL;
8572
8573     PERL_ARGS_ASSERT_REG_NODE;
8574
8575     if (SIZE_ONLY) {
8576         SIZE_ALIGN(RExC_size);
8577         RExC_size += 1;
8578         return(ret);
8579     }
8580     if (RExC_emit >= RExC_emit_bound)
8581         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8582
8583     NODE_ALIGN_FILL(ret);
8584     ptr = ret;
8585     FILL_ADVANCE_NODE(ptr, op);
8586     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
8587 #ifdef RE_TRACK_PATTERN_OFFSETS
8588     if (RExC_offsets) {         /* MJD */
8589         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8590               "reg_node", __LINE__,
8591               PL_reg_name[op],
8592               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8593                 ? "Overwriting end of array!\n" : "OK",
8594               (UV)(RExC_emit - RExC_emit_start),
8595               (UV)(RExC_parse - RExC_start),
8596               (UV)RExC_offsets[0]));
8597         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8598     }
8599 #endif
8600     RExC_emit = ptr;
8601     return(ret);
8602 }
8603
8604 /*
8605 - reganode - emit a node with an argument
8606 */
8607 STATIC regnode *                        /* Location. */
8608 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8609 {
8610     dVAR;
8611     register regnode *ptr;
8612     regnode * const ret = RExC_emit;
8613     GET_RE_DEBUG_FLAGS_DECL;
8614
8615     PERL_ARGS_ASSERT_REGANODE;
8616
8617     if (SIZE_ONLY) {
8618         SIZE_ALIGN(RExC_size);
8619         RExC_size += 2;
8620         /*
8621            We can't do this:
8622
8623            assert(2==regarglen[op]+1);
8624
8625            Anything larger than this has to allocate the extra amount.
8626            If we changed this to be:
8627
8628            RExC_size += (1 + regarglen[op]);
8629
8630            then it wouldn't matter. Its not clear what side effect
8631            might come from that so its not done so far.
8632            -- dmq
8633         */
8634         return(ret);
8635     }
8636     if (RExC_emit >= RExC_emit_bound)
8637         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8638
8639     NODE_ALIGN_FILL(ret);
8640     ptr = ret;
8641     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8642     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
8643 #ifdef RE_TRACK_PATTERN_OFFSETS
8644     if (RExC_offsets) {         /* MJD */
8645         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8646               "reganode",
8647               __LINE__,
8648               PL_reg_name[op],
8649               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8650               "Overwriting end of array!\n" : "OK",
8651               (UV)(RExC_emit - RExC_emit_start),
8652               (UV)(RExC_parse - RExC_start),
8653               (UV)RExC_offsets[0]));
8654         Set_Cur_Node_Offset;
8655     }
8656 #endif
8657     RExC_emit = ptr;
8658     return(ret);
8659 }
8660
8661 /*
8662 - reguni - emit (if appropriate) a Unicode character
8663 */
8664 STATIC STRLEN
8665 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8666 {
8667     dVAR;
8668
8669     PERL_ARGS_ASSERT_REGUNI;
8670
8671     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8672 }
8673
8674 /*
8675 - reginsert - insert an operator in front of already-emitted operand
8676 *
8677 * Means relocating the operand.
8678 */
8679 STATIC void
8680 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8681 {
8682     dVAR;
8683     register regnode *src;
8684     register regnode *dst;
8685     register regnode *place;
8686     const int offset = regarglen[(U8)op];
8687     const int size = NODE_STEP_REGNODE + offset;
8688     GET_RE_DEBUG_FLAGS_DECL;
8689
8690     PERL_ARGS_ASSERT_REGINSERT;
8691     PERL_UNUSED_ARG(depth);
8692 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8693     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8694     if (SIZE_ONLY) {
8695         RExC_size += size;
8696         return;
8697     }
8698
8699     src = RExC_emit;
8700     RExC_emit += size;
8701     dst = RExC_emit;
8702     if (RExC_open_parens) {
8703         int paren;
8704         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8705         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8706             if ( RExC_open_parens[paren] >= opnd ) {
8707                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8708                 RExC_open_parens[paren] += size;
8709             } else {
8710                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8711             }
8712             if ( RExC_close_parens[paren] >= opnd ) {
8713                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8714                 RExC_close_parens[paren] += size;
8715             } else {
8716                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8717             }
8718         }
8719     }
8720
8721     while (src > opnd) {
8722         StructCopy(--src, --dst, regnode);
8723 #ifdef RE_TRACK_PATTERN_OFFSETS
8724         if (RExC_offsets) {     /* MJD 20010112 */
8725             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8726                   "reg_insert",
8727                   __LINE__,
8728                   PL_reg_name[op],
8729                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8730                     ? "Overwriting end of array!\n" : "OK",
8731                   (UV)(src - RExC_emit_start),
8732                   (UV)(dst - RExC_emit_start),
8733                   (UV)RExC_offsets[0]));
8734             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8735             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8736         }
8737 #endif
8738     }
8739
8740
8741     place = opnd;               /* Op node, where operand used to be. */
8742 #ifdef RE_TRACK_PATTERN_OFFSETS
8743     if (RExC_offsets) {         /* MJD */
8744         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8745               "reginsert",
8746               __LINE__,
8747               PL_reg_name[op],
8748               (UV)(place - RExC_emit_start) > RExC_offsets[0]
8749               ? "Overwriting end of array!\n" : "OK",
8750               (UV)(place - RExC_emit_start),
8751               (UV)(RExC_parse - RExC_start),
8752               (UV)RExC_offsets[0]));
8753         Set_Node_Offset(place, RExC_parse);
8754         Set_Node_Length(place, 1);
8755     }
8756 #endif
8757     src = NEXTOPER(place);
8758     FILL_ADVANCE_NODE(place, op);
8759     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
8760     Zero(src, offset, regnode);
8761 }
8762
8763 /*
8764 - regtail - set the next-pointer at the end of a node chain of p to val.
8765 - SEE ALSO: regtail_study
8766 */
8767 /* TODO: All three parms should be const */
8768 STATIC void
8769 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8770 {
8771     dVAR;
8772     register regnode *scan;
8773     GET_RE_DEBUG_FLAGS_DECL;
8774
8775     PERL_ARGS_ASSERT_REGTAIL;
8776 #ifndef DEBUGGING
8777     PERL_UNUSED_ARG(depth);
8778 #endif
8779
8780     if (SIZE_ONLY)
8781         return;
8782
8783     /* Find last node. */
8784     scan = p;
8785     for (;;) {
8786         regnode * const temp = regnext(scan);
8787         DEBUG_PARSE_r({
8788             SV * const mysv=sv_newmortal();
8789             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8790             regprop(RExC_rx, mysv, scan);
8791             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8792                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8793                     (temp == NULL ? "->" : ""),
8794                     (temp == NULL ? PL_reg_name[OP(val)] : "")
8795             );
8796         });
8797         if (temp == NULL)
8798             break;
8799         scan = temp;
8800     }
8801
8802     if (reg_off_by_arg[OP(scan)]) {
8803         ARG_SET(scan, val - scan);
8804     }
8805     else {
8806         NEXT_OFF(scan) = val - scan;
8807     }
8808 }
8809
8810 #ifdef DEBUGGING
8811 /*
8812 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8813 - Look for optimizable sequences at the same time.
8814 - currently only looks for EXACT chains.
8815
8816 This is expermental code. The idea is to use this routine to perform
8817 in place optimizations on branches and groups as they are constructed,
8818 with the long term intention of removing optimization from study_chunk so
8819 that it is purely analytical.
8820
8821 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8822 to control which is which.
8823
8824 */
8825 /* TODO: All four parms should be const */
8826
8827 STATIC U8
8828 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8829 {
8830     dVAR;
8831     register regnode *scan;
8832     U8 exact = PSEUDO;
8833 #ifdef EXPERIMENTAL_INPLACESCAN
8834     I32 min = 0;
8835 #endif
8836     GET_RE_DEBUG_FLAGS_DECL;
8837
8838     PERL_ARGS_ASSERT_REGTAIL_STUDY;
8839
8840
8841     if (SIZE_ONLY)
8842         return exact;
8843
8844     /* Find last node. */
8845
8846     scan = p;
8847     for (;;) {
8848         regnode * const temp = regnext(scan);
8849 #ifdef EXPERIMENTAL_INPLACESCAN
8850         if (PL_regkind[OP(scan)] == EXACT)
8851             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8852                 return EXACT;
8853 #endif
8854         if ( exact ) {
8855             switch (OP(scan)) {
8856                 case EXACT:
8857                 case EXACTF:
8858                 case EXACTFL:
8859                         if( exact == PSEUDO )
8860                             exact= OP(scan);
8861                         else if ( exact != OP(scan) )
8862                             exact= 0;
8863                 case NOTHING:
8864                     break;
8865                 default:
8866                     exact= 0;
8867             }
8868         }
8869         DEBUG_PARSE_r({
8870             SV * const mysv=sv_newmortal();
8871             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8872             regprop(RExC_rx, mysv, scan);
8873             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8874                 SvPV_nolen_const(mysv),
8875                 REG_NODE_NUM(scan),
8876                 PL_reg_name[exact]);
8877         });
8878         if (temp == NULL)
8879             break;
8880         scan = temp;
8881     }
8882     DEBUG_PARSE_r({
8883         SV * const mysv_val=sv_newmortal();
8884         DEBUG_PARSE_MSG("");
8885         regprop(RExC_rx, mysv_val, val);
8886         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8887                       SvPV_nolen_const(mysv_val),
8888                       (IV)REG_NODE_NUM(val),
8889                       (IV)(val - scan)
8890         );
8891     });
8892     if (reg_off_by_arg[OP(scan)]) {
8893         ARG_SET(scan, val - scan);
8894     }
8895     else {
8896         NEXT_OFF(scan) = val - scan;
8897     }
8898
8899     return exact;
8900 }
8901 #endif
8902
8903 /*
8904  - regcurly - a little FSA that accepts {\d+,?\d*}
8905  */
8906 STATIC I32
8907 S_regcurly(register const char *s)
8908 {
8909     PERL_ARGS_ASSERT_REGCURLY;
8910
8911     if (*s++ != '{')
8912         return FALSE;
8913     if (!isDIGIT(*s))
8914         return FALSE;
8915     while (isDIGIT(*s))
8916         s++;
8917     if (*s == ',')
8918         s++;
8919     while (isDIGIT(*s))
8920         s++;
8921     if (*s != '}')
8922         return FALSE;
8923     return TRUE;
8924 }
8925
8926
8927 /*
8928  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8929  */
8930 #ifdef DEBUGGING
8931 static void
8932 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
8933 {
8934     int bit;
8935     int set=0;
8936
8937     for (bit=0; bit<32; bit++) {
8938         if (flags & (1<<bit)) {
8939             if (!set++ && lead)
8940                 PerlIO_printf(Perl_debug_log, "%s",lead);
8941             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8942         }
8943     }
8944     if (lead)  {
8945         if (set)
8946             PerlIO_printf(Perl_debug_log, "\n");
8947         else
8948             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8949     }
8950 }
8951 #endif
8952
8953 void
8954 Perl_regdump(pTHX_ const regexp *r)
8955 {
8956 #ifdef DEBUGGING
8957     dVAR;
8958     SV * const sv = sv_newmortal();
8959     SV *dsv= sv_newmortal();
8960     RXi_GET_DECL(r,ri);
8961     GET_RE_DEBUG_FLAGS_DECL;
8962
8963     PERL_ARGS_ASSERT_REGDUMP;
8964
8965     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8966
8967     /* Header fields of interest. */
8968     if (r->anchored_substr) {
8969         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8970             RE_SV_DUMPLEN(r->anchored_substr), 30);
8971         PerlIO_printf(Perl_debug_log,
8972                       "anchored %s%s at %"IVdf" ",
8973                       s, RE_SV_TAIL(r->anchored_substr),
8974                       (IV)r->anchored_offset);
8975     } else if (r->anchored_utf8) {
8976         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8977             RE_SV_DUMPLEN(r->anchored_utf8), 30);
8978         PerlIO_printf(Perl_debug_log,
8979                       "anchored utf8 %s%s at %"IVdf" ",
8980                       s, RE_SV_TAIL(r->anchored_utf8),
8981                       (IV)r->anchored_offset);
8982     }
8983     if (r->float_substr) {
8984         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8985             RE_SV_DUMPLEN(r->float_substr), 30);
8986         PerlIO_printf(Perl_debug_log,
8987                       "floating %s%s at %"IVdf"..%"UVuf" ",
8988                       s, RE_SV_TAIL(r->float_substr),
8989                       (IV)r->float_min_offset, (UV)r->float_max_offset);
8990     } else if (r->float_utf8) {
8991         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8992             RE_SV_DUMPLEN(r->float_utf8), 30);
8993         PerlIO_printf(Perl_debug_log,
8994                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8995                       s, RE_SV_TAIL(r->float_utf8),
8996                       (IV)r->float_min_offset, (UV)r->float_max_offset);
8997     }
8998     if (r->check_substr || r->check_utf8)
8999         PerlIO_printf(Perl_debug_log,
9000                       (const char *)
9001                       (r->check_substr == r->float_substr
9002                        && r->check_utf8 == r->float_utf8
9003                        ? "(checking floating" : "(checking anchored"));
9004     if (r->extflags & RXf_NOSCAN)
9005         PerlIO_printf(Perl_debug_log, " noscan");
9006     if (r->extflags & RXf_CHECK_ALL)
9007         PerlIO_printf(Perl_debug_log, " isall");
9008     if (r->check_substr || r->check_utf8)
9009         PerlIO_printf(Perl_debug_log, ") ");
9010
9011     if (ri->regstclass) {
9012         regprop(r, sv, ri->regstclass);
9013         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9014     }
9015     if (r->extflags & RXf_ANCH) {
9016         PerlIO_printf(Perl_debug_log, "anchored");
9017         if (r->extflags & RXf_ANCH_BOL)
9018             PerlIO_printf(Perl_debug_log, "(BOL)");
9019         if (r->extflags & RXf_ANCH_MBOL)
9020             PerlIO_printf(Perl_debug_log, "(MBOL)");
9021         if (r->extflags & RXf_ANCH_SBOL)
9022             PerlIO_printf(Perl_debug_log, "(SBOL)");
9023         if (r->extflags & RXf_ANCH_GPOS)
9024             PerlIO_printf(Perl_debug_log, "(GPOS)");
9025         PerlIO_putc(Perl_debug_log, ' ');
9026     }
9027     if (r->extflags & RXf_GPOS_SEEN)
9028         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9029     if (r->intflags & PREGf_SKIP)
9030         PerlIO_printf(Perl_debug_log, "plus ");
9031     if (r->intflags & PREGf_IMPLICIT)
9032         PerlIO_printf(Perl_debug_log, "implicit ");
9033     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9034     if (r->extflags & RXf_EVAL_SEEN)
9035         PerlIO_printf(Perl_debug_log, "with eval ");
9036     PerlIO_printf(Perl_debug_log, "\n");
9037     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9038 #else
9039     PERL_ARGS_ASSERT_REGDUMP;
9040     PERL_UNUSED_CONTEXT;
9041     PERL_UNUSED_ARG(r);
9042 #endif  /* DEBUGGING */
9043 }
9044
9045 /*
9046 - regprop - printable representation of opcode
9047 */
9048 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9049 STMT_START { \
9050         if (do_sep) {                           \
9051             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9052             if (flags & ANYOF_INVERT)           \
9053                 /*make sure the invert info is in each */ \
9054                 sv_catpvs(sv, "^");             \
9055             do_sep = 0;                         \
9056         }                                       \
9057 } STMT_END
9058
9059 void
9060 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9061 {
9062 #ifdef DEBUGGING
9063     dVAR;
9064     register int k;
9065     RXi_GET_DECL(prog,progi);
9066     GET_RE_DEBUG_FLAGS_DECL;
9067
9068     PERL_ARGS_ASSERT_REGPROP;
9069
9070     sv_setpvs(sv, "");
9071
9072     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
9073         /* It would be nice to FAIL() here, but this may be called from
9074            regexec.c, and it would be hard to supply pRExC_state. */
9075         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9076     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9077
9078     k = PL_regkind[OP(o)];
9079
9080     if (k == EXACT) {
9081         sv_catpvs(sv, " ");
9082         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9083          * is a crude hack but it may be the best for now since
9084          * we have no flag "this EXACTish node was UTF-8"
9085          * --jhi */
9086         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9087                   PERL_PV_ESCAPE_UNI_DETECT |
9088                   PERL_PV_PRETTY_ELLIPSES   |
9089                   PERL_PV_PRETTY_LTGT       |
9090                   PERL_PV_PRETTY_NOCLEAR
9091                   );
9092     } else if (k == TRIE) {
9093         /* print the details of the trie in dumpuntil instead, as
9094          * progi->data isn't available here */
9095         const char op = OP(o);
9096         const U32 n = ARG(o);
9097         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9098                (reg_ac_data *)progi->data->data[n] :
9099                NULL;
9100         const reg_trie_data * const trie
9101             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9102
9103         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9104         DEBUG_TRIE_COMPILE_r(
9105             Perl_sv_catpvf(aTHX_ sv,
9106                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9107                 (UV)trie->startstate,
9108                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9109                 (UV)trie->wordcount,
9110                 (UV)trie->minlen,
9111                 (UV)trie->maxlen,
9112                 (UV)TRIE_CHARCOUNT(trie),
9113                 (UV)trie->uniquecharcount
9114             )
9115         );
9116         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9117             int i;
9118             int rangestart = -1;
9119             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9120             sv_catpvs(sv, "[");
9121             for (i = 0; i <= 256; i++) {
9122                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9123                     if (rangestart == -1)
9124                         rangestart = i;
9125                 } else if (rangestart != -1) {
9126                     if (i <= rangestart + 3)
9127                         for (; rangestart < i; rangestart++)
9128                             put_byte(sv, rangestart);
9129                     else {
9130                         put_byte(sv, rangestart);
9131                         sv_catpvs(sv, "-");
9132                         put_byte(sv, i - 1);
9133                     }
9134                     rangestart = -1;
9135                 }
9136             }
9137             sv_catpvs(sv, "]");
9138         }
9139
9140     } else if (k == CURLY) {
9141         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9142             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9143         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9144     }
9145     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
9146         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9147     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9148         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
9149         if ( RXp_PAREN_NAMES(prog) ) {
9150             if ( k != REF || OP(o) < NREF) {
9151                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9152                 SV **name= av_fetch(list, ARG(o), 0 );
9153                 if (name)
9154                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9155             }
9156             else {
9157                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9158                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9159                 I32 *nums=(I32*)SvPVX(sv_dat);
9160                 SV **name= av_fetch(list, nums[0], 0 );
9161                 I32 n;
9162                 if (name) {
9163                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
9164                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9165                                     (n ? "," : ""), (IV)nums[n]);
9166                     }
9167                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9168                 }
9169             }
9170         }
9171     } else if (k == GOSUB)
9172         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9173     else if (k == VERB) {
9174         if (!o->flags)
9175             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9176                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9177     } else if (k == LOGICAL)
9178         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
9179     else if (k == FOLDCHAR)
9180         Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9181     else if (k == ANYOF) {
9182         int i, rangestart = -1;
9183         const U8 flags = ANYOF_FLAGS(o);
9184         int do_sep = 0;
9185
9186         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9187         static const char * const anyofs[] = {
9188             "\\w",
9189             "\\W",
9190             "\\s",
9191             "\\S",
9192             "\\d",
9193             "\\D",
9194             "[:alnum:]",
9195             "[:^alnum:]",
9196             "[:alpha:]",
9197             "[:^alpha:]",
9198             "[:ascii:]",
9199             "[:^ascii:]",
9200             "[:cntrl:]",
9201             "[:^cntrl:]",
9202             "[:graph:]",
9203             "[:^graph:]",
9204             "[:lower:]",
9205             "[:^lower:]",
9206             "[:print:]",
9207             "[:^print:]",
9208             "[:punct:]",
9209             "[:^punct:]",
9210             "[:upper:]",
9211             "[:^upper:]",
9212             "[:xdigit:]",
9213             "[:^xdigit:]",
9214             "[:space:]",
9215             "[:^space:]",
9216             "[:blank:]",
9217             "[:^blank:]"
9218         };
9219
9220         if (flags & ANYOF_LOCALE)
9221             sv_catpvs(sv, "{loc}");
9222         if (flags & ANYOF_FOLD)
9223             sv_catpvs(sv, "{i}");
9224         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9225         if (flags & ANYOF_INVERT)
9226             sv_catpvs(sv, "^");
9227
9228         /* output what the standard cp 0-255 bitmap matches */
9229         for (i = 0; i <= 256; i++) {
9230             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9231                 if (rangestart == -1)
9232                     rangestart = i;
9233             } else if (rangestart != -1) {
9234                 if (i <= rangestart + 3)
9235                     for (; rangestart < i; rangestart++)
9236                         put_byte(sv, rangestart);
9237                 else {
9238                     put_byte(sv, rangestart);
9239                     sv_catpvs(sv, "-");
9240                     put_byte(sv, i - 1);
9241                 }
9242                 do_sep = 1;
9243                 rangestart = -1;
9244             }
9245         }
9246
9247         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9248         /* output any special charclass tests (used mostly under use locale) */
9249         if (o->flags & ANYOF_CLASS)
9250             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9251                 if (ANYOF_CLASS_TEST(o,i)) {
9252                     sv_catpv(sv, anyofs[i]);
9253                     do_sep = 1;
9254                 }
9255
9256         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9257
9258         /* output information about the unicode matching */
9259         if (flags & ANYOF_UNICODE)
9260             sv_catpvs(sv, "{unicode}");
9261         else if (flags & ANYOF_UNICODE_ALL)
9262             sv_catpvs(sv, "{unicode_all}");
9263
9264         {
9265             SV *lv;
9266             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9267
9268             if (lv) {
9269                 if (sw) {
9270                     U8 s[UTF8_MAXBYTES_CASE+1];
9271
9272                     for (i = 0; i <= 256; i++) { /* just the first 256 */
9273                         uvchr_to_utf8(s, i);
9274
9275                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
9276                             if (rangestart == -1)
9277                                 rangestart = i;
9278                         } else if (rangestart != -1) {
9279                             if (i <= rangestart + 3)
9280                                 for (; rangestart < i; rangestart++) {
9281                                     const U8 * const e = uvchr_to_utf8(s,rangestart);
9282                                     U8 *p;
9283                                     for(p = s; p < e; p++)
9284                                         put_byte(sv, *p);
9285                                 }
9286                             else {
9287                                 const U8 *e = uvchr_to_utf8(s,rangestart);
9288                                 U8 *p;
9289                                 for (p = s; p < e; p++)
9290                                     put_byte(sv, *p);
9291                                 sv_catpvs(sv, "-");
9292                                 e = uvchr_to_utf8(s, i-1);
9293                                 for (p = s; p < e; p++)
9294                                     put_byte(sv, *p);
9295                                 }
9296                                 rangestart = -1;
9297                             }
9298                         }
9299
9300                     sv_catpvs(sv, "..."); /* et cetera */
9301                 }
9302
9303                 {
9304                     char *s = savesvpv(lv);
9305                     char * const origs = s;
9306
9307                     while (*s && *s != '\n')
9308                         s++;
9309
9310                     if (*s == '\n') {
9311                         const char * const t = ++s;
9312
9313                         while (*s) {
9314                             if (*s == '\n')
9315                                 *s = ' ';
9316                             s++;
9317                         }
9318                         if (s[-1] == ' ')
9319                             s[-1] = 0;
9320
9321                         sv_catpv(sv, t);
9322                     }
9323
9324                     Safefree(origs);
9325                 }
9326             }
9327         }
9328
9329         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9330     }
9331     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9332         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9333 #else
9334     PERL_UNUSED_CONTEXT;
9335     PERL_UNUSED_ARG(sv);
9336     PERL_UNUSED_ARG(o);
9337     PERL_UNUSED_ARG(prog);
9338 #endif  /* DEBUGGING */
9339 }
9340
9341 SV *
9342 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9343 {                               /* Assume that RE_INTUIT is set */
9344     dVAR;
9345     struct regexp *const prog = (struct regexp *)SvANY(r);
9346     GET_RE_DEBUG_FLAGS_DECL;
9347
9348     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9349     PERL_UNUSED_CONTEXT;
9350
9351     DEBUG_COMPILE_r(
9352         {
9353             const char * const s = SvPV_nolen_const(prog->check_substr
9354                       ? prog->check_substr : prog->check_utf8);
9355
9356             if (!PL_colorset) reginitcolors();
9357             PerlIO_printf(Perl_debug_log,
9358                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9359                       PL_colors[4],
9360                       prog->check_substr ? "" : "utf8 ",
9361                       PL_colors[5],PL_colors[0],
9362                       s,
9363                       PL_colors[1],
9364                       (strlen(s) > 60 ? "..." : ""));
9365         } );
9366
9367     return prog->check_substr ? prog->check_substr : prog->check_utf8;
9368 }
9369
9370 /*
9371    pregfree()
9372
9373    handles refcounting and freeing the perl core regexp structure. When
9374    it is necessary to actually free the structure the first thing it
9375    does is call the 'free' method of the regexp_engine associated to to
9376    the regexp, allowing the handling of the void *pprivate; member
9377    first. (This routine is not overridable by extensions, which is why
9378    the extensions free is called first.)
9379
9380    See regdupe and regdupe_internal if you change anything here.
9381 */
9382 #ifndef PERL_IN_XSUB_RE
9383 void
9384 Perl_pregfree(pTHX_ REGEXP *r)
9385 {
9386     SvREFCNT_dec(r);
9387 }
9388
9389 void
9390 Perl_pregfree2(pTHX_ REGEXP *rx)
9391 {
9392     dVAR;
9393     struct regexp *const r = (struct regexp *)SvANY(rx);
9394     GET_RE_DEBUG_FLAGS_DECL;
9395
9396     PERL_ARGS_ASSERT_PREGFREE2;
9397
9398     if (r->mother_re) {
9399         ReREFCNT_dec(r->mother_re);
9400     } else {
9401         CALLREGFREE_PVT(rx); /* free the private data */
9402         if (RXp_PAREN_NAMES(r))
9403             SvREFCNT_dec(RXp_PAREN_NAMES(r));
9404     }
9405     if (r->substrs) {
9406         if (r->anchored_substr)
9407             SvREFCNT_dec(r->anchored_substr);
9408         if (r->anchored_utf8)
9409             SvREFCNT_dec(r->anchored_utf8);
9410         if (r->float_substr)
9411             SvREFCNT_dec(r->float_substr);
9412         if (r->float_utf8)
9413             SvREFCNT_dec(r->float_utf8);
9414         Safefree(r->substrs);
9415     }
9416     RX_MATCH_COPY_FREE(rx);
9417 #ifdef PERL_OLD_COPY_ON_WRITE
9418     if (r->saved_copy)
9419         SvREFCNT_dec(r->saved_copy);
9420 #endif
9421     Safefree(r->offs);
9422 }
9423
9424 /*  reg_temp_copy()
9425
9426     This is a hacky workaround to the structural issue of match results
9427     being stored in the regexp structure which is in turn stored in
9428     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9429     could be PL_curpm in multiple contexts, and could require multiple
9430     result sets being associated with the pattern simultaneously, such
9431     as when doing a recursive match with (??{$qr})
9432
9433     The solution is to make a lightweight copy of the regexp structure
9434     when a qr// is returned from the code executed by (??{$qr}) this
9435     lightweight copy doesnt actually own any of its data except for
9436     the starp/end and the actual regexp structure itself.
9437
9438 */
9439
9440
9441 REGEXP *
9442 Perl_reg_temp_copy (pTHX_ REGEXP *rx)
9443 {
9444     REGEXP *ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9445     struct regexp *ret = (struct regexp *)SvANY(ret_x);
9446     struct regexp *const r = (struct regexp *)SvANY(rx);
9447     register const I32 npar = r->nparens+1;
9448
9449     PERL_ARGS_ASSERT_REG_TEMP_COPY;
9450
9451     (void)ReREFCNT_inc(rx);
9452     /* We can take advantage of the existing "copied buffer" mechanism in SVs
9453        by pointing directly at the buffer, but flagging that the allocated
9454        space in the copy is zero. As we've just done a struct copy, it's now
9455        a case of zero-ing that, rather than copying the current length.  */
9456     SvPV_set(ret_x, RX_WRAPPED(rx));
9457     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9458     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9459            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9460     SvLEN_set(ret_x, 0);
9461     Newx(ret->offs, npar, regexp_paren_pair);
9462     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9463     if (r->substrs) {
9464         Newx(ret->substrs, 1, struct reg_substr_data);
9465         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9466
9467         SvREFCNT_inc_void(ret->anchored_substr);
9468         SvREFCNT_inc_void(ret->anchored_utf8);
9469         SvREFCNT_inc_void(ret->float_substr);
9470         SvREFCNT_inc_void(ret->float_utf8);
9471
9472         /* check_substr and check_utf8, if non-NULL, point to either their
9473            anchored or float namesakes, and don't hold a second reference.  */
9474     }
9475     RX_MATCH_COPIED_off(ret_x);
9476 #ifdef PERL_OLD_COPY_ON_WRITE
9477     ret->saved_copy = NULL;
9478 #endif
9479     ret->mother_re = rx;
9480
9481     return ret_x;
9482 }
9483 #endif
9484
9485 /* regfree_internal()
9486
9487    Free the private data in a regexp. This is overloadable by
9488    extensions. Perl takes care of the regexp structure in pregfree(),
9489    this covers the *pprivate pointer which technically perldoesnt
9490    know about, however of course we have to handle the
9491    regexp_internal structure when no extension is in use.
9492
9493    Note this is called before freeing anything in the regexp
9494    structure.
9495  */
9496
9497 void
9498 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9499 {
9500     dVAR;
9501     struct regexp *const r = (struct regexp *)SvANY(rx);
9502     RXi_GET_DECL(r,ri);
9503     GET_RE_DEBUG_FLAGS_DECL;
9504
9505     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9506
9507     DEBUG_COMPILE_r({
9508         if (!PL_colorset)
9509             reginitcolors();
9510         {
9511             SV *dsv= sv_newmortal();
9512             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
9513                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
9514             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9515                 PL_colors[4],PL_colors[5],s);
9516         }
9517     });
9518 #ifdef RE_TRACK_PATTERN_OFFSETS
9519     if (ri->u.offsets)
9520         Safefree(ri->u.offsets);             /* 20010421 MJD */
9521 #endif
9522     if (ri->data) {
9523         int n = ri->data->count;
9524         PAD* new_comppad = NULL;
9525         PAD* old_comppad;
9526         PADOFFSET refcnt;
9527
9528         while (--n >= 0) {
9529           /* If you add a ->what type here, update the comment in regcomp.h */
9530             switch (ri->data->what[n]) {
9531             case 's':
9532             case 'S':
9533             case 'u':
9534                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9535                 break;
9536             case 'f':
9537                 Safefree(ri->data->data[n]);
9538                 break;
9539             case 'p':
9540                 new_comppad = MUTABLE_AV(ri->data->data[n]);
9541                 break;
9542             case 'o':
9543                 if (new_comppad == NULL)
9544                     Perl_croak(aTHX_ "panic: pregfree comppad");
9545                 PAD_SAVE_LOCAL(old_comppad,
9546                     /* Watch out for global destruction's random ordering. */
9547                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9548                 );
9549                 OP_REFCNT_LOCK;
9550                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9551                 OP_REFCNT_UNLOCK;
9552                 if (!refcnt)
9553                     op_free((OP_4tree*)ri->data->data[n]);
9554
9555                 PAD_RESTORE_LOCAL(old_comppad);
9556                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
9557                 new_comppad = NULL;
9558                 break;
9559             case 'n':
9560                 break;
9561             case 'T':
9562                 { /* Aho Corasick add-on structure for a trie node.
9563                      Used in stclass optimization only */
9564                     U32 refcount;
9565                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9566                     OP_REFCNT_LOCK;
9567                     refcount = --aho->refcount;
9568                     OP_REFCNT_UNLOCK;
9569                     if ( !refcount ) {
9570                         PerlMemShared_free(aho->states);
9571                         PerlMemShared_free(aho->fail);
9572                          /* do this last!!!! */
9573                         PerlMemShared_free(ri->data->data[n]);
9574                         PerlMemShared_free(ri->regstclass);
9575                     }
9576                 }
9577                 break;
9578             case 't':
9579                 {
9580                     /* trie structure. */
9581                     U32 refcount;
9582                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9583                     OP_REFCNT_LOCK;
9584                     refcount = --trie->refcount;
9585                     OP_REFCNT_UNLOCK;
9586                     if ( !refcount ) {
9587                         PerlMemShared_free(trie->charmap);
9588                         PerlMemShared_free(trie->states);
9589                         PerlMemShared_free(trie->trans);
9590                         if (trie->bitmap)
9591                             PerlMemShared_free(trie->bitmap);
9592                         if (trie->wordlen)
9593                             PerlMemShared_free(trie->wordlen);
9594                         if (trie->jump)
9595                             PerlMemShared_free(trie->jump);
9596                         if (trie->nextword)
9597                             PerlMemShared_free(trie->nextword);
9598                         /* do this last!!!! */
9599                         PerlMemShared_free(ri->data->data[n]);
9600                     }
9601                 }
9602                 break;
9603             default:
9604                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9605             }
9606         }
9607         Safefree(ri->data->what);
9608         Safefree(ri->data);
9609     }
9610
9611     Safefree(ri);
9612 }
9613
9614 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
9615 #define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9616 #define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9617 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
9618
9619 /*
9620    re_dup - duplicate a regexp.
9621
9622    This routine is expected to clone a given regexp structure. It is only
9623    compiled under USE_ITHREADS.
9624
9625    After all of the core data stored in struct regexp is duplicated
9626    the regexp_engine.dupe method is used to copy any private data
9627    stored in the *pprivate pointer. This allows extensions to handle
9628    any duplication it needs to do.
9629
9630    See pregfree() and regfree_internal() if you change anything here.
9631 */
9632 #if defined(USE_ITHREADS)
9633 #ifndef PERL_IN_XSUB_RE
9634 void
9635 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
9636 {
9637     dVAR;
9638     I32 npar;
9639     const struct regexp *r = (const struct regexp *)SvANY(sstr);
9640     struct regexp *ret = (struct regexp *)SvANY(dstr);
9641
9642     PERL_ARGS_ASSERT_RE_DUP_GUTS;
9643
9644     npar = r->nparens+1;
9645     Newx(ret->offs, npar, regexp_paren_pair);
9646     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9647     if(ret->swap) {
9648         /* no need to copy these */
9649         Newx(ret->swap, npar, regexp_paren_pair);
9650     }
9651
9652     if (ret->substrs) {
9653         /* Do it this way to avoid reading from *r after the StructCopy().
9654            That way, if any of the sv_dup_inc()s dislodge *r from the L1
9655            cache, it doesn't matter.  */
9656         const bool anchored = r->check_substr
9657             ? r->check_substr == r->anchored_substr
9658             : r->check_utf8 == r->anchored_utf8;
9659         Newx(ret->substrs, 1, struct reg_substr_data);
9660         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9661
9662         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9663         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9664         ret->float_substr = sv_dup_inc(ret->float_substr, param);
9665         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9666
9667         /* check_substr and check_utf8, if non-NULL, point to either their
9668            anchored or float namesakes, and don't hold a second reference.  */
9669
9670         if (ret->check_substr) {
9671             if (anchored) {
9672                 assert(r->check_utf8 == r->anchored_utf8);
9673                 ret->check_substr = ret->anchored_substr;
9674                 ret->check_utf8 = ret->anchored_utf8;
9675             } else {
9676                 assert(r->check_substr == r->float_substr);
9677                 assert(r->check_utf8 == r->float_utf8);
9678                 ret->check_substr = ret->float_substr;
9679                 ret->check_utf8 = ret->float_utf8;
9680             }
9681         } else if (ret->check_utf8) {
9682             if (anchored) {
9683                 ret->check_utf8 = ret->anchored_utf8;
9684             } else {
9685                 ret->check_utf8 = ret->float_utf8;
9686             }
9687         }
9688     }
9689
9690     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9691
9692     if (ret->pprivate)
9693         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
9694
9695     if (RX_MATCH_COPIED(dstr))
9696         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
9697     else
9698         ret->subbeg = NULL;
9699 #ifdef PERL_OLD_COPY_ON_WRITE
9700     ret->saved_copy = NULL;
9701 #endif
9702
9703     ret->mother_re      = NULL;
9704     ret->gofs = 0;
9705 }
9706 #endif /* PERL_IN_XSUB_RE */
9707
9708 /*
9709    regdupe_internal()
9710
9711    This is the internal complement to regdupe() which is used to copy
9712    the structure pointed to by the *pprivate pointer in the regexp.
9713    This is the core version of the extension overridable cloning hook.
9714    The regexp structure being duplicated will be copied by perl prior
9715    to this and will be provided as the regexp *r argument, however
9716    with the /old/ structures pprivate pointer value. Thus this routine
9717    may override any copying normally done by perl.
9718
9719    It returns a pointer to the new regexp_internal structure.
9720 */
9721
9722 void *
9723 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
9724 {
9725     dVAR;
9726     struct regexp *const r = (struct regexp *)SvANY(rx);
9727     regexp_internal *reti;
9728     int len, npar;
9729     RXi_GET_DECL(r,ri);
9730
9731     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
9732
9733     npar = r->nparens+1;
9734     len = ProgLen(ri);
9735
9736     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
9737     Copy(ri->program, reti->program, len+1, regnode);
9738
9739
9740     reti->regstclass = NULL;
9741
9742     if (ri->data) {
9743         struct reg_data *d;
9744         const int count = ri->data->count;
9745         int i;
9746
9747         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9748                 char, struct reg_data);
9749         Newx(d->what, count, U8);
9750
9751         d->count = count;
9752         for (i = 0; i < count; i++) {
9753             d->what[i] = ri->data->what[i];
9754             switch (d->what[i]) {
9755                 /* legal options are one of: sSfpontTu
9756                    see also regcomp.h and pregfree() */
9757             case 's':
9758             case 'S':
9759             case 'p': /* actually an AV, but the dup function is identical.  */
9760             case 'u': /* actually an HV, but the dup function is identical.  */
9761                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
9762                 break;
9763             case 'f':
9764                 /* This is cheating. */
9765                 Newx(d->data[i], 1, struct regnode_charclass_class);
9766                 StructCopy(ri->data->data[i], d->data[i],
9767                             struct regnode_charclass_class);
9768                 reti->regstclass = (regnode*)d->data[i];
9769                 break;
9770             case 'o':
9771                 /* Compiled op trees are readonly and in shared memory,
9772                    and can thus be shared without duplication. */
9773                 OP_REFCNT_LOCK;
9774                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9775                 OP_REFCNT_UNLOCK;
9776                 break;
9777             case 'T':
9778                 /* Trie stclasses are readonly and can thus be shared
9779                  * without duplication. We free the stclass in pregfree
9780                  * when the corresponding reg_ac_data struct is freed.
9781                  */
9782                 reti->regstclass= ri->regstclass;
9783                 /* Fall through */
9784             case 't':
9785                 OP_REFCNT_LOCK;
9786                 ((reg_trie_data*)ri->data->data[i])->refcount++;
9787                 OP_REFCNT_UNLOCK;
9788                 /* Fall through */
9789             case 'n':
9790                 d->data[i] = ri->data->data[i];
9791                 break;
9792             default:
9793                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9794             }
9795         }
9796
9797         reti->data = d;
9798     }
9799     else
9800         reti->data = NULL;
9801
9802     reti->name_list_idx = ri->name_list_idx;
9803
9804 #ifdef RE_TRACK_PATTERN_OFFSETS
9805     if (ri->u.offsets) {
9806         Newx(reti->u.offsets, 2*len+1, U32);
9807         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9808     }
9809 #else
9810     SetProgLen(reti,len);
9811 #endif
9812
9813     return (void*)reti;
9814 }
9815
9816 #endif    /* USE_ITHREADS */
9817
9818 #ifndef PERL_IN_XSUB_RE
9819
9820 /*
9821  - regnext - dig the "next" pointer out of a node
9822  */
9823 regnode *
9824 Perl_regnext(pTHX_ register regnode *p)
9825 {
9826     dVAR;
9827     register I32 offset;
9828
9829     if (!p)
9830         return(NULL);
9831
9832     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9833     if (offset == 0)
9834         return(NULL);
9835
9836     return(p+offset);
9837 }
9838 #endif
9839
9840 STATIC void
9841 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9842 {
9843     va_list args;
9844     STRLEN l1 = strlen(pat1);
9845     STRLEN l2 = strlen(pat2);
9846     char buf[512];
9847     SV *msv;
9848     const char *message;
9849
9850     PERL_ARGS_ASSERT_RE_CROAK2;
9851
9852     if (l1 > 510)
9853         l1 = 510;
9854     if (l1 + l2 > 510)
9855         l2 = 510 - l1;
9856     Copy(pat1, buf, l1 , char);
9857     Copy(pat2, buf + l1, l2 , char);
9858     buf[l1 + l2] = '\n';
9859     buf[l1 + l2 + 1] = '\0';
9860 #ifdef I_STDARG
9861     /* ANSI variant takes additional second argument */
9862     va_start(args, pat2);
9863 #else
9864     va_start(args);
9865 #endif
9866     msv = vmess(buf, &args);
9867     va_end(args);
9868     message = SvPV_const(msv,l1);
9869     if (l1 > 512)
9870         l1 = 512;
9871     Copy(message, buf, l1 , char);
9872     buf[l1-1] = '\0';                   /* Overwrite \n */
9873     Perl_croak(aTHX_ "%s", buf);
9874 }
9875
9876 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
9877
9878 #ifndef PERL_IN_XSUB_RE
9879 void
9880 Perl_save_re_context(pTHX)
9881 {
9882     dVAR;
9883
9884     struct re_save_state *state;
9885
9886     SAVEVPTR(PL_curcop);
9887     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9888
9889     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9890     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9891     SSPUSHINT(SAVEt_RE_STATE);
9892
9893     Copy(&PL_reg_state, state, 1, struct re_save_state);
9894
9895     PL_reg_start_tmp = 0;
9896     PL_reg_start_tmpl = 0;
9897     PL_reg_oldsaved = NULL;
9898     PL_reg_oldsavedlen = 0;
9899     PL_reg_maxiter = 0;
9900     PL_reg_leftiter = 0;
9901     PL_reg_poscache = NULL;
9902     PL_reg_poscache_size = 0;
9903 #ifdef PERL_OLD_COPY_ON_WRITE
9904     PL_nrs = NULL;
9905 #endif
9906
9907     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9908     if (PL_curpm) {
9909         const REGEXP * const rx = PM_GETRE(PL_curpm);
9910         if (rx) {
9911             U32 i;
9912             for (i = 1; i <= RX_NPARENS(rx); i++) {
9913                 char digits[TYPE_CHARS(long)];
9914                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9915                 GV *const *const gvp
9916                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9917
9918                 if (gvp) {
9919                     GV * const gv = *gvp;
9920                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9921                         save_scalar(gv);
9922                 }
9923             }
9924         }
9925     }
9926 }
9927 #endif
9928
9929 static void
9930 clear_re(pTHX_ void *r)
9931 {
9932     dVAR;
9933     ReREFCNT_dec((REGEXP *)r);
9934 }
9935
9936 #ifdef DEBUGGING
9937
9938 STATIC void
9939 S_put_byte(pTHX_ SV *sv, int c)
9940 {
9941     PERL_ARGS_ASSERT_PUT_BYTE;
9942
9943     /* Our definition of isPRINT() ignores locales, so only bytes that are
9944        not part of UTF-8 are considered printable. I assume that the same
9945        holds for UTF-EBCDIC.
9946        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9947        which Wikipedia says:
9948
9949        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9950        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9951        identical, to the ASCII delete (DEL) or rubout control character.
9952        ) So the old condition can be simplified to !isPRINT(c)  */
9953     if (!isPRINT(c))
9954         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9955     else {
9956         const char string = c;
9957         if (c == '-' || c == ']' || c == '\\' || c == '^')
9958             sv_catpvs(sv, "\\");
9959         sv_catpvn(sv, &string, 1);
9960     }
9961 }
9962
9963
9964 #define CLEAR_OPTSTART \
9965     if (optstart) STMT_START { \
9966             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9967             optstart=NULL; \
9968     } STMT_END
9969
9970 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9971
9972 STATIC const regnode *
9973 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9974             const regnode *last, const regnode *plast,
9975             SV* sv, I32 indent, U32 depth)
9976 {
9977     dVAR;
9978     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
9979     register const regnode *next;
9980     const regnode *optstart= NULL;
9981
9982     RXi_GET_DECL(r,ri);
9983     GET_RE_DEBUG_FLAGS_DECL;
9984
9985     PERL_ARGS_ASSERT_DUMPUNTIL;
9986
9987 #ifdef DEBUG_DUMPUNTIL
9988     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9989         last ? last-start : 0,plast ? plast-start : 0);
9990 #endif
9991
9992     if (plast && plast < last)
9993         last= plast;
9994
9995     while (PL_regkind[op] != END && (!last || node < last)) {
9996         /* While that wasn't END last time... */
9997         NODE_ALIGN(node);
9998         op = OP(node);
9999         if (op == CLOSE || op == WHILEM)
10000             indent--;
10001         next = regnext((regnode *)node);
10002
10003         /* Where, what. */
10004         if (OP(node) == OPTIMIZED) {
10005             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10006                 optstart = node;
10007             else
10008                 goto after_print;
10009         } else
10010             CLEAR_OPTSTART;
10011
10012         regprop(r, sv, node);
10013         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10014                       (int)(2*indent + 1), "", SvPVX_const(sv));
10015
10016         if (OP(node) != OPTIMIZED) {
10017             if (next == NULL)           /* Next ptr. */
10018                 PerlIO_printf(Perl_debug_log, " (0)");
10019             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10020                 PerlIO_printf(Perl_debug_log, " (FAIL)");
10021             else
10022                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10023             (void)PerlIO_putc(Perl_debug_log, '\n');
10024         }
10025
10026       after_print:
10027         if (PL_regkind[(U8)op] == BRANCHJ) {
10028             assert(next);
10029             {
10030                 register const regnode *nnode = (OP(next) == LONGJMP
10031                                              ? regnext((regnode *)next)
10032                                              : next);
10033                 if (last && nnode > last)
10034                     nnode = last;
10035                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10036             }
10037         }
10038         else if (PL_regkind[(U8)op] == BRANCH) {
10039             assert(next);
10040             DUMPUNTIL(NEXTOPER(node), next);
10041         }
10042         else if ( PL_regkind[(U8)op]  == TRIE ) {
10043             const regnode *this_trie = node;
10044             const char op = OP(node);
10045             const U32 n = ARG(node);
10046             const reg_ac_data * const ac = op>=AHOCORASICK ?
10047                (reg_ac_data *)ri->data->data[n] :
10048                NULL;
10049             const reg_trie_data * const trie =
10050                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10051 #ifdef DEBUGGING
10052             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10053 #endif
10054             const regnode *nextbranch= NULL;
10055             I32 word_idx;
10056             sv_setpvs(sv, "");
10057             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10058                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10059
10060                 PerlIO_printf(Perl_debug_log, "%*s%s ",
10061                    (int)(2*(indent+3)), "",
10062                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10063                             PL_colors[0], PL_colors[1],
10064                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10065                             PERL_PV_PRETTY_ELLIPSES    |
10066                             PERL_PV_PRETTY_LTGT
10067                             )
10068                             : "???"
10069                 );
10070                 if (trie->jump) {
10071                     U16 dist= trie->jump[word_idx+1];
10072                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10073                                   (UV)((dist ? this_trie + dist : next) - start));
10074                     if (dist) {
10075                         if (!nextbranch)
10076                             nextbranch= this_trie + trie->jump[0];
10077                         DUMPUNTIL(this_trie + dist, nextbranch);
10078                     }
10079                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10080                         nextbranch= regnext((regnode *)nextbranch);
10081                 } else {
10082                     PerlIO_printf(Perl_debug_log, "\n");
10083                 }
10084             }
10085             if (last && next > last)
10086                 node= last;
10087             else
10088                 node= next;
10089         }
10090         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
10091             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10092                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10093         }
10094         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10095             assert(next);
10096             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10097         }
10098         else if ( op == PLUS || op == STAR) {
10099             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10100         }
10101         else if (op == ANYOF) {
10102             /* arglen 1 + class block */
10103             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10104                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10105             node = NEXTOPER(node);
10106         }
10107         else if (PL_regkind[(U8)op] == EXACT) {
10108             /* Literal string, where present. */
10109             node += NODE_SZ_STR(node) - 1;
10110             node = NEXTOPER(node);
10111         }
10112         else {
10113             node = NEXTOPER(node);
10114             node += regarglen[(U8)op];
10115         }
10116         if (op == CURLYX || op == OPEN)
10117             indent++;
10118     }
10119     CLEAR_OPTSTART;
10120 #ifdef DEBUG_DUMPUNTIL
10121     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10122 #endif
10123     return node;
10124 }
10125
10126 #endif  /* DEBUGGING */
10127
10128 /*
10129  * Local variables:
10130  * c-indentation-style: bsd
10131  * c-basic-offset: 4
10132  * indent-tabs-mode: t
10133  * End:
10134  *
10135  * ex: set ts=8 sts=4 sw=4 noet:
10136  */