src/5013008/orig/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89
  90 #ifdef op
  91 #undef op
  92 #endif /* op */
  93
  94 #ifdef MSDOS
  95 #  if defined(BUGGY_MSC6)
  96  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  97 #    pragma optimize("a",off)
  98  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  99 #    pragma optimize("w",on )
 100 #  endif /* BUGGY_MSC6 */
 101 #endif /* MSDOS */
 102
 103 #ifndef STATIC
 104 #define STATIC  static
 105 #endif
 106
 107 typedef struct RExC_state_t {
 108     U32         flags;                  /* are we folding, multilining? */
 109     char        *precomp;               /* uncompiled string. */
 110     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 111     regexp      *rx;                    /* perl core regexp structure */
 112     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 113     char        *start;                 /* Start of input for compile */
 114     char        *end;                   /* End of input for compile */
 115     char        *parse;                 /* Input-scan pointer. */
 116     I32         whilem_seen;            /* number of WHILEM in this expr */
 117     regnode     *emit_start;            /* Start of emitted-code area */
 118     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 120     I32         naughty;                /* How bad is this pattern? */
 121     I32         sawback;                /* Did we see \1, ...? */
 122     U32         seen;
 123     I32         size;                   /* Code size. */
 124     I32         npar;                   /* Capture buffer count, (OPEN). */
 125     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 126     I32         nestroot;               /* root parens we are in - used by accept */
 127     I32         extralen;
 128     I32         seen_zerolen;
 129     I32         seen_evals;
 130     regnode     **open_parens;          /* pointers to open parens */
 131     regnode     **close_parens;         /* pointers to close parens */
 132     regnode     *opend;                 /* END node in program */
 133     I32         utf8;           /* whether the pattern is utf8 or not */
 134     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 135                                 /* XXX use this for future optimisation of case
 136                                  * where pattern must be upgraded to utf8. */
 137     HV          *paren_names;           /* Paren names */
 138
 139     regnode     **recurse;              /* Recurse regops */
 140     I32         recurse_count;          /* Number of recurse regops */
 141 #if ADD_TO_REGEXEC
 142     char        *starttry;              /* -Dr: where regtry was called. */
 143 #define RExC_starttry   (pRExC_state->starttry)
 144 #endif
 145 #ifdef DEBUGGING
 146     const char  *lastparse;
 147     I32         lastnum;
 148     AV          *paren_name_list;       /* idx -> name */
 149 #define RExC_lastparse  (pRExC_state->lastparse)
 150 #define RExC_lastnum    (pRExC_state->lastnum)
 151 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 152 #endif
 153 } RExC_state_t;
 154
 155 #define RExC_flags      (pRExC_state->flags)
 156 #define RExC_precomp    (pRExC_state->precomp)
 157 #define RExC_rx_sv      (pRExC_state->rx_sv)
 158 #define RExC_rx         (pRExC_state->rx)
 159 #define RExC_rxi        (pRExC_state->rxi)
 160 #define RExC_start      (pRExC_state->start)
 161 #define RExC_end        (pRExC_state->end)
 162 #define RExC_parse      (pRExC_state->parse)
 163 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 164 #ifdef RE_TRACK_PATTERN_OFFSETS
 165 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 166 #endif
 167 #define RExC_emit       (pRExC_state->emit)
 168 #define RExC_emit_start (pRExC_state->emit_start)
 169 #define RExC_emit_bound (pRExC_state->emit_bound)
 170 #define RExC_naughty    (pRExC_state->naughty)
 171 #define RExC_sawback    (pRExC_state->sawback)
 172 #define RExC_seen       (pRExC_state->seen)
 173 #define RExC_size       (pRExC_state->size)
 174 #define RExC_npar       (pRExC_state->npar)
 175 #define RExC_nestroot   (pRExC_state->nestroot)
 176 #define RExC_extralen   (pRExC_state->extralen)
 177 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 178 #define RExC_seen_evals (pRExC_state->seen_evals)
 179 #define RExC_utf8       (pRExC_state->utf8)
 180 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 181 #define RExC_open_parens        (pRExC_state->open_parens)
 182 #define RExC_close_parens       (pRExC_state->close_parens)
 183 #define RExC_opend      (pRExC_state->opend)
 184 #define RExC_paren_names        (pRExC_state->paren_names)
 185 #define RExC_recurse    (pRExC_state->recurse)
 186 #define RExC_recurse_count      (pRExC_state->recurse_count)
 187
 188
 189 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 190 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 191         ((*s) == '{' && regcurly(s)))
 192
 193 #ifdef SPSTART
 194 #undef SPSTART          /* dratted cpp namespace... */
 195 #endif
 196 /*
 197  * Flags to be passed up and down.
 198  */
 199 #define WORST           0       /* Worst case. */
 200 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 201
 202 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 203  * character, and if utf8, must be invariant. */
 204 #define SIMPLE          0x02
 205 #define SPSTART         0x04    /* Starts with * or +. */
 206 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 207 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 208
 209 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 210
 211 /* whether trie related optimizations are enabled */
 212 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 213 #define TRIE_STUDY_OPT
 214 #define FULL_TRIE_STUDY
 215 #define TRIE_STCLASS
 216 #endif
 217
 218
 219
 220 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 221 #define PBITVAL(paren) (1 << ((paren) & 7))
 222 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 223 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 224 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 225
 226 /* If not already in utf8, do a longjmp back to the beginning */
 227 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 228 #define REQUIRE_UTF8    STMT_START {                                       \
 229                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 230                         } STMT_END
 231
 232 /* About scan_data_t.
 233
 234   During optimisation we recurse through the regexp program performing
 235   various inplace (keyhole style) optimisations. In addition study_chunk
 236   and scan_commit populate this data structure with information about
 237   what strings MUST appear in the pattern. We look for the longest
 238   string that must appear at a fixed location, and we look for the
 239   longest string that may appear at a floating location. So for instance
 240   in the pattern:
 241
 242     /FOO[xX]A.*B[xX]BAR/
 243
 244   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 245   strings (because they follow a .* construct). study_chunk will identify
 246   both FOO and BAR as being the longest fixed and floating strings respectively.
 247
 248   The strings can be composites, for instance
 249
 250      /(f)(o)(o)/
 251
 252   will result in a composite fixed substring 'foo'.
 253
 254   For each string some basic information is maintained:
 255
 256   - offset or min_offset
 257     This is the position the string must appear at, or not before.
 258     It also implicitly (when combined with minlenp) tells us how many
 259     characters must match before the string we are searching for.
 260     Likewise when combined with minlenp and the length of the string it
 261     tells us how many characters must appear after the string we have
 262     found.
 263
 264   - max_offset
 265     Only used for floating strings. This is the rightmost point that
 266     the string can appear at. If set to I32 max it indicates that the
 267     string can occur infinitely far to the right.
 268
 269   - minlenp
 270     A pointer to the minimum length of the pattern that the string
 271     was found inside. This is important as in the case of positive
 272     lookahead or positive lookbehind we can have multiple patterns
 273     involved. Consider
 274
 275     /(?=FOO).*F/
 276
 277     The minimum length of the pattern overall is 3, the minimum length
 278     of the lookahead part is 3, but the minimum length of the part that
 279     will actually match is 1. So 'FOO's minimum length is 3, but the
 280     minimum length for the F is 1. This is important as the minimum length
 281     is used to determine offsets in front of and behind the string being
 282     looked for.  Since strings can be composites this is the length of the
 283     pattern at the time it was commited with a scan_commit. Note that
 284     the length is calculated by study_chunk, so that the minimum lengths
 285     are not known until the full pattern has been compiled, thus the
 286     pointer to the value.
 287
 288   - lookbehind
 289
 290     In the case of lookbehind the string being searched for can be
 291     offset past the start point of the final matching string.
 292     If this value was just blithely removed from the min_offset it would
 293     invalidate some of the calculations for how many chars must match
 294     before or after (as they are derived from min_offset and minlen and
 295     the length of the string being searched for).
 296     When the final pattern is compiled and the data is moved from the
 297     scan_data_t structure into the regexp structure the information
 298     about lookbehind is factored in, with the information that would
 299     have been lost precalculated in the end_shift field for the
 300     associated string.
 301
 302   The fields pos_min and pos_delta are used to store the minimum offset
 303   and the delta to the maximum offset at the current point in the pattern.
 304
 305 */
 306
 307 typedef struct scan_data_t {
 308     /*I32 len_min;      unused */
 309     /*I32 len_delta;    unused */
 310     I32 pos_min;
 311     I32 pos_delta;
 312     SV *last_found;
 313     I32 last_end;           /* min value, <0 unless valid. */
 314     I32 last_start_min;
 315     I32 last_start_max;
 316     SV **longest;           /* Either &l_fixed, or &l_float. */
 317     SV *longest_fixed;      /* longest fixed string found in pattern */
 318     I32 offset_fixed;       /* offset where it starts */
 319     I32 *minlen_fixed;      /* pointer to the minlen relevent to the string */
 320     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 321     SV *longest_float;      /* longest floating string found in pattern */
 322     I32 offset_float_min;   /* earliest point in string it can appear */
 323     I32 offset_float_max;   /* latest point in string it can appear */
 324     I32 *minlen_float;      /* pointer to the minlen relevent to the string */
 325     I32 lookbehind_float;   /* is the position of the string modified by LB */
 326     I32 flags;
 327     I32 whilem_c;
 328     I32 *last_closep;
 329     struct regnode_charclass_class *start_class;
 330 } scan_data_t;
 331
 332 /*
 333  * Forward declarations for pregcomp()'s friends.
 334  */
 335
 336 static const scan_data_t zero_scan_data =
 337   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 338
 339 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 340 #define SF_BEFORE_SEOL          0x0001
 341 #define SF_BEFORE_MEOL          0x0002
 342 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 343 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 344
 345 #ifdef NO_UNARY_PLUS
 346 #  define SF_FIX_SHIFT_EOL      (0+2)
 347 #  define SF_FL_SHIFT_EOL               (0+4)
 348 #else
 349 #  define SF_FIX_SHIFT_EOL      (+2)
 350 #  define SF_FL_SHIFT_EOL               (+4)
 351 #endif
 352
 353 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 354 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 355
 356 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 357 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 358 #define SF_IS_INF               0x0040
 359 #define SF_HAS_PAR              0x0080
 360 #define SF_IN_PAR               0x0100
 361 #define SF_HAS_EVAL             0x0200
 362 #define SCF_DO_SUBSTR           0x0400
 363 #define SCF_DO_STCLASS_AND      0x0800
 364 #define SCF_DO_STCLASS_OR       0x1000
 365 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 366 #define SCF_WHILEM_VISITED_POS  0x2000
 367
 368 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 369 #define SCF_SEEN_ACCEPT         0x8000
 370
 371 #define UTF cBOOL(RExC_utf8)
 372 #define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
 373 #define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
 374 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 375
 376 #define OOB_UNICODE             12345678
 377 #define OOB_NAMEDCLASS          -1
 378
 379 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 380 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 381
 382
 383 /* length of regex to show in messages that don't mark a position within */
 384 #define RegexLengthToShowInErrorMessages 127
 385
 386 /*
 387  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 388  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 389  * op/pragma/warn/regcomp.
 390  */
 391 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 392 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 393
 394 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 395
 396 /*
 397  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 398  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 399  * "...".
 400  */
 401 #define _FAIL(code) STMT_START {                                        \
 402     const char *ellipses = "";                                          \
 403     IV len = RExC_end - RExC_precomp;                                   \
 404                                                                         \
 405     if (!SIZE_ONLY)                                                     \
 406         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 407     if (len > RegexLengthToShowInErrorMessages) {                       \
 408         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 409         len = RegexLengthToShowInErrorMessages - 10;                    \
 410         ellipses = "...";                                               \
 411     }                                                                   \
 412     code;                                                               \
 413 } STMT_END
 414
 415 #define FAIL(msg) _FAIL(                            \
 416     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 417             msg, (int)len, RExC_precomp, ellipses))
 418
 419 #define FAIL2(msg,arg) _FAIL(                       \
 420     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 421             arg, (int)len, RExC_precomp, ellipses))
 422
 423 /*
 424  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 425  */
 426 #define Simple_vFAIL(m) STMT_START {                                    \
 427     const IV offset = RExC_parse - RExC_precomp;                        \
 428     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 429             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 430 } STMT_END
 431
 432 /*
 433  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 434  */
 435 #define vFAIL(m) STMT_START {                           \
 436     if (!SIZE_ONLY)                                     \
 437         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 438     Simple_vFAIL(m);                                    \
 439 } STMT_END
 440
 441 /*
 442  * Like Simple_vFAIL(), but accepts two arguments.
 443  */
 444 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 445     const IV offset = RExC_parse - RExC_precomp;                        \
 446     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 447             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 448 } STMT_END
 449
 450 /*
 451  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 452  */
 453 #define vFAIL2(m,a1) STMT_START {                       \
 454     if (!SIZE_ONLY)                                     \
 455         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 456     Simple_vFAIL2(m, a1);                               \
 457 } STMT_END
 458
 459
 460 /*
 461  * Like Simple_vFAIL(), but accepts three arguments.
 462  */
 463 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 464     const IV offset = RExC_parse - RExC_precomp;                \
 465     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 466             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 467 } STMT_END
 468
 469 /*
 470  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 471  */
 472 #define vFAIL3(m,a1,a2) STMT_START {                    \
 473     if (!SIZE_ONLY)                                     \
 474         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 475     Simple_vFAIL3(m, a1, a2);                           \
 476 } STMT_END
 477
 478 /*
 479  * Like Simple_vFAIL(), but accepts four arguments.
 480  */
 481 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 482     const IV offset = RExC_parse - RExC_precomp;                \
 483     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 484             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 485 } STMT_END
 486
 487 #define ckWARNreg(loc,m) STMT_START {                                   \
 488     const IV offset = loc - RExC_precomp;                               \
 489     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 490             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 491 } STMT_END
 492
 493 #define ckWARNregdep(loc,m) STMT_START {                                \
 494     const IV offset = loc - RExC_precomp;                               \
 495     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 496             m REPORT_LOCATION,                                          \
 497             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 498 } STMT_END
 499
 500 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 501     const IV offset = loc - RExC_precomp;                               \
 502     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 503             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 504 } STMT_END
 505
 506 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 507     const IV offset = loc - RExC_precomp;                               \
 508     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 509             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 510 } STMT_END
 511
 512 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 513     const IV offset = loc - RExC_precomp;                               \
 514     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 515             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 516 } STMT_END
 517
 518 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 519     const IV offset = loc - RExC_precomp;                               \
 520     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 521             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 522 } STMT_END
 523
 524 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 525     const IV offset = loc - RExC_precomp;                               \
 526     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 527             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 528 } STMT_END
 529
 530 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 531     const IV offset = loc - RExC_precomp;                               \
 532     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 533             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 534 } STMT_END
 535
 536
 537 /* Allow for side effects in s */
 538 #define REGC(c,s) STMT_START {                  \
 539     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 540 } STMT_END
 541
 542 /* Macros for recording node offsets.   20001227 mjd@plover.com
 543  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 544  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 545  * Element 0 holds the number n.
 546  * Position is 1 indexed.
 547  */
 548 #ifndef RE_TRACK_PATTERN_OFFSETS
 549 #define Set_Node_Offset_To_R(node,byte)
 550 #define Set_Node_Offset(node,byte)
 551 #define Set_Cur_Node_Offset
 552 #define Set_Node_Length_To_R(node,len)
 553 #define Set_Node_Length(node,len)
 554 #define Set_Node_Cur_Length(node)
 555 #define Node_Offset(n)
 556 #define Node_Length(n)
 557 #define Set_Node_Offset_Length(node,offset,len)
 558 #define ProgLen(ri) ri->u.proglen
 559 #define SetProgLen(ri,x) ri->u.proglen = x
 560 #else
 561 #define ProgLen(ri) ri->u.offsets[0]
 562 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 563 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 564     if (! SIZE_ONLY) {                                                  \
 565         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 566                     __LINE__, (int)(node), (int)(byte)));               \
 567         if((node) < 0) {                                                \
 568             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 569         } else {                                                        \
 570             RExC_offsets[2*(node)-1] = (byte);                          \
 571         }                                                               \
 572     }                                                                   \
 573 } STMT_END
 574
 575 #define Set_Node_Offset(node,byte) \
 576     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 577 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 578
 579 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 580     if (! SIZE_ONLY) {                                                  \
 581         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 582                 __LINE__, (int)(node), (int)(len)));                    \
 583         if((node) < 0) {                                                \
 584             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 585         } else {                                                        \
 586             RExC_offsets[2*(node)] = (len);                             \
 587         }                                                               \
 588     }                                                                   \
 589 } STMT_END
 590
 591 #define Set_Node_Length(node,len) \
 592     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 593 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 594 #define Set_Node_Cur_Length(node) \
 595     Set_Node_Length(node, RExC_parse - parse_start)
 596
 597 /* Get offsets and lengths */
 598 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 599 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 600
 601 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 602     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 603     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 604 } STMT_END
 605 #endif
 606
 607 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 608 #define EXPERIMENTAL_INPLACESCAN
 609 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 610
 611 #define DEBUG_STUDYDATA(str,data,depth)                              \
 612 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 613     PerlIO_printf(Perl_debug_log,                                    \
 614         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 615         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 616         (int)(depth)*2, "",                                          \
 617         (IV)((data)->pos_min),                                       \
 618         (IV)((data)->pos_delta),                                     \
 619         (UV)((data)->flags),                                         \
 620         (IV)((data)->whilem_c),                                      \
 621         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 622         is_inf ? "INF " : ""                                         \
 623     );                                                               \
 624     if ((data)->last_found)                                          \
 625         PerlIO_printf(Perl_debug_log,                                \
 626             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 627             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 628             SvPVX_const((data)->last_found),                         \
 629             (IV)((data)->last_end),                                  \
 630             (IV)((data)->last_start_min),                            \
 631             (IV)((data)->last_start_max),                            \
 632             ((data)->longest &&                                      \
 633              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 634             SvPVX_const((data)->longest_fixed),                      \
 635             (IV)((data)->offset_fixed),                              \
 636             ((data)->longest &&                                      \
 637              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 638             SvPVX_const((data)->longest_float),                      \
 639             (IV)((data)->offset_float_min),                          \
 640             (IV)((data)->offset_float_max)                           \
 641         );                                                           \
 642     PerlIO_printf(Perl_debug_log,"\n");                              \
 643 });
 644
 645 static void clear_re(pTHX_ void *r);
 646
 647 /* Mark that we cannot extend a found fixed substring at this point.
 648    Update the longest found anchored substring and the longest found
 649    floating substrings if needed. */
 650
 651 STATIC void
 652 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 653 {
 654     const STRLEN l = CHR_SVLEN(data->last_found);
 655     const STRLEN old_l = CHR_SVLEN(*data->longest);
 656     GET_RE_DEBUG_FLAGS_DECL;
 657
 658     PERL_ARGS_ASSERT_SCAN_COMMIT;
 659
 660     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 661         SvSetMagicSV(*data->longest, data->last_found);
 662         if (*data->longest == data->longest_fixed) {
 663             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 664             if (data->flags & SF_BEFORE_EOL)
 665                 data->flags
 666                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 667             else
 668                 data->flags &= ~SF_FIX_BEFORE_EOL;
 669             data->minlen_fixed=minlenp;
 670             data->lookbehind_fixed=0;
 671         }
 672         else { /* *data->longest == data->longest_float */
 673             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 674             data->offset_float_max = (l
 675                                       ? data->last_start_max
 676                                       : data->pos_min + data->pos_delta);
 677             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 678                 data->offset_float_max = I32_MAX;
 679             if (data->flags & SF_BEFORE_EOL)
 680                 data->flags
 681                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 682             else
 683                 data->flags &= ~SF_FL_BEFORE_EOL;
 684             data->minlen_float=minlenp;
 685             data->lookbehind_float=0;
 686         }
 687     }
 688     SvCUR_set(data->last_found, 0);
 689     {
 690         SV * const sv = data->last_found;
 691         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 692             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 693             if (mg)
 694                 mg->mg_len = 0;
 695         }
 696     }
 697     data->last_end = -1;
 698     data->flags &= ~SF_BEFORE_EOL;
 699     DEBUG_STUDYDATA("commit: ",data,0);
 700 }
 701
 702 /* Can match anything (initialization) */
 703 STATIC void
 704 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 705 {
 706     PERL_ARGS_ASSERT_CL_ANYTHING;
 707
 708     ANYOF_CLASS_ZERO(cl);
 709     ANYOF_BITMAP_SETALL(cl);
 710     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
 711     if (LOC)
 712         cl->flags |= ANYOF_LOCALE;
 713     cl->flags |= ANYOF_FOLD;
 714 }
 715
 716 /* Can match anything (initialization) */
 717 STATIC int
 718 S_cl_is_anything(const struct regnode_charclass_class *cl)
 719 {
 720     int value;
 721
 722     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 723
 724     for (value = 0; value <= ANYOF_MAX; value += 2)
 725         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 726             return 1;
 727     if (!(cl->flags & ANYOF_UNICODE_ALL))
 728         return 0;
 729     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 730         return 0;
 731     return 1;
 732 }
 733
 734 /* Can match anything (initialization) */
 735 STATIC void
 736 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 737 {
 738     PERL_ARGS_ASSERT_CL_INIT;
 739
 740     Zero(cl, 1, struct regnode_charclass_class);
 741     cl->type = ANYOF;
 742     cl_anything(pRExC_state, cl);
 743 }
 744
 745 STATIC void
 746 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 747 {
 748     PERL_ARGS_ASSERT_CL_INIT_ZERO;
 749
 750     Zero(cl, 1, struct regnode_charclass_class);
 751     cl->type = ANYOF;
 752     cl_anything(pRExC_state, cl);
 753     if (LOC)
 754         cl->flags |= ANYOF_LOCALE;
 755 }
 756
 757 /* 'And' a given class with another one.  Can create false positives */
 758 /* We assume that cl is not inverted */
 759 STATIC void
 760 S_cl_and(struct regnode_charclass_class *cl,
 761         const struct regnode_charclass_class *and_with)
 762 {
 763     PERL_ARGS_ASSERT_CL_AND;
 764
 765     assert(and_with->type == ANYOF);
 766
 767     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 768         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 769         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 770         && !(and_with->flags & ANYOF_FOLD)
 771         && !(cl->flags & ANYOF_FOLD)) {
 772         int i;
 773
 774         if (and_with->flags & ANYOF_INVERT)
 775             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 776                 cl->bitmap[i] &= ~and_with->bitmap[i];
 777         else
 778             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 779                 cl->bitmap[i] &= and_with->bitmap[i];
 780     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 781     if (!(and_with->flags & ANYOF_EOS))
 782         cl->flags &= ~ANYOF_EOS;
 783
 784     if (!(and_with->flags & ANYOF_FOLD))
 785         cl->flags &= ~ANYOF_FOLD;
 786
 787     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP &&
 788         !(and_with->flags & ANYOF_INVERT)) {
 789         cl->flags &= ~ANYOF_UNICODE_ALL;
 790         cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
 791                                                            only the one(s)
 792                                                            actually set */
 793         ARG_SET(cl, ARG(and_with));
 794     }
 795     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
 796         !(and_with->flags & ANYOF_INVERT))
 797         cl->flags &= ~ANYOF_UNICODE_ALL;
 798     if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
 799         !(and_with->flags & ANYOF_INVERT))
 800         cl->flags &= ~ANYOF_NONBITMAP;
 801 }
 802
 803 /* 'OR' a given class with another one.  Can create false positives */
 804 /* We assume that cl is not inverted */
 805 STATIC void
 806 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 807 {
 808     PERL_ARGS_ASSERT_CL_OR;
 809
 810     if (or_with->flags & ANYOF_INVERT) {
 811         /* We do not use
 812          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 813          *   <= (B1 | !B2) | (CL1 | !CL2)
 814          * which is wasteful if CL2 is small, but we ignore CL2:
 815          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 816          * XXXX Can we handle case-fold?  Unclear:
 817          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 818          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 819          */
 820         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 821              && !(or_with->flags & ANYOF_FOLD)
 822              && !(cl->flags & ANYOF_FOLD) ) {
 823             int i;
 824
 825             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 826                 cl->bitmap[i] |= ~or_with->bitmap[i];
 827         } /* XXXX: logic is complicated otherwise */
 828         else {
 829             cl_anything(pRExC_state, cl);
 830         }
 831     } else {
 832         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 833         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 834              && (!(or_with->flags & ANYOF_FOLD)
 835                  || (cl->flags & ANYOF_FOLD)) ) {
 836             int i;
 837
 838             /* OR char bitmap and class bitmap separately */
 839             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 840                 cl->bitmap[i] |= or_with->bitmap[i];
 841             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 842                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 843                     cl->classflags[i] |= or_with->classflags[i];
 844                 cl->flags |= ANYOF_CLASS;
 845             }
 846         }
 847         else { /* XXXX: logic is complicated, leave it along for a moment. */
 848             cl_anything(pRExC_state, cl);
 849         }
 850     }
 851     if (or_with->flags & ANYOF_EOS)
 852         cl->flags |= ANYOF_EOS;
 853
 854     if (or_with->flags & ANYOF_FOLD)
 855         cl->flags |= ANYOF_FOLD;
 856
 857     /* If both nodes match something outside the bitmap, but what they match
 858      * outside is not the same pointer, and hence not easily compared, give up
 859      * and allow the start class to match everything outside the bitmap */
 860     if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
 861         ARG(cl) != ARG(or_with)) {
 862         cl->flags |= ANYOF_UNICODE_ALL;
 863     }
 864
 865     if (or_with->flags & ANYOF_UNICODE_ALL) {
 866         cl->flags |= ANYOF_UNICODE_ALL;
 867     }
 868 }
 869
 870 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 871 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 872 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 873 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
 874
 875
 876 #ifdef DEBUGGING
 877 /*
 878    dump_trie(trie,widecharmap,revcharmap)
 879    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
 880    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 881
 882    These routines dump out a trie in a somewhat readable format.
 883    The _interim_ variants are used for debugging the interim
 884    tables that are used to generate the final compressed
 885    representation which is what dump_trie expects.
 886
 887    Part of the reason for their existance is to provide a form
 888    of documentation as to how the different representations function.
 889
 890 */
 891
 892 /*
 893   Dumps the final compressed table form of the trie to Perl_debug_log.
 894   Used for debugging make_trie().
 895 */
 896
 897 STATIC void
 898 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
 899             AV *revcharmap, U32 depth)
 900 {
 901     U32 state;
 902     SV *sv=sv_newmortal();
 903     int colwidth= widecharmap ? 6 : 4;
 904     U16 word;
 905     GET_RE_DEBUG_FLAGS_DECL;
 906
 907     PERL_ARGS_ASSERT_DUMP_TRIE;
 908
 909     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
 910         (int)depth * 2 + 2,"",
 911         "Match","Base","Ofs" );
 912
 913     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
 914         SV ** const tmp = av_fetch( revcharmap, state, 0);
 915         if ( tmp ) {
 916             PerlIO_printf( Perl_debug_log, "%*s",
 917                 colwidth,
 918                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 919                             PL_colors[0], PL_colors[1],
 920                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 921                             PERL_PV_ESCAPE_FIRSTCHAR
 922                 )
 923             );
 924         }
 925     }
 926     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
 927         (int)depth * 2 + 2,"");
 928
 929     for( state = 0 ; state < trie->uniquecharcount ; state++ )
 930         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
 931     PerlIO_printf( Perl_debug_log, "\n");
 932
 933     for( state = 1 ; state < trie->statecount ; state++ ) {
 934         const U32 base = trie->states[ state ].trans.base;
 935
 936         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
 937
 938         if ( trie->states[ state ].wordnum ) {
 939             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
 940         } else {
 941             PerlIO_printf( Perl_debug_log, "%6s", "" );
 942         }
 943
 944         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
 945
 946         if ( base ) {
 947             U32 ofs = 0;
 948
 949             while( ( base + ofs  < trie->uniquecharcount ) ||
 950                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
 951                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
 952                     ofs++;
 953
 954             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
 955
 956             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
 957                 if ( ( base + ofs >= trie->uniquecharcount ) &&
 958                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
 959                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
 960                 {
 961                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
 962                     colwidth,
 963                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
 964                 } else {
 965                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
 966                 }
 967             }
 968
 969             PerlIO_printf( Perl_debug_log, "]");
 970
 971         }
 972         PerlIO_printf( Perl_debug_log, "\n" );
 973     }
 974     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
 975     for (word=1; word <= trie->wordcount; word++) {
 976         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
 977             (int)word, (int)(trie->wordinfo[word].prev),
 978             (int)(trie->wordinfo[word].len));
 979     }
 980     PerlIO_printf(Perl_debug_log, "\n" );
 981 }
 982 /*
 983   Dumps a fully constructed but uncompressed trie in list form.
 984   List tries normally only are used for construction when the number of
 985   possible chars (trie->uniquecharcount) is very high.
 986   Used for debugging make_trie().
 987 */
 988 STATIC void
 989 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
 990                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
 991                          U32 depth)
 992 {
 993     U32 state;
 994     SV *sv=sv_newmortal();
 995     int colwidth= widecharmap ? 6 : 4;
 996     GET_RE_DEBUG_FLAGS_DECL;
 997
 998     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
 999
1000     /* print out the table precompression.  */
1001     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1002         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1003         "------:-----+-----------------\n" );
1004
1005     for( state=1 ; state < next_alloc ; state ++ ) {
1006         U16 charid;
1007
1008         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1009             (int)depth * 2 + 2,"", (UV)state  );
1010         if ( ! trie->states[ state ].wordnum ) {
1011             PerlIO_printf( Perl_debug_log, "%5s| ","");
1012         } else {
1013             PerlIO_printf( Perl_debug_log, "W%4x| ",
1014                 trie->states[ state ].wordnum
1015             );
1016         }
1017         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1018             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1019             if ( tmp ) {
1020                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1021                     colwidth,
1022                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1023                             PL_colors[0], PL_colors[1],
1024                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1025                             PERL_PV_ESCAPE_FIRSTCHAR
1026                     ) ,
1027                     TRIE_LIST_ITEM(state,charid).forid,
1028                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1029                 );
1030                 if (!(charid % 10))
1031                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1032                         (int)((depth * 2) + 14), "");
1033             }
1034         }
1035         PerlIO_printf( Perl_debug_log, "\n");
1036     }
1037 }
1038
1039 /*
1040   Dumps a fully constructed but uncompressed trie in table form.
1041   This is the normal DFA style state transition table, with a few
1042   twists to facilitate compression later.
1043   Used for debugging make_trie().
1044 */
1045 STATIC void
1046 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1047                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1048                           U32 depth)
1049 {
1050     U32 state;
1051     U16 charid;
1052     SV *sv=sv_newmortal();
1053     int colwidth= widecharmap ? 6 : 4;
1054     GET_RE_DEBUG_FLAGS_DECL;
1055
1056     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1057
1058     /*
1059        print out the table precompression so that we can do a visual check
1060        that they are identical.
1061      */
1062
1063     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1064
1065     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1066         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1067         if ( tmp ) {
1068             PerlIO_printf( Perl_debug_log, "%*s",
1069                 colwidth,
1070                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1071                             PL_colors[0], PL_colors[1],
1072                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1073                             PERL_PV_ESCAPE_FIRSTCHAR
1074                 )
1075             );
1076         }
1077     }
1078
1079     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1080
1081     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1082         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1083     }
1084
1085     PerlIO_printf( Perl_debug_log, "\n" );
1086
1087     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1088
1089         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1090             (int)depth * 2 + 2,"",
1091             (UV)TRIE_NODENUM( state ) );
1092
1093         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1094             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1095             if (v)
1096                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1097             else
1098                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1099         }
1100         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1101             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1102         } else {
1103             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1104             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1105         }
1106     }
1107 }
1108
1109 #endif
1110
1111
1112 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1113   startbranch: the first branch in the whole branch sequence
1114   first      : start branch of sequence of branch-exact nodes.
1115                May be the same as startbranch
1116   last       : Thing following the last branch.
1117                May be the same as tail.
1118   tail       : item following the branch sequence
1119   count      : words in the sequence
1120   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1121   depth      : indent depth
1122
1123 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1124
1125 A trie is an N'ary tree where the branches are determined by digital
1126 decomposition of the key. IE, at the root node you look up the 1st character and
1127 follow that branch repeat until you find the end of the branches. Nodes can be
1128 marked as "accepting" meaning they represent a complete word. Eg:
1129
1130   /he|she|his|hers/
1131
1132 would convert into the following structure. Numbers represent states, letters
1133 following numbers represent valid transitions on the letter from that state, if
1134 the number is in square brackets it represents an accepting state, otherwise it
1135 will be in parenthesis.
1136
1137       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1138       |    |
1139       |   (2)
1140       |    |
1141      (1)   +-i->(6)-+-s->[7]
1142       |
1143       +-s->(3)-+-h->(4)-+-e->[5]
1144
1145       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1146
1147 This shows that when matching against the string 'hers' we will begin at state 1
1148 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1149 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1150 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1151 single traverse. We store a mapping from accepting to state to which word was
1152 matched, and then when we have multiple possibilities we try to complete the
1153 rest of the regex in the order in which they occured in the alternation.
1154
1155 The only prior NFA like behaviour that would be changed by the TRIE support is
1156 the silent ignoring of duplicate alternations which are of the form:
1157
1158  / (DUPE|DUPE) X? (?{ ... }) Y /x
1159
1160 Thus EVAL blocks following a trie may be called a different number of times with
1161 and without the optimisation. With the optimisations dupes will be silently
1162 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1163 the following demonstrates:
1164
1165  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1166
1167 which prints out 'word' three times, but
1168
1169  'words'=~/(word|word|word)(?{ print $1 })S/
1170
1171 which doesnt print it out at all. This is due to other optimisations kicking in.
1172
1173 Example of what happens on a structural level:
1174
1175 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1176
1177    1: CURLYM[1] {1,32767}(18)
1178    5:   BRANCH(8)
1179    6:     EXACT <ac>(16)
1180    8:   BRANCH(11)
1181    9:     EXACT <ad>(16)
1182   11:   BRANCH(14)
1183   12:     EXACT <ab>(16)
1184   16:   SUCCEED(0)
1185   17:   NOTHING(18)
1186   18: END(0)
1187
1188 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1189 and should turn into:
1190
1191    1: CURLYM[1] {1,32767}(18)
1192    5:   TRIE(16)
1193         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1194           <ac>
1195           <ad>
1196           <ab>
1197   16:   SUCCEED(0)
1198   17:   NOTHING(18)
1199   18: END(0)
1200
1201 Cases where tail != last would be like /(?foo|bar)baz/:
1202
1203    1: BRANCH(4)
1204    2:   EXACT <foo>(8)
1205    4: BRANCH(7)
1206    5:   EXACT <bar>(8)
1207    7: TAIL(8)
1208    8: EXACT <baz>(10)
1209   10: END(0)
1210
1211 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1212 and would end up looking like:
1213
1214     1: TRIE(8)
1215       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1216         <foo>
1217         <bar>
1218    7: TAIL(8)
1219    8: EXACT <baz>(10)
1220   10: END(0)
1221
1222     d = uvuni_to_utf8_flags(d, uv, 0);
1223
1224 is the recommended Unicode-aware way of saying
1225
1226     *(d++) = uv;
1227 */
1228
1229 #define TRIE_STORE_REVCHAR                                                 \
1230     STMT_START {                                                           \
1231         if (UTF) {                                                         \
1232             SV *zlopp = newSV(2);                                          \
1233             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1234             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1235             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1236             SvPOK_on(zlopp);                                               \
1237             SvUTF8_on(zlopp);                                              \
1238             av_push(revcharmap, zlopp);                                    \
1239         } else {                                                           \
1240             char ooooff = (char)uvc;                                               \
1241             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1242         }                                                                  \
1243         } STMT_END
1244
1245 #define TRIE_READ_CHAR STMT_START {                                           \
1246     wordlen++;                                                                \
1247     if ( UTF ) {                                                              \
1248         if ( folder ) {                                                       \
1249             if ( foldlen > 0 ) {                                              \
1250                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1251                foldlen -= len;                                                \
1252                scan += len;                                                   \
1253                len = 0;                                                       \
1254             } else {                                                          \
1255                 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1256                 uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1257                 foldlen -= UNISKIP( uvc );                                    \
1258                 scan = foldbuf + UNISKIP( uvc );                              \
1259             }                                                                 \
1260         } else {                                                              \
1261             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1262         }                                                                     \
1263     } else {                                                                  \
1264         uvc = (U32)*uc;                                                       \
1265         len = 1;                                                              \
1266     }                                                                         \
1267 } STMT_END
1268
1269
1270
1271 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1272     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1273         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1274         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1275     }                                                           \
1276     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1277     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1278     TRIE_LIST_CUR( state )++;                                   \
1279 } STMT_END
1280
1281 #define TRIE_LIST_NEW(state) STMT_START {                       \
1282     Newxz( trie->states[ state ].trans.list,               \
1283         4, reg_trie_trans_le );                                 \
1284      TRIE_LIST_CUR( state ) = 1;                                \
1285      TRIE_LIST_LEN( state ) = 4;                                \
1286 } STMT_END
1287
1288 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1289     U16 dupe= trie->states[ state ].wordnum;                    \
1290     regnode * const noper_next = regnext( noper );              \
1291                                                                 \
1292     DEBUG_r({                                                   \
1293         /* store the word for dumping */                        \
1294         SV* tmp;                                                \
1295         if (OP(noper) != NOTHING)                               \
1296             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1297         else                                                    \
1298             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1299         av_push( trie_words, tmp );                             \
1300     });                                                         \
1301                                                                 \
1302     curword++;                                                  \
1303     trie->wordinfo[curword].prev   = 0;                         \
1304     trie->wordinfo[curword].len    = wordlen;                   \
1305     trie->wordinfo[curword].accept = state;                     \
1306                                                                 \
1307     if ( noper_next < tail ) {                                  \
1308         if (!trie->jump)                                        \
1309             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1310         trie->jump[curword] = (U16)(noper_next - convert);      \
1311         if (!jumper)                                            \
1312             jumper = noper_next;                                \
1313         if (!nextbranch)                                        \
1314             nextbranch= regnext(cur);                           \
1315     }                                                           \
1316                                                                 \
1317     if ( dupe ) {                                               \
1318         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1319         /* chain, so that when the bits of chain are later    */\
1320         /* linked together, the dups appear in the chain      */\
1321         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1322         trie->wordinfo[dupe].prev = curword;                    \
1323     } else {                                                    \
1324         /* we haven't inserted this word yet.                */ \
1325         trie->states[ state ].wordnum = curword;                \
1326     }                                                           \
1327 } STMT_END
1328
1329
1330 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1331      ( ( base + charid >=  ucharcount                                   \
1332          && base + charid < ubound                                      \
1333          && state == trie->trans[ base - ucharcount + charid ].check    \
1334          && trie->trans[ base - ucharcount + charid ].next )            \
1335            ? trie->trans[ base - ucharcount + charid ].next             \
1336            : ( state==1 ? special : 0 )                                 \
1337       )
1338
1339 #define MADE_TRIE       1
1340 #define MADE_JUMP_TRIE  2
1341 #define MADE_EXACT_TRIE 4
1342
1343 STATIC I32
1344 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1345 {
1346     dVAR;
1347     /* first pass, loop through and scan words */
1348     reg_trie_data *trie;
1349     HV *widecharmap = NULL;
1350     AV *revcharmap = newAV();
1351     regnode *cur;
1352     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1353     STRLEN len = 0;
1354     UV uvc = 0;
1355     U16 curword = 0;
1356     U32 next_alloc = 0;
1357     regnode *jumper = NULL;
1358     regnode *nextbranch = NULL;
1359     regnode *convert = NULL;
1360     U32 *prev_states; /* temp array mapping each state to previous one */
1361     /* we just use folder as a flag in utf8 */
1362     const U8 * folder = NULL;
1363
1364 #ifdef DEBUGGING
1365     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1366     AV *trie_words = NULL;
1367     /* along with revcharmap, this only used during construction but both are
1368      * useful during debugging so we store them in the struct when debugging.
1369      */
1370 #else
1371     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1372     STRLEN trie_charcount=0;
1373 #endif
1374     SV *re_trie_maxbuff;
1375     GET_RE_DEBUG_FLAGS_DECL;
1376
1377     PERL_ARGS_ASSERT_MAKE_TRIE;
1378 #ifndef DEBUGGING
1379     PERL_UNUSED_ARG(depth);
1380 #endif
1381
1382     switch (flags) {
1383         case EXACTFU: folder = PL_fold_latin1; break;
1384         case EXACTF:  folder = PL_fold; break;
1385         case EXACTFL: folder = PL_fold_locale; break;
1386     }
1387
1388     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1389     trie->refcount = 1;
1390     trie->startstate = 1;
1391     trie->wordcount = word_count;
1392     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1393     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1394     if (!(UTF && folder))
1395         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1396     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1397                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1398
1399     DEBUG_r({
1400         trie_words = newAV();
1401     });
1402
1403     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1404     if (!SvIOK(re_trie_maxbuff)) {
1405         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1406     }
1407     DEBUG_OPTIMISE_r({
1408                 PerlIO_printf( Perl_debug_log,
1409                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1410                   (int)depth * 2 + 2, "",
1411                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1412                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1413                   (int)depth);
1414     });
1415
1416    /* Find the node we are going to overwrite */
1417     if ( first == startbranch && OP( last ) != BRANCH ) {
1418         /* whole branch chain */
1419         convert = first;
1420     } else {
1421         /* branch sub-chain */
1422         convert = NEXTOPER( first );
1423     }
1424
1425     /*  -- First loop and Setup --
1426
1427        We first traverse the branches and scan each word to determine if it
1428        contains widechars, and how many unique chars there are, this is
1429        important as we have to build a table with at least as many columns as we
1430        have unique chars.
1431
1432        We use an array of integers to represent the character codes 0..255
1433        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1434        native representation of the character value as the key and IV's for the
1435        coded index.
1436
1437        *TODO* If we keep track of how many times each character is used we can
1438        remap the columns so that the table compression later on is more
1439        efficient in terms of memory by ensuring the most common value is in the
1440        middle and the least common are on the outside.  IMO this would be better
1441        than a most to least common mapping as theres a decent chance the most
1442        common letter will share a node with the least common, meaning the node
1443        will not be compressable. With a middle is most common approach the worst
1444        case is when we have the least common nodes twice.
1445
1446      */
1447
1448     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1449         regnode * const noper = NEXTOPER( cur );
1450         const U8 *uc = (U8*)STRING( noper );
1451         const U8 * const e  = uc + STR_LEN( noper );
1452         STRLEN foldlen = 0;
1453         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1454         const U8 *scan = (U8*)NULL;
1455         U32 wordlen      = 0;         /* required init */
1456         STRLEN chars = 0;
1457         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1458
1459         if (OP(noper) == NOTHING) {
1460             trie->minlen= 0;
1461             continue;
1462         }
1463         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1464             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1465                                           regardless of encoding */
1466
1467         for ( ; uc < e ; uc += len ) {
1468             TRIE_CHARCOUNT(trie)++;
1469             TRIE_READ_CHAR;
1470             chars++;
1471             if ( uvc < 256 ) {
1472                 if ( !trie->charmap[ uvc ] ) {
1473                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1474                     if ( folder )
1475                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1476                     TRIE_STORE_REVCHAR;
1477                 }
1478                 if ( set_bit ) {
1479                     /* store the codepoint in the bitmap, and its folded
1480                      * equivalent. */
1481                     TRIE_BITMAP_SET(trie,uvc);
1482
1483                     /* store the folded codepoint */
1484                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1485
1486                     if ( !UTF ) {
1487                         /* store first byte of utf8 representation of
1488                            variant codepoints */
1489                         if (! UNI_IS_INVARIANT(uvc)) {
1490                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1491                         }
1492                     }
1493                     set_bit = 0; /* We've done our bit :-) */
1494                 }
1495             } else {
1496                 SV** svpp;
1497                 if ( !widecharmap )
1498                     widecharmap = newHV();
1499
1500                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1501
1502                 if ( !svpp )
1503                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1504
1505                 if ( !SvTRUE( *svpp ) ) {
1506                     sv_setiv( *svpp, ++trie->uniquecharcount );
1507                     TRIE_STORE_REVCHAR;
1508                 }
1509             }
1510         }
1511         if( cur == first ) {
1512             trie->minlen=chars;
1513             trie->maxlen=chars;
1514         } else if (chars < trie->minlen) {
1515             trie->minlen=chars;
1516         } else if (chars > trie->maxlen) {
1517             trie->maxlen=chars;
1518         }
1519
1520     } /* end first pass */
1521     DEBUG_TRIE_COMPILE_r(
1522         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1523                 (int)depth * 2 + 2,"",
1524                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1525                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1526                 (int)trie->minlen, (int)trie->maxlen )
1527     );
1528
1529     /*
1530         We now know what we are dealing with in terms of unique chars and
1531         string sizes so we can calculate how much memory a naive
1532         representation using a flat table  will take. If it's over a reasonable
1533         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1534         conservative but potentially much slower representation using an array
1535         of lists.
1536
1537         At the end we convert both representations into the same compressed
1538         form that will be used in regexec.c for matching with. The latter
1539         is a form that cannot be used to construct with but has memory
1540         properties similar to the list form and access properties similar
1541         to the table form making it both suitable for fast searches and
1542         small enough that its feasable to store for the duration of a program.
1543
1544         See the comment in the code where the compressed table is produced
1545         inplace from the flat tabe representation for an explanation of how
1546         the compression works.
1547
1548     */
1549
1550
1551     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1552     prev_states[1] = 0;
1553
1554     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1555         /*
1556             Second Pass -- Array Of Lists Representation
1557
1558             Each state will be represented by a list of charid:state records
1559             (reg_trie_trans_le) the first such element holds the CUR and LEN
1560             points of the allocated array. (See defines above).
1561
1562             We build the initial structure using the lists, and then convert
1563             it into the compressed table form which allows faster lookups
1564             (but cant be modified once converted).
1565         */
1566
1567         STRLEN transcount = 1;
1568
1569         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1570             "%*sCompiling trie using list compiler\n",
1571             (int)depth * 2 + 2, ""));
1572
1573         trie->states = (reg_trie_state *)
1574             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1575                                   sizeof(reg_trie_state) );
1576         TRIE_LIST_NEW(1);
1577         next_alloc = 2;
1578
1579         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1580
1581             regnode * const noper = NEXTOPER( cur );
1582             U8 *uc           = (U8*)STRING( noper );
1583             const U8 * const e = uc + STR_LEN( noper );
1584             U32 state        = 1;         /* required init */
1585             U16 charid       = 0;         /* sanity init */
1586             U8 *scan         = (U8*)NULL; /* sanity init */
1587             STRLEN foldlen   = 0;         /* required init */
1588             U32 wordlen      = 0;         /* required init */
1589             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1590
1591             if (OP(noper) != NOTHING) {
1592                 for ( ; uc < e ; uc += len ) {
1593
1594                     TRIE_READ_CHAR;
1595
1596                     if ( uvc < 256 ) {
1597                         charid = trie->charmap[ uvc ];
1598                     } else {
1599                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1600                         if ( !svpp ) {
1601                             charid = 0;
1602                         } else {
1603                             charid=(U16)SvIV( *svpp );
1604                         }
1605                     }
1606                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1607                     if ( charid ) {
1608
1609                         U16 check;
1610                         U32 newstate = 0;
1611
1612                         charid--;
1613                         if ( !trie->states[ state ].trans.list ) {
1614                             TRIE_LIST_NEW( state );
1615                         }
1616                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1617                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1618                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1619                                 break;
1620                             }
1621                         }
1622                         if ( ! newstate ) {
1623                             newstate = next_alloc++;
1624                             prev_states[newstate] = state;
1625                             TRIE_LIST_PUSH( state, charid, newstate );
1626                             transcount++;
1627                         }
1628                         state = newstate;
1629                     } else {
1630                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1631                     }
1632                 }
1633             }
1634             TRIE_HANDLE_WORD(state);
1635
1636         } /* end second pass */
1637
1638         /* next alloc is the NEXT state to be allocated */
1639         trie->statecount = next_alloc;
1640         trie->states = (reg_trie_state *)
1641             PerlMemShared_realloc( trie->states,
1642                                    next_alloc
1643                                    * sizeof(reg_trie_state) );
1644
1645         /* and now dump it out before we compress it */
1646         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1647                                                          revcharmap, next_alloc,
1648                                                          depth+1)
1649         );
1650
1651         trie->trans = (reg_trie_trans *)
1652             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1653         {
1654             U32 state;
1655             U32 tp = 0;
1656             U32 zp = 0;
1657
1658
1659             for( state=1 ; state < next_alloc ; state ++ ) {
1660                 U32 base=0;
1661
1662                 /*
1663                 DEBUG_TRIE_COMPILE_MORE_r(
1664                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1665                 );
1666                 */
1667
1668                 if (trie->states[state].trans.list) {
1669                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1670                     U16 maxid=minid;
1671                     U16 idx;
1672
1673                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1674                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1675                         if ( forid < minid ) {
1676                             minid=forid;
1677                         } else if ( forid > maxid ) {
1678                             maxid=forid;
1679                         }
1680                     }
1681                     if ( transcount < tp + maxid - minid + 1) {
1682                         transcount *= 2;
1683                         trie->trans = (reg_trie_trans *)
1684                             PerlMemShared_realloc( trie->trans,
1685                                                      transcount
1686                                                      * sizeof(reg_trie_trans) );
1687                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1688                     }
1689                     base = trie->uniquecharcount + tp - minid;
1690                     if ( maxid == minid ) {
1691                         U32 set = 0;
1692                         for ( ; zp < tp ; zp++ ) {
1693                             if ( ! trie->trans[ zp ].next ) {
1694                                 base = trie->uniquecharcount + zp - minid;
1695                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1696                                 trie->trans[ zp ].check = state;
1697                                 set = 1;
1698                                 break;
1699                             }
1700                         }
1701                         if ( !set ) {
1702                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1703                             trie->trans[ tp ].check = state;
1704                             tp++;
1705                             zp = tp;
1706                         }
1707                     } else {
1708                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1709                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1710                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1711                             trie->trans[ tid ].check = state;
1712                         }
1713                         tp += ( maxid - minid + 1 );
1714                     }
1715                     Safefree(trie->states[ state ].trans.list);
1716                 }
1717                 /*
1718                 DEBUG_TRIE_COMPILE_MORE_r(
1719                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1720                 );
1721                 */
1722                 trie->states[ state ].trans.base=base;
1723             }
1724             trie->lasttrans = tp + 1;
1725         }
1726     } else {
1727         /*
1728            Second Pass -- Flat Table Representation.
1729
1730            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1731            We know that we will need Charcount+1 trans at most to store the data
1732            (one row per char at worst case) So we preallocate both structures
1733            assuming worst case.
1734
1735            We then construct the trie using only the .next slots of the entry
1736            structs.
1737
1738            We use the .check field of the first entry of the node temporarily to
1739            make compression both faster and easier by keeping track of how many non
1740            zero fields are in the node.
1741
1742            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1743            transition.
1744
1745            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1746            number representing the first entry of the node, and state as a
1747            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1748            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1749            are 2 entrys per node. eg:
1750
1751              A B       A B
1752           1. 2 4    1. 3 7
1753           2. 0 3    3. 0 5
1754           3. 0 0    5. 0 0
1755           4. 0 0    7. 0 0
1756
1757            The table is internally in the right hand, idx form. However as we also
1758            have to deal with the states array which is indexed by nodenum we have to
1759            use TRIE_NODENUM() to convert.
1760
1761         */
1762         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1763             "%*sCompiling trie using table compiler\n",
1764             (int)depth * 2 + 2, ""));
1765
1766         trie->trans = (reg_trie_trans *)
1767             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1768                                   * trie->uniquecharcount + 1,
1769                                   sizeof(reg_trie_trans) );
1770         trie->states = (reg_trie_state *)
1771             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1772                                   sizeof(reg_trie_state) );
1773         next_alloc = trie->uniquecharcount + 1;
1774
1775
1776         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1777
1778             regnode * const noper   = NEXTOPER( cur );
1779             const U8 *uc     = (U8*)STRING( noper );
1780             const U8 * const e = uc + STR_LEN( noper );
1781
1782             U32 state        = 1;         /* required init */
1783
1784             U16 charid       = 0;         /* sanity init */
1785             U32 accept_state = 0;         /* sanity init */
1786             U8 *scan         = (U8*)NULL; /* sanity init */
1787
1788             STRLEN foldlen   = 0;         /* required init */
1789             U32 wordlen      = 0;         /* required init */
1790             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1791
1792             if ( OP(noper) != NOTHING ) {
1793                 for ( ; uc < e ; uc += len ) {
1794
1795                     TRIE_READ_CHAR;
1796
1797                     if ( uvc < 256 ) {
1798                         charid = trie->charmap[ uvc ];
1799                     } else {
1800                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1801                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1802                     }
1803                     if ( charid ) {
1804                         charid--;
1805                         if ( !trie->trans[ state + charid ].next ) {
1806                             trie->trans[ state + charid ].next = next_alloc;
1807                             trie->trans[ state ].check++;
1808                             prev_states[TRIE_NODENUM(next_alloc)]
1809                                     = TRIE_NODENUM(state);
1810                             next_alloc += trie->uniquecharcount;
1811                         }
1812                         state = trie->trans[ state + charid ].next;
1813                     } else {
1814                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1815                     }
1816                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1817                 }
1818             }
1819             accept_state = TRIE_NODENUM( state );
1820             TRIE_HANDLE_WORD(accept_state);
1821
1822         } /* end second pass */
1823
1824         /* and now dump it out before we compress it */
1825         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1826                                                           revcharmap,
1827                                                           next_alloc, depth+1));
1828
1829         {
1830         /*
1831            * Inplace compress the table.*
1832
1833            For sparse data sets the table constructed by the trie algorithm will
1834            be mostly 0/FAIL transitions or to put it another way mostly empty.
1835            (Note that leaf nodes will not contain any transitions.)
1836
1837            This algorithm compresses the tables by eliminating most such
1838            transitions, at the cost of a modest bit of extra work during lookup:
1839
1840            - Each states[] entry contains a .base field which indicates the
1841            index in the state[] array wheres its transition data is stored.
1842
1843            - If .base is 0 there are no valid transitions from that node.
1844
1845            - If .base is nonzero then charid is added to it to find an entry in
1846            the trans array.
1847
1848            -If trans[states[state].base+charid].check!=state then the
1849            transition is taken to be a 0/Fail transition. Thus if there are fail
1850            transitions at the front of the node then the .base offset will point
1851            somewhere inside the previous nodes data (or maybe even into a node
1852            even earlier), but the .check field determines if the transition is
1853            valid.
1854
1855            XXX - wrong maybe?
1856            The following process inplace converts the table to the compressed
1857            table: We first do not compress the root node 1,and mark all its
1858            .check pointers as 1 and set its .base pointer as 1 as well. This
1859            allows us to do a DFA construction from the compressed table later,
1860            and ensures that any .base pointers we calculate later are greater
1861            than 0.
1862
1863            - We set 'pos' to indicate the first entry of the second node.
1864
1865            - We then iterate over the columns of the node, finding the first and
1866            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1867            and set the .check pointers accordingly, and advance pos
1868            appropriately and repreat for the next node. Note that when we copy
1869            the next pointers we have to convert them from the original
1870            NODEIDX form to NODENUM form as the former is not valid post
1871            compression.
1872
1873            - If a node has no transitions used we mark its base as 0 and do not
1874            advance the pos pointer.
1875
1876            - If a node only has one transition we use a second pointer into the
1877            structure to fill in allocated fail transitions from other states.
1878            This pointer is independent of the main pointer and scans forward
1879            looking for null transitions that are allocated to a state. When it
1880            finds one it writes the single transition into the "hole".  If the
1881            pointer doesnt find one the single transition is appended as normal.
1882
1883            - Once compressed we can Renew/realloc the structures to release the
1884            excess space.
1885
1886            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1887            specifically Fig 3.47 and the associated pseudocode.
1888
1889            demq
1890         */
1891         const U32 laststate = TRIE_NODENUM( next_alloc );
1892         U32 state, charid;
1893         U32 pos = 0, zp=0;
1894         trie->statecount = laststate;
1895
1896         for ( state = 1 ; state < laststate ; state++ ) {
1897             U8 flag = 0;
1898             const U32 stateidx = TRIE_NODEIDX( state );
1899             const U32 o_used = trie->trans[ stateidx ].check;
1900             U32 used = trie->trans[ stateidx ].check;
1901             trie->trans[ stateidx ].check = 0;
1902
1903             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1904                 if ( flag || trie->trans[ stateidx + charid ].next ) {
1905                     if ( trie->trans[ stateidx + charid ].next ) {
1906                         if (o_used == 1) {
1907                             for ( ; zp < pos ; zp++ ) {
1908                                 if ( ! trie->trans[ zp ].next ) {
1909                                     break;
1910                                 }
1911                             }
1912                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1913                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1914                             trie->trans[ zp ].check = state;
1915                             if ( ++zp > pos ) pos = zp;
1916                             break;
1917                         }
1918                         used--;
1919                     }
1920                     if ( !flag ) {
1921                         flag = 1;
1922                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1923                     }
1924                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1925                     trie->trans[ pos ].check = state;
1926                     pos++;
1927                 }
1928             }
1929         }
1930         trie->lasttrans = pos + 1;
1931         trie->states = (reg_trie_state *)
1932             PerlMemShared_realloc( trie->states, laststate
1933                                    * sizeof(reg_trie_state) );
1934         DEBUG_TRIE_COMPILE_MORE_r(
1935                 PerlIO_printf( Perl_debug_log,
1936                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1937                     (int)depth * 2 + 2,"",
1938                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1939                     (IV)next_alloc,
1940                     (IV)pos,
1941                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1942             );
1943
1944         } /* end table compress */
1945     }
1946     DEBUG_TRIE_COMPILE_MORE_r(
1947             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1948                 (int)depth * 2 + 2, "",
1949                 (UV)trie->statecount,
1950                 (UV)trie->lasttrans)
1951     );
1952     /* resize the trans array to remove unused space */
1953     trie->trans = (reg_trie_trans *)
1954         PerlMemShared_realloc( trie->trans, trie->lasttrans
1955                                * sizeof(reg_trie_trans) );
1956
1957     {   /* Modify the program and insert the new TRIE node */
1958         U8 nodetype =(U8)(flags & 0xFF);
1959         char *str=NULL;
1960
1961 #ifdef DEBUGGING
1962         regnode *optimize = NULL;
1963 #ifdef RE_TRACK_PATTERN_OFFSETS
1964
1965         U32 mjd_offset = 0;
1966         U32 mjd_nodelen = 0;
1967 #endif /* RE_TRACK_PATTERN_OFFSETS */
1968 #endif /* DEBUGGING */
1969         /*
1970            This means we convert either the first branch or the first Exact,
1971            depending on whether the thing following (in 'last') is a branch
1972            or not and whther first is the startbranch (ie is it a sub part of
1973            the alternation or is it the whole thing.)
1974            Assuming its a sub part we convert the EXACT otherwise we convert
1975            the whole branch sequence, including the first.
1976          */
1977         /* Find the node we are going to overwrite */
1978         if ( first != startbranch || OP( last ) == BRANCH ) {
1979             /* branch sub-chain */
1980             NEXT_OFF( first ) = (U16)(last - first);
1981 #ifdef RE_TRACK_PATTERN_OFFSETS
1982             DEBUG_r({
1983                 mjd_offset= Node_Offset((convert));
1984                 mjd_nodelen= Node_Length((convert));
1985             });
1986 #endif
1987             /* whole branch chain */
1988         }
1989 #ifdef RE_TRACK_PATTERN_OFFSETS
1990         else {
1991             DEBUG_r({
1992                 const  regnode *nop = NEXTOPER( convert );
1993                 mjd_offset= Node_Offset((nop));
1994                 mjd_nodelen= Node_Length((nop));
1995             });
1996         }
1997         DEBUG_OPTIMISE_r(
1998             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1999                 (int)depth * 2 + 2, "",
2000                 (UV)mjd_offset, (UV)mjd_nodelen)
2001         );
2002 #endif
2003         /* But first we check to see if there is a common prefix we can
2004            split out as an EXACT and put in front of the TRIE node.  */
2005         trie->startstate= 1;
2006         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2007             U32 state;
2008             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2009                 U32 ofs = 0;
2010                 I32 idx = -1;
2011                 U32 count = 0;
2012                 const U32 base = trie->states[ state ].trans.base;
2013
2014                 if ( trie->states[state].wordnum )
2015                         count = 1;
2016
2017                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2018                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2019                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2020                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2021                     {
2022                         if ( ++count > 1 ) {
2023                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2024                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2025                             if ( state == 1 ) break;
2026                             if ( count == 2 ) {
2027                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2028                                 DEBUG_OPTIMISE_r(
2029                                     PerlIO_printf(Perl_debug_log,
2030                                         "%*sNew Start State=%"UVuf" Class: [",
2031                                         (int)depth * 2 + 2, "",
2032                                         (UV)state));
2033                                 if (idx >= 0) {
2034                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2035                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2036
2037                                     TRIE_BITMAP_SET(trie,*ch);
2038                                     if ( folder )
2039                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2040                                     DEBUG_OPTIMISE_r(
2041                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2042                                     );
2043                                 }
2044                             }
2045                             TRIE_BITMAP_SET(trie,*ch);
2046                             if ( folder )
2047                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2048                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2049                         }
2050                         idx = ofs;
2051                     }
2052                 }
2053                 if ( count == 1 ) {
2054                     SV **tmp = av_fetch( revcharmap, idx, 0);
2055                     STRLEN len;
2056                     char *ch = SvPV( *tmp, len );
2057                     DEBUG_OPTIMISE_r({
2058                         SV *sv=sv_newmortal();
2059                         PerlIO_printf( Perl_debug_log,
2060                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2061                             (int)depth * 2 + 2, "",
2062                             (UV)state, (UV)idx,
2063                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2064                                 PL_colors[0], PL_colors[1],
2065                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2066                                 PERL_PV_ESCAPE_FIRSTCHAR
2067                             )
2068                         );
2069                     });
2070                     if ( state==1 ) {
2071                         OP( convert ) = nodetype;
2072                         str=STRING(convert);
2073                         STR_LEN(convert)=0;
2074                     }
2075                     STR_LEN(convert) += len;
2076                     while (len--)
2077                         *str++ = *ch++;
2078                 } else {
2079 #ifdef DEBUGGING
2080                     if (state>1)
2081                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2082 #endif
2083                     break;
2084                 }
2085             }
2086             trie->prefixlen = (state-1);
2087             if (str) {
2088                 regnode *n = convert+NODE_SZ_STR(convert);
2089                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2090                 trie->startstate = state;
2091                 trie->minlen -= (state - 1);
2092                 trie->maxlen -= (state - 1);
2093 #ifdef DEBUGGING
2094                /* At least the UNICOS C compiler choked on this
2095                 * being argument to DEBUG_r(), so let's just have
2096                 * it right here. */
2097                if (
2098 #ifdef PERL_EXT_RE_BUILD
2099                    1
2100 #else
2101                    DEBUG_r_TEST
2102 #endif
2103                    ) {
2104                    regnode *fix = convert;
2105                    U32 word = trie->wordcount;
2106                    mjd_nodelen++;
2107                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2108                    while( ++fix < n ) {
2109                        Set_Node_Offset_Length(fix, 0, 0);
2110                    }
2111                    while (word--) {
2112                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2113                        if (tmp) {
2114                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2115                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2116                            else
2117                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2118                        }
2119                    }
2120                }
2121 #endif
2122                 if (trie->maxlen) {
2123                     convert = n;
2124                 } else {
2125                     NEXT_OFF(convert) = (U16)(tail - convert);
2126                     DEBUG_r(optimize= n);
2127                 }
2128             }
2129         }
2130         if (!jumper)
2131             jumper = last;
2132         if ( trie->maxlen ) {
2133             NEXT_OFF( convert ) = (U16)(tail - convert);
2134             ARG_SET( convert, data_slot );
2135             /* Store the offset to the first unabsorbed branch in
2136                jump[0], which is otherwise unused by the jump logic.
2137                We use this when dumping a trie and during optimisation. */
2138             if (trie->jump)
2139                 trie->jump[0] = (U16)(nextbranch - convert);
2140
2141             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2142              *   and there is a bitmap
2143              *   and the first "jump target" node we found leaves enough room
2144              * then convert the TRIE node into a TRIEC node, with the bitmap
2145              * embedded inline in the opcode - this is hypothetically faster.
2146              */
2147             if ( !trie->states[trie->startstate].wordnum
2148                  && trie->bitmap
2149                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2150             {
2151                 OP( convert ) = TRIEC;
2152                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2153                 PerlMemShared_free(trie->bitmap);
2154                 trie->bitmap= NULL;
2155             } else
2156                 OP( convert ) = TRIE;
2157
2158             /* store the type in the flags */
2159             convert->flags = nodetype;
2160             DEBUG_r({
2161             optimize = convert
2162                       + NODE_STEP_REGNODE
2163                       + regarglen[ OP( convert ) ];
2164             });
2165             /* XXX We really should free up the resource in trie now,
2166                    as we won't use them - (which resources?) dmq */
2167         }
2168         /* needed for dumping*/
2169         DEBUG_r(if (optimize) {
2170             regnode *opt = convert;
2171
2172             while ( ++opt < optimize) {
2173                 Set_Node_Offset_Length(opt,0,0);
2174             }
2175             /*
2176                 Try to clean up some of the debris left after the
2177                 optimisation.
2178              */
2179             while( optimize < jumper ) {
2180                 mjd_nodelen += Node_Length((optimize));
2181                 OP( optimize ) = OPTIMIZED;
2182                 Set_Node_Offset_Length(optimize,0,0);
2183                 optimize++;
2184             }
2185             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2186         });
2187     } /* end node insert */
2188
2189     /*  Finish populating the prev field of the wordinfo array.  Walk back
2190      *  from each accept state until we find another accept state, and if
2191      *  so, point the first word's .prev field at the second word. If the
2192      *  second already has a .prev field set, stop now. This will be the
2193      *  case either if we've already processed that word's accept state,
2194      *  or that state had multiple words, and the overspill words were
2195      *  already linked up earlier.
2196      */
2197     {
2198         U16 word;
2199         U32 state;
2200         U16 prev;
2201
2202         for (word=1; word <= trie->wordcount; word++) {
2203             prev = 0;
2204             if (trie->wordinfo[word].prev)
2205                 continue;
2206             state = trie->wordinfo[word].accept;
2207             while (state) {
2208                 state = prev_states[state];
2209                 if (!state)
2210                     break;
2211                 prev = trie->states[state].wordnum;
2212                 if (prev)
2213                     break;
2214             }
2215             trie->wordinfo[word].prev = prev;
2216         }
2217         Safefree(prev_states);
2218     }
2219
2220
2221     /* and now dump out the compressed format */
2222     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2223
2224     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2225 #ifdef DEBUGGING
2226     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2227     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2228 #else
2229     SvREFCNT_dec(revcharmap);
2230 #endif
2231     return trie->jump
2232            ? MADE_JUMP_TRIE
2233            : trie->startstate>1
2234              ? MADE_EXACT_TRIE
2235              : MADE_TRIE;
2236 }
2237
2238 STATIC void
2239 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2240 {
2241 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2242
2243    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2244    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2245    ISBN 0-201-10088-6
2246
2247    We find the fail state for each state in the trie, this state is the longest proper
2248    suffix of the current state's 'word' that is also a proper prefix of another word in our
2249    trie. State 1 represents the word '' and is thus the default fail state. This allows
2250    the DFA not to have to restart after its tried and failed a word at a given point, it
2251    simply continues as though it had been matching the other word in the first place.
2252    Consider
2253       'abcdgu'=~/abcdefg|cdgu/
2254    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2255    fail, which would bring us to the state representing 'd' in the second word where we would
2256    try 'g' and succeed, proceeding to match 'cdgu'.
2257  */
2258  /* add a fail transition */
2259     const U32 trie_offset = ARG(source);
2260     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2261     U32 *q;
2262     const U32 ucharcount = trie->uniquecharcount;
2263     const U32 numstates = trie->statecount;
2264     const U32 ubound = trie->lasttrans + ucharcount;
2265     U32 q_read = 0;
2266     U32 q_write = 0;
2267     U32 charid;
2268     U32 base = trie->states[ 1 ].trans.base;
2269     U32 *fail;
2270     reg_ac_data *aho;
2271     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2272     GET_RE_DEBUG_FLAGS_DECL;
2273
2274     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2275 #ifndef DEBUGGING
2276     PERL_UNUSED_ARG(depth);
2277 #endif
2278
2279
2280     ARG_SET( stclass, data_slot );
2281     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2282     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2283     aho->trie=trie_offset;
2284     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2285     Copy( trie->states, aho->states, numstates, reg_trie_state );
2286     Newxz( q, numstates, U32);
2287     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2288     aho->refcount = 1;
2289     fail = aho->fail;
2290     /* initialize fail[0..1] to be 1 so that we always have
2291        a valid final fail state */
2292     fail[ 0 ] = fail[ 1 ] = 1;
2293
2294     for ( charid = 0; charid < ucharcount ; charid++ ) {
2295         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2296         if ( newstate ) {
2297             q[ q_write ] = newstate;
2298             /* set to point at the root */
2299             fail[ q[ q_write++ ] ]=1;
2300         }
2301     }
2302     while ( q_read < q_write) {
2303         const U32 cur = q[ q_read++ % numstates ];
2304         base = trie->states[ cur ].trans.base;
2305
2306         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2307             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2308             if (ch_state) {
2309                 U32 fail_state = cur;
2310                 U32 fail_base;
2311                 do {
2312                     fail_state = fail[ fail_state ];
2313                     fail_base = aho->states[ fail_state ].trans.base;
2314                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2315
2316                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2317                 fail[ ch_state ] = fail_state;
2318                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2319                 {
2320                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2321                 }
2322                 q[ q_write++ % numstates] = ch_state;
2323             }
2324         }
2325     }
2326     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2327        when we fail in state 1, this allows us to use the
2328        charclass scan to find a valid start char. This is based on the principle
2329        that theres a good chance the string being searched contains lots of stuff
2330        that cant be a start char.
2331      */
2332     fail[ 0 ] = fail[ 1 ] = 0;
2333     DEBUG_TRIE_COMPILE_r({
2334         PerlIO_printf(Perl_debug_log,
2335                       "%*sStclass Failtable (%"UVuf" states): 0",
2336                       (int)(depth * 2), "", (UV)numstates
2337         );
2338         for( q_read=1; q_read<numstates; q_read++ ) {
2339             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2340         }
2341         PerlIO_printf(Perl_debug_log, "\n");
2342     });
2343     Safefree(q);
2344     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2345 }
2346
2347
2348 /*
2349  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2350  * These need to be revisited when a newer toolchain becomes available.
2351  */
2352 #if defined(__sparc64__) && defined(__GNUC__)
2353 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2354 #       undef  SPARC64_GCC_WORKAROUND
2355 #       define SPARC64_GCC_WORKAROUND 1
2356 #   endif
2357 #endif
2358
2359 #define DEBUG_PEEP(str,scan,depth) \
2360     DEBUG_OPTIMISE_r({if (scan){ \
2361        SV * const mysv=sv_newmortal(); \
2362        regnode *Next = regnext(scan); \
2363        regprop(RExC_rx, mysv, scan); \
2364        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2365        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2366        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2367    }});
2368
2369
2370
2371
2372
2373 #define JOIN_EXACT(scan,min,flags) \
2374     if (PL_regkind[OP(scan)] == EXACT) \
2375         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2376
2377 STATIC U32
2378 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2379     /* Merge several consecutive EXACTish nodes into one. */
2380     regnode *n = regnext(scan);
2381     U32 stringok = 1;
2382     regnode *next = scan + NODE_SZ_STR(scan);
2383     U32 merged = 0;
2384     U32 stopnow = 0;
2385 #ifdef DEBUGGING
2386     regnode *stop = scan;
2387     GET_RE_DEBUG_FLAGS_DECL;
2388 #else
2389     PERL_UNUSED_ARG(depth);
2390 #endif
2391
2392     PERL_ARGS_ASSERT_JOIN_EXACT;
2393 #ifndef EXPERIMENTAL_INPLACESCAN
2394     PERL_UNUSED_ARG(flags);
2395     PERL_UNUSED_ARG(val);
2396 #endif
2397     DEBUG_PEEP("join",scan,depth);
2398
2399     /* Skip NOTHING, merge EXACT*. */
2400     while (n &&
2401            ( PL_regkind[OP(n)] == NOTHING ||
2402              (stringok && (OP(n) == OP(scan))))
2403            && NEXT_OFF(n)
2404            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2405
2406         if (OP(n) == TAIL || n > next)
2407             stringok = 0;
2408         if (PL_regkind[OP(n)] == NOTHING) {
2409             DEBUG_PEEP("skip:",n,depth);
2410             NEXT_OFF(scan) += NEXT_OFF(n);
2411             next = n + NODE_STEP_REGNODE;
2412 #ifdef DEBUGGING
2413             if (stringok)
2414                 stop = n;
2415 #endif
2416             n = regnext(n);
2417         }
2418         else if (stringok) {
2419             const unsigned int oldl = STR_LEN(scan);
2420             regnode * const nnext = regnext(n);
2421
2422             DEBUG_PEEP("merg",n,depth);
2423
2424             merged++;
2425             if (oldl + STR_LEN(n) > U8_MAX)
2426                 break;
2427             NEXT_OFF(scan) += NEXT_OFF(n);
2428             STR_LEN(scan) += STR_LEN(n);
2429             next = n + NODE_SZ_STR(n);
2430             /* Now we can overwrite *n : */
2431             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2432 #ifdef DEBUGGING
2433             stop = next - 1;
2434 #endif
2435             n = nnext;
2436             if (stopnow) break;
2437         }
2438
2439 #ifdef EXPERIMENTAL_INPLACESCAN
2440         if (flags && !NEXT_OFF(n)) {
2441             DEBUG_PEEP("atch", val, depth);
2442             if (reg_off_by_arg[OP(n)]) {
2443                 ARG_SET(n, val - n);
2444             }
2445             else {
2446                 NEXT_OFF(n) = val - n;
2447             }
2448             stopnow = 1;
2449         }
2450 #endif
2451     }
2452 #define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS   0x0390
2453 #define IOTA_D_T        GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2454 #define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS     0x03B0
2455 #define UPSILON_D_T     GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2456
2457     if (UTF
2458         && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
2459         && ( STR_LEN(scan) >= 6 ) )
2460     {
2461     /*
2462     Two problematic code points in Unicode casefolding of EXACT nodes:
2463
2464     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2465     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2466
2467     which casefold to
2468
2469     Unicode                      UTF-8
2470
2471     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2472     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2473
2474     This means that in case-insensitive matching (or "loose matching",
2475     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2476     length of the above casefolded versions) can match a target string
2477     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2478     This would rather mess up the minimum length computation.
2479
2480     What we'll do is to look for the tail four bytes, and then peek
2481     at the preceding two bytes to see whether we need to decrease
2482     the minimum length by four (six minus two).
2483
2484     Thanks to the design of UTF-8, there cannot be false matches:
2485     A sequence of valid UTF-8 bytes cannot be a subsequence of
2486     another valid sequence of UTF-8 bytes.
2487
2488     */
2489          char * const s0 = STRING(scan), *s, *t;
2490          char * const s1 = s0 + STR_LEN(scan) - 1;
2491          char * const s2 = s1 - 4;
2492 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2493          const char t0[] = "\xaf\x49\xaf\x42";
2494 #else
2495          const char t0[] = "\xcc\x88\xcc\x81";
2496 #endif
2497          const char * const t1 = t0 + 3;
2498
2499          for (s = s0 + 2;
2500               s < s2 && (t = ninstr(s, s1, t0, t1));
2501               s = t + 4) {
2502 #ifdef EBCDIC
2503               if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2504                   ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2505 #else
2506               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2507                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2508 #endif
2509                    *min -= 4;
2510          }
2511     }
2512
2513 #ifdef DEBUGGING
2514     /* Allow dumping */
2515     n = scan + NODE_SZ_STR(scan);
2516     while (n <= stop) {
2517         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2518             OP(n) = OPTIMIZED;
2519             NEXT_OFF(n) = 0;
2520         }
2521         n++;
2522     }
2523 #endif
2524     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2525     return stopnow;
2526 }
2527
2528 /* REx optimizer.  Converts nodes into quickier variants "in place".
2529    Finds fixed substrings.  */
2530
2531 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2532    to the position after last scanned or to NULL. */
2533
2534 #define INIT_AND_WITHP \
2535     assert(!and_withp); \
2536     Newx(and_withp,1,struct regnode_charclass_class); \
2537     SAVEFREEPV(and_withp)
2538
2539 /* this is a chain of data about sub patterns we are processing that
2540    need to be handled seperately/specially in study_chunk. Its so
2541    we can simulate recursion without losing state.  */
2542 struct scan_frame;
2543 typedef struct scan_frame {
2544     regnode *last;  /* last node to process in this frame */
2545     regnode *next;  /* next node to process when last is reached */
2546     struct scan_frame *prev; /*previous frame*/
2547     I32 stop; /* what stopparen do we use */
2548 } scan_frame;
2549
2550
2551 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2552
2553 #define CASE_SYNST_FNC(nAmE)                                       \
2554 case nAmE:                                                         \
2555     if (flags & SCF_DO_STCLASS_AND) {                              \
2556             for (value = 0; value < 256; value++)                  \
2557                 if (!is_ ## nAmE ## _cp(value))                       \
2558                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2559     }                                                              \
2560     else {                                                         \
2561             for (value = 0; value < 256; value++)                  \
2562                 if (is_ ## nAmE ## _cp(value))                        \
2563                     ANYOF_BITMAP_SET(data->start_class, value);    \
2564     }                                                              \
2565     break;                                                         \
2566 case N ## nAmE:                                                    \
2567     if (flags & SCF_DO_STCLASS_AND) {                              \
2568             for (value = 0; value < 256; value++)                   \
2569                 if (is_ ## nAmE ## _cp(value))                         \
2570                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2571     }                                                               \
2572     else {                                                          \
2573             for (value = 0; value < 256; value++)                   \
2574                 if (!is_ ## nAmE ## _cp(value))                        \
2575                     ANYOF_BITMAP_SET(data->start_class, value);     \
2576     }                                                               \
2577     break
2578
2579
2580
2581 STATIC I32
2582 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2583                         I32 *minlenp, I32 *deltap,
2584                         regnode *last,
2585                         scan_data_t *data,
2586                         I32 stopparen,
2587                         U8* recursed,
2588                         struct regnode_charclass_class *and_withp,
2589                         U32 flags, U32 depth)
2590                         /* scanp: Start here (read-write). */
2591                         /* deltap: Write maxlen-minlen here. */
2592                         /* last: Stop before this one. */
2593                         /* data: string data about the pattern */
2594                         /* stopparen: treat close N as END */
2595                         /* recursed: which subroutines have we recursed into */
2596                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2597 {
2598     dVAR;
2599     I32 min = 0, pars = 0, code;
2600     regnode *scan = *scanp, *next;
2601     I32 delta = 0;
2602     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2603     int is_inf_internal = 0;            /* The studied chunk is infinite */
2604     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2605     scan_data_t data_fake;
2606     SV *re_trie_maxbuff = NULL;
2607     regnode *first_non_open = scan;
2608     I32 stopmin = I32_MAX;
2609     scan_frame *frame = NULL;
2610     GET_RE_DEBUG_FLAGS_DECL;
2611
2612     PERL_ARGS_ASSERT_STUDY_CHUNK;
2613
2614 #ifdef DEBUGGING
2615     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2616 #endif
2617
2618     if ( depth == 0 ) {
2619         while (first_non_open && OP(first_non_open) == OPEN)
2620             first_non_open=regnext(first_non_open);
2621     }
2622
2623
2624   fake_study_recurse:
2625     while ( scan && OP(scan) != END && scan < last ){
2626         /* Peephole optimizer: */
2627         DEBUG_STUDYDATA("Peep:", data,depth);
2628         DEBUG_PEEP("Peep",scan,depth);
2629         JOIN_EXACT(scan,&min,0);
2630
2631         /* Follow the next-chain of the current node and optimize
2632            away all the NOTHINGs from it.  */
2633         if (OP(scan) != CURLYX) {
2634             const int max = (reg_off_by_arg[OP(scan)]
2635                        ? I32_MAX
2636                        /* I32 may be smaller than U16 on CRAYs! */
2637                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2638             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2639             int noff;
2640             regnode *n = scan;
2641
2642             /* Skip NOTHING and LONGJMP. */
2643             while ((n = regnext(n))
2644                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2645                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2646                    && off + noff < max)
2647                 off += noff;
2648             if (reg_off_by_arg[OP(scan)])
2649                 ARG(scan) = off;
2650             else
2651                 NEXT_OFF(scan) = off;
2652         }
2653
2654
2655
2656         /* The principal pseudo-switch.  Cannot be a switch, since we
2657            look into several different things.  */
2658         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2659                    || OP(scan) == IFTHEN) {
2660             next = regnext(scan);
2661             code = OP(scan);
2662             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2663
2664             if (OP(next) == code || code == IFTHEN) {
2665                 /* NOTE - There is similar code to this block below for handling
2666                    TRIE nodes on a re-study.  If you change stuff here check there
2667                    too. */
2668                 I32 max1 = 0, min1 = I32_MAX, num = 0;
2669                 struct regnode_charclass_class accum;
2670                 regnode * const startbranch=scan;
2671
2672                 if (flags & SCF_DO_SUBSTR)
2673                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2674                 if (flags & SCF_DO_STCLASS)
2675                     cl_init_zero(pRExC_state, &accum);
2676
2677                 while (OP(scan) == code) {
2678                     I32 deltanext, minnext, f = 0, fake;
2679                     struct regnode_charclass_class this_class;
2680
2681                     num++;
2682                     data_fake.flags = 0;
2683                     if (data) {
2684                         data_fake.whilem_c = data->whilem_c;
2685                         data_fake.last_closep = data->last_closep;
2686                     }
2687                     else
2688                         data_fake.last_closep = &fake;
2689
2690                     data_fake.pos_delta = delta;
2691                     next = regnext(scan);
2692                     scan = NEXTOPER(scan);
2693                     if (code != BRANCH)
2694                         scan = NEXTOPER(scan);
2695                     if (flags & SCF_DO_STCLASS) {
2696                         cl_init(pRExC_state, &this_class);
2697                         data_fake.start_class = &this_class;
2698                         f = SCF_DO_STCLASS_AND;
2699                     }
2700                     if (flags & SCF_WHILEM_VISITED_POS)
2701                         f |= SCF_WHILEM_VISITED_POS;
2702
2703                     /* we suppose the run is continuous, last=next...*/
2704                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2705                                           next, &data_fake,
2706                                           stopparen, recursed, NULL, f,depth+1);
2707                     if (min1 > minnext)
2708                         min1 = minnext;
2709                     if (max1 < minnext + deltanext)
2710                         max1 = minnext + deltanext;
2711                     if (deltanext == I32_MAX)
2712                         is_inf = is_inf_internal = 1;
2713                     scan = next;
2714                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2715                         pars++;
2716                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
2717                         if ( stopmin > minnext)
2718                             stopmin = min + min1;
2719                         flags &= ~SCF_DO_SUBSTR;
2720                         if (data)
2721                             data->flags |= SCF_SEEN_ACCEPT;
2722                     }
2723                     if (data) {
2724                         if (data_fake.flags & SF_HAS_EVAL)
2725                             data->flags |= SF_HAS_EVAL;
2726                         data->whilem_c = data_fake.whilem_c;
2727                     }
2728                     if (flags & SCF_DO_STCLASS)
2729                         cl_or(pRExC_state, &accum, &this_class);
2730                 }
2731                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2732                     min1 = 0;
2733                 if (flags & SCF_DO_SUBSTR) {
2734                     data->pos_min += min1;
2735                     data->pos_delta += max1 - min1;
2736                     if (max1 != min1 || is_inf)
2737                         data->longest = &(data->longest_float);
2738                 }
2739                 min += min1;
2740                 delta += max1 - min1;
2741                 if (flags & SCF_DO_STCLASS_OR) {
2742                     cl_or(pRExC_state, data->start_class, &accum);
2743                     if (min1) {
2744                         cl_and(data->start_class, and_withp);
2745                         flags &= ~SCF_DO_STCLASS;
2746                     }
2747                 }
2748                 else if (flags & SCF_DO_STCLASS_AND) {
2749                     if (min1) {
2750                         cl_and(data->start_class, &accum);
2751                         flags &= ~SCF_DO_STCLASS;
2752                     }
2753                     else {
2754                         /* Switch to OR mode: cache the old value of
2755                          * data->start_class */
2756                         INIT_AND_WITHP;
2757                         StructCopy(data->start_class, and_withp,
2758                                    struct regnode_charclass_class);
2759                         flags &= ~SCF_DO_STCLASS_AND;
2760                         StructCopy(&accum, data->start_class,
2761                                    struct regnode_charclass_class);
2762                         flags |= SCF_DO_STCLASS_OR;
2763                         data->start_class->flags |= ANYOF_EOS;
2764                     }
2765                 }
2766
2767                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2768                 /* demq.
2769
2770                    Assuming this was/is a branch we are dealing with: 'scan' now
2771                    points at the item that follows the branch sequence, whatever
2772                    it is. We now start at the beginning of the sequence and look
2773                    for subsequences of
2774
2775                    BRANCH->EXACT=>x1
2776                    BRANCH->EXACT=>x2
2777                    tail
2778
2779                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
2780
2781                    If we can find such a subseqence we need to turn the first
2782                    element into a trie and then add the subsequent branch exact
2783                    strings to the trie.
2784
2785                    We have two cases
2786
2787                      1. patterns where the whole set of branches can be converted.
2788
2789                      2. patterns where only a subset can be converted.
2790
2791                    In case 1 we can replace the whole set with a single regop
2792                    for the trie. In case 2 we need to keep the start and end
2793                    branches so
2794
2795                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2796                      becomes BRANCH TRIE; BRANCH X;
2797
2798                   There is an additional case, that being where there is a
2799                   common prefix, which gets split out into an EXACT like node
2800                   preceding the TRIE node.
2801
2802                   If x(1..n)==tail then we can do a simple trie, if not we make
2803                   a "jump" trie, such that when we match the appropriate word
2804                   we "jump" to the appopriate tail node. Essentailly we turn
2805                   a nested if into a case structure of sorts.
2806
2807                 */
2808
2809                     int made=0;
2810                     if (!re_trie_maxbuff) {
2811                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2812                         if (!SvIOK(re_trie_maxbuff))
2813                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2814                     }
2815                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2816                         regnode *cur;
2817                         regnode *first = (regnode *)NULL;
2818                         regnode *last = (regnode *)NULL;
2819                         regnode *tail = scan;
2820                         U8 optype = 0;
2821                         U32 count=0;
2822
2823 #ifdef DEBUGGING
2824                         SV * const mysv = sv_newmortal();       /* for dumping */
2825 #endif
2826                         /* var tail is used because there may be a TAIL
2827                            regop in the way. Ie, the exacts will point to the
2828                            thing following the TAIL, but the last branch will
2829                            point at the TAIL. So we advance tail. If we
2830                            have nested (?:) we may have to move through several
2831                            tails.
2832                          */
2833
2834                         while ( OP( tail ) == TAIL ) {
2835                             /* this is the TAIL generated by (?:) */
2836                             tail = regnext( tail );
2837                         }
2838
2839
2840                         DEBUG_OPTIMISE_r({
2841                             regprop(RExC_rx, mysv, tail );
2842                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2843                                 (int)depth * 2 + 2, "",
2844                                 "Looking for TRIE'able sequences. Tail node is: ",
2845                                 SvPV_nolen_const( mysv )
2846                             );
2847                         });
2848
2849                         /*
2850
2851                            step through the branches, cur represents each
2852                            branch, noper is the first thing to be matched
2853                            as part of that branch and noper_next is the
2854                            regnext() of that node. if noper is an EXACT
2855                            and noper_next is the same as scan (our current
2856                            position in the regex) then the EXACT branch is
2857                            a possible optimization target. Once we have
2858                            two or more consequetive such branches we can
2859                            create a trie of the EXACT's contents and stich
2860                            it in place. If the sequence represents all of
2861                            the branches we eliminate the whole thing and
2862                            replace it with a single TRIE. If it is a
2863                            subsequence then we need to stitch it in. This
2864                            means the first branch has to remain, and needs
2865                            to be repointed at the item on the branch chain
2866                            following the last branch optimized. This could
2867                            be either a BRANCH, in which case the
2868                            subsequence is internal, or it could be the
2869                            item following the branch sequence in which
2870                            case the subsequence is at the end.
2871
2872                         */
2873
2874                         /* dont use tail as the end marker for this traverse */
2875                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2876                             regnode * const noper = NEXTOPER( cur );
2877 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2878                             regnode * const noper_next = regnext( noper );
2879 #endif
2880
2881                             DEBUG_OPTIMISE_r({
2882                                 regprop(RExC_rx, mysv, cur);
2883                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2884                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2885
2886                                 regprop(RExC_rx, mysv, noper);
2887                                 PerlIO_printf( Perl_debug_log, " -> %s",
2888                                     SvPV_nolen_const(mysv));
2889
2890                                 if ( noper_next ) {
2891                                   regprop(RExC_rx, mysv, noper_next );
2892                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2893                                     SvPV_nolen_const(mysv));
2894                                 }
2895                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2896                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2897                             });
2898                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2899                                          : PL_regkind[ OP( noper ) ] == EXACT )
2900                                   || OP(noper) == NOTHING )
2901 #ifdef NOJUMPTRIE
2902                                   && noper_next == tail
2903 #endif
2904                                   && count < U16_MAX)
2905                             {
2906                                 count++;
2907                                 if ( !first || optype == NOTHING ) {
2908                                     if (!first) first = cur;
2909                                     optype = OP( noper );
2910                                 } else {
2911                                     last = cur;
2912                                 }
2913                             } else {
2914 /*
2915     Currently we do not believe that the trie logic can
2916     handle case insensitive matching properly when the
2917     pattern is not unicode (thus forcing unicode semantics).
2918
2919     If/when this is fixed the following define can be swapped
2920     in below to fully enable trie logic.
2921
2922 #define TRIE_TYPE_IS_SAFE 1
2923
2924 */
2925 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2926
2927                                 if ( last && TRIE_TYPE_IS_SAFE ) {
2928                                     make_trie( pRExC_state,
2929                                             startbranch, first, cur, tail, count,
2930                                             optype, depth+1 );
2931                                 }
2932                                 if ( PL_regkind[ OP( noper ) ] == EXACT
2933 #ifdef NOJUMPTRIE
2934                                      && noper_next == tail
2935 #endif
2936                                 ){
2937                                     count = 1;
2938                                     first = cur;
2939                                     optype = OP( noper );
2940                                 } else {
2941                                     count = 0;
2942                                     first = NULL;
2943                                     optype = 0;
2944                                 }
2945                                 last = NULL;
2946                             }
2947                         }
2948                         DEBUG_OPTIMISE_r({
2949                             regprop(RExC_rx, mysv, cur);
2950                             PerlIO_printf( Perl_debug_log,
2951                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2952                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2953
2954                         });
2955
2956                         if ( last && TRIE_TYPE_IS_SAFE ) {
2957                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2958 #ifdef TRIE_STUDY_OPT
2959                             if ( ((made == MADE_EXACT_TRIE &&
2960                                  startbranch == first)
2961                                  || ( first_non_open == first )) &&
2962                                  depth==0 ) {
2963                                 flags |= SCF_TRIE_RESTUDY;
2964                                 if ( startbranch == first
2965                                      && scan == tail )
2966                                 {
2967                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2968                                 }
2969                             }
2970 #endif
2971                         }
2972                     }
2973
2974                 } /* do trie */
2975
2976             }
2977             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
2978                 scan = NEXTOPER(NEXTOPER(scan));
2979             } else                      /* single branch is optimized. */
2980                 scan = NEXTOPER(scan);
2981             continue;
2982         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2983             scan_frame *newframe = NULL;
2984             I32 paren;
2985             regnode *start;
2986             regnode *end;
2987
2988             if (OP(scan) != SUSPEND) {
2989             /* set the pointer */
2990                 if (OP(scan) == GOSUB) {
2991                     paren = ARG(scan);
2992                     RExC_recurse[ARG2L(scan)] = scan;
2993                     start = RExC_open_parens[paren-1];
2994                     end   = RExC_close_parens[paren-1];
2995                 } else {
2996                     paren = 0;
2997                     start = RExC_rxi->program + 1;
2998                     end   = RExC_opend;
2999                 }
3000                 if (!recursed) {
3001                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3002                     SAVEFREEPV(recursed);
3003                 }
3004                 if (!PAREN_TEST(recursed,paren+1)) {
3005                     PAREN_SET(recursed,paren+1);
3006                     Newx(newframe,1,scan_frame);
3007                 } else {
3008                     if (flags & SCF_DO_SUBSTR) {
3009                         SCAN_COMMIT(pRExC_state,data,minlenp);
3010                         data->longest = &(data->longest_float);
3011                     }
3012                     is_inf = is_inf_internal = 1;
3013                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3014                         cl_anything(pRExC_state, data->start_class);
3015                     flags &= ~SCF_DO_STCLASS;
3016                 }
3017             } else {
3018                 Newx(newframe,1,scan_frame);
3019                 paren = stopparen;
3020                 start = scan+2;
3021                 end = regnext(scan);
3022             }
3023             if (newframe) {
3024                 assert(start);
3025                 assert(end);
3026                 SAVEFREEPV(newframe);
3027                 newframe->next = regnext(scan);
3028                 newframe->last = last;
3029                 newframe->stop = stopparen;
3030                 newframe->prev = frame;
3031
3032                 frame = newframe;
3033                 scan =  start;
3034                 stopparen = paren;
3035                 last = end;
3036
3037                 continue;
3038             }
3039         }
3040         else if (OP(scan) == EXACT) {
3041             I32 l = STR_LEN(scan);
3042             UV uc;
3043             if (UTF) {
3044                 const U8 * const s = (U8*)STRING(scan);
3045                 l = utf8_length(s, s + l);
3046                 uc = utf8_to_uvchr(s, NULL);
3047             } else {
3048                 uc = *((U8*)STRING(scan));
3049             }
3050             min += l;
3051             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3052                 /* The code below prefers earlier match for fixed
3053                    offset, later match for variable offset.  */
3054                 if (data->last_end == -1) { /* Update the start info. */
3055                     data->last_start_min = data->pos_min;
3056                     data->last_start_max = is_inf
3057                         ? I32_MAX : data->pos_min + data->pos_delta;
3058                 }
3059                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3060                 if (UTF)
3061                     SvUTF8_on(data->last_found);
3062                 {
3063                     SV * const sv = data->last_found;
3064                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3065                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3066                     if (mg && mg->mg_len >= 0)
3067                         mg->mg_len += utf8_length((U8*)STRING(scan),
3068                                                   (U8*)STRING(scan)+STR_LEN(scan));
3069                 }
3070                 data->last_end = data->pos_min + l;
3071                 data->pos_min += l; /* As in the first entry. */
3072                 data->flags &= ~SF_BEFORE_EOL;
3073             }
3074             if (flags & SCF_DO_STCLASS_AND) {
3075                 /* Check whether it is compatible with what we know already! */
3076                 int compat = 1;
3077
3078
3079                 /* If compatibile, we or it in below.  It is compatible if is
3080                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3081                  * it's for a locale.  Even if there isn't unicode semantics
3082                  * here, at runtime there may be because of matching against a
3083                  * utf8 string, so accept a possible false positive for
3084                  * latin1-range folds */
3085                 if (uc >= 0x100 ||
3086                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3087                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3088                     && (!(data->start_class->flags & ANYOF_FOLD)
3089                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3090                     )
3091                     compat = 0;
3092                 ANYOF_CLASS_ZERO(data->start_class);
3093                 ANYOF_BITMAP_ZERO(data->start_class);
3094                 if (compat)
3095                     ANYOF_BITMAP_SET(data->start_class, uc);
3096                 data->start_class->flags &= ~ANYOF_EOS;
3097                 if (uc < 0x100)
3098                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3099             }
3100             else if (flags & SCF_DO_STCLASS_OR) {
3101                 /* false positive possible if the class is case-folded */
3102                 if (uc < 0x100)
3103                     ANYOF_BITMAP_SET(data->start_class, uc);
3104                 else
3105                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3106                 data->start_class->flags &= ~ANYOF_EOS;
3107                 cl_and(data->start_class, and_withp);
3108             }
3109             flags &= ~SCF_DO_STCLASS;
3110         }
3111         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3112             I32 l = STR_LEN(scan);
3113             UV uc = *((U8*)STRING(scan));
3114
3115             /* Search for fixed substrings supports EXACT only. */
3116             if (flags & SCF_DO_SUBSTR) {
3117                 assert(data);
3118                 SCAN_COMMIT(pRExC_state, data, minlenp);
3119             }
3120             if (UTF) {
3121                 const U8 * const s = (U8 *)STRING(scan);
3122                 l = utf8_length(s, s + l);
3123                 uc = utf8_to_uvchr(s, NULL);
3124             }
3125             min += l;
3126             if (flags & SCF_DO_SUBSTR)
3127                 data->pos_min += l;
3128             if (flags & SCF_DO_STCLASS_AND) {
3129                 /* Check whether it is compatible with what we know already! */
3130                 int compat = 1;
3131                 if (uc >= 0x100 ||
3132                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3133                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3134                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3135                 {
3136                     compat = 0;
3137                 }
3138                 ANYOF_CLASS_ZERO(data->start_class);
3139                 ANYOF_BITMAP_ZERO(data->start_class);
3140                 if (compat) {
3141                     ANYOF_BITMAP_SET(data->start_class, uc);
3142                     data->start_class->flags &= ~ANYOF_EOS;
3143                     data->start_class->flags |= ANYOF_FOLD;
3144                     if (OP(scan) == EXACTFL) {
3145                         data->start_class->flags |= ANYOF_LOCALE;
3146                     }
3147                     else {
3148
3149                         /* Also set the other member of the fold pair.  In case
3150                          * that unicode semantics is called for at runtime, use
3151                          * the full latin1 fold.  (Can't do this for locale,
3152                          * because not known until runtime */
3153                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3154                     }
3155                 }
3156             }
3157             else if (flags & SCF_DO_STCLASS_OR) {
3158                 if (data->start_class->flags & ANYOF_FOLD) {
3159                     /* false positive possible if the class is case-folded.
3160                        Assume that the locale settings are the same... */
3161                     if (uc < 0x100) {
3162                         ANYOF_BITMAP_SET(data->start_class, uc);
3163                         if (OP(scan) != EXACTFL) {
3164
3165                             /* And set the other member of the fold pair, but
3166                              * can't do that in locale because not known until
3167                              * run-time */
3168                             ANYOF_BITMAP_SET(data->start_class,
3169                                              PL_fold_latin1[uc]);
3170                         }
3171                     }
3172                     data->start_class->flags &= ~ANYOF_EOS;
3173                 }
3174                 cl_and(data->start_class, and_withp);
3175             }
3176             flags &= ~SCF_DO_STCLASS;
3177         }
3178         else if (REGNODE_VARIES(OP(scan))) {
3179             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3180             I32 f = flags, pos_before = 0;
3181             regnode * const oscan = scan;
3182             struct regnode_charclass_class this_class;
3183             struct regnode_charclass_class *oclass = NULL;
3184             I32 next_is_eval = 0;
3185
3186             switch (PL_regkind[OP(scan)]) {
3187             case WHILEM:                /* End of (?:...)* . */
3188                 scan = NEXTOPER(scan);
3189                 goto finish;
3190             case PLUS:
3191                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3192                     next = NEXTOPER(scan);
3193                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3194                         mincount = 1;
3195                         maxcount = REG_INFTY;
3196                         next = regnext(scan);
3197                         scan = NEXTOPER(scan);
3198                         goto do_curly;
3199                     }
3200                 }
3201                 if (flags & SCF_DO_SUBSTR)
3202                     data->pos_min++;
3203                 min++;
3204                 /* Fall through. */
3205             case STAR:
3206                 if (flags & SCF_DO_STCLASS) {
3207                     mincount = 0;
3208                     maxcount = REG_INFTY;
3209                     next = regnext(scan);
3210                     scan = NEXTOPER(scan);
3211                     goto do_curly;
3212                 }
3213                 is_inf = is_inf_internal = 1;
3214                 scan = regnext(scan);
3215                 if (flags & SCF_DO_SUBSTR) {
3216                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3217                     data->longest = &(data->longest_float);
3218                 }
3219                 goto optimize_curly_tail;
3220             case CURLY:
3221                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3222                     && (scan->flags == stopparen))
3223                 {
3224                     mincount = 1;
3225                     maxcount = 1;
3226                 } else {
3227                     mincount = ARG1(scan);
3228                     maxcount = ARG2(scan);
3229                 }
3230                 next = regnext(scan);
3231                 if (OP(scan) == CURLYX) {
3232                     I32 lp = (data ? *(data->last_closep) : 0);
3233                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3234                 }
3235                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3236                 next_is_eval = (OP(scan) == EVAL);
3237               do_curly:
3238                 if (flags & SCF_DO_SUBSTR) {
3239                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3240                     pos_before = data->pos_min;
3241                 }
3242                 if (data) {
3243                     fl = data->flags;
3244                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3245                     if (is_inf)
3246                         data->flags |= SF_IS_INF;
3247                 }
3248                 if (flags & SCF_DO_STCLASS) {
3249                     cl_init(pRExC_state, &this_class);
3250                     oclass = data->start_class;
3251                     data->start_class = &this_class;
3252                     f |= SCF_DO_STCLASS_AND;
3253                     f &= ~SCF_DO_STCLASS_OR;
3254                 }
3255                 /* Exclude from super-linear cache processing any {n,m}
3256                    regops for which the combination of input pos and regex
3257                    pos is not enough information to determine if a match
3258                    will be possible.
3259
3260                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3261                    regex pos at the \s*, the prospects for a match depend not
3262                    only on the input position but also on how many (bar\s*)
3263                    repeats into the {4,8} we are. */
3264                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3265                     f &= ~SCF_WHILEM_VISITED_POS;
3266
3267                 /* This will finish on WHILEM, setting scan, or on NULL: */
3268                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3269                                       last, data, stopparen, recursed, NULL,
3270                                       (mincount == 0
3271                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3272
3273                 if (flags & SCF_DO_STCLASS)
3274                     data->start_class = oclass;
3275                 if (mincount == 0 || minnext == 0) {
3276                     if (flags & SCF_DO_STCLASS_OR) {
3277                         cl_or(pRExC_state, data->start_class, &this_class);
3278                     }
3279                     else if (flags & SCF_DO_STCLASS_AND) {
3280                         /* Switch to OR mode: cache the old value of
3281                          * data->start_class */
3282                         INIT_AND_WITHP;
3283                         StructCopy(data->start_class, and_withp,
3284                                    struct regnode_charclass_class);
3285                         flags &= ~SCF_DO_STCLASS_AND;
3286                         StructCopy(&this_class, data->start_class,
3287                                    struct regnode_charclass_class);
3288                         flags |= SCF_DO_STCLASS_OR;
3289                         data->start_class->flags |= ANYOF_EOS;
3290                     }
3291                 } else {                /* Non-zero len */
3292                     if (flags & SCF_DO_STCLASS_OR) {
3293                         cl_or(pRExC_state, data->start_class, &this_class);
3294                         cl_and(data->start_class, and_withp);
3295                     }
3296                     else if (flags & SCF_DO_STCLASS_AND)
3297                         cl_and(data->start_class, &this_class);
3298                     flags &= ~SCF_DO_STCLASS;
3299                 }
3300                 if (!scan)              /* It was not CURLYX, but CURLY. */
3301                     scan = next;
3302                 if ( /* ? quantifier ok, except for (?{ ... }) */
3303                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3304                     && (minnext == 0) && (deltanext == 0)
3305                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3306                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3307                 {
3308                     ckWARNreg(RExC_parse,
3309                               "Quantifier unexpected on zero-length expression");
3310                 }
3311
3312                 min += minnext * mincount;
3313                 is_inf_internal |= ((maxcount == REG_INFTY
3314                                      && (minnext + deltanext) > 0)
3315                                     || deltanext == I32_MAX);
3316                 is_inf |= is_inf_internal;
3317                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3318
3319                 /* Try powerful optimization CURLYX => CURLYN. */
3320                 if (  OP(oscan) == CURLYX && data
3321                       && data->flags & SF_IN_PAR
3322                       && !(data->flags & SF_HAS_EVAL)
3323                       && !deltanext && minnext == 1 ) {
3324                     /* Try to optimize to CURLYN.  */
3325                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3326                     regnode * const nxt1 = nxt;
3327 #ifdef DEBUGGING
3328                     regnode *nxt2;
3329 #endif
3330
3331                     /* Skip open. */
3332                     nxt = regnext(nxt);
3333                     if (!REGNODE_SIMPLE(OP(nxt))
3334                         && !(PL_regkind[OP(nxt)] == EXACT
3335                              && STR_LEN(nxt) == 1))
3336                         goto nogo;
3337 #ifdef DEBUGGING
3338                     nxt2 = nxt;
3339 #endif
3340                     nxt = regnext(nxt);
3341                     if (OP(nxt) != CLOSE)
3342                         goto nogo;
3343                     if (RExC_open_parens) {
3344                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3345                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3346                     }
3347                     /* Now we know that nxt2 is the only contents: */
3348                     oscan->flags = (U8)ARG(nxt);
3349                     OP(oscan) = CURLYN;
3350                     OP(nxt1) = NOTHING; /* was OPEN. */
3351
3352 #ifdef DEBUGGING
3353                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3354                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3355                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3356                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3357                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3358                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3359 #endif
3360                 }
3361               nogo:
3362
3363                 /* Try optimization CURLYX => CURLYM. */
3364                 if (  OP(oscan) == CURLYX && data
3365                       && !(data->flags & SF_HAS_PAR)
3366                       && !(data->flags & SF_HAS_EVAL)
3367                       && !deltanext     /* atom is fixed width */
3368                       && minnext != 0   /* CURLYM can't handle zero width */
3369                 ) {
3370                     /* XXXX How to optimize if data == 0? */
3371                     /* Optimize to a simpler form.  */
3372                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3373                     regnode *nxt2;
3374
3375                     OP(oscan) = CURLYM;
3376                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3377                             && (OP(nxt2) != WHILEM))
3378                         nxt = nxt2;
3379                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3380                     /* Need to optimize away parenths. */
3381                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3382                         /* Set the parenth number.  */
3383                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3384
3385                         oscan->flags = (U8)ARG(nxt);
3386                         if (RExC_open_parens) {
3387                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3388                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3389                         }
3390                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3391                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3392
3393 #ifdef DEBUGGING
3394                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3395                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3396                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3397                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3398 #endif
3399 #if 0
3400                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3401                             regnode *nnxt = regnext(nxt1);
3402                             if (nnxt == nxt) {
3403                                 if (reg_off_by_arg[OP(nxt1)])
3404                                     ARG_SET(nxt1, nxt2 - nxt1);
3405                                 else if (nxt2 - nxt1 < U16_MAX)
3406                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3407                                 else
3408                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3409                             }
3410                             nxt1 = nnxt;
3411                         }
3412 #endif
3413                         /* Optimize again: */
3414                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3415                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3416                     }
3417                     else
3418                         oscan->flags = 0;
3419                 }
3420                 else if ((OP(oscan) == CURLYX)
3421                          && (flags & SCF_WHILEM_VISITED_POS)
3422                          /* See the comment on a similar expression above.
3423                             However, this time it's not a subexpression
3424                             we care about, but the expression itself. */
3425                          && (maxcount == REG_INFTY)
3426                          && data && ++data->whilem_c < 16) {
3427                     /* This stays as CURLYX, we can put the count/of pair. */
3428                     /* Find WHILEM (as in regexec.c) */
3429                     regnode *nxt = oscan + NEXT_OFF(oscan);
3430
3431                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3432                         nxt += ARG(nxt);
3433                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3434                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3435                 }
3436                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3437                     pars++;
3438                 if (flags & SCF_DO_SUBSTR) {
3439                     SV *last_str = NULL;
3440                     int counted = mincount != 0;
3441
3442                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3443 #if defined(SPARC64_GCC_WORKAROUND)
3444                         I32 b = 0;
3445                         STRLEN l = 0;
3446                         const char *s = NULL;
3447                         I32 old = 0;
3448
3449                         if (pos_before >= data->last_start_min)
3450                             b = pos_before;
3451                         else
3452                             b = data->last_start_min;
3453
3454                         l = 0;
3455                         s = SvPV_const(data->last_found, l);
3456                         old = b - data->last_start_min;
3457
3458 #else
3459                         I32 b = pos_before >= data->last_start_min
3460                             ? pos_before : data->last_start_min;
3461                         STRLEN l;
3462                         const char * const s = SvPV_const(data->last_found, l);
3463                         I32 old = b - data->last_start_min;
3464 #endif
3465
3466                         if (UTF)
3467                             old = utf8_hop((U8*)s, old) - (U8*)s;
3468                         l -= old;
3469                         /* Get the added string: */
3470                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3471                         if (deltanext == 0 && pos_before == b) {
3472                             /* What was added is a constant string */
3473                             if (mincount > 1) {
3474                                 SvGROW(last_str, (mincount * l) + 1);
3475                                 repeatcpy(SvPVX(last_str) + l,
3476                                           SvPVX_const(last_str), l, mincount - 1);
3477                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3478                                 /* Add additional parts. */
3479                                 SvCUR_set(data->last_found,
3480                                           SvCUR(data->last_found) - l);
3481                                 sv_catsv(data->last_found, last_str);
3482                                 {
3483                                     SV * sv = data->last_found;
3484                                     MAGIC *mg =
3485                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3486                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3487                                     if (mg && mg->mg_len >= 0)
3488                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3489                                 }
3490                                 data->last_end += l * (mincount - 1);
3491                             }
3492                         } else {
3493                             /* start offset must point into the last copy */
3494                             data->last_start_min += minnext * (mincount - 1);
3495                             data->last_start_max += is_inf ? I32_MAX
3496                                 : (maxcount - 1) * (minnext + data->pos_delta);
3497                         }
3498                     }
3499                     /* It is counted once already... */
3500                     data->pos_min += minnext * (mincount - counted);
3501                     data->pos_delta += - counted * deltanext +
3502                         (minnext + deltanext) * maxcount - minnext * mincount;
3503                     if (mincount != maxcount) {
3504                          /* Cannot extend fixed substrings found inside
3505                             the group.  */
3506                         SCAN_COMMIT(pRExC_state,data,minlenp);
3507                         if (mincount && last_str) {
3508                             SV * const sv = data->last_found;
3509                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3510                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3511
3512                             if (mg)
3513                                 mg->mg_len = -1;
3514                             sv_setsv(sv, last_str);
3515                             data->last_end = data->pos_min;
3516                             data->last_start_min =
3517                                 data->pos_min - CHR_SVLEN(last_str);
3518                             data->last_start_max = is_inf
3519                                 ? I32_MAX
3520                                 : data->pos_min + data->pos_delta
3521                                 - CHR_SVLEN(last_str);
3522                         }
3523                         data->longest = &(data->longest_float);
3524                     }
3525                     SvREFCNT_dec(last_str);
3526                 }
3527                 if (data && (fl & SF_HAS_EVAL))
3528                     data->flags |= SF_HAS_EVAL;
3529               optimize_curly_tail:
3530                 if (OP(oscan) != CURLYX) {
3531                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3532                            && NEXT_OFF(next))
3533                         NEXT_OFF(oscan) += NEXT_OFF(next);
3534                 }
3535                 continue;
3536             default:                    /* REF and CLUMP only? */
3537                 if (flags & SCF_DO_SUBSTR) {
3538                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3539                     data->longest = &(data->longest_float);
3540                 }
3541                 is_inf = is_inf_internal = 1;
3542                 if (flags & SCF_DO_STCLASS_OR)
3543                     cl_anything(pRExC_state, data->start_class);
3544                 flags &= ~SCF_DO_STCLASS;
3545                 break;
3546             }
3547         }
3548         else if (OP(scan) == LNBREAK) {
3549             if (flags & SCF_DO_STCLASS) {
3550                 int value = 0;
3551                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3552                 if (flags & SCF_DO_STCLASS_AND) {
3553                     for (value = 0; value < 256; value++)
3554                         if (!is_VERTWS_cp(value))
3555                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3556                 }
3557                 else {
3558                     for (value = 0; value < 256; value++)
3559                         if (is_VERTWS_cp(value))
3560                             ANYOF_BITMAP_SET(data->start_class, value);
3561                 }
3562                 if (flags & SCF_DO_STCLASS_OR)
3563                     cl_and(data->start_class, and_withp);
3564                 flags &= ~SCF_DO_STCLASS;
3565             }
3566             min += 1;
3567             delta += 1;
3568             if (flags & SCF_DO_SUBSTR) {
3569                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3570                 data->pos_min += 1;
3571                 data->pos_delta += 1;
3572                 data->longest = &(data->longest_float);
3573             }
3574         }
3575         else if (OP(scan) == FOLDCHAR) {
3576             int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
3577             flags &= ~SCF_DO_STCLASS;
3578             min += 1;
3579             delta += d;
3580             if (flags & SCF_DO_SUBSTR) {
3581                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3582                 data->pos_min += 1;
3583                 data->pos_delta += d;
3584                 data->longest = &(data->longest_float);
3585             }
3586         }
3587         else if (REGNODE_SIMPLE(OP(scan))) {
3588             int value = 0;
3589
3590             if (flags & SCF_DO_SUBSTR) {
3591                 SCAN_COMMIT(pRExC_state,data,minlenp);
3592                 data->pos_min++;
3593             }
3594             min++;
3595             if (flags & SCF_DO_STCLASS) {
3596                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3597
3598                 /* Some of the logic below assumes that switching
3599                    locale on will only add false positives. */
3600                 switch (PL_regkind[OP(scan)]) {
3601                 case SANY:
3602                 default:
3603                   do_default:
3604                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3605                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3606                         cl_anything(pRExC_state, data->start_class);
3607                     break;
3608                 case REG_ANY:
3609                     if (OP(scan) == SANY)
3610                         goto do_default;
3611                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3612                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3613                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3614                         cl_anything(pRExC_state, data->start_class);
3615                     }
3616                     if (flags & SCF_DO_STCLASS_AND || !value)
3617                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3618                     break;
3619                 case ANYOF:
3620                     if (flags & SCF_DO_STCLASS_AND)
3621                         cl_and(data->start_class,
3622                                (struct regnode_charclass_class*)scan);
3623                     else
3624                         cl_or(pRExC_state, data->start_class,
3625                               (struct regnode_charclass_class*)scan);
3626                     break;
3627                 case ALNUM:
3628                     if (flags & SCF_DO_STCLASS_AND) {
3629                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3630                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3631                             if (FLAGS(scan) & USE_UNI) {
3632                                 for (value = 0; value < 256; value++) {
3633                                     if (!isWORDCHAR_L1(value)) {
3634                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3635                                     }
3636                                 }
3637                             } else {
3638                                 for (value = 0; value < 256; value++) {
3639                                     if (!isALNUM(value)) {
3640                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3641                                     }
3642                                 }
3643                             }
3644                         }
3645                     }
3646                     else {
3647                         if (data->start_class->flags & ANYOF_LOCALE)
3648                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3649                         else if (FLAGS(scan) & USE_UNI) {
3650                             for (value = 0; value < 256; value++) {
3651                                 if (isWORDCHAR_L1(value)) {
3652                                     ANYOF_BITMAP_SET(data->start_class, value);
3653                                 }
3654                             }
3655                         } else {
3656                             for (value = 0; value < 256; value++) {
3657                                 if (isALNUM(value)) {
3658                                     ANYOF_BITMAP_SET(data->start_class, value);
3659                                 }
3660                             }
3661                         }
3662                     }
3663                     break;
3664                 case ALNUML:
3665                     if (flags & SCF_DO_STCLASS_AND) {
3666                         if (data->start_class->flags & ANYOF_LOCALE)
3667                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3668                     }
3669                     else {
3670                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3671                         data->start_class->flags |= ANYOF_LOCALE;
3672                     }
3673                     break;
3674                 case NALNUM:
3675                     if (flags & SCF_DO_STCLASS_AND) {
3676                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3677                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3678                             if (FLAGS(scan) & USE_UNI) {
3679                                 for (value = 0; value < 256; value++) {
3680                                     if (isWORDCHAR_L1(value)) {
3681                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3682                                     }
3683                                 }
3684                             } else {
3685                                 for (value = 0; value < 256; value++) {
3686                                     if (isALNUM(value)) {
3687                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3688                                     }
3689                                 }
3690                             }
3691                         }
3692                     }
3693                     else {
3694                         if (data->start_class->flags & ANYOF_LOCALE)
3695                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3696                         else {
3697                             for (value = 0; value < 256; value++)
3698                                 if (!isALNUM(value))
3699                                     ANYOF_BITMAP_SET(data->start_class, value);
3700                         }
3701                     }
3702                     break;
3703                 case NALNUML:
3704                     if (flags & SCF_DO_STCLASS_AND) {
3705                         if (data->start_class->flags & ANYOF_LOCALE)
3706                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3707                     }
3708                     else {
3709                         data->start_class->flags |= ANYOF_LOCALE;
3710                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3711                     }
3712                     break;
3713                 case SPACE:
3714                     if (flags & SCF_DO_STCLASS_AND) {
3715                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3716                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3717                             if (FLAGS(scan) & USE_UNI) {
3718                                 for (value = 0; value < 256; value++) {
3719                                     if (!isSPACE_L1(value)) {
3720                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3721                                     }
3722                                 }
3723                             } else {
3724                                 for (value = 0; value < 256; value++) {
3725                                     if (!isSPACE(value)) {
3726                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3727                                     }
3728                                 }
3729                             }
3730                         }
3731                     }
3732                     else {
3733                         if (data->start_class->flags & ANYOF_LOCALE) {
3734                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3735                         }
3736                         else if (FLAGS(scan) & USE_UNI) {
3737                             for (value = 0; value < 256; value++) {
3738                                 if (isSPACE_L1(value)) {
3739                                     ANYOF_BITMAP_SET(data->start_class, value);
3740                                 }
3741                             }
3742                         } else {
3743                             for (value = 0; value < 256; value++) {
3744                                 if (isSPACE(value)) {
3745                                     ANYOF_BITMAP_SET(data->start_class, value);
3746                                 }
3747                             }
3748                         }
3749                     }
3750                     break;
3751                 case SPACEL:
3752                     if (flags & SCF_DO_STCLASS_AND) {
3753                         if (data->start_class->flags & ANYOF_LOCALE)
3754                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3755                     }
3756                     else {
3757                         data->start_class->flags |= ANYOF_LOCALE;
3758                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3759                     }
3760                     break;
3761                 case NSPACE:
3762                     if (flags & SCF_DO_STCLASS_AND) {
3763                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3764                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3765                             if (FLAGS(scan) & USE_UNI) {
3766                                 for (value = 0; value < 256; value++) {
3767                                     if (isSPACE_L1(value)) {
3768                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3769                                     }
3770                                 }
3771                             } else {
3772                                 for (value = 0; value < 256; value++) {
3773                                     if (isSPACE(value)) {
3774                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3775                                     }
3776                                 }
3777                             }
3778                         }
3779                     }
3780                     else {
3781                         if (data->start_class->flags & ANYOF_LOCALE)
3782                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3783                         else if (FLAGS(scan) & USE_UNI) {
3784                             for (value = 0; value < 256; value++) {
3785                                 if (!isSPACE_L1(value)) {
3786                                     ANYOF_BITMAP_SET(data->start_class, value);
3787                                 }
3788                             }
3789                         }
3790                         else {
3791                             for (value = 0; value < 256; value++) {
3792                                 if (!isSPACE(value)) {
3793                                     ANYOF_BITMAP_SET(data->start_class, value);
3794                                 }
3795                             }
3796                         }
3797                     }
3798                     break;
3799                 case NSPACEL:
3800                     if (flags & SCF_DO_STCLASS_AND) {
3801                         if (data->start_class->flags & ANYOF_LOCALE) {
3802                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3803                             for (value = 0; value < 256; value++)
3804                                 if (!isSPACE(value))
3805                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3806                         }
3807                     }
3808                     else {
3809                         data->start_class->flags |= ANYOF_LOCALE;
3810                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3811                     }
3812                     break;
3813                 case DIGIT:
3814                     if (flags & SCF_DO_STCLASS_AND) {
3815                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3816                         for (value = 0; value < 256; value++)
3817                             if (!isDIGIT(value))
3818                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3819                     }
3820                     else {
3821                         if (data->start_class->flags & ANYOF_LOCALE)
3822                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3823                         else {
3824                             for (value = 0; value < 256; value++)
3825                                 if (isDIGIT(value))
3826                                     ANYOF_BITMAP_SET(data->start_class, value);
3827                         }
3828                     }
3829                     break;
3830                 case NDIGIT:
3831                     if (flags & SCF_DO_STCLASS_AND) {
3832                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3833                         for (value = 0; value < 256; value++)
3834                             if (isDIGIT(value))
3835                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3836                     }
3837                     else {
3838                         if (data->start_class->flags & ANYOF_LOCALE)
3839                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3840                         else {
3841                             for (value = 0; value < 256; value++)
3842                                 if (!isDIGIT(value))
3843                                     ANYOF_BITMAP_SET(data->start_class, value);
3844                         }
3845                     }
3846                     break;
3847                 CASE_SYNST_FNC(VERTWS);
3848                 CASE_SYNST_FNC(HORIZWS);
3849
3850                 }
3851                 if (flags & SCF_DO_STCLASS_OR)
3852                     cl_and(data->start_class, and_withp);
3853                 flags &= ~SCF_DO_STCLASS;
3854             }
3855         }
3856         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3857             data->flags |= (OP(scan) == MEOL
3858                             ? SF_BEFORE_MEOL
3859                             : SF_BEFORE_SEOL);
3860         }
3861         else if (  PL_regkind[OP(scan)] == BRANCHJ
3862                  /* Lookbehind, or need to calculate parens/evals/stclass: */
3863                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
3864                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3865             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3866                 || OP(scan) == UNLESSM )
3867             {
3868                 /* Negative Lookahead/lookbehind
3869                    In this case we can't do fixed string optimisation.
3870                 */
3871
3872                 I32 deltanext, minnext, fake = 0;
3873                 regnode *nscan;
3874                 struct regnode_charclass_class intrnl;
3875                 int f = 0;
3876
3877                 data_fake.flags = 0;
3878                 if (data) {
3879                     data_fake.whilem_c = data->whilem_c;
3880                     data_fake.last_closep = data->last_closep;
3881                 }
3882                 else
3883                     data_fake.last_closep = &fake;
3884                 data_fake.pos_delta = delta;
3885                 if ( flags & SCF_DO_STCLASS && !scan->flags
3886                      && OP(scan) == IFMATCH ) { /* Lookahead */
3887                     cl_init(pRExC_state, &intrnl);
3888                     data_fake.start_class = &intrnl;
3889                     f |= SCF_DO_STCLASS_AND;
3890                 }
3891                 if (flags & SCF_WHILEM_VISITED_POS)
3892                     f |= SCF_WHILEM_VISITED_POS;
3893                 next = regnext(scan);
3894                 nscan = NEXTOPER(NEXTOPER(scan));
3895                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3896                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3897                 if (scan->flags) {
3898                     if (deltanext) {
3899                         FAIL("Variable length lookbehind not implemented");
3900                     }
3901                     else if (minnext > (I32)U8_MAX) {
3902                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3903                     }
3904                     scan->flags = (U8)minnext;
3905                 }
3906                 if (data) {
3907                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3908                         pars++;
3909                     if (data_fake.flags & SF_HAS_EVAL)
3910                         data->flags |= SF_HAS_EVAL;
3911                     data->whilem_c = data_fake.whilem_c;
3912                 }
3913                 if (f & SCF_DO_STCLASS_AND) {
3914                     if (flags & SCF_DO_STCLASS_OR) {
3915                         /* OR before, AND after: ideally we would recurse with
3916                          * data_fake to get the AND applied by study of the
3917                          * remainder of the pattern, and then derecurse;
3918                          * *** HACK *** for now just treat as "no information".
3919                          * See [perl #56690].
3920                          */
3921                         cl_init(pRExC_state, data->start_class);
3922                     }  else {
3923                         /* AND before and after: combine and continue */
3924                         const int was = (data->start_class->flags & ANYOF_EOS);
3925
3926                         cl_and(data->start_class, &intrnl);
3927                         if (was)
3928                             data->start_class->flags |= ANYOF_EOS;
3929                     }
3930                 }
3931             }
3932 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3933             else {
3934                 /* Positive Lookahead/lookbehind
3935                    In this case we can do fixed string optimisation,
3936                    but we must be careful about it. Note in the case of
3937                    lookbehind the positions will be offset by the minimum
3938                    length of the pattern, something we won't know about
3939                    until after the recurse.
3940                 */
3941                 I32 deltanext, fake = 0;
3942                 regnode *nscan;
3943                 struct regnode_charclass_class intrnl;
3944                 int f = 0;
3945                 /* We use SAVEFREEPV so that when the full compile
3946                     is finished perl will clean up the allocated
3947                     minlens when it's all done. This way we don't
3948                     have to worry about freeing them when we know
3949                     they wont be used, which would be a pain.
3950                  */
3951                 I32 *minnextp;
3952                 Newx( minnextp, 1, I32 );
3953                 SAVEFREEPV(minnextp);
3954
3955                 if (data) {
3956                     StructCopy(data, &data_fake, scan_data_t);
3957                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3958                         f |= SCF_DO_SUBSTR;
3959                         if (scan->flags)
3960                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3961                         data_fake.last_found=newSVsv(data->last_found);
3962                     }
3963                 }
3964                 else
3965                     data_fake.last_closep = &fake;
3966                 data_fake.flags = 0;
3967                 data_fake.pos_delta = delta;
3968                 if (is_inf)
3969                     data_fake.flags |= SF_IS_INF;
3970                 if ( flags & SCF_DO_STCLASS && !scan->flags
3971                      && OP(scan) == IFMATCH ) { /* Lookahead */
3972                     cl_init(pRExC_state, &intrnl);
3973                     data_fake.start_class = &intrnl;
3974                     f |= SCF_DO_STCLASS_AND;
3975                 }
3976                 if (flags & SCF_WHILEM_VISITED_POS)
3977                     f |= SCF_WHILEM_VISITED_POS;
3978                 next = regnext(scan);
3979                 nscan = NEXTOPER(NEXTOPER(scan));
3980
3981                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3982                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3983                 if (scan->flags) {
3984                     if (deltanext) {
3985                         FAIL("Variable length lookbehind not implemented");
3986                     }
3987                     else if (*minnextp > (I32)U8_MAX) {
3988                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3989                     }
3990                     scan->flags = (U8)*minnextp;
3991                 }
3992
3993                 *minnextp += min;
3994
3995                 if (f & SCF_DO_STCLASS_AND) {
3996                     const int was = (data->start_class->flags & ANYOF_EOS);
3997
3998                     cl_and(data->start_class, &intrnl);
3999                     if (was)
4000                         data->start_class->flags |= ANYOF_EOS;
4001                 }
4002                 if (data) {
4003                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4004                         pars++;
4005                     if (data_fake.flags & SF_HAS_EVAL)
4006                         data->flags |= SF_HAS_EVAL;
4007                     data->whilem_c = data_fake.whilem_c;
4008                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4009                         if (RExC_rx->minlen<*minnextp)
4010                             RExC_rx->minlen=*minnextp;
4011                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4012                         SvREFCNT_dec(data_fake.last_found);
4013
4014                         if ( data_fake.minlen_fixed != minlenp )
4015                         {
4016                             data->offset_fixed= data_fake.offset_fixed;
4017                             data->minlen_fixed= data_fake.minlen_fixed;
4018                             data->lookbehind_fixed+= scan->flags;
4019                         }
4020                         if ( data_fake.minlen_float != minlenp )
4021                         {
4022                             data->minlen_float= data_fake.minlen_float;
4023                             data->offset_float_min=data_fake.offset_float_min;
4024                             data->offset_float_max=data_fake.offset_float_max;
4025                             data->lookbehind_float+= scan->flags;
4026                         }
4027                     }
4028                 }
4029
4030
4031             }
4032 #endif
4033         }
4034         else if (OP(scan) == OPEN) {
4035             if (stopparen != (I32)ARG(scan))
4036                 pars++;
4037         }
4038         else if (OP(scan) == CLOSE) {
4039             if (stopparen == (I32)ARG(scan)) {
4040                 break;
4041             }
4042             if ((I32)ARG(scan) == is_par) {
4043                 next = regnext(scan);
4044
4045                 if ( next && (OP(next) != WHILEM) && next < last)
4046                     is_par = 0;         /* Disable optimization */
4047             }
4048             if (data)
4049                 *(data->last_closep) = ARG(scan);
4050         }
4051         else if (OP(scan) == EVAL) {
4052                 if (data)
4053                     data->flags |= SF_HAS_EVAL;
4054         }
4055         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4056             if (flags & SCF_DO_SUBSTR) {
4057                 SCAN_COMMIT(pRExC_state,data,minlenp);
4058                 flags &= ~SCF_DO_SUBSTR;
4059             }
4060             if (data && OP(scan)==ACCEPT) {
4061                 data->flags |= SCF_SEEN_ACCEPT;
4062                 if (stopmin > min)
4063                     stopmin = min;
4064             }
4065         }
4066         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4067         {
4068                 if (flags & SCF_DO_SUBSTR) {
4069                     SCAN_COMMIT(pRExC_state,data,minlenp);
4070                     data->longest = &(data->longest_float);
4071                 }
4072                 is_inf = is_inf_internal = 1;
4073                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4074                     cl_anything(pRExC_state, data->start_class);
4075                 flags &= ~SCF_DO_STCLASS;
4076         }
4077         else if (OP(scan) == GPOS) {
4078             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4079                 !(delta || is_inf || (data && data->pos_delta)))
4080             {
4081                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4082                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4083                 if (RExC_rx->gofs < (U32)min)
4084                     RExC_rx->gofs = min;
4085             } else {
4086                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4087                 RExC_rx->gofs = 0;
4088             }
4089         }
4090 #ifdef TRIE_STUDY_OPT
4091 #ifdef FULL_TRIE_STUDY
4092         else if (PL_regkind[OP(scan)] == TRIE) {
4093             /* NOTE - There is similar code to this block above for handling
4094                BRANCH nodes on the initial study.  If you change stuff here
4095                check there too. */
4096             regnode *trie_node= scan;
4097             regnode *tail= regnext(scan);
4098             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4099             I32 max1 = 0, min1 = I32_MAX;
4100             struct regnode_charclass_class accum;
4101
4102             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4103                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4104             if (flags & SCF_DO_STCLASS)
4105                 cl_init_zero(pRExC_state, &accum);
4106
4107             if (!trie->jump) {
4108                 min1= trie->minlen;
4109                 max1= trie->maxlen;
4110             } else {
4111                 const regnode *nextbranch= NULL;
4112                 U32 word;
4113
4114                 for ( word=1 ; word <= trie->wordcount ; word++)
4115                 {
4116                     I32 deltanext=0, minnext=0, f = 0, fake;
4117                     struct regnode_charclass_class this_class;
4118
4119                     data_fake.flags = 0;
4120                     if (data) {
4121                         data_fake.whilem_c = data->whilem_c;
4122                         data_fake.last_closep = data->last_closep;
4123                     }
4124                     else
4125                         data_fake.last_closep = &fake;
4126                     data_fake.pos_delta = delta;
4127                     if (flags & SCF_DO_STCLASS) {
4128                         cl_init(pRExC_state, &this_class);
4129                         data_fake.start_class = &this_class;
4130                         f = SCF_DO_STCLASS_AND;
4131                     }
4132                     if (flags & SCF_WHILEM_VISITED_POS)
4133                         f |= SCF_WHILEM_VISITED_POS;
4134
4135                     if (trie->jump[word]) {
4136                         if (!nextbranch)
4137                             nextbranch = trie_node + trie->jump[0];
4138                         scan= trie_node + trie->jump[word];
4139                         /* We go from the jump point to the branch that follows
4140                            it. Note this means we need the vestigal unused branches
4141                            even though they arent otherwise used.
4142                          */
4143                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4144                             &deltanext, (regnode *)nextbranch, &data_fake,
4145                             stopparen, recursed, NULL, f,depth+1);
4146                     }
4147                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4148                         nextbranch= regnext((regnode*)nextbranch);
4149
4150                     if (min1 > (I32)(minnext + trie->minlen))
4151                         min1 = minnext + trie->minlen;
4152                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4153                         max1 = minnext + deltanext + trie->maxlen;
4154                     if (deltanext == I32_MAX)
4155                         is_inf = is_inf_internal = 1;
4156
4157                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4158                         pars++;
4159                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4160                         if ( stopmin > min + min1)
4161                             stopmin = min + min1;
4162                         flags &= ~SCF_DO_SUBSTR;
4163                         if (data)
4164                             data->flags |= SCF_SEEN_ACCEPT;
4165                     }
4166                     if (data) {
4167                         if (data_fake.flags & SF_HAS_EVAL)
4168                             data->flags |= SF_HAS_EVAL;
4169                         data->whilem_c = data_fake.whilem_c;
4170                     }
4171                     if (flags & SCF_DO_STCLASS)
4172                         cl_or(pRExC_state, &accum, &this_class);
4173                 }
4174             }
4175             if (flags & SCF_DO_SUBSTR) {
4176                 data->pos_min += min1;
4177                 data->pos_delta += max1 - min1;
4178                 if (max1 != min1 || is_inf)
4179                     data->longest = &(data->longest_float);
4180             }
4181             min += min1;
4182             delta += max1 - min1;
4183             if (flags & SCF_DO_STCLASS_OR) {
4184                 cl_or(pRExC_state, data->start_class, &accum);
4185                 if (min1) {
4186                     cl_and(data->start_class, and_withp);
4187                     flags &= ~SCF_DO_STCLASS;
4188                 }
4189             }
4190             else if (flags & SCF_DO_STCLASS_AND) {
4191                 if (min1) {
4192                     cl_and(data->start_class, &accum);
4193                     flags &= ~SCF_DO_STCLASS;
4194                 }
4195                 else {
4196                     /* Switch to OR mode: cache the old value of
4197                      * data->start_class */
4198                     INIT_AND_WITHP;
4199                     StructCopy(data->start_class, and_withp,
4200                                struct regnode_charclass_class);
4201                     flags &= ~SCF_DO_STCLASS_AND;
4202                     StructCopy(&accum, data->start_class,
4203                                struct regnode_charclass_class);
4204                     flags |= SCF_DO_STCLASS_OR;
4205                     data->start_class->flags |= ANYOF_EOS;
4206                 }
4207             }
4208             scan= tail;
4209             continue;
4210         }
4211 #else
4212         else if (PL_regkind[OP(scan)] == TRIE) {
4213             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4214             U8*bang=NULL;
4215
4216             min += trie->minlen;
4217             delta += (trie->maxlen - trie->minlen);
4218             flags &= ~SCF_DO_STCLASS; /* xxx */
4219             if (flags & SCF_DO_SUBSTR) {
4220                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4221                 data->pos_min += trie->minlen;
4222                 data->pos_delta += (trie->maxlen - trie->minlen);
4223                 if (trie->maxlen != trie->minlen)
4224                     data->longest = &(data->longest_float);
4225             }
4226             if (trie->jump) /* no more substrings -- for now /grr*/
4227                 flags &= ~SCF_DO_SUBSTR;
4228         }
4229 #endif /* old or new */
4230 #endif /* TRIE_STUDY_OPT */
4231
4232         /* Else: zero-length, ignore. */
4233         scan = regnext(scan);
4234     }
4235     if (frame) {
4236         last = frame->last;
4237         scan = frame->next;
4238         stopparen = frame->stop;
4239         frame = frame->prev;
4240         goto fake_study_recurse;
4241     }
4242
4243   finish:
4244     assert(!frame);
4245     DEBUG_STUDYDATA("pre-fin:",data,depth);
4246
4247     *scanp = scan;
4248     *deltap = is_inf_internal ? I32_MAX : delta;
4249     if (flags & SCF_DO_SUBSTR && is_inf)
4250         data->pos_delta = I32_MAX - data->pos_min;
4251     if (is_par > (I32)U8_MAX)
4252         is_par = 0;
4253     if (is_par && pars==1 && data) {
4254         data->flags |= SF_IN_PAR;
4255         data->flags &= ~SF_HAS_PAR;
4256     }
4257     else if (pars && data) {
4258         data->flags |= SF_HAS_PAR;
4259         data->flags &= ~SF_IN_PAR;
4260     }
4261     if (flags & SCF_DO_STCLASS_OR)
4262         cl_and(data->start_class, and_withp);
4263     if (flags & SCF_TRIE_RESTUDY)
4264         data->flags |=  SCF_TRIE_RESTUDY;
4265
4266     DEBUG_STUDYDATA("post-fin:",data,depth);
4267
4268     return min < stopmin ? min : stopmin;
4269 }
4270
4271 STATIC U32
4272 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4273 {
4274     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4275
4276     PERL_ARGS_ASSERT_ADD_DATA;
4277
4278     Renewc(RExC_rxi->data,
4279            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4280            char, struct reg_data);
4281     if(count)
4282         Renew(RExC_rxi->data->what, count + n, U8);
4283     else
4284         Newx(RExC_rxi->data->what, n, U8);
4285     RExC_rxi->data->count = count + n;
4286     Copy(s, RExC_rxi->data->what + count, n, U8);
4287     return count;
4288 }
4289
4290 /*XXX: todo make this not included in a non debugging perl */
4291 #ifndef PERL_IN_XSUB_RE
4292 void
4293 Perl_reginitcolors(pTHX)
4294 {
4295     dVAR;
4296     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4297     if (s) {
4298         char *t = savepv(s);
4299         int i = 0;
4300         PL_colors[0] = t;
4301         while (++i < 6) {
4302             t = strchr(t, '\t');
4303             if (t) {
4304                 *t = '\0';
4305                 PL_colors[i] = ++t;
4306             }
4307             else
4308                 PL_colors[i] = t = (char *)"";
4309         }
4310     } else {
4311         int i = 0;
4312         while (i < 6)
4313             PL_colors[i++] = (char *)"";
4314     }
4315     PL_colorset = 1;
4316 }
4317 #endif
4318
4319
4320 #ifdef TRIE_STUDY_OPT
4321 #define CHECK_RESTUDY_GOTO                                  \
4322         if (                                                \
4323               (data.flags & SCF_TRIE_RESTUDY)               \
4324               && ! restudied++                              \
4325         )     goto reStudy
4326 #else
4327 #define CHECK_RESTUDY_GOTO
4328 #endif
4329
4330 /*
4331  - pregcomp - compile a regular expression into internal code
4332  *
4333  * We can't allocate space until we know how big the compiled form will be,
4334  * but we can't compile it (and thus know how big it is) until we've got a
4335  * place to put the code.  So we cheat:  we compile it twice, once with code
4336  * generation turned off and size counting turned on, and once "for real".
4337  * This also means that we don't allocate space until we are sure that the
4338  * thing really will compile successfully, and we never have to move the
4339  * code and thus invalidate pointers into it.  (Note that it has to be in
4340  * one piece because free() must be able to free it all.) [NB: not true in perl]
4341  *
4342  * Beware that the optimization-preparation code in here knows about some
4343  * of the structure of the compiled regexp.  [I'll say.]
4344  */
4345
4346
4347
4348 #ifndef PERL_IN_XSUB_RE
4349 #define RE_ENGINE_PTR &PL_core_reg_engine
4350 #else
4351 extern const struct regexp_engine my_reg_engine;
4352 #define RE_ENGINE_PTR &my_reg_engine
4353 #endif
4354
4355 #ifndef PERL_IN_XSUB_RE
4356 REGEXP *
4357 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4358 {
4359     dVAR;
4360     HV * const table = GvHV(PL_hintgv);
4361
4362     PERL_ARGS_ASSERT_PREGCOMP;
4363
4364     /* Dispatch a request to compile a regexp to correct
4365        regexp engine. */
4366     if (table) {
4367         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4368         GET_RE_DEBUG_FLAGS_DECL;
4369         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4370             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4371             DEBUG_COMPILE_r({
4372                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4373                     SvIV(*ptr));
4374             });
4375             return CALLREGCOMP_ENG(eng, pattern, flags);
4376         }
4377     }
4378     return Perl_re_compile(aTHX_ pattern, flags);
4379 }
4380 #endif
4381
4382 REGEXP *
4383 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4384 {
4385     dVAR;
4386     REGEXP *rx;
4387     struct regexp *r;
4388     register regexp_internal *ri;
4389     STRLEN plen;
4390     char  *exp;
4391     char* xend;
4392     regnode *scan;
4393     I32 flags;
4394     I32 minlen = 0;
4395     U32 pm_flags;
4396
4397     /* these are all flags - maybe they should be turned
4398      * into a single int with different bit masks */
4399     I32 sawlookahead = 0;
4400     I32 sawplus = 0;
4401     I32 sawopen = 0;
4402     bool used_setjump = FALSE;
4403
4404     U8 jump_ret = 0;
4405     dJMPENV;
4406     scan_data_t data;
4407     RExC_state_t RExC_state;
4408     RExC_state_t * const pRExC_state = &RExC_state;
4409 #ifdef TRIE_STUDY_OPT
4410     int restudied;
4411     RExC_state_t copyRExC_state;
4412 #endif
4413     GET_RE_DEBUG_FLAGS_DECL;
4414
4415     PERL_ARGS_ASSERT_RE_COMPILE;
4416
4417     DEBUG_r(if (!PL_colorset) reginitcolors());
4418
4419     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4420
4421     /****************** LONG JUMP TARGET HERE***********************/
4422     /* Longjmp back to here if have to switch in midstream to utf8 */
4423     if (! RExC_orig_utf8) {
4424         JMPENV_PUSH(jump_ret);
4425         used_setjump = TRUE;
4426     }
4427
4428     if (jump_ret == 0) {    /* First time through */
4429         exp = SvPV(pattern, plen);
4430         xend = exp + plen;
4431         /* ignore the utf8ness if the pattern is 0 length */
4432         if (plen == 0) {
4433             RExC_utf8 = RExC_orig_utf8 = 0;
4434         }
4435
4436         DEBUG_COMPILE_r({
4437             SV *dsv= sv_newmortal();
4438             RE_PV_QUOTED_DECL(s, RExC_utf8,
4439                 dsv, exp, plen, 60);
4440             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4441                            PL_colors[4],PL_colors[5],s);
4442         });
4443     }
4444     else {  /* longjumped back */
4445         STRLEN len = plen;
4446
4447         /* If the cause for the longjmp was other than changing to utf8, pop
4448          * our own setjmp, and longjmp to the correct handler */
4449         if (jump_ret != UTF8_LONGJMP) {
4450             JMPENV_POP;
4451             JMPENV_JUMP(jump_ret);
4452         }
4453
4454         GET_RE_DEBUG_FLAGS;
4455
4456         /* It's possible to write a regexp in ascii that represents Unicode
4457         codepoints outside of the byte range, such as via \x{100}. If we
4458         detect such a sequence we have to convert the entire pattern to utf8
4459         and then recompile, as our sizing calculation will have been based
4460         on 1 byte == 1 character, but we will need to use utf8 to encode
4461         at least some part of the pattern, and therefore must convert the whole
4462         thing.
4463         -- dmq */
4464         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4465             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4466         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4467         xend = exp + len;
4468         RExC_orig_utf8 = RExC_utf8 = 1;
4469         SAVEFREEPV(exp);
4470     }
4471
4472 #ifdef TRIE_STUDY_OPT
4473     restudied = 0;
4474 #endif
4475
4476     /* Set to use unicode semantics if the pattern is in utf8 and has the
4477      * 'dual' charset specified, as it means unicode when utf8  */
4478     pm_flags = orig_pm_flags;
4479     if (RExC_utf8  && ! (pm_flags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE))) {
4480         pm_flags |= RXf_PMf_UNICODE;
4481     }
4482
4483     RExC_precomp = exp;
4484     RExC_flags = pm_flags;
4485     RExC_sawback = 0;
4486
4487     RExC_seen = 0;
4488     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4489     RExC_seen_evals = 0;
4490     RExC_extralen = 0;
4491
4492     /* First pass: determine size, legality. */
4493     RExC_parse = exp;
4494     RExC_start = exp;
4495     RExC_end = xend;
4496     RExC_naughty = 0;
4497     RExC_npar = 1;
4498     RExC_nestroot = 0;
4499     RExC_size = 0L;
4500     RExC_emit = &PL_regdummy;
4501     RExC_whilem_seen = 0;
4502     RExC_open_parens = NULL;
4503     RExC_close_parens = NULL;
4504     RExC_opend = NULL;
4505     RExC_paren_names = NULL;
4506 #ifdef DEBUGGING
4507     RExC_paren_name_list = NULL;
4508 #endif
4509     RExC_recurse = NULL;
4510     RExC_recurse_count = 0;
4511
4512 #if 0 /* REGC() is (currently) a NOP at the first pass.
4513        * Clever compilers notice this and complain. --jhi */
4514     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4515 #endif
4516     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4517     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4518         RExC_precomp = NULL;
4519         return(NULL);
4520     }
4521
4522     /* Here, finished first pass.  Get rid of any added setjmp */
4523     if (used_setjump) {
4524         JMPENV_POP;
4525     }
4526     DEBUG_PARSE_r({
4527         PerlIO_printf(Perl_debug_log,
4528             "Required size %"IVdf" nodes\n"
4529             "Starting second pass (creation)\n",
4530             (IV)RExC_size);
4531         RExC_lastnum=0;
4532         RExC_lastparse=NULL;
4533     });
4534     /* Small enough for pointer-storage convention?
4535        If extralen==0, this means that we will not need long jumps. */
4536     if (RExC_size >= 0x10000L && RExC_extralen)
4537         RExC_size += RExC_extralen;
4538     else
4539         RExC_extralen = 0;
4540     if (RExC_whilem_seen > 15)
4541         RExC_whilem_seen = 15;
4542
4543     /* Allocate space and zero-initialize. Note, the two step process
4544        of zeroing when in debug mode, thus anything assigned has to
4545        happen after that */
4546     rx = (REGEXP*) newSV_type(SVt_REGEXP);
4547     r = (struct regexp*)SvANY(rx);
4548     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4549          char, regexp_internal);
4550     if ( r == NULL || ri == NULL )
4551         FAIL("Regexp out of space");
4552 #ifdef DEBUGGING
4553     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4554     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4555 #else
4556     /* bulk initialize base fields with 0. */
4557     Zero(ri, sizeof(regexp_internal), char);
4558 #endif
4559
4560     /* non-zero initialization begins here */
4561     RXi_SET( r, ri );
4562     r->engine= RE_ENGINE_PTR;
4563     r->extflags = pm_flags;
4564     {
4565         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4566         bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
4567
4568         /* The caret is output if there are any defaults: if not all the STD
4569          * flags are set, or if no character set specifier is needed */
4570         bool has_default =
4571                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4572                     || ! has_charset);
4573         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4574         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4575                             >> RXf_PMf_STD_PMMOD_SHIFT);
4576         const char *fptr = STD_PAT_MODS;        /*"msix"*/
4577         char *p;
4578         /* Allocate for the worst case, which is all the std flags are turned
4579          * on.  If more precision is desired, we could do a population count of
4580          * the flags set.  This could be done with a small lookup table, or by
4581          * shifting, masking and adding, or even, when available, assembly
4582          * language for a machine-language population count.
4583          * We never output a minus, as all those are defaults, so are
4584          * covered by the caret */
4585         const STRLEN wraplen = plen + has_p + has_runon
4586             + has_default       /* If needs a caret */
4587             + has_charset       /* If needs a character set specifier */
4588             + (sizeof(STD_PAT_MODS) - 1)
4589             + (sizeof("(?:)") - 1);
4590
4591         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4592         SvPOK_on(rx);
4593         SvFLAGS(rx) |= SvUTF8(pattern);
4594         *p++='('; *p++='?';
4595
4596         /* If a default, cover it using the caret */
4597         if (has_default) {
4598             *p++= DEFAULT_PAT_MOD;
4599         }
4600         if (has_charset) {
4601             if (r->extflags & RXf_PMf_LOCALE) {
4602                 *p++ = LOCALE_PAT_MOD;
4603             } else {
4604                 *p++ = UNICODE_PAT_MOD;
4605             }
4606         }
4607         if (has_p)
4608             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4609         {
4610             char ch;
4611             while((ch = *fptr++)) {
4612                 if(reganch & 1)
4613                     *p++ = ch;
4614                 reganch >>= 1;
4615             }
4616         }
4617
4618         *p++ = ':';
4619         Copy(RExC_precomp, p, plen, char);
4620         assert ((RX_WRAPPED(rx) - p) < 16);
4621         r->pre_prefix = p - RX_WRAPPED(rx);
4622         p += plen;
4623         if (has_runon)
4624             *p++ = '\n';
4625         *p++ = ')';
4626         *p = 0;
4627         SvCUR_set(rx, p - SvPVX_const(rx));
4628     }
4629
4630     r->intflags = 0;
4631     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4632
4633     if (RExC_seen & REG_SEEN_RECURSE) {
4634         Newxz(RExC_open_parens, RExC_npar,regnode *);
4635         SAVEFREEPV(RExC_open_parens);
4636         Newxz(RExC_close_parens,RExC_npar,regnode *);
4637         SAVEFREEPV(RExC_close_parens);
4638     }
4639
4640     /* Useful during FAIL. */
4641 #ifdef RE_TRACK_PATTERN_OFFSETS
4642     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4643     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4644                           "%s %"UVuf" bytes for offset annotations.\n",
4645                           ri->u.offsets ? "Got" : "Couldn't get",
4646                           (UV)((2*RExC_size+1) * sizeof(U32))));
4647 #endif
4648     SetProgLen(ri,RExC_size);
4649     RExC_rx_sv = rx;
4650     RExC_rx = r;
4651     RExC_rxi = ri;
4652
4653     /* Second pass: emit code. */
4654     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
4655     RExC_parse = exp;
4656     RExC_end = xend;
4657     RExC_naughty = 0;
4658     RExC_npar = 1;
4659     RExC_emit_start = ri->program;
4660     RExC_emit = ri->program;
4661     RExC_emit_bound = ri->program + RExC_size + 1;
4662
4663     /* Store the count of eval-groups for security checks: */
4664     RExC_rx->seen_evals = RExC_seen_evals;
4665     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4666     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4667         ReREFCNT_dec(rx);
4668         return(NULL);
4669     }
4670     /* XXXX To minimize changes to RE engine we always allocate
4671        3-units-long substrs field. */
4672     Newx(r->substrs, 1, struct reg_substr_data);
4673     if (RExC_recurse_count) {
4674         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4675         SAVEFREEPV(RExC_recurse);
4676     }
4677
4678 reStudy:
4679     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4680     Zero(r->substrs, 1, struct reg_substr_data);
4681
4682 #ifdef TRIE_STUDY_OPT
4683     if (!restudied) {
4684         StructCopy(&zero_scan_data, &data, scan_data_t);
4685         copyRExC_state = RExC_state;
4686     } else {
4687         U32 seen=RExC_seen;
4688         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4689
4690         RExC_state = copyRExC_state;
4691         if (seen & REG_TOP_LEVEL_BRANCHES)
4692             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4693         else
4694             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4695         if (data.last_found) {
4696             SvREFCNT_dec(data.longest_fixed);
4697             SvREFCNT_dec(data.longest_float);
4698             SvREFCNT_dec(data.last_found);
4699         }
4700         StructCopy(&zero_scan_data, &data, scan_data_t);
4701     }
4702 #else
4703     StructCopy(&zero_scan_data, &data, scan_data_t);
4704 #endif
4705
4706     /* Dig out information for optimizations. */
4707     r->extflags = RExC_flags; /* was pm_op */
4708     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4709
4710     if (UTF)
4711         SvUTF8_on(rx);  /* Unicode in it? */
4712     ri->regstclass = NULL;
4713     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
4714         r->intflags |= PREGf_NAUGHTY;
4715     scan = ri->program + 1;             /* First BRANCH. */
4716
4717     /* testing for BRANCH here tells us whether there is "must appear"
4718        data in the pattern. If there is then we can use it for optimisations */
4719     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4720         I32 fake;
4721         STRLEN longest_float_length, longest_fixed_length;
4722         struct regnode_charclass_class ch_class; /* pointed to by data */
4723         int stclass_flag;
4724         I32 last_close = 0; /* pointed to by data */
4725         regnode *first= scan;
4726         regnode *first_next= regnext(first);
4727         /*
4728          * Skip introductions and multiplicators >= 1
4729          * so that we can extract the 'meat' of the pattern that must
4730          * match in the large if() sequence following.
4731          * NOTE that EXACT is NOT covered here, as it is normally
4732          * picked up by the optimiser separately.
4733          *
4734          * This is unfortunate as the optimiser isnt handling lookahead
4735          * properly currently.
4736          *
4737          */
4738         while ((OP(first) == OPEN && (sawopen = 1)) ||
4739                /* An OR of *one* alternative - should not happen now. */
4740             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4741             /* for now we can't handle lookbehind IFMATCH*/
4742             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4743             (OP(first) == PLUS) ||
4744             (OP(first) == MINMOD) ||
4745                /* An {n,m} with n>0 */
4746             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4747             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4748         {
4749                 /*
4750                  * the only op that could be a regnode is PLUS, all the rest
4751                  * will be regnode_1 or regnode_2.
4752                  *
4753                  */
4754                 if (OP(first) == PLUS)
4755                     sawplus = 1;
4756                 else
4757                     first += regarglen[OP(first)];
4758
4759                 first = NEXTOPER(first);
4760                 first_next= regnext(first);
4761         }
4762
4763         /* Starting-point info. */
4764       again:
4765         DEBUG_PEEP("first:",first,0);
4766         /* Ignore EXACT as we deal with it later. */
4767         if (PL_regkind[OP(first)] == EXACT) {
4768             if (OP(first) == EXACT)
4769                 NOOP;   /* Empty, get anchored substr later. */
4770             else
4771                 ri->regstclass = first;
4772         }
4773 #ifdef TRIE_STCLASS
4774         else if (PL_regkind[OP(first)] == TRIE &&
4775                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4776         {
4777             regnode *trie_op;
4778             /* this can happen only on restudy */
4779             if ( OP(first) == TRIE ) {
4780                 struct regnode_1 *trieop = (struct regnode_1 *)
4781                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
4782                 StructCopy(first,trieop,struct regnode_1);
4783                 trie_op=(regnode *)trieop;
4784             } else {
4785                 struct regnode_charclass *trieop = (struct regnode_charclass *)
4786                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4787                 StructCopy(first,trieop,struct regnode_charclass);
4788                 trie_op=(regnode *)trieop;
4789             }
4790             OP(trie_op)+=2;
4791             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4792             ri->regstclass = trie_op;
4793         }
4794 #endif
4795         else if (REGNODE_SIMPLE(OP(first)))
4796             ri->regstclass = first;
4797         else if (PL_regkind[OP(first)] == BOUND ||
4798                  PL_regkind[OP(first)] == NBOUND)
4799             ri->regstclass = first;
4800         else if (PL_regkind[OP(first)] == BOL) {
4801             r->extflags |= (OP(first) == MBOL
4802                            ? RXf_ANCH_MBOL
4803                            : (OP(first) == SBOL
4804                               ? RXf_ANCH_SBOL
4805                               : RXf_ANCH_BOL));
4806             first = NEXTOPER(first);
4807             goto again;
4808         }
4809         else if (OP(first) == GPOS) {
4810             r->extflags |= RXf_ANCH_GPOS;
4811             first = NEXTOPER(first);
4812             goto again;
4813         }
4814         else if ((!sawopen || !RExC_sawback) &&
4815             (OP(first) == STAR &&
4816             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4817             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4818         {
4819             /* turn .* into ^.* with an implied $*=1 */
4820             const int type =
4821                 (OP(NEXTOPER(first)) == REG_ANY)
4822                     ? RXf_ANCH_MBOL
4823                     : RXf_ANCH_SBOL;
4824             r->extflags |= type;
4825             r->intflags |= PREGf_IMPLICIT;
4826             first = NEXTOPER(first);
4827             goto again;
4828         }
4829         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4830             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4831             /* x+ must match at the 1st pos of run of x's */
4832             r->intflags |= PREGf_SKIP;
4833
4834         /* Scan is after the zeroth branch, first is atomic matcher. */
4835 #ifdef TRIE_STUDY_OPT
4836         DEBUG_PARSE_r(
4837             if (!restudied)
4838                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4839                               (IV)(first - scan + 1))
4840         );
4841 #else
4842         DEBUG_PARSE_r(
4843             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4844                 (IV)(first - scan + 1))
4845         );
4846 #endif
4847
4848
4849         /*
4850         * If there's something expensive in the r.e., find the
4851         * longest literal string that must appear and make it the
4852         * regmust.  Resolve ties in favor of later strings, since
4853         * the regstart check works with the beginning of the r.e.
4854         * and avoiding duplication strengthens checking.  Not a
4855         * strong reason, but sufficient in the absence of others.
4856         * [Now we resolve ties in favor of the earlier string if
4857         * it happens that c_offset_min has been invalidated, since the
4858         * earlier string may buy us something the later one won't.]
4859         */
4860
4861         data.longest_fixed = newSVpvs("");
4862         data.longest_float = newSVpvs("");
4863         data.last_found = newSVpvs("");
4864         data.longest = &(data.longest_fixed);
4865         first = scan;
4866         if (!ri->regstclass) {
4867             cl_init(pRExC_state, &ch_class);
4868             data.start_class = &ch_class;
4869             stclass_flag = SCF_DO_STCLASS_AND;
4870         } else                          /* XXXX Check for BOUND? */
4871             stclass_flag = 0;
4872         data.last_closep = &last_close;
4873
4874         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4875             &data, -1, NULL, NULL,
4876             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4877
4878
4879         CHECK_RESTUDY_GOTO;
4880
4881
4882         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4883              && data.last_start_min == 0 && data.last_end > 0
4884              && !RExC_seen_zerolen
4885              && !(RExC_seen & REG_SEEN_VERBARG)
4886              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4887             r->extflags |= RXf_CHECK_ALL;
4888         scan_commit(pRExC_state, &data,&minlen,0);
4889         SvREFCNT_dec(data.last_found);
4890
4891         /* Note that code very similar to this but for anchored string
4892            follows immediately below, changes may need to be made to both.
4893            Be careful.
4894          */
4895         longest_float_length = CHR_SVLEN(data.longest_float);
4896         if (longest_float_length
4897             || (data.flags & SF_FL_BEFORE_EOL
4898                 && (!(data.flags & SF_FL_BEFORE_MEOL)
4899                     || (RExC_flags & RXf_PMf_MULTILINE))))
4900         {
4901             I32 t,ml;
4902
4903             if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
4904                 && data.offset_fixed == data.offset_float_min
4905                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4906                     goto remove_float;          /* As in (a)+. */
4907
4908             /* copy the information about the longest float from the reg_scan_data
4909                over to the program. */
4910             if (SvUTF8(data.longest_float)) {
4911                 r->float_utf8 = data.longest_float;
4912                 r->float_substr = NULL;
4913             } else {
4914                 r->float_substr = data.longest_float;
4915                 r->float_utf8 = NULL;
4916             }
4917             /* float_end_shift is how many chars that must be matched that
4918                follow this item. We calculate it ahead of time as once the
4919                lookbehind offset is added in we lose the ability to correctly
4920                calculate it.*/
4921             ml = data.minlen_float ? *(data.minlen_float)
4922                                    : (I32)longest_float_length;
4923             r->float_end_shift = ml - data.offset_float_min
4924                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4925                 + data.lookbehind_float;
4926             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4927             r->float_max_offset = data.offset_float_max;
4928             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4929                 r->float_max_offset -= data.lookbehind_float;
4930
4931             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4932                        && (!(data.flags & SF_FL_BEFORE_MEOL)
4933                            || (RExC_flags & RXf_PMf_MULTILINE)));
4934             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4935         }
4936         else {
4937           remove_float:
4938             r->float_substr = r->float_utf8 = NULL;
4939             SvREFCNT_dec(data.longest_float);
4940             longest_float_length = 0;
4941         }
4942
4943         /* Note that code very similar to this but for floating string
4944            is immediately above, changes may need to be made to both.
4945            Be careful.
4946          */
4947         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4948         if (longest_fixed_length
4949             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4950                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4951                     || (RExC_flags & RXf_PMf_MULTILINE))))
4952         {
4953             I32 t,ml;
4954
4955             /* copy the information about the longest fixed
4956                from the reg_scan_data over to the program. */
4957             if (SvUTF8(data.longest_fixed)) {
4958                 r->anchored_utf8 = data.longest_fixed;
4959                 r->anchored_substr = NULL;
4960             } else {
4961                 r->anchored_substr = data.longest_fixed;
4962                 r->anchored_utf8 = NULL;
4963             }
4964             /* fixed_end_shift is how many chars that must be matched that
4965                follow this item. We calculate it ahead of time as once the
4966                lookbehind offset is added in we lose the ability to correctly
4967                calculate it.*/
4968             ml = data.minlen_fixed ? *(data.minlen_fixed)
4969                                    : (I32)longest_fixed_length;
4970             r->anchored_end_shift = ml - data.offset_fixed
4971                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4972                 + data.lookbehind_fixed;
4973             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4974
4975             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4976                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
4977                      || (RExC_flags & RXf_PMf_MULTILINE)));
4978             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4979         }
4980         else {
4981             r->anchored_substr = r->anchored_utf8 = NULL;
4982             SvREFCNT_dec(data.longest_fixed);
4983             longest_fixed_length = 0;
4984         }
4985         if (ri->regstclass
4986             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4987             ri->regstclass = NULL;
4988         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4989             && stclass_flag
4990             && !(data.start_class->flags & ANYOF_EOS)
4991             && !cl_is_anything(data.start_class))
4992         {
4993             const U32 n = add_data(pRExC_state, 1, "f");
4994
4995             Newx(RExC_rxi->data->data[n], 1,
4996                 struct regnode_charclass_class);
4997             StructCopy(data.start_class,
4998                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4999                        struct regnode_charclass_class);
5000             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5001             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5002             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5003                       regprop(r, sv, (regnode*)data.start_class);
5004                       PerlIO_printf(Perl_debug_log,
5005                                     "synthetic stclass \"%s\".\n",
5006                                     SvPVX_const(sv));});
5007         }
5008
5009         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5010         if (longest_fixed_length > longest_float_length) {
5011             r->check_end_shift = r->anchored_end_shift;
5012             r->check_substr = r->anchored_substr;
5013             r->check_utf8 = r->anchored_utf8;
5014             r->check_offset_min = r->check_offset_max = r->anchored_offset;
5015             if (r->extflags & RXf_ANCH_SINGLE)
5016                 r->extflags |= RXf_NOSCAN;
5017         }
5018         else {
5019             r->check_end_shift = r->float_end_shift;
5020             r->check_substr = r->float_substr;
5021             r->check_utf8 = r->float_utf8;
5022             r->check_offset_min = r->float_min_offset;
5023             r->check_offset_max = r->float_max_offset;
5024         }
5025         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5026            This should be changed ASAP!  */
5027         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5028             r->extflags |= RXf_USE_INTUIT;
5029             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5030                 r->extflags |= RXf_INTUIT_TAIL;
5031         }
5032         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5033         if ( (STRLEN)minlen < longest_float_length )
5034             minlen= longest_float_length;
5035         if ( (STRLEN)minlen < longest_fixed_length )
5036             minlen= longest_fixed_length;
5037         */
5038     }
5039     else {
5040         /* Several toplevels. Best we can is to set minlen. */
5041         I32 fake;
5042         struct regnode_charclass_class ch_class;
5043         I32 last_close = 0;
5044
5045         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5046
5047         scan = ri->program + 1;
5048         cl_init(pRExC_state, &ch_class);
5049         data.start_class = &ch_class;
5050         data.last_closep = &last_close;
5051
5052
5053         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5054             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5055
5056         CHECK_RESTUDY_GOTO;
5057
5058         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5059                 = r->float_substr = r->float_utf8 = NULL;
5060         if (!(data.start_class->flags & ANYOF_EOS)
5061             && !cl_is_anything(data.start_class))
5062         {
5063             const U32 n = add_data(pRExC_state, 1, "f");
5064
5065             Newx(RExC_rxi->data->data[n], 1,
5066                 struct regnode_charclass_class);
5067             StructCopy(data.start_class,
5068                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5069                        struct regnode_charclass_class);
5070             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5071             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5072             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5073                       regprop(r, sv, (regnode*)data.start_class);
5074                       PerlIO_printf(Perl_debug_log,
5075                                     "synthetic stclass \"%s\".\n",
5076                                     SvPVX_const(sv));});
5077         }
5078     }
5079
5080     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5081        the "real" pattern. */
5082     DEBUG_OPTIMISE_r({
5083         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5084                       (IV)minlen, (IV)r->minlen);
5085     });
5086     r->minlenret = minlen;
5087     if (r->minlen < minlen)
5088         r->minlen = minlen;
5089
5090     if (RExC_seen & REG_SEEN_GPOS)
5091         r->extflags |= RXf_GPOS_SEEN;
5092     if (RExC_seen & REG_SEEN_LOOKBEHIND)
5093         r->extflags |= RXf_LOOKBEHIND_SEEN;
5094     if (RExC_seen & REG_SEEN_EVAL)
5095         r->extflags |= RXf_EVAL_SEEN;
5096     if (RExC_seen & REG_SEEN_CANY)
5097         r->extflags |= RXf_CANY_SEEN;
5098     if (RExC_seen & REG_SEEN_VERBARG)
5099         r->intflags |= PREGf_VERBARG_SEEN;
5100     if (RExC_seen & REG_SEEN_CUTGROUP)
5101         r->intflags |= PREGf_CUTGROUP_SEEN;
5102     if (RExC_paren_names)
5103         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5104     else
5105         RXp_PAREN_NAMES(r) = NULL;
5106
5107 #ifdef STUPID_PATTERN_CHECKS
5108     if (RX_PRELEN(rx) == 0)
5109         r->extflags |= RXf_NULL;
5110     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5111         /* XXX: this should happen BEFORE we compile */
5112         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5113     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5114         r->extflags |= RXf_WHITE;
5115     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5116         r->extflags |= RXf_START_ONLY;
5117 #else
5118     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5119             /* XXX: this should happen BEFORE we compile */
5120             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5121     else {
5122         regnode *first = ri->program + 1;
5123         U8 fop = OP(first);
5124         U8 nop = OP(NEXTOPER(first));
5125
5126         if (PL_regkind[fop] == NOTHING && nop == END)
5127             r->extflags |= RXf_NULL;
5128         else if (PL_regkind[fop] == BOL && nop == END)
5129             r->extflags |= RXf_START_ONLY;
5130         else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
5131             r->extflags |= RXf_WHITE;
5132     }
5133 #endif
5134 #ifdef DEBUGGING
5135     if (RExC_paren_names) {
5136         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5137         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5138     } else
5139 #endif
5140         ri->name_list_idx = 0;
5141
5142     if (RExC_recurse_count) {
5143         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5144             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5145             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5146         }
5147     }
5148     Newxz(r->offs, RExC_npar, regexp_paren_pair);
5149     /* assume we don't need to swap parens around before we match */
5150
5151     DEBUG_DUMP_r({
5152         PerlIO_printf(Perl_debug_log,"Final program:\n");
5153         regdump(r);
5154     });
5155 #ifdef RE_TRACK_PATTERN_OFFSETS
5156     DEBUG_OFFSETS_r(if (ri->u.offsets) {
5157         const U32 len = ri->u.offsets[0];
5158         U32 i;
5159         GET_RE_DEBUG_FLAGS_DECL;
5160         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5161         for (i = 1; i <= len; i++) {
5162             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5163                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5164                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5165             }
5166         PerlIO_printf(Perl_debug_log, "\n");
5167     });
5168 #endif
5169     return rx;
5170 }
5171
5172 #undef RE_ENGINE_PTR
5173
5174
5175 SV*
5176 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5177                     const U32 flags)
5178 {
5179     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5180
5181     PERL_UNUSED_ARG(value);
5182
5183     if (flags & RXapif_FETCH) {
5184         return reg_named_buff_fetch(rx, key, flags);
5185     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5186         Perl_croak_no_modify(aTHX);
5187         return NULL;
5188     } else if (flags & RXapif_EXISTS) {
5189         return reg_named_buff_exists(rx, key, flags)
5190             ? &PL_sv_yes
5191             : &PL_sv_no;
5192     } else if (flags & RXapif_REGNAMES) {
5193         return reg_named_buff_all(rx, flags);
5194     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5195         return reg_named_buff_scalar(rx, flags);
5196     } else {
5197         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5198         return NULL;
5199     }
5200 }
5201
5202 SV*
5203 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5204                          const U32 flags)
5205 {
5206     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5207     PERL_UNUSED_ARG(lastkey);
5208
5209     if (flags & RXapif_FIRSTKEY)
5210         return reg_named_buff_firstkey(rx, flags);
5211     else if (flags & RXapif_NEXTKEY)
5212         return reg_named_buff_nextkey(rx, flags);
5213     else {
5214         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5215         return NULL;
5216     }
5217 }
5218
5219 SV*
5220 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5221                           const U32 flags)
5222 {
5223     AV *retarray = NULL;
5224     SV *ret;
5225     struct regexp *const rx = (struct regexp *)SvANY(r);
5226
5227     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5228
5229     if (flags & RXapif_ALL)
5230         retarray=newAV();
5231
5232     if (rx && RXp_PAREN_NAMES(rx)) {
5233         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5234         if (he_str) {
5235             IV i;
5236             SV* sv_dat=HeVAL(he_str);
5237             I32 *nums=(I32*)SvPVX(sv_dat);
5238             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5239                 if ((I32)(rx->nparens) >= nums[i]
5240                     && rx->offs[nums[i]].start != -1
5241                     && rx->offs[nums[i]].end != -1)
5242                 {
5243                     ret = newSVpvs("");
5244                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5245                     if (!retarray)
5246                         return ret;
5247                 } else {
5248                     ret = newSVsv(&PL_sv_undef);
5249                 }
5250                 if (retarray)
5251                     av_push(retarray, ret);
5252             }
5253             if (retarray)
5254                 return newRV_noinc(MUTABLE_SV(retarray));
5255         }
5256     }
5257     return NULL;
5258 }
5259
5260 bool
5261 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5262                            const U32 flags)
5263 {
5264     struct regexp *const rx = (struct regexp *)SvANY(r);
5265
5266     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5267
5268     if (rx && RXp_PAREN_NAMES(rx)) {
5269         if (flags & RXapif_ALL) {
5270             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5271         } else {
5272             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5273             if (sv) {
5274                 SvREFCNT_dec(sv);
5275                 return TRUE;
5276             } else {
5277                 return FALSE;
5278             }
5279         }
5280     } else {
5281         return FALSE;
5282     }
5283 }
5284
5285 SV*
5286 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5287 {
5288     struct regexp *const rx = (struct regexp *)SvANY(r);
5289
5290     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5291
5292     if ( rx && RXp_PAREN_NAMES(rx) ) {
5293         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5294
5295         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5296     } else {
5297         return FALSE;
5298     }
5299 }
5300
5301 SV*
5302 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5303 {
5304     struct regexp *const rx = (struct regexp *)SvANY(r);
5305     GET_RE_DEBUG_FLAGS_DECL;
5306
5307     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5308
5309     if (rx && RXp_PAREN_NAMES(rx)) {
5310         HV *hv = RXp_PAREN_NAMES(rx);
5311         HE *temphe;
5312         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5313             IV i;
5314             IV parno = 0;
5315             SV* sv_dat = HeVAL(temphe);
5316             I32 *nums = (I32*)SvPVX(sv_dat);
5317             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5318                 if ((I32)(rx->lastparen) >= nums[i] &&
5319                     rx->offs[nums[i]].start != -1 &&
5320                     rx->offs[nums[i]].end != -1)
5321                 {
5322                     parno = nums[i];
5323                     break;
5324                 }
5325             }
5326             if (parno || flags & RXapif_ALL) {
5327                 return newSVhek(HeKEY_hek(temphe));
5328             }
5329         }
5330     }
5331     return NULL;
5332 }
5333
5334 SV*
5335 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5336 {
5337     SV *ret;
5338     AV *av;
5339     I32 length;
5340     struct regexp *const rx = (struct regexp *)SvANY(r);
5341
5342     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5343
5344     if (rx && RXp_PAREN_NAMES(rx)) {
5345         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5346             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5347         } else if (flags & RXapif_ONE) {
5348             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5349             av = MUTABLE_AV(SvRV(ret));
5350             length = av_len(av);
5351             SvREFCNT_dec(ret);
5352             return newSViv(length + 1);
5353         } else {
5354             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5355             return NULL;
5356         }
5357     }
5358     return &PL_sv_undef;
5359 }
5360
5361 SV*
5362 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5363 {
5364     struct regexp *const rx = (struct regexp *)SvANY(r);
5365     AV *av = newAV();
5366
5367     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5368
5369     if (rx && RXp_PAREN_NAMES(rx)) {
5370         HV *hv= RXp_PAREN_NAMES(rx);
5371         HE *temphe;
5372         (void)hv_iterinit(hv);
5373         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5374             IV i;
5375             IV parno = 0;
5376             SV* sv_dat = HeVAL(temphe);
5377             I32 *nums = (I32*)SvPVX(sv_dat);
5378             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5379                 if ((I32)(rx->lastparen) >= nums[i] &&
5380                     rx->offs[nums[i]].start != -1 &&
5381                     rx->offs[nums[i]].end != -1)
5382                 {
5383                     parno = nums[i];
5384                     break;
5385                 }
5386             }
5387             if (parno || flags & RXapif_ALL) {
5388                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5389             }
5390         }
5391     }
5392
5393     return newRV_noinc(MUTABLE_SV(av));
5394 }
5395
5396 void
5397 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5398                              SV * const sv)
5399 {
5400     struct regexp *const rx = (struct regexp *)SvANY(r);
5401     char *s = NULL;
5402     I32 i = 0;
5403     I32 s1, t1;
5404
5405     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5406
5407     if (!rx->subbeg) {
5408         sv_setsv(sv,&PL_sv_undef);
5409         return;
5410     }
5411     else
5412     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5413         /* $` */
5414         i = rx->offs[0].start;
5415         s = rx->subbeg;
5416     }
5417     else
5418     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5419         /* $' */
5420         s = rx->subbeg + rx->offs[0].end;
5421         i = rx->sublen - rx->offs[0].end;
5422     }
5423     else
5424     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5425         (s1 = rx->offs[paren].start) != -1 &&
5426         (t1 = rx->offs[paren].end) != -1)
5427     {
5428         /* $& $1 ... */
5429         i = t1 - s1;
5430         s = rx->subbeg + s1;
5431     } else {
5432         sv_setsv(sv,&PL_sv_undef);
5433         return;
5434     }
5435     assert(rx->sublen >= (s - rx->subbeg) + i );
5436     if (i >= 0) {
5437         const int oldtainted = PL_tainted;
5438         TAINT_NOT;
5439         sv_setpvn(sv, s, i);
5440         PL_tainted = oldtainted;
5441         if ( (rx->extflags & RXf_CANY_SEEN)
5442             ? (RXp_MATCH_UTF8(rx)
5443                         && (!i || is_utf8_string((U8*)s, i)))
5444             : (RXp_MATCH_UTF8(rx)) )
5445         {
5446             SvUTF8_on(sv);
5447         }
5448         else
5449             SvUTF8_off(sv);
5450         if (PL_tainting) {
5451             if (RXp_MATCH_TAINTED(rx)) {
5452                 if (SvTYPE(sv) >= SVt_PVMG) {
5453                     MAGIC* const mg = SvMAGIC(sv);
5454                     MAGIC* mgt;
5455                     PL_tainted = 1;
5456                     SvMAGIC_set(sv, mg->mg_moremagic);
5457                     SvTAINT(sv);
5458                     if ((mgt = SvMAGIC(sv))) {
5459                         mg->mg_moremagic = mgt;
5460                         SvMAGIC_set(sv, mg);
5461                     }
5462                 } else {
5463                     PL_tainted = 1;
5464                     SvTAINT(sv);
5465                 }
5466             } else
5467                 SvTAINTED_off(sv);
5468         }
5469     } else {
5470         sv_setsv(sv,&PL_sv_undef);
5471         return;
5472     }
5473 }
5474
5475 void
5476 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5477                                                          SV const * const value)
5478 {
5479     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5480
5481     PERL_UNUSED_ARG(rx);
5482     PERL_UNUSED_ARG(paren);
5483     PERL_UNUSED_ARG(value);
5484
5485     if (!PL_localizing)
5486         Perl_croak_no_modify(aTHX);
5487 }
5488
5489 I32
5490 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5491                               const I32 paren)
5492 {
5493     struct regexp *const rx = (struct regexp *)SvANY(r);
5494     I32 i;
5495     I32 s1, t1;
5496
5497     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5498
5499     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5500         switch (paren) {
5501       /* $` / ${^PREMATCH} */
5502       case RX_BUFF_IDX_PREMATCH:
5503         if (rx->offs[0].start != -1) {
5504                         i = rx->offs[0].start;
5505                         if (i > 0) {
5506                                 s1 = 0;
5507                                 t1 = i;
5508                                 goto getlen;
5509                         }
5510             }
5511         return 0;
5512       /* $' / ${^POSTMATCH} */
5513       case RX_BUFF_IDX_POSTMATCH:
5514             if (rx->offs[0].end != -1) {
5515                         i = rx->sublen - rx->offs[0].end;
5516                         if (i > 0) {
5517                                 s1 = rx->offs[0].end;
5518                                 t1 = rx->sublen;
5519                                 goto getlen;
5520                         }
5521             }
5522         return 0;
5523       /* $& / ${^MATCH}, $1, $2, ... */
5524       default:
5525             if (paren <= (I32)rx->nparens &&
5526             (s1 = rx->offs[paren].start) != -1 &&
5527             (t1 = rx->offs[paren].end) != -1)
5528             {
5529             i = t1 - s1;
5530             goto getlen;
5531         } else {
5532             if (ckWARN(WARN_UNINITIALIZED))
5533                 report_uninit((const SV *)sv);
5534             return 0;
5535         }
5536     }
5537   getlen:
5538     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5539         const char * const s = rx->subbeg + s1;
5540         const U8 *ep;
5541         STRLEN el;
5542
5543         i = t1 - s1;
5544         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5545                         i = el;
5546     }
5547     return i;
5548 }
5549
5550 SV*
5551 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5552 {
5553     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5554         PERL_UNUSED_ARG(rx);
5555         if (0)
5556             return NULL;
5557         else
5558             return newSVpvs("Regexp");
5559 }
5560
5561 /* Scans the name of a named buffer from the pattern.
5562  * If flags is REG_RSN_RETURN_NULL returns null.
5563  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5564  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5565  * to the parsed name as looked up in the RExC_paren_names hash.
5566  * If there is an error throws a vFAIL().. type exception.
5567  */
5568
5569 #define REG_RSN_RETURN_NULL    0
5570 #define REG_RSN_RETURN_NAME    1
5571 #define REG_RSN_RETURN_DATA    2
5572
5573 STATIC SV*
5574 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5575 {
5576     char *name_start = RExC_parse;
5577
5578     PERL_ARGS_ASSERT_REG_SCAN_NAME;
5579
5580     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5581          /* skip IDFIRST by using do...while */
5582         if (UTF)
5583             do {
5584                 RExC_parse += UTF8SKIP(RExC_parse);
5585             } while (isALNUM_utf8((U8*)RExC_parse));
5586         else
5587             do {
5588                 RExC_parse++;
5589             } while (isALNUM(*RExC_parse));
5590     }
5591
5592     if ( flags ) {
5593         SV* sv_name
5594             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5595                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5596         if ( flags == REG_RSN_RETURN_NAME)
5597             return sv_name;
5598         else if (flags==REG_RSN_RETURN_DATA) {
5599             HE *he_str = NULL;
5600             SV *sv_dat = NULL;
5601             if ( ! sv_name )      /* should not happen*/
5602                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5603             if (RExC_paren_names)
5604                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5605             if ( he_str )
5606                 sv_dat = HeVAL(he_str);
5607             if ( ! sv_dat )
5608                 vFAIL("Reference to nonexistent named group");
5609             return sv_dat;
5610         }
5611         else {
5612             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5613         }
5614         /* NOT REACHED */
5615     }
5616     return NULL;
5617 }
5618
5619 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5620     int rem=(int)(RExC_end - RExC_parse);                       \
5621     int cut;                                                    \
5622     int num;                                                    \
5623     int iscut=0;                                                \
5624     if (rem>10) {                                               \
5625         rem=10;                                                 \
5626         iscut=1;                                                \
5627     }                                                           \
5628     cut=10-rem;                                                 \
5629     if (RExC_lastparse!=RExC_parse)                             \
5630         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5631             rem, RExC_parse,                                    \
5632             cut + 4,                                            \
5633             iscut ? "..." : "<"                                 \
5634         );                                                      \
5635     else                                                        \
5636         PerlIO_printf(Perl_debug_log,"%16s","");                \
5637                                                                 \
5638     if (SIZE_ONLY)                                              \
5639        num = RExC_size + 1;                                     \
5640     else                                                        \
5641        num=REG_NODE_NUM(RExC_emit);                             \
5642     if (RExC_lastnum!=num)                                      \
5643        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5644     else                                                        \
5645        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5646     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5647         (int)((depth*2)), "",                                   \
5648         (funcname)                                              \
5649     );                                                          \
5650     RExC_lastnum=num;                                           \
5651     RExC_lastparse=RExC_parse;                                  \
5652 })
5653
5654
5655
5656 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5657     DEBUG_PARSE_MSG((funcname));                            \
5658     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5659 })
5660 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5661     DEBUG_PARSE_MSG((funcname));                            \
5662     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5663 })
5664 /*
5665  - reg - regular expression, i.e. main body or parenthesized thing
5666  *
5667  * Caller must absorb opening parenthesis.
5668  *
5669  * Combining parenthesis handling with the base level of regular expression
5670  * is a trifle forced, but the need to tie the tails of the branches to what
5671  * follows makes it hard to avoid.
5672  */
5673 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5674 #ifdef DEBUGGING
5675 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5676 #else
5677 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5678 #endif
5679
5680 STATIC regnode *
5681 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5682     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5683 {
5684     dVAR;
5685     register regnode *ret;              /* Will be the head of the group. */
5686     register regnode *br;
5687     register regnode *lastbr;
5688     register regnode *ender = NULL;
5689     register I32 parno = 0;
5690     I32 flags;
5691     U32 oregflags = RExC_flags;
5692     bool have_branch = 0;
5693     bool is_open = 0;
5694     I32 freeze_paren = 0;
5695     I32 after_freeze = 0;
5696
5697     /* for (?g), (?gc), and (?o) warnings; warning
5698        about (?c) will warn about (?g) -- japhy    */
5699
5700 #define WASTED_O  0x01
5701 #define WASTED_G  0x02
5702 #define WASTED_C  0x04
5703 #define WASTED_GC (0x02|0x04)
5704     I32 wastedflags = 0x00;
5705
5706     char * parse_start = RExC_parse; /* MJD */
5707     char * const oregcomp_parse = RExC_parse;
5708
5709     GET_RE_DEBUG_FLAGS_DECL;
5710
5711     PERL_ARGS_ASSERT_REG;
5712     DEBUG_PARSE("reg ");
5713
5714     *flagp = 0;                         /* Tentatively. */
5715
5716
5717     /* Make an OPEN node, if parenthesized. */
5718     if (paren) {
5719         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5720             char *start_verb = RExC_parse;
5721             STRLEN verb_len = 0;
5722             char *start_arg = NULL;
5723             unsigned char op = 0;
5724             int argok = 1;
5725             int internal_argval = 0; /* internal_argval is only useful if !argok */
5726             while ( *RExC_parse && *RExC_parse != ')' ) {
5727                 if ( *RExC_parse == ':' ) {
5728                     start_arg = RExC_parse + 1;
5729                     break;
5730                 }
5731                 RExC_parse++;
5732             }
5733             ++start_verb;
5734             verb_len = RExC_parse - start_verb;
5735             if ( start_arg ) {
5736                 RExC_parse++;
5737                 while ( *RExC_parse && *RExC_parse != ')' )
5738                     RExC_parse++;
5739                 if ( *RExC_parse != ')' )
5740                     vFAIL("Unterminated verb pattern argument");
5741                 if ( RExC_parse == start_arg )
5742                     start_arg = NULL;
5743             } else {
5744                 if ( *RExC_parse != ')' )
5745                     vFAIL("Unterminated verb pattern");
5746             }
5747
5748             switch ( *start_verb ) {
5749             case 'A':  /* (*ACCEPT) */
5750                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5751                     op = ACCEPT;
5752                     internal_argval = RExC_nestroot;
5753                 }
5754                 break;
5755             case 'C':  /* (*COMMIT) */
5756                 if ( memEQs(start_verb,verb_len,"COMMIT") )
5757                     op = COMMIT;
5758                 break;
5759             case 'F':  /* (*FAIL) */
5760                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5761                     op = OPFAIL;
5762                     argok = 0;
5763                 }
5764                 break;
5765             case ':':  /* (*:NAME) */
5766             case 'M':  /* (*MARK:NAME) */
5767                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5768                     op = MARKPOINT;
5769                     argok = -1;
5770                 }
5771                 break;
5772             case 'P':  /* (*PRUNE) */
5773                 if ( memEQs(start_verb,verb_len,"PRUNE") )
5774                     op = PRUNE;
5775                 break;
5776             case 'S':   /* (*SKIP) */
5777                 if ( memEQs(start_verb,verb_len,"SKIP") )
5778                     op = SKIP;
5779                 break;
5780             case 'T':  /* (*THEN) */
5781                 /* [19:06] <TimToady> :: is then */
5782                 if ( memEQs(start_verb,verb_len,"THEN") ) {
5783                     op = CUTGROUP;
5784                     RExC_seen |= REG_SEEN_CUTGROUP;
5785                 }
5786                 break;
5787             }
5788             if ( ! op ) {
5789                 RExC_parse++;
5790                 vFAIL3("Unknown verb pattern '%.*s'",
5791                     verb_len, start_verb);
5792             }
5793             if ( argok ) {
5794                 if ( start_arg && internal_argval ) {
5795                     vFAIL3("Verb pattern '%.*s' may not have an argument",
5796                         verb_len, start_verb);
5797                 } else if ( argok < 0 && !start_arg ) {
5798                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5799                         verb_len, start_verb);
5800                 } else {
5801                     ret = reganode(pRExC_state, op, internal_argval);
5802                     if ( ! internal_argval && ! SIZE_ONLY ) {
5803                         if (start_arg) {
5804                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5805                             ARG(ret) = add_data( pRExC_state, 1, "S" );
5806                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5807                             ret->flags = 0;
5808                         } else {
5809                             ret->flags = 1;
5810                         }
5811                     }
5812                 }
5813                 if (!internal_argval)
5814                     RExC_seen |= REG_SEEN_VERBARG;
5815             } else if ( start_arg ) {
5816                 vFAIL3("Verb pattern '%.*s' may not have an argument",
5817                         verb_len, start_verb);
5818             } else {
5819                 ret = reg_node(pRExC_state, op);
5820             }
5821             nextchar(pRExC_state);
5822             return ret;
5823         } else
5824         if (*RExC_parse == '?') { /* (?...) */
5825             bool is_logical = 0;
5826             const char * const seqstart = RExC_parse;
5827             bool has_use_defaults = FALSE;
5828
5829             RExC_parse++;
5830             paren = *RExC_parse++;
5831             ret = NULL;                 /* For look-ahead/behind. */
5832             switch (paren) {
5833
5834             case 'P':   /* (?P...) variants for those used to PCRE/Python */
5835                 paren = *RExC_parse++;
5836                 if ( paren == '<')         /* (?P<...>) named capture */
5837                     goto named_capture;
5838                 else if (paren == '>') {   /* (?P>name) named recursion */
5839                     goto named_recursion;
5840                 }
5841                 else if (paren == '=') {   /* (?P=...)  named backref */
5842                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
5843                        you change this make sure you change that */
5844                     char* name_start = RExC_parse;
5845                     U32 num = 0;
5846                     SV *sv_dat = reg_scan_name(pRExC_state,
5847                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5848                     if (RExC_parse == name_start || *RExC_parse != ')')
5849                         vFAIL2("Sequence %.3s... not terminated",parse_start);
5850
5851                     if (!SIZE_ONLY) {
5852                         num = add_data( pRExC_state, 1, "S" );
5853                         RExC_rxi->data->data[num]=(void*)sv_dat;
5854                         SvREFCNT_inc_simple_void(sv_dat);
5855                     }
5856                     RExC_sawback = 1;
5857                     ret = reganode(pRExC_state,
5858                                    ((! FOLD)
5859                                      ? NREF
5860                                      : (UNI_SEMANTICS)
5861                                        ? NREFFU
5862                                        : (LOC)
5863                                          ? NREFFL
5864                                          : NREFF),
5865                                     num);
5866                     *flagp |= HASWIDTH;
5867
5868                     Set_Node_Offset(ret, parse_start+1);
5869                     Set_Node_Cur_Length(ret); /* MJD */
5870
5871                     nextchar(pRExC_state);
5872                     return ret;
5873                 }
5874                 RExC_parse++;
5875                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5876                 /*NOTREACHED*/
5877             case '<':           /* (?<...) */
5878                 if (*RExC_parse == '!')
5879                     paren = ',';
5880                 else if (*RExC_parse != '=')
5881               named_capture:
5882                 {               /* (?<...>) */
5883                     char *name_start;
5884                     SV *svname;
5885                     paren= '>';
5886             case '\'':          /* (?'...') */
5887                     name_start= RExC_parse;
5888                     svname = reg_scan_name(pRExC_state,
5889                         SIZE_ONLY ?  /* reverse test from the others */
5890                         REG_RSN_RETURN_NAME :
5891                         REG_RSN_RETURN_NULL);
5892                     if (RExC_parse == name_start) {
5893                         RExC_parse++;
5894                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5895                         /*NOTREACHED*/
5896                     }
5897                     if (*RExC_parse != paren)
5898                         vFAIL2("Sequence (?%c... not terminated",
5899                             paren=='>' ? '<' : paren);
5900                     if (SIZE_ONLY) {
5901                         HE *he_str;
5902                         SV *sv_dat = NULL;
5903                         if (!svname) /* shouldnt happen */
5904                             Perl_croak(aTHX_
5905                                 "panic: reg_scan_name returned NULL");
5906                         if (!RExC_paren_names) {
5907                             RExC_paren_names= newHV();
5908                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
5909 #ifdef DEBUGGING
5910                             RExC_paren_name_list= newAV();
5911                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5912 #endif
5913                         }
5914                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5915                         if ( he_str )
5916                             sv_dat = HeVAL(he_str);
5917                         if ( ! sv_dat ) {
5918                             /* croak baby croak */
5919                             Perl_croak(aTHX_
5920                                 "panic: paren_name hash element allocation failed");
5921                         } else if ( SvPOK(sv_dat) ) {
5922                             /* (?|...) can mean we have dupes so scan to check
5923                                its already been stored. Maybe a flag indicating
5924                                we are inside such a construct would be useful,
5925                                but the arrays are likely to be quite small, so
5926                                for now we punt -- dmq */
5927                             IV count = SvIV(sv_dat);
5928                             I32 *pv = (I32*)SvPVX(sv_dat);
5929                             IV i;
5930                             for ( i = 0 ; i < count ; i++ ) {
5931                                 if ( pv[i] == RExC_npar ) {
5932                                     count = 0;
5933                                     break;
5934                                 }
5935                             }
5936                             if ( count ) {
5937                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5938                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5939                                 pv[count] = RExC_npar;
5940                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5941                             }
5942                         } else {
5943                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
5944                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5945                             SvIOK_on(sv_dat);
5946                             SvIV_set(sv_dat, 1);
5947                         }
5948 #ifdef DEBUGGING
5949                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5950                             SvREFCNT_dec(svname);
5951 #endif
5952
5953                         /*sv_dump(sv_dat);*/
5954                     }
5955                     nextchar(pRExC_state);
5956                     paren = 1;
5957                     goto capturing_parens;
5958                 }
5959                 RExC_seen |= REG_SEEN_LOOKBEHIND;
5960                 RExC_parse++;
5961             case '=':           /* (?=...) */
5962                 RExC_seen_zerolen++;
5963                 break;
5964             case '!':           /* (?!...) */
5965                 RExC_seen_zerolen++;
5966                 if (*RExC_parse == ')') {
5967                     ret=reg_node(pRExC_state, OPFAIL);
5968                     nextchar(pRExC_state);
5969                     return ret;
5970                 }
5971                 break;
5972             case '|':           /* (?|...) */
5973                 /* branch reset, behave like a (?:...) except that
5974                    buffers in alternations share the same numbers */
5975                 paren = ':';
5976                 after_freeze = freeze_paren = RExC_npar;
5977                 break;
5978             case ':':           /* (?:...) */
5979             case '>':           /* (?>...) */
5980                 break;
5981             case '$':           /* (?$...) */
5982             case '@':           /* (?@...) */
5983                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5984                 break;
5985             case '#':           /* (?#...) */
5986                 while (*RExC_parse && *RExC_parse != ')')
5987                     RExC_parse++;
5988                 if (*RExC_parse != ')')
5989                     FAIL("Sequence (?#... not terminated");
5990                 nextchar(pRExC_state);
5991                 *flagp = TRYAGAIN;
5992                 return NULL;
5993             case '0' :           /* (?0) */
5994             case 'R' :           /* (?R) */
5995                 if (*RExC_parse != ')')
5996                     FAIL("Sequence (?R) not terminated");
5997                 ret = reg_node(pRExC_state, GOSTART);
5998                 *flagp |= POSTPONED;
5999                 nextchar(pRExC_state);
6000                 return ret;
6001                 /*notreached*/
6002             { /* named and numeric backreferences */
6003                 I32 num;
6004             case '&':            /* (?&NAME) */
6005                 parse_start = RExC_parse - 1;
6006               named_recursion:
6007                 {
6008                     SV *sv_dat = reg_scan_name(pRExC_state,
6009                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6010                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6011                 }
6012                 goto gen_recurse_regop;
6013                 /* NOT REACHED */
6014             case '+':
6015                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6016                     RExC_parse++;
6017                     vFAIL("Illegal pattern");
6018                 }
6019                 goto parse_recursion;
6020                 /* NOT REACHED*/
6021             case '-': /* (?-1) */
6022                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6023                     RExC_parse--; /* rewind to let it be handled later */
6024                     goto parse_flags;
6025                 }
6026                 /*FALLTHROUGH */
6027             case '1': case '2': case '3': case '4': /* (?1) */
6028             case '5': case '6': case '7': case '8': case '9':
6029                 RExC_parse--;
6030               parse_recursion:
6031                 num = atoi(RExC_parse);
6032                 parse_start = RExC_parse - 1; /* MJD */
6033                 if (*RExC_parse == '-')
6034                     RExC_parse++;
6035                 while (isDIGIT(*RExC_parse))
6036                         RExC_parse++;
6037                 if (*RExC_parse!=')')
6038                     vFAIL("Expecting close bracket");
6039
6040               gen_recurse_regop:
6041                 if ( paren == '-' ) {
6042                     /*
6043                     Diagram of capture buffer numbering.
6044                     Top line is the normal capture buffer numbers
6045                     Bottom line is the negative indexing as from
6046                     the X (the (?-2))
6047
6048                     +   1 2    3 4 5 X          6 7
6049                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6050                     -   5 4    3 2 1 X          x x
6051
6052                     */
6053                     num = RExC_npar + num;
6054                     if (num < 1)  {
6055                         RExC_parse++;
6056                         vFAIL("Reference to nonexistent group");
6057                     }
6058                 } else if ( paren == '+' ) {
6059                     num = RExC_npar + num - 1;
6060                 }
6061
6062                 ret = reganode(pRExC_state, GOSUB, num);
6063                 if (!SIZE_ONLY) {
6064                     if (num > (I32)RExC_rx->nparens) {
6065                         RExC_parse++;
6066                         vFAIL("Reference to nonexistent group");
6067                     }
6068                     ARG2L_SET( ret, RExC_recurse_count++);
6069                     RExC_emit++;
6070                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6071                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6072                 } else {
6073                     RExC_size++;
6074                 }
6075                 RExC_seen |= REG_SEEN_RECURSE;
6076                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6077                 Set_Node_Offset(ret, parse_start); /* MJD */
6078
6079                 *flagp |= POSTPONED;
6080                 nextchar(pRExC_state);
6081                 return ret;
6082             } /* named and numeric backreferences */
6083             /* NOT REACHED */
6084
6085             case '?':           /* (??...) */
6086                 is_logical = 1;
6087                 if (*RExC_parse != '{') {
6088                     RExC_parse++;
6089                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6090                     /*NOTREACHED*/
6091                 }
6092                 *flagp |= POSTPONED;
6093                 paren = *RExC_parse++;
6094                 /* FALL THROUGH */
6095             case '{':           /* (?{...}) */
6096             {
6097                 I32 count = 1;
6098                 U32 n = 0;
6099                 char c;
6100                 char *s = RExC_parse;
6101
6102                 RExC_seen_zerolen++;
6103                 RExC_seen |= REG_SEEN_EVAL;
6104                 while (count && (c = *RExC_parse)) {
6105                     if (c == '\\') {
6106                         if (RExC_parse[1])
6107                             RExC_parse++;
6108                     }
6109                     else if (c == '{')
6110                         count++;
6111                     else if (c == '}')
6112                         count--;
6113                     RExC_parse++;
6114                 }
6115                 if (*RExC_parse != ')') {
6116                     RExC_parse = s;
6117                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6118                 }
6119                 if (!SIZE_ONLY) {
6120                     PAD *pad;
6121                     OP_4tree *sop, *rop;
6122                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6123
6124                     ENTER;
6125                     Perl_save_re_context(aTHX);
6126                     rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6127                     sop->op_private |= OPpREFCOUNTED;
6128                     /* re_dup will OpREFCNT_inc */
6129                     OpREFCNT_set(sop, 1);
6130                     LEAVE;
6131
6132                     n = add_data(pRExC_state, 3, "nop");
6133                     RExC_rxi->data->data[n] = (void*)rop;
6134                     RExC_rxi->data->data[n+1] = (void*)sop;
6135                     RExC_rxi->data->data[n+2] = (void*)pad;
6136                     SvREFCNT_dec(sv);
6137                 }
6138                 else {                                          /* First pass */
6139                     if (PL_reginterp_cnt < ++RExC_seen_evals
6140                         && IN_PERL_RUNTIME)
6141                         /* No compiled RE interpolated, has runtime
6142                            components ===> unsafe.  */
6143                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
6144                     if (PL_tainting && PL_tainted)
6145                         FAIL("Eval-group in insecure regular expression");
6146 #if PERL_VERSION > 8
6147                     if (IN_PERL_COMPILETIME)
6148                         PL_cv_has_eval = 1;
6149 #endif
6150                 }
6151
6152                 nextchar(pRExC_state);
6153                 if (is_logical) {
6154                     ret = reg_node(pRExC_state, LOGICAL);
6155                     if (!SIZE_ONLY)
6156                         ret->flags = 2;
6157                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6158                     /* deal with the length of this later - MJD */
6159                     return ret;
6160                 }
6161                 ret = reganode(pRExC_state, EVAL, n);
6162                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6163                 Set_Node_Offset(ret, parse_start);
6164                 return ret;
6165             }
6166             case '(':           /* (?(?{...})...) and (?(?=...)...) */
6167             {
6168                 int is_define= 0;
6169                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
6170                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6171                         || RExC_parse[1] == '<'
6172                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
6173                         I32 flag;
6174
6175                         ret = reg_node(pRExC_state, LOGICAL);
6176                         if (!SIZE_ONLY)
6177                             ret->flags = 1;
6178                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6179                         goto insert_if;
6180                     }
6181                 }
6182                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
6183                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6184                 {
6185                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
6186                     char *name_start= RExC_parse++;
6187                     U32 num = 0;
6188                     SV *sv_dat=reg_scan_name(pRExC_state,
6189                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6190                     if (RExC_parse == name_start || *RExC_parse != ch)
6191                         vFAIL2("Sequence (?(%c... not terminated",
6192                             (ch == '>' ? '<' : ch));
6193                     RExC_parse++;
6194                     if (!SIZE_ONLY) {
6195                         num = add_data( pRExC_state, 1, "S" );
6196                         RExC_rxi->data->data[num]=(void*)sv_dat;
6197                         SvREFCNT_inc_simple_void(sv_dat);
6198                     }
6199                     ret = reganode(pRExC_state,NGROUPP,num);
6200                     goto insert_if_check_paren;
6201                 }
6202                 else if (RExC_parse[0] == 'D' &&
6203                          RExC_parse[1] == 'E' &&
6204                          RExC_parse[2] == 'F' &&
6205                          RExC_parse[3] == 'I' &&
6206                          RExC_parse[4] == 'N' &&
6207                          RExC_parse[5] == 'E')
6208                 {
6209                     ret = reganode(pRExC_state,DEFINEP,0);
6210                     RExC_parse +=6 ;
6211                     is_define = 1;
6212                     goto insert_if_check_paren;
6213                 }
6214                 else if (RExC_parse[0] == 'R') {
6215                     RExC_parse++;
6216                     parno = 0;
6217                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6218                         parno = atoi(RExC_parse++);
6219                         while (isDIGIT(*RExC_parse))
6220                             RExC_parse++;
6221                     } else if (RExC_parse[0] == '&') {
6222                         SV *sv_dat;
6223                         RExC_parse++;
6224                         sv_dat = reg_scan_name(pRExC_state,
6225                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6226                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6227                     }
6228                     ret = reganode(pRExC_state,INSUBP,parno);
6229                     goto insert_if_check_paren;
6230                 }
6231                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6232                     /* (?(1)...) */
6233                     char c;
6234                     parno = atoi(RExC_parse++);
6235
6236                     while (isDIGIT(*RExC_parse))
6237                         RExC_parse++;
6238                     ret = reganode(pRExC_state, GROUPP, parno);
6239
6240                  insert_if_check_paren:
6241                     if ((c = *nextchar(pRExC_state)) != ')')
6242                         vFAIL("Switch condition not recognized");
6243                   insert_if:
6244                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6245                     br = regbranch(pRExC_state, &flags, 1,depth+1);
6246                     if (br == NULL)
6247                         br = reganode(pRExC_state, LONGJMP, 0);
6248                     else
6249                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6250                     c = *nextchar(pRExC_state);
6251                     if (flags&HASWIDTH)
6252                         *flagp |= HASWIDTH;
6253                     if (c == '|') {
6254                         if (is_define)
6255                             vFAIL("(?(DEFINE)....) does not allow branches");
6256                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6257                         regbranch(pRExC_state, &flags, 1,depth+1);
6258                         REGTAIL(pRExC_state, ret, lastbr);
6259                         if (flags&HASWIDTH)
6260                             *flagp |= HASWIDTH;
6261                         c = *nextchar(pRExC_state);
6262                     }
6263                     else
6264                         lastbr = NULL;
6265                     if (c != ')')
6266                         vFAIL("Switch (?(condition)... contains too many branches");
6267                     ender = reg_node(pRExC_state, TAIL);
6268                     REGTAIL(pRExC_state, br, ender);
6269                     if (lastbr) {
6270                         REGTAIL(pRExC_state, lastbr, ender);
6271                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6272                     }
6273                     else
6274                         REGTAIL(pRExC_state, ret, ender);
6275                     RExC_size++; /* XXX WHY do we need this?!!
6276                                     For large programs it seems to be required
6277                                     but I can't figure out why. -- dmq*/
6278                     return ret;
6279                 }
6280                 else {
6281                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6282                 }
6283             }
6284             case 0:
6285                 RExC_parse--; /* for vFAIL to print correctly */
6286                 vFAIL("Sequence (? incomplete");
6287                 break;
6288             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
6289                                        that follow */
6290                 has_use_defaults = TRUE;
6291                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
6292                 if (RExC_utf8) {    /* But the default for a utf8 pattern is
6293                                        unicode semantics */
6294                     RExC_flags |= RXf_PMf_UNICODE;
6295                 }
6296                 goto parse_flags;
6297             default:
6298                 --RExC_parse;
6299                 parse_flags:      /* (?i) */
6300             {
6301                 U32 posflags = 0, negflags = 0;
6302                 U32 *flagsp = &posflags;
6303                 bool has_charset_modifier = 0;
6304
6305                 while (*RExC_parse) {
6306                     /* && strchr("iogcmsx", *RExC_parse) */
6307                     /* (?g), (?gc) and (?o) are useless here
6308                        and must be globally applied -- japhy */
6309                     switch (*RExC_parse) {
6310                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6311                     case LOCALE_PAT_MOD:
6312                         if (has_charset_modifier || flagsp == &negflags) {
6313                             goto fail_modifiers;
6314                         }
6315                         posflags |= RXf_PMf_LOCALE;
6316                         negflags |= RXf_PMf_UNICODE;
6317                         has_charset_modifier = 1;
6318                         break;
6319                     case UNICODE_PAT_MOD:
6320                         if (has_charset_modifier || flagsp == &negflags) {
6321                             goto fail_modifiers;
6322                         }
6323                         posflags |= RXf_PMf_UNICODE;
6324                         negflags |= RXf_PMf_LOCALE;
6325                         has_charset_modifier = 1;
6326                         break;
6327                     case DUAL_PAT_MOD:
6328                         if (has_use_defaults
6329                             || has_charset_modifier
6330                             || flagsp == &negflags)
6331                         {
6332                             goto fail_modifiers;
6333                         }
6334
6335                         /* The dual charset means unicode semantics if the
6336                          * pattern (or target, not known until runtime) are
6337                          * utf8 */
6338                         if (RExC_utf8) {
6339                             posflags |= RXf_PMf_UNICODE;
6340                             negflags |= RXf_PMf_LOCALE;
6341                         }
6342                         else {
6343                             negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
6344                         }
6345                         has_charset_modifier = 1;
6346                         break;
6347                     case ONCE_PAT_MOD: /* 'o' */
6348                     case GLOBAL_PAT_MOD: /* 'g' */
6349                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6350                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6351                             if (! (wastedflags & wflagbit) ) {
6352                                 wastedflags |= wflagbit;
6353                                 vWARN5(
6354                                     RExC_parse + 1,
6355                                     "Useless (%s%c) - %suse /%c modifier",
6356                                     flagsp == &negflags ? "?-" : "?",
6357                                     *RExC_parse,
6358                                     flagsp == &negflags ? "don't " : "",
6359                                     *RExC_parse
6360                                 );
6361                             }
6362                         }
6363                         break;
6364
6365                     case CONTINUE_PAT_MOD: /* 'c' */
6366                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6367                             if (! (wastedflags & WASTED_C) ) {
6368                                 wastedflags |= WASTED_GC;
6369                                 vWARN3(
6370                                     RExC_parse + 1,
6371                                     "Useless (%sc) - %suse /gc modifier",
6372                                     flagsp == &negflags ? "?-" : "?",
6373                                     flagsp == &negflags ? "don't " : ""
6374                                 );
6375                             }
6376                         }
6377                         break;
6378                     case KEEPCOPY_PAT_MOD: /* 'p' */
6379                         if (flagsp == &negflags) {
6380                             if (SIZE_ONLY)
6381                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6382                         } else {
6383                             *flagsp |= RXf_PMf_KEEPCOPY;
6384                         }
6385                         break;
6386                     case '-':
6387                         /* A flag is a default iff it is following a minus, so
6388                          * if there is a minus, it means will be trying to
6389                          * re-specify a default which is an error */
6390                         if (has_use_defaults || flagsp == &negflags) {
6391             fail_modifiers:
6392                             RExC_parse++;
6393                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6394                             /*NOTREACHED*/
6395                         }
6396                         flagsp = &negflags;
6397                         wastedflags = 0;  /* reset so (?g-c) warns twice */
6398                         break;
6399                     case ':':
6400                         paren = ':';
6401                         /*FALLTHROUGH*/
6402                     case ')':
6403                         RExC_flags |= posflags;
6404                         RExC_flags &= ~negflags;
6405                         if (paren != ':') {
6406                             oregflags |= posflags;
6407                             oregflags &= ~negflags;
6408                         }
6409                         nextchar(pRExC_state);
6410                         if (paren != ':') {
6411                             *flagp = TRYAGAIN;
6412                             return NULL;
6413                         } else {
6414                             ret = NULL;
6415                             goto parse_rest;
6416                         }
6417                         /*NOTREACHED*/
6418                     default:
6419                         RExC_parse++;
6420                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6421                         /*NOTREACHED*/
6422                     }
6423                     ++RExC_parse;
6424                 }
6425             }} /* one for the default block, one for the switch */
6426         }
6427         else {                  /* (...) */
6428           capturing_parens:
6429             parno = RExC_npar;
6430             RExC_npar++;
6431
6432             ret = reganode(pRExC_state, OPEN, parno);
6433             if (!SIZE_ONLY ){
6434                 if (!RExC_nestroot)
6435                     RExC_nestroot = parno;
6436                 if (RExC_seen & REG_SEEN_RECURSE
6437                     && !RExC_open_parens[parno-1])
6438                 {
6439                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6440                         "Setting open paren #%"IVdf" to %d\n",
6441                         (IV)parno, REG_NODE_NUM(ret)));
6442                     RExC_open_parens[parno-1]= ret;
6443                 }
6444             }
6445             Set_Node_Length(ret, 1); /* MJD */
6446             Set_Node_Offset(ret, RExC_parse); /* MJD */
6447             is_open = 1;
6448         }
6449     }
6450     else                        /* ! paren */
6451         ret = NULL;
6452
6453    parse_rest:
6454     /* Pick up the branches, linking them together. */
6455     parse_start = RExC_parse;   /* MJD */
6456     br = regbranch(pRExC_state, &flags, 1,depth+1);
6457
6458     if (freeze_paren) {
6459         if (RExC_npar > after_freeze)
6460             after_freeze = RExC_npar;
6461         RExC_npar = freeze_paren;
6462     }
6463
6464     /*     branch_len = (paren != 0); */
6465
6466     if (br == NULL)
6467         return(NULL);
6468     if (*RExC_parse == '|') {
6469         if (!SIZE_ONLY && RExC_extralen) {
6470             reginsert(pRExC_state, BRANCHJ, br, depth+1);
6471         }
6472         else {                  /* MJD */
6473             reginsert(pRExC_state, BRANCH, br, depth+1);
6474             Set_Node_Length(br, paren != 0);
6475             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6476         }
6477         have_branch = 1;
6478         if (SIZE_ONLY)
6479             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
6480     }
6481     else if (paren == ':') {
6482         *flagp |= flags&SIMPLE;
6483     }
6484     if (is_open) {                              /* Starts with OPEN. */
6485         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
6486     }
6487     else if (paren != '?')              /* Not Conditional */
6488         ret = br;
6489     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6490     lastbr = br;
6491     while (*RExC_parse == '|') {
6492         if (!SIZE_ONLY && RExC_extralen) {
6493             ender = reganode(pRExC_state, LONGJMP,0);
6494             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6495         }
6496         if (SIZE_ONLY)
6497             RExC_extralen += 2;         /* Account for LONGJMP. */
6498         nextchar(pRExC_state);
6499         if (freeze_paren) {
6500             if (RExC_npar > after_freeze)
6501                 after_freeze = RExC_npar;
6502             RExC_npar = freeze_paren;
6503         }
6504         br = regbranch(pRExC_state, &flags, 0, depth+1);
6505
6506         if (br == NULL)
6507             return(NULL);
6508         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
6509         lastbr = br;
6510         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6511     }
6512
6513     if (have_branch || paren != ':') {
6514         /* Make a closing node, and hook it on the end. */
6515         switch (paren) {
6516         case ':':
6517             ender = reg_node(pRExC_state, TAIL);
6518             break;
6519         case 1:
6520             ender = reganode(pRExC_state, CLOSE, parno);
6521             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6522                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6523                         "Setting close paren #%"IVdf" to %d\n",
6524                         (IV)parno, REG_NODE_NUM(ender)));
6525                 RExC_close_parens[parno-1]= ender;
6526                 if (RExC_nestroot == parno)
6527                     RExC_nestroot = 0;
6528             }
6529             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6530             Set_Node_Length(ender,1); /* MJD */
6531             break;
6532         case '<':
6533         case ',':
6534         case '=':
6535         case '!':
6536             *flagp &= ~HASWIDTH;
6537             /* FALL THROUGH */
6538         case '>':
6539             ender = reg_node(pRExC_state, SUCCEED);
6540             break;
6541         case 0:
6542             ender = reg_node(pRExC_state, END);
6543             if (!SIZE_ONLY) {
6544                 assert(!RExC_opend); /* there can only be one! */
6545                 RExC_opend = ender;
6546             }
6547             break;
6548         }
6549         REGTAIL(pRExC_state, lastbr, ender);
6550
6551         if (have_branch && !SIZE_ONLY) {
6552             if (depth==1)
6553                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6554
6555             /* Hook the tails of the branches to the closing node. */
6556             for (br = ret; br; br = regnext(br)) {
6557                 const U8 op = PL_regkind[OP(br)];
6558                 if (op == BRANCH) {
6559                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6560                 }
6561                 else if (op == BRANCHJ) {
6562                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6563                 }
6564             }
6565         }
6566     }
6567
6568     {
6569         const char *p;
6570         static const char parens[] = "=!<,>";
6571
6572         if (paren && (p = strchr(parens, paren))) {
6573             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6574             int flag = (p - parens) > 1;
6575
6576             if (paren == '>')
6577                 node = SUSPEND, flag = 0;
6578             reginsert(pRExC_state, node,ret, depth+1);
6579             Set_Node_Cur_Length(ret);
6580             Set_Node_Offset(ret, parse_start + 1);
6581             ret->flags = flag;
6582             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6583         }
6584     }
6585
6586     /* Check for proper termination. */
6587     if (paren) {
6588         RExC_flags = oregflags;
6589         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6590             RExC_parse = oregcomp_parse;
6591             vFAIL("Unmatched (");
6592         }
6593     }
6594     else if (!paren && RExC_parse < RExC_end) {
6595         if (*RExC_parse == ')') {
6596             RExC_parse++;
6597             vFAIL("Unmatched )");
6598         }
6599         else
6600             FAIL("Junk on end of regexp");      /* "Can't happen". */
6601         /* NOTREACHED */
6602     }
6603     if (after_freeze)
6604         RExC_npar = after_freeze;
6605     return(ret);
6606 }
6607
6608 /*
6609  - regbranch - one alternative of an | operator
6610  *
6611  * Implements the concatenation operator.
6612  */
6613 STATIC regnode *
6614 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6615 {
6616     dVAR;
6617     register regnode *ret;
6618     register regnode *chain = NULL;
6619     register regnode *latest;
6620     I32 flags = 0, c = 0;
6621     GET_RE_DEBUG_FLAGS_DECL;
6622
6623     PERL_ARGS_ASSERT_REGBRANCH;
6624
6625     DEBUG_PARSE("brnc");
6626
6627     if (first)
6628         ret = NULL;
6629     else {
6630         if (!SIZE_ONLY && RExC_extralen)
6631             ret = reganode(pRExC_state, BRANCHJ,0);
6632         else {
6633             ret = reg_node(pRExC_state, BRANCH);
6634             Set_Node_Length(ret, 1);
6635         }
6636     }
6637
6638     if (!first && SIZE_ONLY)
6639         RExC_extralen += 1;                     /* BRANCHJ */
6640
6641     *flagp = WORST;                     /* Tentatively. */
6642
6643     RExC_parse--;
6644     nextchar(pRExC_state);
6645     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6646         flags &= ~TRYAGAIN;
6647         latest = regpiece(pRExC_state, &flags,depth+1);
6648         if (latest == NULL) {
6649             if (flags & TRYAGAIN)
6650                 continue;
6651             return(NULL);
6652         }
6653         else if (ret == NULL)
6654             ret = latest;
6655         *flagp |= flags&(HASWIDTH|POSTPONED);
6656         if (chain == NULL)      /* First piece. */
6657             *flagp |= flags&SPSTART;
6658         else {
6659             RExC_naughty++;
6660             REGTAIL(pRExC_state, chain, latest);
6661         }
6662         chain = latest;
6663         c++;
6664     }
6665     if (chain == NULL) {        /* Loop ran zero times. */
6666         chain = reg_node(pRExC_state, NOTHING);
6667         if (ret == NULL)
6668             ret = chain;
6669     }
6670     if (c == 1) {
6671         *flagp |= flags&SIMPLE;
6672     }
6673
6674     return ret;
6675 }
6676
6677 /*
6678  - regpiece - something followed by possible [*+?]
6679  *
6680  * Note that the branching code sequences used for ? and the general cases
6681  * of * and + are somewhat optimized:  they use the same NOTHING node as
6682  * both the endmarker for their branch list and the body of the last branch.
6683  * It might seem that this node could be dispensed with entirely, but the
6684  * endmarker role is not redundant.
6685  */
6686 STATIC regnode *
6687 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6688 {
6689     dVAR;
6690     register regnode *ret;
6691     register char op;
6692     register char *next;
6693     I32 flags;
6694     const char * const origparse = RExC_parse;
6695     I32 min;
6696     I32 max = REG_INFTY;
6697     char *parse_start;
6698     const char *maxpos = NULL;
6699     GET_RE_DEBUG_FLAGS_DECL;
6700
6701     PERL_ARGS_ASSERT_REGPIECE;
6702
6703     DEBUG_PARSE("piec");
6704
6705     ret = regatom(pRExC_state, &flags,depth+1);
6706     if (ret == NULL) {
6707         if (flags & TRYAGAIN)
6708             *flagp |= TRYAGAIN;
6709         return(NULL);
6710     }
6711
6712     op = *RExC_parse;
6713
6714     if (op == '{' && regcurly(RExC_parse)) {
6715         maxpos = NULL;
6716         parse_start = RExC_parse; /* MJD */
6717         next = RExC_parse + 1;
6718         while (isDIGIT(*next) || *next == ',') {
6719             if (*next == ',') {
6720                 if (maxpos)
6721                     break;
6722                 else
6723                     maxpos = next;
6724             }
6725             next++;
6726         }
6727         if (*next == '}') {             /* got one */
6728             if (!maxpos)
6729                 maxpos = next;
6730             RExC_parse++;
6731             min = atoi(RExC_parse);
6732             if (*maxpos == ',')
6733                 maxpos++;
6734             else
6735                 maxpos = RExC_parse;
6736             max = atoi(maxpos);
6737             if (!max && *maxpos != '0')
6738                 max = REG_INFTY;                /* meaning "infinity" */
6739             else if (max >= REG_INFTY)
6740                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6741             RExC_parse = next;
6742             nextchar(pRExC_state);
6743
6744         do_curly:
6745             if ((flags&SIMPLE)) {
6746                 RExC_naughty += 2 + RExC_naughty / 2;
6747                 reginsert(pRExC_state, CURLY, ret, depth+1);
6748                 Set_Node_Offset(ret, parse_start+1); /* MJD */
6749                 Set_Node_Cur_Length(ret);
6750             }
6751             else {
6752                 regnode * const w = reg_node(pRExC_state, WHILEM);
6753
6754                 w->flags = 0;
6755                 REGTAIL(pRExC_state, ret, w);
6756                 if (!SIZE_ONLY && RExC_extralen) {
6757                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
6758                     reginsert(pRExC_state, NOTHING,ret, depth+1);
6759                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
6760                 }
6761                 reginsert(pRExC_state, CURLYX,ret, depth+1);
6762                                 /* MJD hk */
6763                 Set_Node_Offset(ret, parse_start+1);
6764                 Set_Node_Length(ret,
6765                                 op == '{' ? (RExC_parse - parse_start) : 1);
6766
6767                 if (!SIZE_ONLY && RExC_extralen)
6768                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
6769                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6770                 if (SIZE_ONLY)
6771                     RExC_whilem_seen++, RExC_extralen += 3;
6772                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
6773             }
6774             ret->flags = 0;
6775
6776             if (min > 0)
6777                 *flagp = WORST;
6778             if (max > 0)
6779                 *flagp |= HASWIDTH;
6780             if (max < min)
6781                 vFAIL("Can't do {n,m} with n > m");
6782             if (!SIZE_ONLY) {
6783                 ARG1_SET(ret, (U16)min);
6784                 ARG2_SET(ret, (U16)max);
6785             }
6786
6787             goto nest_check;
6788         }
6789     }
6790
6791     if (!ISMULT1(op)) {
6792         *flagp = flags;
6793         return(ret);
6794     }
6795
6796 #if 0                           /* Now runtime fix should be reliable. */
6797
6798     /* if this is reinstated, don't forget to put this back into perldiag:
6799
6800             =item Regexp *+ operand could be empty at {#} in regex m/%s/
6801
6802            (F) The part of the regexp subject to either the * or + quantifier
6803            could match an empty string. The {#} shows in the regular
6804            expression about where the problem was discovered.
6805
6806     */
6807
6808     if (!(flags&HASWIDTH) && op != '?')
6809       vFAIL("Regexp *+ operand could be empty");
6810 #endif
6811
6812     parse_start = RExC_parse;
6813     nextchar(pRExC_state);
6814
6815     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6816
6817     if (op == '*' && (flags&SIMPLE)) {
6818         reginsert(pRExC_state, STAR, ret, depth+1);
6819         ret->flags = 0;
6820         RExC_naughty += 4;
6821     }
6822     else if (op == '*') {
6823         min = 0;
6824         goto do_curly;
6825     }
6826     else if (op == '+' && (flags&SIMPLE)) {
6827         reginsert(pRExC_state, PLUS, ret, depth+1);
6828         ret->flags = 0;
6829         RExC_naughty += 3;
6830     }
6831     else if (op == '+') {
6832         min = 1;
6833         goto do_curly;
6834     }
6835     else if (op == '?') {
6836         min = 0; max = 1;
6837         goto do_curly;
6838     }
6839   nest_check:
6840     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6841         ckWARN3reg(RExC_parse,
6842                    "%.*s matches null string many times",
6843                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6844                    origparse);
6845     }
6846
6847     if (RExC_parse < RExC_end && *RExC_parse == '?') {
6848         nextchar(pRExC_state);
6849         reginsert(pRExC_state, MINMOD, ret, depth+1);
6850         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6851     }
6852 #ifndef REG_ALLOW_MINMOD_SUSPEND
6853     else
6854 #endif
6855     if (RExC_parse < RExC_end && *RExC_parse == '+') {
6856         regnode *ender;
6857         nextchar(pRExC_state);
6858         ender = reg_node(pRExC_state, SUCCEED);
6859         REGTAIL(pRExC_state, ret, ender);
6860         reginsert(pRExC_state, SUSPEND, ret, depth+1);
6861         ret->flags = 0;
6862         ender = reg_node(pRExC_state, TAIL);
6863         REGTAIL(pRExC_state, ret, ender);
6864         /*ret= ender;*/
6865     }
6866
6867     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6868         RExC_parse++;
6869         vFAIL("Nested quantifiers");
6870     }
6871
6872     return(ret);
6873 }
6874
6875
6876 /* reg_namedseq(pRExC_state,UVp)
6877
6878    This is expected to be called by a parser routine that has
6879    recognized '\N' and needs to handle the rest. RExC_parse is
6880    expected to point at the first char following the N at the time
6881    of the call.
6882
6883    The \N may be inside (indicated by valuep not being NULL) or outside a
6884    character class.
6885
6886    \N may begin either a named sequence, or if outside a character class, mean
6887    to match a non-newline.  For non single-quoted regexes, the tokenizer has
6888    attempted to decide which, and in the case of a named sequence converted it
6889    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6890    where c1... are the characters in the sequence.  For single-quoted regexes,
6891    the tokenizer passes the \N sequence through unchanged; this code will not
6892    attempt to determine this nor expand those.  The net effect is that if the
6893    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6894    signals that this \N occurrence means to match a non-newline.
6895
6896    Only the \N{U+...} form should occur in a character class, for the same
6897    reason that '.' inside a character class means to just match a period: it
6898    just doesn't make sense.
6899
6900    If valuep is non-null then it is assumed that we are parsing inside
6901    of a charclass definition and the first codepoint in the resolved
6902    string is returned via *valuep and the routine will return NULL.
6903    In this mode if a multichar string is returned from the charnames
6904    handler, a warning will be issued, and only the first char in the
6905    sequence will be examined. If the string returned is zero length
6906    then the value of *valuep is undefined and NON-NULL will
6907    be returned to indicate failure. (This will NOT be a valid pointer
6908    to a regnode.)
6909
6910    If valuep is null then it is assumed that we are parsing normal text and a
6911    new EXACT node is inserted into the program containing the resolved string,
6912    and a pointer to the new node is returned.  But if the string is zero length
6913    a NOTHING node is emitted instead.
6914
6915    On success RExC_parse is set to the char following the endbrace.
6916    Parsing failures will generate a fatal error via vFAIL(...)
6917  */
6918 STATIC regnode *
6919 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6920 {
6921     char * endbrace;    /* '}' following the name */
6922     regnode *ret = NULL;
6923 #ifdef DEBUGGING
6924     char* parse_start = RExC_parse - 2;     /* points to the '\N' */
6925 #endif
6926     char* p;
6927
6928     GET_RE_DEBUG_FLAGS_DECL;
6929
6930     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6931
6932     GET_RE_DEBUG_FLAGS;
6933
6934     /* The [^\n] meaning of \N ignores spaces and comments under the /x
6935      * modifier.  The other meaning does not */
6936     p = (RExC_flags & RXf_PMf_EXTENDED)
6937         ? regwhite( pRExC_state, RExC_parse )
6938         : RExC_parse;
6939
6940     /* Disambiguate between \N meaning a named character versus \N meaning
6941      * [^\n].  The former is assumed when it can't be the latter. */
6942     if (*p != '{' || regcurly(p)) {
6943         RExC_parse = p;
6944         if (valuep) {
6945             /* no bare \N in a charclass */
6946             vFAIL("\\N in a character class must be a named character: \\N{...}");
6947         }
6948         nextchar(pRExC_state);
6949         ret = reg_node(pRExC_state, REG_ANY);
6950         *flagp |= HASWIDTH|SIMPLE;
6951         RExC_naughty++;
6952         RExC_parse--;
6953         Set_Node_Length(ret, 1); /* MJD */
6954         return ret;
6955     }
6956
6957     /* Here, we have decided it should be a named sequence */
6958
6959     /* The test above made sure that the next real character is a '{', but
6960      * under the /x modifier, it could be separated by space (or a comment and
6961      * \n) and this is not allowed (for consistency with \x{...} and the
6962      * tokenizer handling of \N{NAME}). */
6963     if (*RExC_parse != '{') {
6964         vFAIL("Missing braces on \\N{}");
6965     }
6966
6967     RExC_parse++;       /* Skip past the '{' */
6968
6969     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6970         || ! (endbrace == RExC_parse            /* nothing between the {} */
6971               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
6972                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6973     {
6974         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
6975         vFAIL("\\N{NAME} must be resolved by the lexer");
6976     }
6977
6978     if (endbrace == RExC_parse) {   /* empty: \N{} */
6979         if (! valuep) {
6980             RExC_parse = endbrace + 1;
6981             return reg_node(pRExC_state,NOTHING);
6982         }
6983
6984         if (SIZE_ONLY) {
6985             ckWARNreg(RExC_parse,
6986                     "Ignoring zero length \\N{} in character class"
6987             );
6988             RExC_parse = endbrace + 1;
6989         }
6990         *valuep = 0;
6991         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6992     }
6993
6994     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
6995     RExC_parse += 2;    /* Skip past the 'U+' */
6996
6997     if (valuep) {   /* In a bracketed char class */
6998         /* We only pay attention to the first char of
6999         multichar strings being returned. I kinda wonder
7000         if this makes sense as it does change the behaviour
7001         from earlier versions, OTOH that behaviour was broken
7002         as well. XXX Solution is to recharacterize as
7003         [rest-of-class]|multi1|multi2... */
7004
7005         STRLEN length_of_hex;
7006         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7007             | PERL_SCAN_DISALLOW_PREFIX
7008             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7009
7010         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7011         if (endchar < endbrace) {
7012             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7013         }
7014
7015         length_of_hex = (STRLEN)(endchar - RExC_parse);
7016         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7017
7018         /* The tokenizer should have guaranteed validity, but it's possible to
7019          * bypass it by using single quoting, so check */
7020         if (length_of_hex == 0
7021             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7022         {
7023             RExC_parse += length_of_hex;        /* Includes all the valid */
7024             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
7025                             ? UTF8SKIP(RExC_parse)
7026                             : 1;
7027             /* Guard against malformed utf8 */
7028             if (RExC_parse >= endchar) RExC_parse = endchar;
7029             vFAIL("Invalid hexadecimal number in \\N{U+...}");
7030         }
7031
7032         RExC_parse = endbrace + 1;
7033         if (endchar == endbrace) return NULL;
7034
7035         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
7036     }
7037     else {      /* Not a char class */
7038         char *s;            /* String to put in generated EXACT node */
7039         STRLEN len = 0;     /* Its current byte length */
7040         char *endchar;      /* Points to '.' or '}' ending cur char in the input
7041                                stream */
7042
7043         ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
7044                                                    : (LOC)
7045                                                       ? EXACTFL
7046                                                       : UNI_SEMANTICS
7047                                                         ? EXACTFU
7048                                                         : EXACTF));
7049         s= STRING(ret);
7050
7051         /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
7052          * the input which is of the form now 'c1.c2.c3...}' until find the
7053          * ending brace or exceed length 255.  The characters that exceed this
7054          * limit are dropped.  The limit could be relaxed should it become
7055          * desirable by reparsing this as (?:\N{NAME}), so could generate
7056          * multiple EXACT nodes, as is done for just regular input.  But this
7057          * is primarily a named character, and not intended to be a huge long
7058          * string, so 255 bytes should be good enough */
7059         while (1) {
7060             STRLEN length_of_hex;
7061             I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7062                             | PERL_SCAN_DISALLOW_PREFIX
7063                             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7064             UV cp;  /* Ord of current character */
7065
7066             /* Code points are separated by dots.  If none, there is only one
7067              * code point, and is terminated by the brace */
7068             endchar = RExC_parse + strcspn(RExC_parse, ".}");
7069
7070             /* The values are Unicode even on EBCDIC machines */
7071             length_of_hex = (STRLEN)(endchar - RExC_parse);
7072             cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7073             if ( length_of_hex == 0
7074                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7075             {
7076                 RExC_parse += length_of_hex;        /* Includes all the valid */
7077                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
7078                                 ? UTF8SKIP(RExC_parse)
7079                                 : 1;
7080                 /* Guard against malformed utf8 */
7081                 if (RExC_parse >= endchar) RExC_parse = endchar;
7082                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7083             }
7084
7085             if (! FOLD) {       /* Not folding, just append to the string */
7086                 STRLEN unilen;
7087
7088                 /* Quit before adding this character if would exceed limit */
7089                 if (len + UNISKIP(cp) > U8_MAX) break;
7090
7091                 unilen = reguni(pRExC_state, cp, s);
7092                 if (unilen > 0) {
7093                     s   += unilen;
7094                     len += unilen;
7095                 }
7096             } else {    /* Folding, output the folded equivalent */
7097                 STRLEN foldlen,numlen;
7098                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7099                 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7100
7101                 /* Quit before exceeding size limit */
7102                 if (len + foldlen > U8_MAX) break;
7103
7104                 for (foldbuf = tmpbuf;
7105                     foldlen;
7106                     foldlen -= numlen)
7107                 {
7108                     cp = utf8_to_uvchr(foldbuf, &numlen);
7109                     if (numlen > 0) {
7110                         const STRLEN unilen = reguni(pRExC_state, cp, s);
7111                         s       += unilen;
7112                         len     += unilen;
7113                         /* In EBCDIC the numlen and unilen can differ. */
7114                         foldbuf += numlen;
7115                         if (numlen >= foldlen)
7116                             break;
7117                     }
7118                     else
7119                         break; /* "Can't happen." */
7120                 }
7121             }
7122
7123             /* Point to the beginning of the next character in the sequence. */
7124             RExC_parse = endchar + 1;
7125
7126             /* Quit if no more characters */
7127             if (RExC_parse >= endbrace) break;
7128         }
7129
7130
7131         if (SIZE_ONLY) {
7132             if (RExC_parse < endbrace) {
7133                 ckWARNreg(RExC_parse - 1,
7134                           "Using just the first characters returned by \\N{}");
7135             }
7136
7137             RExC_size += STR_SZ(len);
7138         } else {
7139             STR_LEN(ret) = len;
7140             RExC_emit += STR_SZ(len);
7141         }
7142
7143         RExC_parse = endbrace + 1;
7144
7145         *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7146                                with malformed in t/re/pat_advanced.t */
7147         RExC_parse --;
7148         Set_Node_Cur_Length(ret); /* MJD */
7149         nextchar(pRExC_state);
7150     }
7151
7152     return ret;
7153 }
7154
7155
7156 /*
7157  * reg_recode
7158  *
7159  * It returns the code point in utf8 for the value in *encp.
7160  *    value: a code value in the source encoding
7161  *    encp:  a pointer to an Encode object
7162  *
7163  * If the result from Encode is not a single character,
7164  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7165  */
7166 STATIC UV
7167 S_reg_recode(pTHX_ const char value, SV **encp)
7168 {
7169     STRLEN numlen = 1;
7170     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7171     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7172     const STRLEN newlen = SvCUR(sv);
7173     UV uv = UNICODE_REPLACEMENT;
7174
7175     PERL_ARGS_ASSERT_REG_RECODE;
7176
7177     if (newlen)
7178         uv = SvUTF8(sv)
7179              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7180              : *(U8*)s;
7181
7182     if (!newlen || numlen != newlen) {
7183         uv = UNICODE_REPLACEMENT;
7184         *encp = NULL;
7185     }
7186     return uv;
7187 }
7188
7189
7190 /*
7191  - regatom - the lowest level
7192
7193    Try to identify anything special at the start of the pattern. If there
7194    is, then handle it as required. This may involve generating a single regop,
7195    such as for an assertion; or it may involve recursing, such as to
7196    handle a () structure.
7197
7198    If the string doesn't start with something special then we gobble up
7199    as much literal text as we can.
7200
7201    Once we have been able to handle whatever type of thing started the
7202    sequence, we return.
7203
7204    Note: we have to be careful with escapes, as they can be both literal
7205    and special, and in the case of \10 and friends can either, depending
7206    on context. Specifically there are two seperate switches for handling
7207    escape sequences, with the one for handling literal escapes requiring
7208    a dummy entry for all of the special escapes that are actually handled
7209    by the other.
7210 */
7211
7212 STATIC regnode *
7213 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7214 {
7215     dVAR;
7216     register regnode *ret = NULL;
7217     I32 flags;
7218     char *parse_start = RExC_parse;
7219     GET_RE_DEBUG_FLAGS_DECL;
7220     DEBUG_PARSE("atom");
7221     *flagp = WORST;             /* Tentatively. */
7222
7223     PERL_ARGS_ASSERT_REGATOM;
7224
7225 tryagain:
7226     switch ((U8)*RExC_parse) {
7227     case '^':
7228         RExC_seen_zerolen++;
7229         nextchar(pRExC_state);
7230         if (RExC_flags & RXf_PMf_MULTILINE)
7231             ret = reg_node(pRExC_state, MBOL);
7232         else if (RExC_flags & RXf_PMf_SINGLELINE)
7233             ret = reg_node(pRExC_state, SBOL);
7234         else
7235             ret = reg_node(pRExC_state, BOL);
7236         Set_Node_Length(ret, 1); /* MJD */
7237         break;
7238     case '$':
7239         nextchar(pRExC_state);
7240         if (*RExC_parse)
7241             RExC_seen_zerolen++;
7242         if (RExC_flags & RXf_PMf_MULTILINE)
7243             ret = reg_node(pRExC_state, MEOL);
7244         else if (RExC_flags & RXf_PMf_SINGLELINE)
7245             ret = reg_node(pRExC_state, SEOL);
7246         else
7247             ret = reg_node(pRExC_state, EOL);
7248         Set_Node_Length(ret, 1); /* MJD */
7249         break;
7250     case '.':
7251         nextchar(pRExC_state);
7252         if (RExC_flags & RXf_PMf_SINGLELINE)
7253             ret = reg_node(pRExC_state, SANY);
7254         else
7255             ret = reg_node(pRExC_state, REG_ANY);
7256         *flagp |= HASWIDTH|SIMPLE;
7257         RExC_naughty++;
7258         Set_Node_Length(ret, 1); /* MJD */
7259         break;
7260     case '[':
7261     {
7262         char * const oregcomp_parse = ++RExC_parse;
7263         ret = regclass(pRExC_state,depth+1);
7264         if (*RExC_parse != ']') {
7265             RExC_parse = oregcomp_parse;
7266             vFAIL("Unmatched [");
7267         }
7268         nextchar(pRExC_state);
7269         *flagp |= HASWIDTH|SIMPLE;
7270         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7271         break;
7272     }
7273     case '(':
7274         nextchar(pRExC_state);
7275         ret = reg(pRExC_state, 1, &flags,depth+1);
7276         if (ret == NULL) {
7277                 if (flags & TRYAGAIN) {
7278                     if (RExC_parse == RExC_end) {
7279                          /* Make parent create an empty node if needed. */
7280                         *flagp |= TRYAGAIN;
7281                         return(NULL);
7282                     }
7283                     goto tryagain;
7284                 }
7285                 return(NULL);
7286         }
7287         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7288         break;
7289     case '|':
7290     case ')':
7291         if (flags & TRYAGAIN) {
7292             *flagp |= TRYAGAIN;
7293             return NULL;
7294         }
7295         vFAIL("Internal urp");
7296                                 /* Supposed to be caught earlier. */
7297         break;
7298     case '{':
7299         if (!regcurly(RExC_parse)) {
7300             RExC_parse++;
7301             goto defchar;
7302         }
7303         /* FALL THROUGH */
7304     case '?':
7305     case '+':
7306     case '*':
7307         RExC_parse++;
7308         vFAIL("Quantifier follows nothing");
7309         break;
7310     case LATIN_SMALL_LETTER_SHARP_S:
7311     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7312     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7313 #if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
7314 #error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ.  Other instances in this code should have the case statement below.
7315     case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
7316 #endif
7317         do_foldchar:
7318         if (!LOC && FOLD) {
7319             U32 len,cp;
7320             len=0; /* silence a spurious compiler warning */
7321             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7322                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7323                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7324                 ret = reganode(pRExC_state, FOLDCHAR, cp);
7325                 Set_Node_Length(ret, 1); /* MJD */
7326                 nextchar(pRExC_state); /* kill whitespace under /x */
7327                 return ret;
7328             }
7329         }
7330         goto outer_default;
7331     case '\\':
7332         /* Special Escapes
7333
7334            This switch handles escape sequences that resolve to some kind
7335            of special regop and not to literal text. Escape sequnces that
7336            resolve to literal text are handled below in the switch marked
7337            "Literal Escapes".
7338
7339            Every entry in this switch *must* have a corresponding entry
7340            in the literal escape switch. However, the opposite is not
7341            required, as the default for this switch is to jump to the
7342            literal text handling code.
7343         */
7344         switch ((U8)*++RExC_parse) {
7345         case LATIN_SMALL_LETTER_SHARP_S:
7346         case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7347         case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7348                    goto do_foldchar;
7349         /* Special Escapes */
7350         case 'A':
7351             RExC_seen_zerolen++;
7352             ret = reg_node(pRExC_state, SBOL);
7353             *flagp |= SIMPLE;
7354             goto finish_meta_pat;
7355         case 'G':
7356             ret = reg_node(pRExC_state, GPOS);
7357             RExC_seen |= REG_SEEN_GPOS;
7358             *flagp |= SIMPLE;
7359             goto finish_meta_pat;
7360         case 'K':
7361             RExC_seen_zerolen++;
7362             ret = reg_node(pRExC_state, KEEPS);
7363             *flagp |= SIMPLE;
7364             /* XXX:dmq : disabling in-place substitution seems to
7365              * be necessary here to avoid cases of memory corruption, as
7366              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7367              */
7368             RExC_seen |= REG_SEEN_LOOKBEHIND;
7369             goto finish_meta_pat;
7370         case 'Z':
7371             ret = reg_node(pRExC_state, SEOL);
7372             *flagp |= SIMPLE;
7373             RExC_seen_zerolen++;                /* Do not optimize RE away */
7374             goto finish_meta_pat;
7375         case 'z':
7376             ret = reg_node(pRExC_state, EOS);
7377             *flagp |= SIMPLE;
7378             RExC_seen_zerolen++;                /* Do not optimize RE away */
7379             goto finish_meta_pat;
7380         case 'C':
7381             ret = reg_node(pRExC_state, CANY);
7382             RExC_seen |= REG_SEEN_CANY;
7383             *flagp |= HASWIDTH|SIMPLE;
7384             goto finish_meta_pat;
7385         case 'X':
7386             ret = reg_node(pRExC_state, CLUMP);
7387             *flagp |= HASWIDTH;
7388             goto finish_meta_pat;
7389         case 'w':
7390             if (LOC) {
7391                 ret = reg_node(pRExC_state, (U8)(ALNUML));
7392             } else {
7393                 ret = reg_node(pRExC_state, (U8)(ALNUM));
7394                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7395             }
7396             *flagp |= HASWIDTH|SIMPLE;
7397             goto finish_meta_pat;
7398         case 'W':
7399             if (LOC) {
7400                 ret = reg_node(pRExC_state, (U8)(NALNUML));
7401             } else {
7402                 ret = reg_node(pRExC_state, (U8)(NALNUM));
7403                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7404             }
7405             *flagp |= HASWIDTH|SIMPLE;
7406             goto finish_meta_pat;
7407         case 'b':
7408             RExC_seen_zerolen++;
7409             RExC_seen |= REG_SEEN_LOOKBEHIND;
7410             if (LOC) {
7411                 ret = reg_node(pRExC_state, (U8)(BOUNDL));
7412             } else {
7413                 ret = reg_node(pRExC_state, (U8)(BOUND));
7414                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7415             }
7416             *flagp |= SIMPLE;
7417             goto finish_meta_pat;
7418         case 'B':
7419             RExC_seen_zerolen++;
7420             RExC_seen |= REG_SEEN_LOOKBEHIND;
7421             if (LOC) {
7422                 ret = reg_node(pRExC_state, (U8)(NBOUNDL));
7423             } else {
7424                 ret = reg_node(pRExC_state, (U8)(NBOUND));
7425                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7426             }
7427             *flagp |= SIMPLE;
7428             goto finish_meta_pat;
7429         case 's':
7430             if (LOC) {
7431                 ret = reg_node(pRExC_state, (U8)(SPACEL));
7432             } else {
7433                 ret = reg_node(pRExC_state, (U8)(SPACE));
7434                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7435             }
7436             *flagp |= HASWIDTH|SIMPLE;
7437             goto finish_meta_pat;
7438         case 'S':
7439             if (LOC) {
7440                 ret = reg_node(pRExC_state, (U8)(NSPACEL));
7441             } else {
7442                 ret = reg_node(pRExC_state, (U8)(NSPACE));
7443                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7444             }
7445             *flagp |= HASWIDTH|SIMPLE;
7446             goto finish_meta_pat;
7447         case 'd':
7448             if (LOC) {
7449                 ret = reg_node(pRExC_state, (U8)(DIGITL));
7450             } else {
7451                 ret = reg_node(pRExC_state, (U8)(DIGIT));
7452             }
7453             *flagp |= HASWIDTH|SIMPLE;
7454             goto finish_meta_pat;
7455         case 'D':
7456             if (LOC) {
7457                 ret = reg_node(pRExC_state, (U8)(NDIGITL));
7458             } else {
7459                 ret = reg_node(pRExC_state, (U8)(NDIGIT));
7460             }
7461             *flagp |= HASWIDTH|SIMPLE;
7462             goto finish_meta_pat;
7463         case 'R':
7464             ret = reg_node(pRExC_state, LNBREAK);
7465             *flagp |= HASWIDTH|SIMPLE;
7466             goto finish_meta_pat;
7467         case 'h':
7468             ret = reg_node(pRExC_state, HORIZWS);
7469             *flagp |= HASWIDTH|SIMPLE;
7470             goto finish_meta_pat;
7471         case 'H':
7472             ret = reg_node(pRExC_state, NHORIZWS);
7473             *flagp |= HASWIDTH|SIMPLE;
7474             goto finish_meta_pat;
7475         case 'v':
7476             ret = reg_node(pRExC_state, VERTWS);
7477             *flagp |= HASWIDTH|SIMPLE;
7478             goto finish_meta_pat;
7479         case 'V':
7480             ret = reg_node(pRExC_state, NVERTWS);
7481             *flagp |= HASWIDTH|SIMPLE;
7482          finish_meta_pat:
7483             nextchar(pRExC_state);
7484             Set_Node_Length(ret, 2); /* MJD */
7485             break;
7486         case 'p':
7487         case 'P':
7488             {
7489                 char* const oldregxend = RExC_end;
7490 #ifdef DEBUGGING
7491                 char* parse_start = RExC_parse - 2;
7492 #endif
7493
7494                 if (RExC_parse[1] == '{') {
7495                   /* a lovely hack--pretend we saw [\pX] instead */
7496                     RExC_end = strchr(RExC_parse, '}');
7497                     if (!RExC_end) {
7498                         const U8 c = (U8)*RExC_parse;
7499                         RExC_parse += 2;
7500                         RExC_end = oldregxend;
7501                         vFAIL2("Missing right brace on \\%c{}", c);
7502                     }
7503                     RExC_end++;
7504                 }
7505                 else {
7506                     RExC_end = RExC_parse + 2;
7507                     if (RExC_end > oldregxend)
7508                         RExC_end = oldregxend;
7509                 }
7510                 RExC_parse--;
7511
7512                 ret = regclass(pRExC_state,depth+1);
7513
7514                 RExC_end = oldregxend;
7515                 RExC_parse--;
7516
7517                 Set_Node_Offset(ret, parse_start + 2);
7518                 Set_Node_Cur_Length(ret);
7519                 nextchar(pRExC_state);
7520                 *flagp |= HASWIDTH|SIMPLE;
7521             }
7522             break;
7523         case 'N':
7524             /* Handle \N and \N{NAME} here and not below because it can be
7525             multicharacter. join_exact() will join them up later on.
7526             Also this makes sure that things like /\N{BLAH}+/ and
7527             \N{BLAH} being multi char Just Happen. dmq*/
7528             ++RExC_parse;
7529             ret= reg_namedseq(pRExC_state, NULL, flagp);
7530             break;
7531         case 'k':    /* Handle \k<NAME> and \k'NAME' */
7532         parse_named_seq:
7533         {
7534             char ch= RExC_parse[1];
7535             if (ch != '<' && ch != '\'' && ch != '{') {
7536                 RExC_parse++;
7537                 vFAIL2("Sequence %.2s... not terminated",parse_start);
7538             } else {
7539                 /* this pretty much dupes the code for (?P=...) in reg(), if
7540                    you change this make sure you change that */
7541                 char* name_start = (RExC_parse += 2);
7542                 U32 num = 0;
7543                 SV *sv_dat = reg_scan_name(pRExC_state,
7544                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7545                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7546                 if (RExC_parse == name_start || *RExC_parse != ch)
7547                     vFAIL2("Sequence %.3s... not terminated",parse_start);
7548
7549                 if (!SIZE_ONLY) {
7550                     num = add_data( pRExC_state, 1, "S" );
7551                     RExC_rxi->data->data[num]=(void*)sv_dat;
7552                     SvREFCNT_inc_simple_void(sv_dat);
7553                 }
7554
7555                 RExC_sawback = 1;
7556                 ret = reganode(pRExC_state,
7557                                ((! FOLD)
7558                                  ? NREF
7559                                  : (UNI_SEMANTICS)
7560                                    ? NREFFU
7561                                    : (LOC)
7562                                      ? NREFFL
7563                                      : NREFF),
7564                                 num);
7565                 *flagp |= HASWIDTH;
7566
7567                 /* override incorrect value set in reganode MJD */
7568                 Set_Node_Offset(ret, parse_start+1);
7569                 Set_Node_Cur_Length(ret); /* MJD */
7570                 nextchar(pRExC_state);
7571
7572             }
7573             break;
7574         }
7575         case 'g':
7576         case '1': case '2': case '3': case '4':
7577         case '5': case '6': case '7': case '8': case '9':
7578             {
7579                 I32 num;
7580                 bool isg = *RExC_parse == 'g';
7581                 bool isrel = 0;
7582                 bool hasbrace = 0;
7583                 if (isg) {
7584                     RExC_parse++;
7585                     if (*RExC_parse == '{') {
7586                         RExC_parse++;
7587                         hasbrace = 1;
7588                     }
7589                     if (*RExC_parse == '-') {
7590                         RExC_parse++;
7591                         isrel = 1;
7592                     }
7593                     if (hasbrace && !isDIGIT(*RExC_parse)) {
7594                         if (isrel) RExC_parse--;
7595                         RExC_parse -= 2;
7596                         goto parse_named_seq;
7597                 }   }
7598                 num = atoi(RExC_parse);
7599                 if (isg && num == 0)
7600                     vFAIL("Reference to invalid group 0");
7601                 if (isrel) {
7602                     num = RExC_npar - num;
7603                     if (num < 1)
7604                         vFAIL("Reference to nonexistent or unclosed group");
7605                 }
7606                 if (!isg && num > 9 && num >= RExC_npar)
7607                     goto defchar;
7608                 else {
7609                     char * const parse_start = RExC_parse - 1; /* MJD */
7610                     while (isDIGIT(*RExC_parse))
7611                         RExC_parse++;
7612                     if (parse_start == RExC_parse - 1)
7613                         vFAIL("Unterminated \\g... pattern");
7614                     if (hasbrace) {
7615                         if (*RExC_parse != '}')
7616                             vFAIL("Unterminated \\g{...} pattern");
7617                         RExC_parse++;
7618                     }
7619                     if (!SIZE_ONLY) {
7620                         if (num > (I32)RExC_rx->nparens)
7621                             vFAIL("Reference to nonexistent group");
7622                     }
7623                     RExC_sawback = 1;
7624                     ret = reganode(pRExC_state,
7625                                    ((! FOLD)
7626                                      ? REF
7627                                      : (UNI_SEMANTICS)
7628                                        ? REFFU
7629                                        : (LOC)
7630                                          ? REFFL
7631                                          : REFF),
7632                                     num);
7633                     *flagp |= HASWIDTH;
7634
7635                     /* override incorrect value set in reganode MJD */
7636                     Set_Node_Offset(ret, parse_start+1);
7637                     Set_Node_Cur_Length(ret); /* MJD */
7638                     RExC_parse--;
7639                     nextchar(pRExC_state);
7640                 }
7641             }
7642             break;
7643         case '\0':
7644             if (RExC_parse >= RExC_end)
7645                 FAIL("Trailing \\");
7646             /* FALL THROUGH */
7647         default:
7648             /* Do not generate "unrecognized" warnings here, we fall
7649                back into the quick-grab loop below */
7650             parse_start--;
7651             goto defchar;
7652         }
7653         break;
7654
7655     case '#':
7656         if (RExC_flags & RXf_PMf_EXTENDED) {
7657             if ( reg_skipcomment( pRExC_state ) )
7658                 goto tryagain;
7659         }
7660         /* FALL THROUGH */
7661
7662     default:
7663         outer_default:{
7664             register STRLEN len;
7665             register UV ender;
7666             register char *p;
7667             char *s;
7668             STRLEN foldlen;
7669             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7670
7671             parse_start = RExC_parse - 1;
7672
7673             RExC_parse++;
7674
7675         defchar:
7676             ender = 0;
7677             ret = reg_node(pRExC_state,
7678                            (U8) ((! FOLD) ? EXACT
7679                                           : (LOC)
7680                                              ? EXACTFL
7681                                              : (UNI_SEMANTICS)
7682                                                ? EXACTFU
7683                                                : EXACTF)
7684                     );
7685             s = STRING(ret);
7686             for (len = 0, p = RExC_parse - 1;
7687               len < 127 && p < RExC_end;
7688               len++)
7689             {
7690                 char * const oldp = p;
7691
7692                 if (RExC_flags & RXf_PMf_EXTENDED)
7693                     p = regwhite( pRExC_state, p );
7694                 switch ((U8)*p) {
7695                 case LATIN_SMALL_LETTER_SHARP_S:
7696                 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7697                 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7698                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7699                                 goto normal_default;
7700                 case '^':
7701                 case '$':
7702                 case '.':
7703                 case '[':
7704                 case '(':
7705                 case ')':
7706                 case '|':
7707                     goto loopdone;
7708                 case '\\':
7709                     /* Literal Escapes Switch
7710
7711                        This switch is meant to handle escape sequences that
7712                        resolve to a literal character.
7713
7714                        Every escape sequence that represents something
7715                        else, like an assertion or a char class, is handled
7716                        in the switch marked 'Special Escapes' above in this
7717                        routine, but also has an entry here as anything that
7718                        isn't explicitly mentioned here will be treated as
7719                        an unescaped equivalent literal.
7720                     */
7721
7722                     switch ((U8)*++p) {
7723                     /* These are all the special escapes. */
7724                     case LATIN_SMALL_LETTER_SHARP_S:
7725                     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7726                     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7727                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7728                                 goto normal_default;
7729                     case 'A':             /* Start assertion */
7730                     case 'b': case 'B':   /* Word-boundary assertion*/
7731                     case 'C':             /* Single char !DANGEROUS! */
7732                     case 'd': case 'D':   /* digit class */
7733                     case 'g': case 'G':   /* generic-backref, pos assertion */
7734                     case 'h': case 'H':   /* HORIZWS */
7735                     case 'k': case 'K':   /* named backref, keep marker */
7736                     case 'N':             /* named char sequence */
7737                     case 'p': case 'P':   /* Unicode property */
7738                               case 'R':   /* LNBREAK */
7739                     case 's': case 'S':   /* space class */
7740                     case 'v': case 'V':   /* VERTWS */
7741                     case 'w': case 'W':   /* word class */
7742                     case 'X':             /* eXtended Unicode "combining character sequence" */
7743                     case 'z': case 'Z':   /* End of line/string assertion */
7744                         --p;
7745                         goto loopdone;
7746
7747                     /* Anything after here is an escape that resolves to a
7748                        literal. (Except digits, which may or may not)
7749                      */
7750                     case 'n':
7751                         ender = '\n';
7752                         p++;
7753                         break;
7754                     case 'r':
7755                         ender = '\r';
7756                         p++;
7757                         break;
7758                     case 't':
7759                         ender = '\t';
7760                         p++;
7761                         break;
7762                     case 'f':
7763                         ender = '\f';
7764                         p++;
7765                         break;
7766                     case 'e':
7767                           ender = ASCII_TO_NATIVE('\033');
7768                         p++;
7769                         break;
7770                     case 'a':
7771                           ender = ASCII_TO_NATIVE('\007');
7772                         p++;
7773                         break;
7774                     case 'o':
7775                         {
7776                             STRLEN brace_len = len;
7777                             UV result;
7778                             const char* error_msg;
7779
7780                             bool valid = grok_bslash_o(p,
7781                                                        &result,
7782                                                        &brace_len,
7783                                                        &error_msg,
7784                                                        1);
7785                             p += brace_len;
7786                             if (! valid) {
7787                                 RExC_parse = p; /* going to die anyway; point
7788                                                    to exact spot of failure */
7789                                 vFAIL(error_msg);
7790                             }
7791                             else
7792                             {
7793                                 ender = result;
7794                             }
7795                             if (PL_encoding && ender < 0x100) {
7796                                 goto recode_encoding;
7797                             }
7798                             if (ender > 0xff) {
7799                                 REQUIRE_UTF8;
7800                             }
7801                             break;
7802                         }
7803                     case 'x':
7804                         if (*++p == '{') {
7805                             char* const e = strchr(p, '}');
7806
7807                             if (!e) {
7808                                 RExC_parse = p + 1;
7809                                 vFAIL("Missing right brace on \\x{}");
7810                             }
7811                             else {
7812                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7813                                     | PERL_SCAN_DISALLOW_PREFIX;
7814                                 STRLEN numlen = e - p - 1;
7815                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7816                                 if (ender > 0xff)
7817                                     REQUIRE_UTF8;
7818                                 p = e + 1;
7819                             }
7820                         }
7821                         else {
7822                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7823                             STRLEN numlen = 2;
7824                             ender = grok_hex(p, &numlen, &flags, NULL);
7825                             p += numlen;
7826                         }
7827                         if (PL_encoding && ender < 0x100)
7828                             goto recode_encoding;
7829                         break;
7830                     case 'c':
7831                         p++;
7832                         ender = grok_bslash_c(*p++, SIZE_ONLY);
7833                         break;
7834                     case '0': case '1': case '2': case '3':case '4':
7835                     case '5': case '6': case '7': case '8':case '9':
7836                         if (*p == '0' ||
7837                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
7838                         {
7839                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
7840                             STRLEN numlen = 3;
7841                             ender = grok_oct(p, &numlen, &flags, NULL);
7842                             if (ender > 0xff) {
7843                                 REQUIRE_UTF8;
7844                             }
7845                             p += numlen;
7846                         }
7847                         else {
7848                             --p;
7849                             goto loopdone;
7850                         }
7851                         if (PL_encoding && ender < 0x100)
7852                             goto recode_encoding;
7853                         break;
7854                     recode_encoding:
7855                         {
7856                             SV* enc = PL_encoding;
7857                             ender = reg_recode((const char)(U8)ender, &enc);
7858                             if (!enc && SIZE_ONLY)
7859                                 ckWARNreg(p, "Invalid escape in the specified encoding");
7860                             REQUIRE_UTF8;
7861                         }
7862                         break;
7863                     case '\0':
7864                         if (p >= RExC_end)
7865                             FAIL("Trailing \\");
7866                         /* FALL THROUGH */
7867                     default:
7868                         if (!SIZE_ONLY&& isALPHA(*p))
7869                             ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7870                         goto normal_default;
7871                     }
7872                     break;
7873                 default:
7874                   normal_default:
7875                     if (UTF8_IS_START(*p) && UTF) {
7876                         STRLEN numlen;
7877                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7878                                                &numlen, UTF8_ALLOW_DEFAULT);
7879                         p += numlen;
7880                     }
7881                     else
7882                         ender = *p++;
7883                     break;
7884                 }
7885                 if ( RExC_flags & RXf_PMf_EXTENDED)
7886                     p = regwhite( pRExC_state, p );
7887                 if (UTF && FOLD) {
7888                     /* Prime the casefolded buffer. */
7889                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7890                 }
7891                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7892                     if (len)
7893                         p = oldp;
7894                     else if (UTF) {
7895                          if (FOLD) {
7896                               /* Emit all the Unicode characters. */
7897                               STRLEN numlen;
7898                               for (foldbuf = tmpbuf;
7899                                    foldlen;
7900                                    foldlen -= numlen) {
7901                                    ender = utf8_to_uvchr(foldbuf, &numlen);
7902                                    if (numlen > 0) {
7903                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
7904                                         s       += unilen;
7905                                         len     += unilen;
7906                                         /* In EBCDIC the numlen
7907                                          * and unilen can differ. */
7908                                         foldbuf += numlen;
7909                                         if (numlen >= foldlen)
7910                                              break;
7911                                    }
7912                                    else
7913                                         break; /* "Can't happen." */
7914                               }
7915                          }
7916                          else {
7917                               const STRLEN unilen = reguni(pRExC_state, ender, s);
7918                               if (unilen > 0) {
7919                                    s   += unilen;
7920                                    len += unilen;
7921                               }
7922                          }
7923                     }
7924                     else {
7925                         len++;
7926                         REGC((char)ender, s++);
7927                     }
7928                     break;
7929                 }
7930                 if (UTF) {
7931                      if (FOLD) {
7932                           /* Emit all the Unicode characters. */
7933                           STRLEN numlen;
7934                           for (foldbuf = tmpbuf;
7935                                foldlen;
7936                                foldlen -= numlen) {
7937                                ender = utf8_to_uvchr(foldbuf, &numlen);
7938                                if (numlen > 0) {
7939                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
7940                                     len     += unilen;
7941                                     s       += unilen;
7942                                     /* In EBCDIC the numlen
7943                                      * and unilen can differ. */
7944                                     foldbuf += numlen;
7945                                     if (numlen >= foldlen)
7946                                          break;
7947                                }
7948                                else
7949                                     break;
7950                           }
7951                      }
7952                      else {
7953                           const STRLEN unilen = reguni(pRExC_state, ender, s);
7954                           if (unilen > 0) {
7955                                s   += unilen;
7956                                len += unilen;
7957                           }
7958                      }
7959                      len--;
7960                 }
7961                 else
7962                     REGC((char)ender, s++);
7963             }
7964         loopdone:
7965             RExC_parse = p - 1;
7966             Set_Node_Cur_Length(ret); /* MJD */
7967             nextchar(pRExC_state);
7968             {
7969                 /* len is STRLEN which is unsigned, need to copy to signed */
7970                 IV iv = len;
7971                 if (iv < 0)
7972                     vFAIL("Internal disaster");
7973             }
7974             if (len > 0)
7975                 *flagp |= HASWIDTH;
7976             if (len == 1 && UNI_IS_INVARIANT(ender))
7977                 *flagp |= SIMPLE;
7978
7979             if (SIZE_ONLY)
7980                 RExC_size += STR_SZ(len);
7981             else {
7982                 STR_LEN(ret) = len;
7983                 RExC_emit += STR_SZ(len);
7984             }
7985         }
7986         break;
7987     }
7988
7989     return(ret);
7990 }
7991
7992 STATIC char *
7993 S_regwhite( RExC_state_t *pRExC_state, char *p )
7994 {
7995     const char *e = RExC_end;
7996
7997     PERL_ARGS_ASSERT_REGWHITE;
7998
7999     while (p < e) {
8000         if (isSPACE(*p))
8001             ++p;
8002         else if (*p == '#') {
8003             bool ended = 0;
8004             do {
8005                 if (*p++ == '\n') {
8006                     ended = 1;
8007                     break;
8008                 }
8009             } while (p < e);
8010             if (!ended)
8011                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8012         }
8013         else
8014             break;
8015     }
8016     return p;
8017 }
8018
8019 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
8020    Character classes ([:foo:]) can also be negated ([:^foo:]).
8021    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
8022    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
8023    but trigger failures because they are currently unimplemented. */
8024
8025 #define POSIXCC_DONE(c)   ((c) == ':')
8026 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
8027 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
8028
8029 STATIC I32
8030 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
8031 {
8032     dVAR;
8033     I32 namedclass = OOB_NAMEDCLASS;
8034
8035     PERL_ARGS_ASSERT_REGPPOSIXCC;
8036
8037     if (value == '[' && RExC_parse + 1 < RExC_end &&
8038         /* I smell either [: or [= or [. -- POSIX has been here, right? */
8039         POSIXCC(UCHARAT(RExC_parse))) {
8040         const char c = UCHARAT(RExC_parse);
8041         char* const s = RExC_parse++;
8042
8043         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
8044             RExC_parse++;
8045         if (RExC_parse == RExC_end)
8046             /* Grandfather lone [:, [=, [. */
8047             RExC_parse = s;
8048         else {
8049             const char* const t = RExC_parse++; /* skip over the c */
8050             assert(*t == c);
8051
8052             if (UCHARAT(RExC_parse) == ']') {
8053                 const char *posixcc = s + 1;
8054                 RExC_parse++; /* skip over the ending ] */
8055
8056                 if (*s == ':') {
8057                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
8058                     const I32 skip = t - posixcc;
8059
8060                     /* Initially switch on the length of the name.  */
8061                     switch (skip) {
8062                     case 4:
8063                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
8064                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
8065                         break;
8066                     case 5:
8067                         /* Names all of length 5.  */
8068                         /* alnum alpha ascii blank cntrl digit graph lower
8069                            print punct space upper  */
8070                         /* Offset 4 gives the best switch position.  */
8071                         switch (posixcc[4]) {
8072                         case 'a':
8073                             if (memEQ(posixcc, "alph", 4)) /* alpha */
8074                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
8075                             break;
8076                         case 'e':
8077                             if (memEQ(posixcc, "spac", 4)) /* space */
8078                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
8079                             break;
8080                         case 'h':
8081                             if (memEQ(posixcc, "grap", 4)) /* graph */
8082                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
8083                             break;
8084                         case 'i':
8085                             if (memEQ(posixcc, "asci", 4)) /* ascii */
8086                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
8087                             break;
8088                         case 'k':
8089                             if (memEQ(posixcc, "blan", 4)) /* blank */
8090                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
8091                             break;
8092                         case 'l':
8093                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
8094                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
8095                             break;
8096                         case 'm':
8097                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
8098                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
8099                             break;
8100                         case 'r':
8101                             if (memEQ(posixcc, "lowe", 4)) /* lower */
8102                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
8103                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
8104                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
8105                             break;
8106                         case 't':
8107                             if (memEQ(posixcc, "digi", 4)) /* digit */
8108                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
8109                             else if (memEQ(posixcc, "prin", 4)) /* print */
8110                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
8111                             else if (memEQ(posixcc, "punc", 4)) /* punct */
8112                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
8113                             break;
8114                         }
8115                         break;
8116                     case 6:
8117                         if (memEQ(posixcc, "xdigit", 6))
8118                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
8119                         break;
8120                     }
8121
8122                     if (namedclass == OOB_NAMEDCLASS)
8123                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8124                                       t - s - 1, s + 1);
8125                     assert (posixcc[skip] == ':');
8126                     assert (posixcc[skip+1] == ']');
8127                 } else if (!SIZE_ONLY) {
8128                     /* [[=foo=]] and [[.foo.]] are still future. */
8129
8130                     /* adjust RExC_parse so the warning shows after
8131                        the class closes */
8132                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
8133                         RExC_parse++;
8134                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8135                 }
8136             } else {
8137                 /* Maternal grandfather:
8138                  * "[:" ending in ":" but not in ":]" */
8139                 RExC_parse = s;
8140             }
8141         }
8142     }
8143
8144     return namedclass;
8145 }
8146
8147 STATIC void
8148 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
8149 {
8150     dVAR;
8151
8152     PERL_ARGS_ASSERT_CHECKPOSIXCC;
8153
8154     if (POSIXCC(UCHARAT(RExC_parse))) {
8155         const char *s = RExC_parse;
8156         const char  c = *s++;
8157
8158         while (isALNUM(*s))
8159             s++;
8160         if (*s && c == *s && s[1] == ']') {
8161             ckWARN3reg(s+2,
8162                        "POSIX syntax [%c %c] belongs inside character classes",
8163                        c, c);
8164
8165             /* [[=foo=]] and [[.foo.]] are still future. */
8166             if (POSIXCC_NOTYET(c)) {
8167                 /* adjust RExC_parse so the error shows after
8168                    the class closes */
8169                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
8170                     NOOP;
8171                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8172             }
8173         }
8174     }
8175 }
8176
8177 /* No locale test */
8178 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
8179 ANYOF_##NAME:                                           \
8180         for (value = 0; value < 256; value++)           \
8181             if (TEST)                                   \
8182                 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8183     yesno = '+';                                        \
8184     what = WORD;                                        \
8185     break;                                              \
8186 case ANYOF_N##NAME:                                     \
8187         for (value = 0; value < 256; value++)           \
8188             if (!TEST)                                  \
8189                 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8190     yesno = '!';                                        \
8191     what = WORD;                                        \
8192     break
8193
8194 /* Like the above, but there are differences if we are in uni-8-bit or not, so
8195  * there are two tests passed in, to use depending on that. There aren't any
8196  * cases where the label is different from the name, so no need for that
8197  * parameter */
8198 #define _C_C_T_(NAME,TEST_8,TEST_7,WORD)       \
8199 ANYOF_##NAME:                                           \
8200     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);        \
8201     else if (UNI_SEMANTICS) {                           \
8202         for (value = 0; value < 256; value++) {         \
8203             if (TEST_8) stored +=                       \
8204                       S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8205         }                                               \
8206     }                                                   \
8207     else {                                              \
8208         for (value = 0; value < 128; value++) {         \
8209             if (TEST_7) stored +=                       \
8210                        S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value)); \
8211         }                                               \
8212     }                                                   \
8213     yesno = '+';                                        \
8214     what = WORD;                                        \
8215     break;                                              \
8216 case ANYOF_N##NAME:                                     \
8217     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);       \
8218     else if (UNI_SEMANTICS) {                           \
8219         for (value = 0; value < 256; value++) {         \
8220             if (! TEST_8) stored +=                     \
8221                         S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8222         }                                               \
8223     }                                                   \
8224     else {                                              \
8225         for (value = 0; value < 128; value++) {         \
8226             if (! TEST_7) stored +=                     \
8227                         S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8228         }                                               \
8229         for (value = 128; value < 256; value++) {         \
8230                         S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8231         }                                               \
8232     }                                                   \
8233     yesno = '!';                                        \
8234     what = WORD;                                        \
8235     break
8236
8237 /*
8238    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8239    so that it is possible to override the option here without having to
8240    rebuild the entire core. as we are required to do if we change regcomp.h
8241    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8242 */
8243 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8244 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8245 #endif
8246
8247 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8248 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8249 #else
8250 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8251 #endif
8252
8253 STATIC U8
8254 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8255 {
8256
8257     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
8258      * Locale folding is done at run-time, so this function should not be
8259      * called for nodes that are for locales.
8260      *
8261      * This function simply sets the bit corresponding to the fold of the input
8262      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
8263      * 'F' is 'f'.
8264      *
8265      * It also sets any necessary flags, and returns the number of bits that
8266      * actually changed from 0 to 1 */
8267
8268     U8 stored = 0;
8269     U8 fold;
8270
8271     fold = (UNI_SEMANTICS) ? PL_fold_latin1[value]
8272                            : PL_fold[value];
8273
8274     /* It assumes the bit for 'value' has already been set */
8275     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
8276         ANYOF_BITMAP_SET(node, fold);
8277         stored++;
8278     }
8279
8280     /* The fold of the German sharp s is two ASCII characters, so isn't in the
8281      * bitmap and doesn't have to be in utf8, but we only process it if unicode
8282      * semantics are called for */
8283     if (UNI_SEMANTICS && value == LATIN_SMALL_LETTER_SHARP_S) {
8284         ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
8285     }
8286     else if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
8287              || (! UNI_SEMANTICS
8288                  && ! isASCII(value)
8289                  && PL_fold_latin1[value] != value))
8290     {   /* A character that has a fold outside of Latin1 matches outside the
8291            bitmap, but only when the target string is utf8.  Similarly when we
8292            don't have unicode semantics for the above ASCII Latin-1 characters,
8293            and they have a fold, they should match if the target is utf8, and
8294            not otherwise */
8295         ANYOF_FLAGS(node) |= ANYOF_UTF8;
8296     }
8297
8298     return stored;
8299 }
8300
8301
8302 PERL_STATIC_INLINE U8
8303 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8304 {
8305     /* This inline function sets a bit in the bitmap if not already set, and if
8306      * appropriate, its fold, returning the number of bits that actually
8307      * changed from 0 to 1 */
8308
8309     U8 stored;
8310
8311     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
8312         return 0;
8313     }
8314
8315     ANYOF_BITMAP_SET(node, value);
8316     stored = 1;
8317
8318     if (FOLD && ! LOC) {        /* Locale folds aren't known until runtime */
8319         stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value);
8320     }
8321
8322     return stored;
8323 }
8324
8325 /*
8326    parse a class specification and produce either an ANYOF node that
8327    matches the pattern or if the pattern matches a single char only and
8328    that char is < 256 and we are case insensitive then we produce an
8329    EXACT node instead.
8330 */
8331
8332 STATIC regnode *
8333 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
8334 {
8335     dVAR;
8336     register UV nextvalue;
8337     register IV prevvalue = OOB_UNICODE;
8338     register IV range = 0;
8339     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
8340     register regnode *ret;
8341     STRLEN numlen;
8342     IV namedclass;
8343     char *rangebegin = NULL;
8344     bool need_class = 0;
8345     SV *listsv = NULL;
8346     UV n;
8347     AV* unicode_alternate  = NULL;
8348 #ifdef EBCDIC
8349     UV literal_endpoint = 0;
8350 #endif
8351     UV stored = 0;  /* how many chars stored in the bitmap */
8352
8353     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
8354         case we need to change the emitted regop to an EXACT. */
8355     const char * orig_parse = RExC_parse;
8356     GET_RE_DEBUG_FLAGS_DECL;
8357
8358     PERL_ARGS_ASSERT_REGCLASS;
8359 #ifndef DEBUGGING
8360     PERL_UNUSED_ARG(depth);
8361 #endif
8362
8363     DEBUG_PARSE("clas");
8364
8365     /* Assume we are going to generate an ANYOF node. */
8366     ret = reganode(pRExC_state, ANYOF, 0);
8367
8368     if (!SIZE_ONLY)
8369         ANYOF_FLAGS(ret) = 0;
8370
8371     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
8372         RExC_naughty++;
8373         RExC_parse++;
8374         if (!SIZE_ONLY)
8375             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8376     }
8377
8378     if (SIZE_ONLY) {
8379         RExC_size += ANYOF_SKIP;
8380 #ifdef ANYOF_ADD_LOC_SKIP
8381         if (LOC) {
8382             RExC_size += ANYOF_ADD_LOC_SKIP;
8383         }
8384 #endif
8385         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8386     }
8387     else {
8388         RExC_emit += ANYOF_SKIP;
8389         if (LOC) {
8390             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
8391 #ifdef ANYOF_ADD_LOC_SKIP
8392             RExC_emit += ANYOF_ADD_LOC_SKIP;
8393 #endif
8394         }
8395         ANYOF_BITMAP_ZERO(ret);
8396         listsv = newSVpvs("# comment\n");
8397     }
8398
8399     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8400
8401     if (!SIZE_ONLY && POSIXCC(nextvalue))
8402         checkposixcc(pRExC_state);
8403
8404     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8405     if (UCHARAT(RExC_parse) == ']')
8406         goto charclassloop;
8407
8408 parseit:
8409     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
8410
8411     charclassloop:
8412
8413         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8414
8415         if (!range)
8416             rangebegin = RExC_parse;
8417         if (UTF) {
8418             value = utf8n_to_uvchr((U8*)RExC_parse,
8419                                    RExC_end - RExC_parse,
8420                                    &numlen, UTF8_ALLOW_DEFAULT);
8421             RExC_parse += numlen;
8422         }
8423         else
8424             value = UCHARAT(RExC_parse++);
8425
8426         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8427         if (value == '[' && POSIXCC(nextvalue))
8428             namedclass = regpposixcc(pRExC_state, value);
8429         else if (value == '\\') {
8430             if (UTF) {
8431                 value = utf8n_to_uvchr((U8*)RExC_parse,
8432                                    RExC_end - RExC_parse,
8433                                    &numlen, UTF8_ALLOW_DEFAULT);
8434                 RExC_parse += numlen;
8435             }
8436             else
8437                 value = UCHARAT(RExC_parse++);
8438             /* Some compilers cannot handle switching on 64-bit integer
8439              * values, therefore value cannot be an UV.  Yes, this will
8440              * be a problem later if we want switch on Unicode.
8441              * A similar issue a little bit later when switching on
8442              * namedclass. --jhi */
8443             switch ((I32)value) {
8444             case 'w':   namedclass = ANYOF_ALNUM;       break;
8445             case 'W':   namedclass = ANYOF_NALNUM;      break;
8446             case 's':   namedclass = ANYOF_SPACE;       break;
8447             case 'S':   namedclass = ANYOF_NSPACE;      break;
8448             case 'd':   namedclass = ANYOF_DIGIT;       break;
8449             case 'D':   namedclass = ANYOF_NDIGIT;      break;
8450             case 'v':   namedclass = ANYOF_VERTWS;      break;
8451             case 'V':   namedclass = ANYOF_NVERTWS;     break;
8452             case 'h':   namedclass = ANYOF_HORIZWS;     break;
8453             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
8454             case 'N':  /* Handle \N{NAME} in class */
8455                 {
8456                     /* We only pay attention to the first char of
8457                     multichar strings being returned. I kinda wonder
8458                     if this makes sense as it does change the behaviour
8459                     from earlier versions, OTOH that behaviour was broken
8460                     as well. */
8461                     UV v; /* value is register so we cant & it /grrr */
8462                     if (reg_namedseq(pRExC_state, &v, NULL)) {
8463                         goto parseit;
8464                     }
8465                     value= v;
8466                 }
8467                 break;
8468             case 'p':
8469             case 'P':
8470                 {
8471                 char *e;
8472                 if (RExC_parse >= RExC_end)
8473                     vFAIL2("Empty \\%c{}", (U8)value);
8474                 if (*RExC_parse == '{') {
8475                     const U8 c = (U8)value;
8476                     e = strchr(RExC_parse++, '}');
8477                     if (!e)
8478                         vFAIL2("Missing right brace on \\%c{}", c);
8479                     while (isSPACE(UCHARAT(RExC_parse)))
8480                         RExC_parse++;
8481                     if (e == RExC_parse)
8482                         vFAIL2("Empty \\%c{}", c);
8483                     n = e - RExC_parse;
8484                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8485                         n--;
8486                 }
8487                 else {
8488                     e = RExC_parse;
8489                     n = 1;
8490                 }
8491                 if (!SIZE_ONLY) {
8492                     if (UCHARAT(RExC_parse) == '^') {
8493                          RExC_parse++;
8494                          n--;
8495                          value = value == 'p' ? 'P' : 'p'; /* toggle */
8496                          while (isSPACE(UCHARAT(RExC_parse))) {
8497                               RExC_parse++;
8498                               n--;
8499                          }
8500                     }
8501                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8502                         (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8503                 }
8504                 RExC_parse = e + 1;
8505
8506                 /* The \p could match something in the Latin1 range, hence
8507                  * something that isn't utf8 */
8508                 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
8509                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
8510                 }
8511                 break;
8512             case 'n':   value = '\n';                   break;
8513             case 'r':   value = '\r';                   break;
8514             case 't':   value = '\t';                   break;
8515             case 'f':   value = '\f';                   break;
8516             case 'b':   value = '\b';                   break;
8517             case 'e':   value = ASCII_TO_NATIVE('\033');break;
8518             case 'a':   value = ASCII_TO_NATIVE('\007');break;
8519             case 'o':
8520                 RExC_parse--;   /* function expects to be pointed at the 'o' */
8521                 {
8522                     const char* error_msg;
8523                     bool valid = grok_bslash_o(RExC_parse,
8524                                                &value,
8525                                                &numlen,
8526                                                &error_msg,
8527                                                SIZE_ONLY);
8528                     RExC_parse += numlen;
8529                     if (! valid) {
8530                         vFAIL(error_msg);
8531                     }
8532                 }
8533                 if (PL_encoding && value < 0x100) {
8534                     goto recode_encoding;
8535                 }
8536                 break;
8537             case 'x':
8538                 if (*RExC_parse == '{') {
8539                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8540                         | PERL_SCAN_DISALLOW_PREFIX;
8541                     char * const e = strchr(RExC_parse++, '}');
8542                     if (!e)
8543                         vFAIL("Missing right brace on \\x{}");
8544
8545                     numlen = e - RExC_parse;
8546                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8547                     RExC_parse = e + 1;
8548                 }
8549                 else {
8550                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8551                     numlen = 2;
8552                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8553                     RExC_parse += numlen;
8554                 }
8555                 if (PL_encoding && value < 0x100)
8556                     goto recode_encoding;
8557                 break;
8558             case 'c':
8559                 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
8560                 break;
8561             case '0': case '1': case '2': case '3': case '4':
8562             case '5': case '6': case '7':
8563                 {
8564                     /* Take 1-3 octal digits */
8565                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8566                     numlen = 3;
8567                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8568                     RExC_parse += numlen;
8569                     if (PL_encoding && value < 0x100)
8570                         goto recode_encoding;
8571                     break;
8572                 }
8573             recode_encoding:
8574                 {
8575                     SV* enc = PL_encoding;
8576                     value = reg_recode((const char)(U8)value, &enc);
8577                     if (!enc && SIZE_ONLY)
8578                         ckWARNreg(RExC_parse,
8579                                   "Invalid escape in the specified encoding");
8580                     break;
8581                 }
8582             default:
8583                 /* Allow \_ to not give an error */
8584                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
8585                     ckWARN2reg(RExC_parse,
8586                                "Unrecognized escape \\%c in character class passed through",
8587                                (int)value);
8588                 }
8589                 break;
8590             }
8591         } /* end of \blah */
8592 #ifdef EBCDIC
8593         else
8594             literal_endpoint++;
8595 #endif
8596
8597         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8598
8599             /* What matches in a locale is not known until runtime, so need to
8600              * (one time per class) allocate extra space to pass to regexec.
8601              * The space will contain a bit for each named class that is to be
8602              * matched against.  This isn't needed for \p{} and pseudo-classes,
8603              * as they are not affected by locale, and hence are dealt with
8604              * separately */
8605             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
8606                 need_class = 1;
8607                 if (SIZE_ONLY) {
8608 #ifdef ANYOF_CLASS_ADD_SKIP
8609                     RExC_size += ANYOF_CLASS_ADD_SKIP;
8610 #endif
8611                 }
8612                 else {
8613 #ifdef ANYOF_CLASS_ADD_SKIP
8614                     RExC_emit += ANYOF_CLASS_ADD_SKIP;
8615 #endif
8616                     ANYOF_CLASS_ZERO(ret);
8617                 }
8618                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8619             }
8620
8621             /* a bad range like a-\d, a-[:digit:] ? */
8622             if (range) {
8623                 if (!SIZE_ONLY) {
8624                     const int w =
8625                         RExC_parse >= rangebegin ?
8626                         RExC_parse - rangebegin : 0;
8627                     ckWARN4reg(RExC_parse,
8628                                "False [] range \"%*.*s\"",
8629                                w, w, rangebegin);
8630
8631                     if (prevvalue < 256) {
8632                         stored +=
8633                          S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue);
8634                         stored +=
8635                          S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
8636                     }
8637                     else {
8638                         ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8639                         Perl_sv_catpvf(aTHX_ listsv,
8640                            "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8641                     }
8642                 }
8643
8644                 range = 0; /* this was not a true range */
8645             }
8646
8647
8648
8649             if (!SIZE_ONLY) {
8650                 const char *what = NULL;
8651                 char yesno = 0;
8652
8653                 /* Possible truncation here but in some 64-bit environments
8654                  * the compiler gets heartburn about switch on 64-bit values.
8655                  * A similar issue a little earlier when switching on value.
8656                  * --jhi */
8657                 switch ((I32)namedclass) {
8658
8659                 case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
8660                 case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
8661                 case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
8662                 case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
8663                 case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
8664                 case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
8665                 case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
8666                 case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
8667                 case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
8668                 case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
8669 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8670                 /* \s, \w match all unicode if utf8. */
8671                 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
8672                 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
8673 #else
8674                 /* \s, \w match ascii and locale only */
8675                 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
8676                 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
8677 #endif
8678                 case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
8679                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8680                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8681                 case ANYOF_ASCII:
8682                     if (LOC)
8683                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8684                     else {
8685                         for (value = 0; value < 128; value++)
8686                             stored +=
8687                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
8688                     }
8689                     yesno = '+';
8690                     what = NULL;        /* Doesn't match outside ascii, so
8691                                            don't want to add +utf8:: */
8692                     break;
8693                 case ANYOF_NASCII:
8694                     if (LOC)
8695                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8696                     else {
8697                         for (value = 128; value < 256; value++)
8698                             stored +=
8699                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
8700                     }
8701                     yesno = '!';
8702                     what = "ASCII";
8703                     break;
8704                 case ANYOF_DIGIT:
8705                     if (LOC)
8706                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8707                     else {
8708                         /* consecutive digits assumed */
8709                         for (value = '0'; value <= '9'; value++)
8710                             stored +=
8711                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
8712                     }
8713                     yesno = '+';
8714                     what = POSIX_CC_UNI_NAME("Digit");
8715                     break;
8716                 case ANYOF_NDIGIT:
8717                     if (LOC)
8718                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8719                     else {
8720                         /* consecutive digits assumed */
8721                         for (value = 0; value < '0'; value++)
8722                             stored +=
8723                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
8724                         for (value = '9' + 1; value < 256; value++)
8725                             stored +=
8726                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
8727                     }
8728                     yesno = '!';
8729                     what = POSIX_CC_UNI_NAME("Digit");
8730                     break;
8731                 case ANYOF_MAX:
8732                     /* this is to handle \p and \P */
8733                     break;
8734                 default:
8735                     vFAIL("Invalid [::] class");
8736                     break;
8737                 }
8738                 if (what) {
8739                     /* Strings such as "+utf8::isWord\n" */
8740                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8741                     ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8742                 }
8743
8744                 continue;
8745             }
8746         } /* end of namedclass \blah */
8747
8748         if (range) {
8749             if (prevvalue > (IV)value) /* b-a */ {
8750                 const int w = RExC_parse - rangebegin;
8751                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8752                 range = 0; /* not a valid range */
8753             }
8754         }
8755         else {
8756             prevvalue = value; /* save the beginning of the range */
8757             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8758                 RExC_parse[1] != ']') {
8759                 RExC_parse++;
8760
8761                 /* a bad range like \w-, [:word:]- ? */
8762                 if (namedclass > OOB_NAMEDCLASS) {
8763                     if (ckWARN(WARN_REGEXP)) {
8764                         const int w =
8765                             RExC_parse >= rangebegin ?
8766                             RExC_parse - rangebegin : 0;
8767                         vWARN4(RExC_parse,
8768                                "False [] range \"%*.*s\"",
8769                                w, w, rangebegin);
8770                     }
8771                     if (!SIZE_ONLY)
8772                         stored +=
8773                             S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
8774                 } else
8775                     range = 1;  /* yeah, it's a range! */
8776                 continue;       /* but do it the next time */
8777             }
8778         }
8779
8780         /* now is the next time */
8781         if (!SIZE_ONLY) {
8782             if (prevvalue < 256) {
8783                 const IV ceilvalue = value < 256 ? value : 255;
8784                 IV i;
8785 #ifdef EBCDIC
8786                 /* In EBCDIC [\x89-\x91] should include
8787                  * the \x8e but [i-j] should not. */
8788                 if (literal_endpoint == 2 &&
8789                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8790                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8791                 {
8792                     if (isLOWER(prevvalue)) {
8793                         for (i = prevvalue; i <= ceilvalue; i++)
8794                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8795                                 stored +=
8796                                   S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
8797                             }
8798                     } else {
8799                         for (i = prevvalue; i <= ceilvalue; i++)
8800                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8801                                 stored +=
8802                                   S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
8803                             }
8804                     }
8805                 }
8806                 else
8807 #endif
8808                       for (i = prevvalue; i <= ceilvalue; i++) {
8809                         stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
8810                       }
8811           }
8812           if (value > 255 || UTF) {
8813                 const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
8814                 const UV natvalue      = NATIVE_TO_UNI(value);
8815
8816                 /* If the code point requires utf8 to represent, and we are not
8817                  * folding, it can't match unless the target is in utf8.  Only
8818                  * a few code points above 255 fold to below it, so XXX an
8819                  * optimization would be to know which ones and set the flag
8820                  * appropriately. */
8821                 ANYOF_FLAGS(ret) |= (FOLD || value < 256)
8822                                     ? ANYOF_NONBITMAP
8823                                     : ANYOF_UTF8;
8824                 if (prevnatvalue < natvalue) { /* what about > ? */
8825                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8826                                    prevnatvalue, natvalue);
8827                 }
8828                 else if (prevnatvalue == natvalue) {
8829                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8830                     if (FOLD) {
8831                          U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8832                          STRLEN foldlen;
8833                          const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8834
8835 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8836                          if (RExC_precomp[0] == ':' &&
8837                              RExC_precomp[1] == '[' &&
8838                              (f == 0xDF || f == 0x92)) {
8839                              f = NATIVE_TO_UNI(f);
8840                         }
8841 #endif
8842                          /* If folding and foldable and a single
8843                           * character, insert also the folded version
8844                           * to the charclass. */
8845                          if (f != value) {
8846 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8847                              if ((RExC_precomp[0] == ':' &&
8848                                   RExC_precomp[1] == '[' &&
8849                                   (f == 0xA2 &&
8850                                    (value == 0xFB05 || value == 0xFB06))) ?
8851                                  foldlen == ((STRLEN)UNISKIP(f) - 1) :
8852                                  foldlen == (STRLEN)UNISKIP(f) )
8853 #else
8854                               if (foldlen == (STRLEN)UNISKIP(f))
8855 #endif
8856                                   Perl_sv_catpvf(aTHX_ listsv,
8857                                                  "%04"UVxf"\n", f);
8858                               else {
8859                                   /* Any multicharacter foldings
8860                                    * require the following transform:
8861                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8862                                    * where E folds into "pq" and F folds
8863                                    * into "rst", all other characters
8864                                    * fold to single characters.  We save
8865                                    * away these multicharacter foldings,
8866                                    * to be later saved as part of the
8867                                    * additional "s" data. */
8868                                   SV *sv;
8869
8870                                   if (!unicode_alternate)
8871                                       unicode_alternate = newAV();
8872                                   sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8873                                                      TRUE);
8874                                   av_push(unicode_alternate, sv);
8875                               }
8876                          }
8877
8878                          /* If folding and the value is one of the Greek
8879                           * sigmas insert a few more sigmas to make the
8880                           * folding rules of the sigmas to work right.
8881                           * Note that not all the possible combinations
8882                           * are handled here: some of them are handled
8883                           * by the standard folding rules, and some of
8884                           * them (literal or EXACTF cases) are handled
8885                           * during runtime in regexec.c:S_find_byclass(). */
8886                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8887                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8888                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8889                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8890                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8891                          }
8892                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8893                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8894                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8895                     }
8896                 }
8897             }
8898 #ifdef EBCDIC
8899             literal_endpoint = 0;
8900 #endif
8901         }
8902
8903         range = 0; /* this range (if it was one) is done now */
8904     }
8905
8906
8907
8908     if (SIZE_ONLY)
8909         return ret;
8910     /****** !SIZE_ONLY AFTER HERE *********/
8911
8912     /* Folding in the bitmap is taken care of above, but not for locale, for
8913      * which we have to wait to see what folding is in effect at runtime, and
8914      * for things not in the bitmap */
8915     if (FOLD && (LOC || ANYOF_FLAGS(ret) & ANYOF_NONBITMAP)) {
8916         ANYOF_FLAGS(ret) |= ANYOF_FOLD;
8917     }
8918
8919     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that this doesn't
8920      * optimize locale.  Doing so perhaps could be done as long as there is
8921      * nothing like \w in it; some thought also would have to be given to the
8922      * interaction with above 0x100 chars */
8923     if ((ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8924         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8925             ANYOF_BITMAP(ret)[value] ^= 0xFF;
8926         stored = 256 - stored;
8927
8928         /* The inversion means that everything above 255 is matched */
8929         ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
8930     }
8931
8932     /* A single character class can be "optimized" into an EXACTish node.
8933      * Note that since we don't currently count how many characters there are
8934      * outside the bitmap, we are XXX missing optimization possibilities for
8935      * them.  This optimization can't happen unless this is a truly single
8936      * character class, which means that it can't be an inversion into a
8937      * many-character class, and there must be no possibility of there being
8938      * things outside the bitmap.  'stored' (only) for locales doesn't include
8939      * \w, etc, so have to make a special test that they aren't present
8940      *
8941      * Similarly A 2-character class of the very special form like [bB] can be
8942      * optimized into an EXACTFish node, but only for non-locales, and for
8943      * characters which only have the two folds; so things like 'fF' and 'Ii'
8944      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
8945      * FI'. */
8946     if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
8947         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
8948                               || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
8949             || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
8950                                  && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
8951                                  /* If the latest code point has a fold whose
8952                                   * bit is set, it must be the only other one */
8953                                 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
8954                                  && ANYOF_BITMAP_TEST(ret, prevvalue)))))
8955     {
8956         /* Note that the information needed to decide to do this optimization
8957          * is not currently available until the 2nd pass, and that the actually
8958          * used EXACTish node takes less space than the calculated ANYOF node,
8959          * and hence the amount of space calculated in the first pass is larger
8960          * than actually used, so this optimization doesn't gain us any space.
8961          * But an EXACT node is faster than an ANYOF node, and can be combined
8962          * with any adjacent EXACT nodes later by the optimizer for further
8963          * gains.  The speed of executing an EXACTF is similar to an ANYOF
8964          * node, so the optimization advantage comes from the ability to join
8965          * it to adjacent EXACT nodes */
8966
8967         const char * cur_parse= RExC_parse;
8968         U8 op;
8969         RExC_emit = (regnode *)orig_emit;
8970         RExC_parse = (char *)orig_parse;
8971
8972         if (stored == 1) {
8973
8974             /* A locale node with one point can be folded; all the other cases
8975              * with folding will have two points, since we calculate them above
8976              */
8977             if (ANYOF_FLAGS(ret) & ANYOF_FOLD) {
8978                  op = EXACTFL;
8979             }
8980             else {
8981                 op = EXACT;
8982             }
8983         }   /* else 2 chars in the bit map: the folds of each other */
8984         else if (UNI_SEMANTICS || !isASCII(value)) {
8985
8986             /* To join adjacent nodes, they must be the exact EXACTish type.
8987              * Try to use the most likely type, by using EXACTFU if the regex
8988              * calls for them, or is required because the character is
8989              * non-ASCII */
8990             op = EXACTFU;
8991         }
8992         else {    /* Otherwise, more likely to be EXACTF type */
8993             op = EXACTF;
8994         }
8995
8996         ret = reg_node(pRExC_state, op);
8997         RExC_parse = (char *)cur_parse;
8998         if (UTF && ! NATIVE_IS_INVARIANT(value)) {
8999             *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
9000             *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
9001             STR_LEN(ret)= 2;
9002             RExC_emit += STR_SZ(2);
9003         }
9004         else {
9005             *STRING(ret)= (char)value;
9006             STR_LEN(ret)= 1;
9007             RExC_emit += STR_SZ(1);
9008         }
9009         SvREFCNT_dec(listsv);
9010         return ret;
9011     }
9012
9013     {
9014         AV * const av = newAV();
9015         SV *rv;
9016         /* The 0th element stores the character class description
9017          * in its textual form: used later (regexec.c:Perl_regclass_swash())
9018          * to initialize the appropriate swash (which gets stored in
9019          * the 1st element), and also useful for dumping the regnode.
9020          * The 2nd element stores the multicharacter foldings,
9021          * used later (regexec.c:S_reginclass()). */
9022         av_store(av, 0, listsv);
9023         av_store(av, 1, NULL);
9024         av_store(av, 2, MUTABLE_SV(unicode_alternate));
9025         rv = newRV_noinc(MUTABLE_SV(av));
9026         n = add_data(pRExC_state, 1, "s");
9027         RExC_rxi->data->data[n] = (void*)rv;
9028         ARG_SET(ret, n);
9029     }
9030     return ret;
9031 }
9032 #undef _C_C_T_
9033
9034
9035 /* reg_skipcomment()
9036
9037    Absorbs an /x style # comments from the input stream.
9038    Returns true if there is more text remaining in the stream.
9039    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
9040    terminates the pattern without including a newline.
9041
9042    Note its the callers responsibility to ensure that we are
9043    actually in /x mode
9044
9045 */
9046
9047 STATIC bool
9048 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
9049 {
9050     bool ended = 0;
9051
9052     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
9053
9054     while (RExC_parse < RExC_end)
9055         if (*RExC_parse++ == '\n') {
9056             ended = 1;
9057             break;
9058         }
9059     if (!ended) {
9060         /* we ran off the end of the pattern without ending
9061            the comment, so we have to add an \n when wrapping */
9062         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9063         return 0;
9064     } else
9065         return 1;
9066 }
9067
9068 /* nextchar()
9069
9070    Advances the parse position, and optionally absorbs
9071    "whitespace" from the inputstream.
9072
9073    Without /x "whitespace" means (?#...) style comments only,
9074    with /x this means (?#...) and # comments and whitespace proper.
9075
9076    Returns the RExC_parse point from BEFORE the scan occurs.
9077
9078    This is the /x friendly way of saying RExC_parse++.
9079 */
9080
9081 STATIC char*
9082 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
9083 {
9084     char* const retval = RExC_parse++;
9085
9086     PERL_ARGS_ASSERT_NEXTCHAR;
9087
9088     for (;;) {
9089         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
9090                 RExC_parse[2] == '#') {
9091             while (*RExC_parse != ')') {
9092                 if (RExC_parse == RExC_end)
9093                     FAIL("Sequence (?#... not terminated");
9094                 RExC_parse++;
9095             }
9096             RExC_parse++;
9097             continue;
9098         }
9099         if (RExC_flags & RXf_PMf_EXTENDED) {
9100             if (isSPACE(*RExC_parse)) {
9101                 RExC_parse++;
9102                 continue;
9103             }
9104             else if (*RExC_parse == '#') {
9105                 if ( reg_skipcomment( pRExC_state ) )
9106                     continue;
9107             }
9108         }
9109         return retval;
9110     }
9111 }
9112
9113 /*
9114 - reg_node - emit a node
9115 */
9116 STATIC regnode *                        /* Location. */
9117 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
9118 {
9119     dVAR;
9120     register regnode *ptr;
9121     regnode * const ret = RExC_emit;
9122     GET_RE_DEBUG_FLAGS_DECL;
9123
9124     PERL_ARGS_ASSERT_REG_NODE;
9125
9126     if (SIZE_ONLY) {
9127         SIZE_ALIGN(RExC_size);
9128         RExC_size += 1;
9129         return(ret);
9130     }
9131     if (RExC_emit >= RExC_emit_bound)
9132         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9133
9134     NODE_ALIGN_FILL(ret);
9135     ptr = ret;
9136     FILL_ADVANCE_NODE(ptr, op);
9137 #ifdef RE_TRACK_PATTERN_OFFSETS
9138     if (RExC_offsets) {         /* MJD */
9139         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
9140               "reg_node", __LINE__,
9141               PL_reg_name[op],
9142               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
9143                 ? "Overwriting end of array!\n" : "OK",
9144               (UV)(RExC_emit - RExC_emit_start),
9145               (UV)(RExC_parse - RExC_start),
9146               (UV)RExC_offsets[0]));
9147         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
9148     }
9149 #endif
9150     RExC_emit = ptr;
9151     return(ret);
9152 }
9153
9154 /*
9155 - reganode - emit a node with an argument
9156 */
9157 STATIC regnode *                        /* Location. */
9158 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
9159 {
9160     dVAR;
9161     register regnode *ptr;
9162     regnode * const ret = RExC_emit;
9163     GET_RE_DEBUG_FLAGS_DECL;
9164
9165     PERL_ARGS_ASSERT_REGANODE;
9166
9167     if (SIZE_ONLY) {
9168         SIZE_ALIGN(RExC_size);
9169         RExC_size += 2;
9170         /*
9171            We can't do this:
9172
9173            assert(2==regarglen[op]+1);
9174
9175            Anything larger than this has to allocate the extra amount.
9176            If we changed this to be:
9177
9178            RExC_size += (1 + regarglen[op]);
9179
9180            then it wouldn't matter. Its not clear what side effect
9181            might come from that so its not done so far.
9182            -- dmq
9183         */
9184         return(ret);
9185     }
9186     if (RExC_emit >= RExC_emit_bound)
9187         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9188
9189     NODE_ALIGN_FILL(ret);
9190     ptr = ret;
9191     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
9192 #ifdef RE_TRACK_PATTERN_OFFSETS
9193     if (RExC_offsets) {         /* MJD */
9194         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9195               "reganode",
9196               __LINE__,
9197               PL_reg_name[op],
9198               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
9199               "Overwriting end of array!\n" : "OK",
9200               (UV)(RExC_emit - RExC_emit_start),
9201               (UV)(RExC_parse - RExC_start),
9202               (UV)RExC_offsets[0]));
9203         Set_Cur_Node_Offset;
9204     }
9205 #endif
9206     RExC_emit = ptr;
9207     return(ret);
9208 }
9209
9210 /*
9211 - reguni - emit (if appropriate) a Unicode character
9212 */
9213 STATIC STRLEN
9214 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
9215 {
9216     dVAR;
9217
9218     PERL_ARGS_ASSERT_REGUNI;
9219
9220     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
9221 }
9222
9223 /*
9224 - reginsert - insert an operator in front of already-emitted operand
9225 *
9226 * Means relocating the operand.
9227 */
9228 STATIC void
9229 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
9230 {
9231     dVAR;
9232     register regnode *src;
9233     register regnode *dst;
9234     register regnode *place;
9235     const int offset = regarglen[(U8)op];
9236     const int size = NODE_STEP_REGNODE + offset;
9237     GET_RE_DEBUG_FLAGS_DECL;
9238
9239     PERL_ARGS_ASSERT_REGINSERT;
9240     PERL_UNUSED_ARG(depth);
9241 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
9242     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
9243     if (SIZE_ONLY) {
9244         RExC_size += size;
9245         return;
9246     }
9247
9248     src = RExC_emit;
9249     RExC_emit += size;
9250     dst = RExC_emit;
9251     if (RExC_open_parens) {
9252         int paren;
9253         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
9254         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
9255             if ( RExC_open_parens[paren] >= opnd ) {
9256                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
9257                 RExC_open_parens[paren] += size;
9258             } else {
9259                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
9260             }
9261             if ( RExC_close_parens[paren] >= opnd ) {
9262                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
9263                 RExC_close_parens[paren] += size;
9264             } else {
9265                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
9266             }
9267         }
9268     }
9269
9270     while (src > opnd) {
9271         StructCopy(--src, --dst, regnode);
9272 #ifdef RE_TRACK_PATTERN_OFFSETS
9273         if (RExC_offsets) {     /* MJD 20010112 */
9274             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
9275                   "reg_insert",
9276                   __LINE__,
9277                   PL_reg_name[op],
9278                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
9279                     ? "Overwriting end of array!\n" : "OK",
9280                   (UV)(src - RExC_emit_start),
9281                   (UV)(dst - RExC_emit_start),
9282                   (UV)RExC_offsets[0]));
9283             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
9284             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
9285         }
9286 #endif
9287     }
9288
9289
9290     place = opnd;               /* Op node, where operand used to be. */
9291 #ifdef RE_TRACK_PATTERN_OFFSETS
9292     if (RExC_offsets) {         /* MJD */
9293         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9294               "reginsert",
9295               __LINE__,
9296               PL_reg_name[op],
9297               (UV)(place - RExC_emit_start) > RExC_offsets[0]
9298               ? "Overwriting end of array!\n" : "OK",
9299               (UV)(place - RExC_emit_start),
9300               (UV)(RExC_parse - RExC_start),
9301               (UV)RExC_offsets[0]));
9302         Set_Node_Offset(place, RExC_parse);
9303         Set_Node_Length(place, 1);
9304     }
9305 #endif
9306     src = NEXTOPER(place);
9307     FILL_ADVANCE_NODE(place, op);
9308     Zero(src, offset, regnode);
9309 }
9310
9311 /*
9312 - regtail - set the next-pointer at the end of a node chain of p to val.
9313 - SEE ALSO: regtail_study
9314 */
9315 /* TODO: All three parms should be const */
9316 STATIC void
9317 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9318 {
9319     dVAR;
9320     register regnode *scan;
9321     GET_RE_DEBUG_FLAGS_DECL;
9322
9323     PERL_ARGS_ASSERT_REGTAIL;
9324 #ifndef DEBUGGING
9325     PERL_UNUSED_ARG(depth);
9326 #endif
9327
9328     if (SIZE_ONLY)
9329         return;
9330
9331     /* Find last node. */
9332     scan = p;
9333     for (;;) {
9334         regnode * const temp = regnext(scan);
9335         DEBUG_PARSE_r({
9336             SV * const mysv=sv_newmortal();
9337             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
9338             regprop(RExC_rx, mysv, scan);
9339             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
9340                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
9341                     (temp == NULL ? "->" : ""),
9342                     (temp == NULL ? PL_reg_name[OP(val)] : "")
9343             );
9344         });
9345         if (temp == NULL)
9346             break;
9347         scan = temp;
9348     }
9349
9350     if (reg_off_by_arg[OP(scan)]) {
9351         ARG_SET(scan, val - scan);
9352     }
9353     else {
9354         NEXT_OFF(scan) = val - scan;
9355     }
9356 }
9357
9358 #ifdef DEBUGGING
9359 /*
9360 - regtail_study - set the next-pointer at the end of a node chain of p to val.
9361 - Look for optimizable sequences at the same time.
9362 - currently only looks for EXACT chains.
9363
9364 This is expermental code. The idea is to use this routine to perform
9365 in place optimizations on branches and groups as they are constructed,
9366 with the long term intention of removing optimization from study_chunk so
9367 that it is purely analytical.
9368
9369 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
9370 to control which is which.
9371
9372 */
9373 /* TODO: All four parms should be const */
9374
9375 STATIC U8
9376 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9377 {
9378     dVAR;
9379     register regnode *scan;
9380     U8 exact = PSEUDO;
9381 #ifdef EXPERIMENTAL_INPLACESCAN
9382     I32 min = 0;
9383 #endif
9384     GET_RE_DEBUG_FLAGS_DECL;
9385
9386     PERL_ARGS_ASSERT_REGTAIL_STUDY;
9387
9388
9389     if (SIZE_ONLY)
9390         return exact;
9391
9392     /* Find last node. */
9393
9394     scan = p;
9395     for (;;) {
9396         regnode * const temp = regnext(scan);
9397 #ifdef EXPERIMENTAL_INPLACESCAN
9398         if (PL_regkind[OP(scan)] == EXACT)
9399             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
9400                 return EXACT;
9401 #endif
9402         if ( exact ) {
9403             switch (OP(scan)) {
9404                 case EXACT:
9405                 case EXACTF:
9406                 case EXACTFU:
9407                 case EXACTFL:
9408                         if( exact == PSEUDO )
9409                             exact= OP(scan);
9410                         else if ( exact != OP(scan) )
9411                             exact= 0;
9412                 case NOTHING:
9413                     break;
9414                 default:
9415                     exact= 0;
9416             }
9417         }
9418         DEBUG_PARSE_r({
9419             SV * const mysv=sv_newmortal();
9420             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
9421             regprop(RExC_rx, mysv, scan);
9422             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
9423                 SvPV_nolen_const(mysv),
9424                 REG_NODE_NUM(scan),
9425                 PL_reg_name[exact]);
9426         });
9427         if (temp == NULL)
9428             break;
9429         scan = temp;
9430     }
9431     DEBUG_PARSE_r({
9432         SV * const mysv_val=sv_newmortal();
9433         DEBUG_PARSE_MSG("");
9434         regprop(RExC_rx, mysv_val, val);
9435         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
9436                       SvPV_nolen_const(mysv_val),
9437                       (IV)REG_NODE_NUM(val),
9438                       (IV)(val - scan)
9439         );
9440     });
9441     if (reg_off_by_arg[OP(scan)]) {
9442         ARG_SET(scan, val - scan);
9443     }
9444     else {
9445         NEXT_OFF(scan) = val - scan;
9446     }
9447
9448     return exact;
9449 }
9450 #endif
9451
9452 /*
9453  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
9454  */
9455 #ifdef DEBUGGING
9456 static void
9457 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9458 {
9459     int bit;
9460     int set=0;
9461
9462     for (bit=0; bit<32; bit++) {
9463         if (flags & (1<<bit)) {
9464             if (!set++ && lead)
9465                 PerlIO_printf(Perl_debug_log, "%s",lead);
9466             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9467         }
9468     }
9469     if (lead)  {
9470         if (set)
9471             PerlIO_printf(Perl_debug_log, "\n");
9472         else
9473             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9474     }
9475 }
9476 #endif
9477
9478 void
9479 Perl_regdump(pTHX_ const regexp *r)
9480 {
9481 #ifdef DEBUGGING
9482     dVAR;
9483     SV * const sv = sv_newmortal();
9484     SV *dsv= sv_newmortal();
9485     RXi_GET_DECL(r,ri);
9486     GET_RE_DEBUG_FLAGS_DECL;
9487
9488     PERL_ARGS_ASSERT_REGDUMP;
9489
9490     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
9491
9492     /* Header fields of interest. */
9493     if (r->anchored_substr) {
9494         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9495             RE_SV_DUMPLEN(r->anchored_substr), 30);
9496         PerlIO_printf(Perl_debug_log,
9497                       "anchored %s%s at %"IVdf" ",
9498                       s, RE_SV_TAIL(r->anchored_substr),
9499                       (IV)r->anchored_offset);
9500     } else if (r->anchored_utf8) {
9501         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9502             RE_SV_DUMPLEN(r->anchored_utf8), 30);
9503         PerlIO_printf(Perl_debug_log,
9504                       "anchored utf8 %s%s at %"IVdf" ",
9505                       s, RE_SV_TAIL(r->anchored_utf8),
9506                       (IV)r->anchored_offset);
9507     }
9508     if (r->float_substr) {
9509         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9510             RE_SV_DUMPLEN(r->float_substr), 30);
9511         PerlIO_printf(Perl_debug_log,
9512                       "floating %s%s at %"IVdf"..%"UVuf" ",
9513                       s, RE_SV_TAIL(r->float_substr),
9514                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9515     } else if (r->float_utf8) {
9516         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9517             RE_SV_DUMPLEN(r->float_utf8), 30);
9518         PerlIO_printf(Perl_debug_log,
9519                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9520                       s, RE_SV_TAIL(r->float_utf8),
9521                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9522     }
9523     if (r->check_substr || r->check_utf8)
9524         PerlIO_printf(Perl_debug_log,
9525                       (const char *)
9526                       (r->check_substr == r->float_substr
9527                        && r->check_utf8 == r->float_utf8
9528                        ? "(checking floating" : "(checking anchored"));
9529     if (r->extflags & RXf_NOSCAN)
9530         PerlIO_printf(Perl_debug_log, " noscan");
9531     if (r->extflags & RXf_CHECK_ALL)
9532         PerlIO_printf(Perl_debug_log, " isall");
9533     if (r->check_substr || r->check_utf8)
9534         PerlIO_printf(Perl_debug_log, ") ");
9535
9536     if (ri->regstclass) {
9537         regprop(r, sv, ri->regstclass);
9538         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9539     }
9540     if (r->extflags & RXf_ANCH) {
9541         PerlIO_printf(Perl_debug_log, "anchored");
9542         if (r->extflags & RXf_ANCH_BOL)
9543             PerlIO_printf(Perl_debug_log, "(BOL)");
9544         if (r->extflags & RXf_ANCH_MBOL)
9545             PerlIO_printf(Perl_debug_log, "(MBOL)");
9546         if (r->extflags & RXf_ANCH_SBOL)
9547             PerlIO_printf(Perl_debug_log, "(SBOL)");
9548         if (r->extflags & RXf_ANCH_GPOS)
9549             PerlIO_printf(Perl_debug_log, "(GPOS)");
9550         PerlIO_putc(Perl_debug_log, ' ');
9551     }
9552     if (r->extflags & RXf_GPOS_SEEN)
9553         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9554     if (r->intflags & PREGf_SKIP)
9555         PerlIO_printf(Perl_debug_log, "plus ");
9556     if (r->intflags & PREGf_IMPLICIT)
9557         PerlIO_printf(Perl_debug_log, "implicit ");
9558     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9559     if (r->extflags & RXf_EVAL_SEEN)
9560         PerlIO_printf(Perl_debug_log, "with eval ");
9561     PerlIO_printf(Perl_debug_log, "\n");
9562     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9563 #else
9564     PERL_ARGS_ASSERT_REGDUMP;
9565     PERL_UNUSED_CONTEXT;
9566     PERL_UNUSED_ARG(r);
9567 #endif  /* DEBUGGING */
9568 }
9569
9570 /*
9571 - regprop - printable representation of opcode
9572 */
9573 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9574 STMT_START { \
9575         if (do_sep) {                           \
9576             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9577             if (flags & ANYOF_INVERT)           \
9578                 /*make sure the invert info is in each */ \
9579                 sv_catpvs(sv, "^");             \
9580             do_sep = 0;                         \
9581         }                                       \
9582 } STMT_END
9583
9584 void
9585 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9586 {
9587 #ifdef DEBUGGING
9588     dVAR;
9589     register int k;
9590     RXi_GET_DECL(prog,progi);
9591     GET_RE_DEBUG_FLAGS_DECL;
9592
9593     PERL_ARGS_ASSERT_REGPROP;
9594
9595     sv_setpvs(sv, "");
9596
9597     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
9598         /* It would be nice to FAIL() here, but this may be called from
9599            regexec.c, and it would be hard to supply pRExC_state. */
9600         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9601     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9602
9603     k = PL_regkind[OP(o)];
9604
9605     if (k == EXACT) {
9606         sv_catpvs(sv, " ");
9607         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9608          * is a crude hack but it may be the best for now since
9609          * we have no flag "this EXACTish node was UTF-8"
9610          * --jhi */
9611         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9612                   PERL_PV_ESCAPE_UNI_DETECT |
9613                   PERL_PV_PRETTY_ELLIPSES   |
9614                   PERL_PV_PRETTY_LTGT       |
9615                   PERL_PV_PRETTY_NOCLEAR
9616                   );
9617     } else if (k == TRIE) {
9618         /* print the details of the trie in dumpuntil instead, as
9619          * progi->data isn't available here */
9620         const char op = OP(o);
9621         const U32 n = ARG(o);
9622         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9623                (reg_ac_data *)progi->data->data[n] :
9624                NULL;
9625         const reg_trie_data * const trie
9626             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9627
9628         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9629         DEBUG_TRIE_COMPILE_r(
9630             Perl_sv_catpvf(aTHX_ sv,
9631                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9632                 (UV)trie->startstate,
9633                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9634                 (UV)trie->wordcount,
9635                 (UV)trie->minlen,
9636                 (UV)trie->maxlen,
9637                 (UV)TRIE_CHARCOUNT(trie),
9638                 (UV)trie->uniquecharcount
9639             )
9640         );
9641         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9642             int i;
9643             int rangestart = -1;
9644             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9645             sv_catpvs(sv, "[");
9646             for (i = 0; i <= 256; i++) {
9647                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9648                     if (rangestart == -1)
9649                         rangestart = i;
9650                 } else if (rangestart != -1) {
9651                     if (i <= rangestart + 3)
9652                         for (; rangestart < i; rangestart++)
9653                             put_byte(sv, rangestart);
9654                     else {
9655                         put_byte(sv, rangestart);
9656                         sv_catpvs(sv, "-");
9657                         put_byte(sv, i - 1);
9658                     }
9659                     rangestart = -1;
9660                 }
9661             }
9662             sv_catpvs(sv, "]");
9663         }
9664
9665     } else if (k == CURLY) {
9666         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9667             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9668         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9669     }
9670     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
9671         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9672     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9673         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
9674         if ( RXp_PAREN_NAMES(prog) ) {
9675             if ( k != REF || (OP(o) < NREF)) {
9676                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9677                 SV **name= av_fetch(list, ARG(o), 0 );
9678                 if (name)
9679                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9680             }
9681             else {
9682                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9683                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9684                 I32 *nums=(I32*)SvPVX(sv_dat);
9685                 SV **name= av_fetch(list, nums[0], 0 );
9686                 I32 n;
9687                 if (name) {
9688                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
9689                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9690                                     (n ? "," : ""), (IV)nums[n]);
9691                     }
9692                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9693                 }
9694             }
9695         }
9696     } else if (k == GOSUB)
9697         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9698     else if (k == VERB) {
9699         if (!o->flags)
9700             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9701                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9702     } else if (k == LOGICAL)
9703         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
9704     else if (k == FOLDCHAR)
9705         Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9706     else if (k == ANYOF) {
9707         int i, rangestart = -1;
9708         const U8 flags = ANYOF_FLAGS(o);
9709         int do_sep = 0;
9710
9711         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9712         static const char * const anyofs[] = {
9713             "\\w",
9714             "\\W",
9715             "\\s",
9716             "\\S",
9717             "\\d",
9718             "\\D",
9719             "[:alnum:]",
9720             "[:^alnum:]",
9721             "[:alpha:]",
9722             "[:^alpha:]",
9723             "[:ascii:]",
9724             "[:^ascii:]",
9725             "[:cntrl:]",
9726             "[:^cntrl:]",
9727             "[:graph:]",
9728             "[:^graph:]",
9729             "[:lower:]",
9730             "[:^lower:]",
9731             "[:print:]",
9732             "[:^print:]",
9733             "[:punct:]",
9734             "[:^punct:]",
9735             "[:upper:]",
9736             "[:^upper:]",
9737             "[:xdigit:]",
9738             "[:^xdigit:]",
9739             "[:space:]",
9740             "[:^space:]",
9741             "[:blank:]",
9742             "[:^blank:]"
9743         };
9744
9745         if (flags & ANYOF_LOCALE)
9746             sv_catpvs(sv, "{loc}");
9747         if (flags & ANYOF_FOLD)
9748             sv_catpvs(sv, "{i}");
9749         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9750         if (flags & ANYOF_INVERT)
9751             sv_catpvs(sv, "^");
9752
9753         /* output what the standard cp 0-255 bitmap matches */
9754         for (i = 0; i <= 256; i++) {
9755             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9756                 if (rangestart == -1)
9757                     rangestart = i;
9758             } else if (rangestart != -1) {
9759                 if (i <= rangestart + 3)
9760                     for (; rangestart < i; rangestart++)
9761                         put_byte(sv, rangestart);
9762                 else {
9763                     put_byte(sv, rangestart);
9764                     sv_catpvs(sv, "-");
9765                     put_byte(sv, i - 1);
9766                 }
9767                 do_sep = 1;
9768                 rangestart = -1;
9769             }
9770         }
9771
9772         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9773         /* output any special charclass tests (used entirely under use locale) */
9774         if (ANYOF_CLASS_TEST_ANY_SET(o))
9775             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9776                 if (ANYOF_CLASS_TEST(o,i)) {
9777                     sv_catpv(sv, anyofs[i]);
9778                     do_sep = 1;
9779                 }
9780
9781         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9782
9783         /* output information about the unicode matching */
9784         if (flags & ANYOF_UNICODE_ALL)
9785             sv_catpvs(sv, "{unicode_all}");
9786         else if (flags & ANYOF_UTF8)
9787             sv_catpvs(sv, "{unicode}");
9788         if (flags & ANYOF_NONBITMAP_NON_UTF8)
9789             sv_catpvs(sv, "{outside bitmap}");
9790
9791         {
9792             SV *lv;
9793             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9794
9795             if (lv) {
9796                 if (sw) {
9797                     U8 s[UTF8_MAXBYTES_CASE+1];
9798
9799                     for (i = 0; i <= 256; i++) { /* just the first 256 */
9800                         uvchr_to_utf8(s, i);
9801
9802                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
9803                             if (rangestart == -1)
9804                                 rangestart = i;
9805                         } else if (rangestart != -1) {
9806                             if (i <= rangestart + 3)
9807                                 for (; rangestart < i; rangestart++) {
9808                                     const U8 * const e = uvchr_to_utf8(s,rangestart);
9809                                     U8 *p;
9810                                     for(p = s; p < e; p++)
9811                                         put_byte(sv, *p);
9812                                 }
9813                             else {
9814                                 const U8 *e = uvchr_to_utf8(s,rangestart);
9815                                 U8 *p;
9816                                 for (p = s; p < e; p++)
9817                                     put_byte(sv, *p);
9818                                 sv_catpvs(sv, "-");
9819                                 e = uvchr_to_utf8(s, i-1);
9820                                 for (p = s; p < e; p++)
9821                                     put_byte(sv, *p);
9822                                 }
9823                                 rangestart = -1;
9824                             }
9825                         }
9826
9827                     sv_catpvs(sv, "..."); /* et cetera */
9828                 }
9829
9830                 {
9831                     char *s = savesvpv(lv);
9832                     char * const origs = s;
9833
9834                     while (*s && *s != '\n')
9835                         s++;
9836
9837                     if (*s == '\n') {
9838                         const char * const t = ++s;
9839
9840                         while (*s) {
9841                             if (*s == '\n')
9842                                 *s = ' ';
9843                             s++;
9844                         }
9845                         if (s[-1] == ' ')
9846                             s[-1] = 0;
9847
9848                         sv_catpv(sv, t);
9849                     }
9850
9851                     Safefree(origs);
9852                 }
9853             }
9854         }
9855
9856         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9857     }
9858     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9859         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9860 #else
9861     PERL_UNUSED_CONTEXT;
9862     PERL_UNUSED_ARG(sv);
9863     PERL_UNUSED_ARG(o);
9864     PERL_UNUSED_ARG(prog);
9865 #endif  /* DEBUGGING */
9866 }
9867
9868 SV *
9869 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9870 {                               /* Assume that RE_INTUIT is set */
9871     dVAR;
9872     struct regexp *const prog = (struct regexp *)SvANY(r);
9873     GET_RE_DEBUG_FLAGS_DECL;
9874
9875     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9876     PERL_UNUSED_CONTEXT;
9877
9878     DEBUG_COMPILE_r(
9879         {
9880             const char * const s = SvPV_nolen_const(prog->check_substr
9881                       ? prog->check_substr : prog->check_utf8);
9882
9883             if (!PL_colorset) reginitcolors();
9884             PerlIO_printf(Perl_debug_log,
9885                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9886                       PL_colors[4],
9887                       prog->check_substr ? "" : "utf8 ",
9888                       PL_colors[5],PL_colors[0],
9889                       s,
9890                       PL_colors[1],
9891                       (strlen(s) > 60 ? "..." : ""));
9892         } );
9893
9894     return prog->check_substr ? prog->check_substr : prog->check_utf8;
9895 }
9896
9897 /*
9898    pregfree()
9899
9900    handles refcounting and freeing the perl core regexp structure. When
9901    it is necessary to actually free the structure the first thing it
9902    does is call the 'free' method of the regexp_engine associated to
9903    the regexp, allowing the handling of the void *pprivate; member
9904    first. (This routine is not overridable by extensions, which is why
9905    the extensions free is called first.)
9906
9907    See regdupe and regdupe_internal if you change anything here.
9908 */
9909 #ifndef PERL_IN_XSUB_RE
9910 void
9911 Perl_pregfree(pTHX_ REGEXP *r)
9912 {
9913     SvREFCNT_dec(r);
9914 }
9915
9916 void
9917 Perl_pregfree2(pTHX_ REGEXP *rx)
9918 {
9919     dVAR;
9920     struct regexp *const r = (struct regexp *)SvANY(rx);
9921     GET_RE_DEBUG_FLAGS_DECL;
9922
9923     PERL_ARGS_ASSERT_PREGFREE2;
9924
9925     if (r->mother_re) {
9926         ReREFCNT_dec(r->mother_re);
9927     } else {
9928         CALLREGFREE_PVT(rx); /* free the private data */
9929         SvREFCNT_dec(RXp_PAREN_NAMES(r));
9930     }
9931     if (r->substrs) {
9932         SvREFCNT_dec(r->anchored_substr);
9933         SvREFCNT_dec(r->anchored_utf8);
9934         SvREFCNT_dec(r->float_substr);
9935         SvREFCNT_dec(r->float_utf8);
9936         Safefree(r->substrs);
9937     }
9938     RX_MATCH_COPY_FREE(rx);
9939 #ifdef PERL_OLD_COPY_ON_WRITE
9940     SvREFCNT_dec(r->saved_copy);
9941 #endif
9942     Safefree(r->offs);
9943 }
9944
9945 /*  reg_temp_copy()
9946
9947     This is a hacky workaround to the structural issue of match results
9948     being stored in the regexp structure which is in turn stored in
9949     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9950     could be PL_curpm in multiple contexts, and could require multiple
9951     result sets being associated with the pattern simultaneously, such
9952     as when doing a recursive match with (??{$qr})
9953
9954     The solution is to make a lightweight copy of the regexp structure
9955     when a qr// is returned from the code executed by (??{$qr}) this
9956     lightweight copy doesnt actually own any of its data except for
9957     the starp/end and the actual regexp structure itself.
9958
9959 */
9960
9961
9962 REGEXP *
9963 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9964 {
9965     struct regexp *ret;
9966     struct regexp *const r = (struct regexp *)SvANY(rx);
9967     register const I32 npar = r->nparens+1;
9968
9969     PERL_ARGS_ASSERT_REG_TEMP_COPY;
9970
9971     if (!ret_x)
9972         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9973     ret = (struct regexp *)SvANY(ret_x);
9974
9975     (void)ReREFCNT_inc(rx);
9976     /* We can take advantage of the existing "copied buffer" mechanism in SVs
9977        by pointing directly at the buffer, but flagging that the allocated
9978        space in the copy is zero. As we've just done a struct copy, it's now
9979        a case of zero-ing that, rather than copying the current length.  */
9980     SvPV_set(ret_x, RX_WRAPPED(rx));
9981     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9982     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9983            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9984     SvLEN_set(ret_x, 0);
9985     SvSTASH_set(ret_x, NULL);
9986     SvMAGIC_set(ret_x, NULL);
9987     Newx(ret->offs, npar, regexp_paren_pair);
9988     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9989     if (r->substrs) {
9990         Newx(ret->substrs, 1, struct reg_substr_data);
9991         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9992
9993         SvREFCNT_inc_void(ret->anchored_substr);
9994         SvREFCNT_inc_void(ret->anchored_utf8);
9995         SvREFCNT_inc_void(ret->float_substr);
9996         SvREFCNT_inc_void(ret->float_utf8);
9997
9998         /* check_substr and check_utf8, if non-NULL, point to either their
9999            anchored or float namesakes, and don't hold a second reference.  */
10000     }
10001     RX_MATCH_COPIED_off(ret_x);
10002 #ifdef PERL_OLD_COPY_ON_WRITE
10003     ret->saved_copy = NULL;
10004 #endif
10005     ret->mother_re = rx;
10006
10007     return ret_x;
10008 }
10009 #endif
10010
10011 /* regfree_internal()
10012
10013    Free the private data in a regexp. This is overloadable by
10014    extensions. Perl takes care of the regexp structure in pregfree(),
10015    this covers the *pprivate pointer which technically perl doesn't
10016    know about, however of course we have to handle the
10017    regexp_internal structure when no extension is in use.
10018
10019    Note this is called before freeing anything in the regexp
10020    structure.
10021  */
10022
10023 void
10024 Perl_regfree_internal(pTHX_ REGEXP * const rx)
10025 {
10026     dVAR;
10027     struct regexp *const r = (struct regexp *)SvANY(rx);
10028     RXi_GET_DECL(r,ri);
10029     GET_RE_DEBUG_FLAGS_DECL;
10030
10031     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
10032
10033     DEBUG_COMPILE_r({
10034         if (!PL_colorset)
10035             reginitcolors();
10036         {
10037             SV *dsv= sv_newmortal();
10038             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
10039                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
10040             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
10041                 PL_colors[4],PL_colors[5],s);
10042         }
10043     });
10044 #ifdef RE_TRACK_PATTERN_OFFSETS
10045     if (ri->u.offsets)
10046         Safefree(ri->u.offsets);             /* 20010421 MJD */
10047 #endif
10048     if (ri->data) {
10049         int n = ri->data->count;
10050         PAD* new_comppad = NULL;
10051         PAD* old_comppad;
10052         PADOFFSET refcnt;
10053
10054         while (--n >= 0) {
10055           /* If you add a ->what type here, update the comment in regcomp.h */
10056             switch (ri->data->what[n]) {
10057             case 'a':
10058             case 's':
10059             case 'S':
10060             case 'u':
10061                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
10062                 break;
10063             case 'f':
10064                 Safefree(ri->data->data[n]);
10065                 break;
10066             case 'p':
10067                 new_comppad = MUTABLE_AV(ri->data->data[n]);
10068                 break;
10069             case 'o':
10070                 if (new_comppad == NULL)
10071                     Perl_croak(aTHX_ "panic: pregfree comppad");
10072                 PAD_SAVE_LOCAL(old_comppad,
10073                     /* Watch out for global destruction's random ordering. */
10074                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
10075                 );
10076                 OP_REFCNT_LOCK;
10077                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
10078                 OP_REFCNT_UNLOCK;
10079                 if (!refcnt)
10080                     op_free((OP_4tree*)ri->data->data[n]);
10081
10082                 PAD_RESTORE_LOCAL(old_comppad);
10083                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
10084                 new_comppad = NULL;
10085                 break;
10086             case 'n':
10087                 break;
10088             case 'T':
10089                 { /* Aho Corasick add-on structure for a trie node.
10090                      Used in stclass optimization only */
10091                     U32 refcount;
10092                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
10093                     OP_REFCNT_LOCK;
10094                     refcount = --aho->refcount;
10095                     OP_REFCNT_UNLOCK;
10096                     if ( !refcount ) {
10097                         PerlMemShared_free(aho->states);
10098                         PerlMemShared_free(aho->fail);
10099                          /* do this last!!!! */
10100                         PerlMemShared_free(ri->data->data[n]);
10101                         PerlMemShared_free(ri->regstclass);
10102                     }
10103                 }
10104                 break;
10105             case 't':
10106                 {
10107                     /* trie structure. */
10108                     U32 refcount;
10109                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
10110                     OP_REFCNT_LOCK;
10111                     refcount = --trie->refcount;
10112                     OP_REFCNT_UNLOCK;
10113                     if ( !refcount ) {
10114                         PerlMemShared_free(trie->charmap);
10115                         PerlMemShared_free(trie->states);
10116                         PerlMemShared_free(trie->trans);
10117                         if (trie->bitmap)
10118                             PerlMemShared_free(trie->bitmap);
10119                         if (trie->jump)
10120                             PerlMemShared_free(trie->jump);
10121                         PerlMemShared_free(trie->wordinfo);
10122                         /* do this last!!!! */
10123                         PerlMemShared_free(ri->data->data[n]);
10124                     }
10125                 }
10126                 break;
10127             default:
10128                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
10129             }
10130         }
10131         Safefree(ri->data->what);
10132         Safefree(ri->data);
10133     }
10134
10135     Safefree(ri);
10136 }
10137
10138 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
10139 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
10140 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
10141
10142 /*
10143    re_dup - duplicate a regexp.
10144
10145    This routine is expected to clone a given regexp structure. It is only
10146    compiled under USE_ITHREADS.
10147
10148    After all of the core data stored in struct regexp is duplicated
10149    the regexp_engine.dupe method is used to copy any private data
10150    stored in the *pprivate pointer. This allows extensions to handle
10151    any duplication it needs to do.
10152
10153    See pregfree() and regfree_internal() if you change anything here.
10154 */
10155 #if defined(USE_ITHREADS)
10156 #ifndef PERL_IN_XSUB_RE
10157 void
10158 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
10159 {
10160     dVAR;
10161     I32 npar;
10162     const struct regexp *r = (const struct regexp *)SvANY(sstr);
10163     struct regexp *ret = (struct regexp *)SvANY(dstr);
10164
10165     PERL_ARGS_ASSERT_RE_DUP_GUTS;
10166
10167     npar = r->nparens+1;
10168     Newx(ret->offs, npar, regexp_paren_pair);
10169     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
10170     if(ret->swap) {
10171         /* no need to copy these */
10172         Newx(ret->swap, npar, regexp_paren_pair);
10173     }
10174
10175     if (ret->substrs) {
10176         /* Do it this way to avoid reading from *r after the StructCopy().
10177            That way, if any of the sv_dup_inc()s dislodge *r from the L1
10178            cache, it doesn't matter.  */
10179         const bool anchored = r->check_substr
10180             ? r->check_substr == r->anchored_substr
10181             : r->check_utf8 == r->anchored_utf8;
10182         Newx(ret->substrs, 1, struct reg_substr_data);
10183         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
10184
10185         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
10186         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
10187         ret->float_substr = sv_dup_inc(ret->float_substr, param);
10188         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
10189
10190         /* check_substr and check_utf8, if non-NULL, point to either their
10191            anchored or float namesakes, and don't hold a second reference.  */
10192
10193         if (ret->check_substr) {
10194             if (anchored) {
10195                 assert(r->check_utf8 == r->anchored_utf8);
10196                 ret->check_substr = ret->anchored_substr;
10197                 ret->check_utf8 = ret->anchored_utf8;
10198             } else {
10199                 assert(r->check_substr == r->float_substr);
10200                 assert(r->check_utf8 == r->float_utf8);
10201                 ret->check_substr = ret->float_substr;
10202                 ret->check_utf8 = ret->float_utf8;
10203             }
10204         } else if (ret->check_utf8) {
10205             if (anchored) {
10206                 ret->check_utf8 = ret->anchored_utf8;
10207             } else {
10208                 ret->check_utf8 = ret->float_utf8;
10209             }
10210         }
10211     }
10212
10213     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
10214
10215     if (ret->pprivate)
10216         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
10217
10218     if (RX_MATCH_COPIED(dstr))
10219         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
10220     else
10221         ret->subbeg = NULL;
10222 #ifdef PERL_OLD_COPY_ON_WRITE
10223     ret->saved_copy = NULL;
10224 #endif
10225
10226     if (ret->mother_re) {
10227         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
10228             /* Our storage points directly to our mother regexp, but that's
10229                1: a buffer in a different thread
10230                2: something we no longer hold a reference on
10231                so we need to copy it locally.  */
10232             /* Note we need to sue SvCUR() on our mother_re, because it, in
10233                turn, may well be pointing to its own mother_re.  */
10234             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
10235                                    SvCUR(ret->mother_re)+1));
10236             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
10237         }
10238         ret->mother_re      = NULL;
10239     }
10240     ret->gofs = 0;
10241 }
10242 #endif /* PERL_IN_XSUB_RE */
10243
10244 /*
10245    regdupe_internal()
10246
10247    This is the internal complement to regdupe() which is used to copy
10248    the structure pointed to by the *pprivate pointer in the regexp.
10249    This is the core version of the extension overridable cloning hook.
10250    The regexp structure being duplicated will be copied by perl prior
10251    to this and will be provided as the regexp *r argument, however
10252    with the /old/ structures pprivate pointer value. Thus this routine
10253    may override any copying normally done by perl.
10254
10255    It returns a pointer to the new regexp_internal structure.
10256 */
10257
10258 void *
10259 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
10260 {
10261     dVAR;
10262     struct regexp *const r = (struct regexp *)SvANY(rx);
10263     regexp_internal *reti;
10264     int len, npar;
10265     RXi_GET_DECL(r,ri);
10266
10267     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
10268
10269     npar = r->nparens+1;
10270     len = ProgLen(ri);
10271
10272     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
10273     Copy(ri->program, reti->program, len+1, regnode);
10274
10275
10276     reti->regstclass = NULL;
10277
10278     if (ri->data) {
10279         struct reg_data *d;
10280         const int count = ri->data->count;
10281         int i;
10282
10283         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
10284                 char, struct reg_data);
10285         Newx(d->what, count, U8);
10286
10287         d->count = count;
10288         for (i = 0; i < count; i++) {
10289             d->what[i] = ri->data->what[i];
10290             switch (d->what[i]) {
10291                 /* legal options are one of: sSfpontTua
10292                    see also regcomp.h and pregfree() */
10293             case 'a': /* actually an AV, but the dup function is identical.  */
10294             case 's':
10295             case 'S':
10296             case 'p': /* actually an AV, but the dup function is identical.  */
10297             case 'u': /* actually an HV, but the dup function is identical.  */
10298                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
10299                 break;
10300             case 'f':
10301                 /* This is cheating. */
10302                 Newx(d->data[i], 1, struct regnode_charclass_class);
10303                 StructCopy(ri->data->data[i], d->data[i],
10304                             struct regnode_charclass_class);
10305                 reti->regstclass = (regnode*)d->data[i];
10306                 break;
10307             case 'o':
10308                 /* Compiled op trees are readonly and in shared memory,
10309                    and can thus be shared without duplication. */
10310                 OP_REFCNT_LOCK;
10311                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
10312                 OP_REFCNT_UNLOCK;
10313                 break;
10314             case 'T':
10315                 /* Trie stclasses are readonly and can thus be shared
10316                  * without duplication. We free the stclass in pregfree
10317                  * when the corresponding reg_ac_data struct is freed.
10318                  */
10319                 reti->regstclass= ri->regstclass;
10320                 /* Fall through */
10321             case 't':
10322                 OP_REFCNT_LOCK;
10323                 ((reg_trie_data*)ri->data->data[i])->refcount++;
10324                 OP_REFCNT_UNLOCK;
10325                 /* Fall through */
10326             case 'n':
10327                 d->data[i] = ri->data->data[i];
10328                 break;
10329             default:
10330                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
10331             }
10332         }
10333
10334         reti->data = d;
10335     }
10336     else
10337         reti->data = NULL;
10338
10339     reti->name_list_idx = ri->name_list_idx;
10340
10341 #ifdef RE_TRACK_PATTERN_OFFSETS
10342     if (ri->u.offsets) {
10343         Newx(reti->u.offsets, 2*len+1, U32);
10344         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
10345     }
10346 #else
10347     SetProgLen(reti,len);
10348 #endif
10349
10350     return (void*)reti;
10351 }
10352
10353 #endif    /* USE_ITHREADS */
10354
10355 #ifndef PERL_IN_XSUB_RE
10356
10357 /*
10358  - regnext - dig the "next" pointer out of a node
10359  */
10360 regnode *
10361 Perl_regnext(pTHX_ register regnode *p)
10362 {
10363     dVAR;
10364     register I32 offset;
10365
10366     if (!p)
10367         return(NULL);
10368
10369     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
10370         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
10371     }
10372
10373     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
10374     if (offset == 0)
10375         return(NULL);
10376
10377     return(p+offset);
10378 }
10379 #endif
10380
10381 STATIC void
10382 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
10383 {
10384     va_list args;
10385     STRLEN l1 = strlen(pat1);
10386     STRLEN l2 = strlen(pat2);
10387     char buf[512];
10388     SV *msv;
10389     const char *message;
10390
10391     PERL_ARGS_ASSERT_RE_CROAK2;
10392
10393     if (l1 > 510)
10394         l1 = 510;
10395     if (l1 + l2 > 510)
10396         l2 = 510 - l1;
10397     Copy(pat1, buf, l1 , char);
10398     Copy(pat2, buf + l1, l2 , char);
10399     buf[l1 + l2] = '\n';
10400     buf[l1 + l2 + 1] = '\0';
10401 #ifdef I_STDARG
10402     /* ANSI variant takes additional second argument */
10403     va_start(args, pat2);
10404 #else
10405     va_start(args);
10406 #endif
10407     msv = vmess(buf, &args);
10408     va_end(args);
10409     message = SvPV_const(msv,l1);
10410     if (l1 > 512)
10411         l1 = 512;
10412     Copy(message, buf, l1 , char);
10413     buf[l1-1] = '\0';                   /* Overwrite \n */
10414     Perl_croak(aTHX_ "%s", buf);
10415 }
10416
10417 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
10418
10419 #ifndef PERL_IN_XSUB_RE
10420 void
10421 Perl_save_re_context(pTHX)
10422 {
10423     dVAR;
10424
10425     struct re_save_state *state;
10426
10427     SAVEVPTR(PL_curcop);
10428     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10429
10430     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10431     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
10432     SSPUSHUV(SAVEt_RE_STATE);
10433
10434     Copy(&PL_reg_state, state, 1, struct re_save_state);
10435
10436     PL_reg_start_tmp = 0;
10437     PL_reg_start_tmpl = 0;
10438     PL_reg_oldsaved = NULL;
10439     PL_reg_oldsavedlen = 0;
10440     PL_reg_maxiter = 0;
10441     PL_reg_leftiter = 0;
10442     PL_reg_poscache = NULL;
10443     PL_reg_poscache_size = 0;
10444 #ifdef PERL_OLD_COPY_ON_WRITE
10445     PL_nrs = NULL;
10446 #endif
10447
10448     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10449     if (PL_curpm) {
10450         const REGEXP * const rx = PM_GETRE(PL_curpm);
10451         if (rx) {
10452             U32 i;
10453             for (i = 1; i <= RX_NPARENS(rx); i++) {
10454                 char digits[TYPE_CHARS(long)];
10455                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
10456                 GV *const *const gvp
10457                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10458
10459                 if (gvp) {
10460                     GV * const gv = *gvp;
10461                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10462                         save_scalar(gv);
10463                 }
10464             }
10465         }
10466     }
10467 }
10468 #endif
10469
10470 static void
10471 clear_re(pTHX_ void *r)
10472 {
10473     dVAR;
10474     ReREFCNT_dec((REGEXP *)r);
10475 }
10476
10477 #ifdef DEBUGGING
10478
10479 STATIC void
10480 S_put_byte(pTHX_ SV *sv, int c)
10481 {
10482     PERL_ARGS_ASSERT_PUT_BYTE;
10483
10484     /* Our definition of isPRINT() ignores locales, so only bytes that are
10485        not part of UTF-8 are considered printable. I assume that the same
10486        holds for UTF-EBCDIC.
10487        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10488        which Wikipedia says:
10489
10490        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10491        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10492        identical, to the ASCII delete (DEL) or rubout control character.
10493        ) So the old condition can be simplified to !isPRINT(c)  */
10494     if (!isPRINT(c)) {
10495         if (c < 256) {
10496             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
10497         }
10498         else {
10499             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
10500         }
10501     }
10502     else {
10503         const char string = c;
10504         if (c == '-' || c == ']' || c == '\\' || c == '^')
10505             sv_catpvs(sv, "\\");
10506         sv_catpvn(sv, &string, 1);
10507     }
10508 }
10509
10510
10511 #define CLEAR_OPTSTART \
10512     if (optstart) STMT_START { \
10513             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
10514             optstart=NULL; \
10515     } STMT_END
10516
10517 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
10518
10519 STATIC const regnode *
10520 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
10521             const regnode *last, const regnode *plast,
10522             SV* sv, I32 indent, U32 depth)
10523 {
10524     dVAR;
10525     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
10526     register const regnode *next;
10527     const regnode *optstart= NULL;
10528
10529     RXi_GET_DECL(r,ri);
10530     GET_RE_DEBUG_FLAGS_DECL;
10531
10532     PERL_ARGS_ASSERT_DUMPUNTIL;
10533
10534 #ifdef DEBUG_DUMPUNTIL
10535     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10536         last ? last-start : 0,plast ? plast-start : 0);
10537 #endif
10538
10539     if (plast && plast < last)
10540         last= plast;
10541
10542     while (PL_regkind[op] != END && (!last || node < last)) {
10543         /* While that wasn't END last time... */
10544         NODE_ALIGN(node);
10545         op = OP(node);
10546         if (op == CLOSE || op == WHILEM)
10547             indent--;
10548         next = regnext((regnode *)node);
10549
10550         /* Where, what. */
10551         if (OP(node) == OPTIMIZED) {
10552             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10553                 optstart = node;
10554             else
10555                 goto after_print;
10556         } else
10557             CLEAR_OPTSTART;
10558
10559         regprop(r, sv, node);
10560         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10561                       (int)(2*indent + 1), "", SvPVX_const(sv));
10562
10563         if (OP(node) != OPTIMIZED) {
10564             if (next == NULL)           /* Next ptr. */
10565                 PerlIO_printf(Perl_debug_log, " (0)");
10566             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10567                 PerlIO_printf(Perl_debug_log, " (FAIL)");
10568             else
10569                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10570             (void)PerlIO_putc(Perl_debug_log, '\n');
10571         }
10572
10573       after_print:
10574         if (PL_regkind[(U8)op] == BRANCHJ) {
10575             assert(next);
10576             {
10577                 register const regnode *nnode = (OP(next) == LONGJMP
10578                                              ? regnext((regnode *)next)
10579                                              : next);
10580                 if (last && nnode > last)
10581                     nnode = last;
10582                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10583             }
10584         }
10585         else if (PL_regkind[(U8)op] == BRANCH) {
10586             assert(next);
10587             DUMPUNTIL(NEXTOPER(node), next);
10588         }
10589         else if ( PL_regkind[(U8)op]  == TRIE ) {
10590             const regnode *this_trie = node;
10591             const char op = OP(node);
10592             const U32 n = ARG(node);
10593             const reg_ac_data * const ac = op>=AHOCORASICK ?
10594                (reg_ac_data *)ri->data->data[n] :
10595                NULL;
10596             const reg_trie_data * const trie =
10597                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10598 #ifdef DEBUGGING
10599             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10600 #endif
10601             const regnode *nextbranch= NULL;
10602             I32 word_idx;
10603             sv_setpvs(sv, "");
10604             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10605                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10606
10607                 PerlIO_printf(Perl_debug_log, "%*s%s ",
10608                    (int)(2*(indent+3)), "",
10609                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10610                             PL_colors[0], PL_colors[1],
10611                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10612                             PERL_PV_PRETTY_ELLIPSES    |
10613                             PERL_PV_PRETTY_LTGT
10614                             )
10615                             : "???"
10616                 );
10617                 if (trie->jump) {
10618                     U16 dist= trie->jump[word_idx+1];
10619                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10620                                   (UV)((dist ? this_trie + dist : next) - start));
10621                     if (dist) {
10622                         if (!nextbranch)
10623                             nextbranch= this_trie + trie->jump[0];
10624                         DUMPUNTIL(this_trie + dist, nextbranch);
10625                     }
10626                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10627                         nextbranch= regnext((regnode *)nextbranch);
10628                 } else {
10629                     PerlIO_printf(Perl_debug_log, "\n");
10630                 }
10631             }
10632             if (last && next > last)
10633                 node= last;
10634             else
10635                 node= next;
10636         }
10637         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
10638             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10639                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10640         }
10641         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10642             assert(next);
10643             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10644         }
10645         else if ( op == PLUS || op == STAR) {
10646             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10647         }
10648         else if (op == ANYOF) {
10649             /* arglen 1 + class block */
10650             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
10651                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10652             node = NEXTOPER(node);
10653         }
10654         else if (PL_regkind[(U8)op] == EXACT) {
10655             /* Literal string, where present. */
10656             node += NODE_SZ_STR(node) - 1;
10657             node = NEXTOPER(node);
10658         }
10659         else {
10660             node = NEXTOPER(node);
10661             node += regarglen[(U8)op];
10662         }
10663         if (op == CURLYX || op == OPEN)
10664             indent++;
10665     }
10666     CLEAR_OPTSTART;
10667 #ifdef DEBUG_DUMPUNTIL
10668     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10669 #endif
10670     return node;
10671 }
10672
10673 #endif  /* DEBUGGING */
10674
10675 /*
10676  * Local variables:
10677  * c-indentation-style: bsd
10678  * c-basic-offset: 4
10679  * indent-tabs-mode: t
10680  * End:
10681  *
10682  * ex: set ts=8 sts=4 sw=4 noet:
10683  */