src/5013007/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89
  90 #ifdef op
  91 #undef op
  92 #endif /* op */
  93
  94 #ifdef MSDOS
  95 #  if defined(BUGGY_MSC6)
  96  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  97 #    pragma optimize("a",off)
  98  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  99 #    pragma optimize("w",on )
 100 #  endif /* BUGGY_MSC6 */
 101 #endif /* MSDOS */
 102
 103 #ifndef STATIC
 104 #define STATIC  static
 105 #endif
 106
 107 typedef struct RExC_state_t {
 108     U32         flags;                  /* are we folding, multilining? */
 109     char        *precomp;               /* uncompiled string. */
 110     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 111     regexp      *rx;                    /* perl core regexp structure */
 112     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 113     char        *start;                 /* Start of input for compile */
 114     char        *end;                   /* End of input for compile */
 115     char        *parse;                 /* Input-scan pointer. */
 116     I32         whilem_seen;            /* number of WHILEM in this expr */
 117     regnode     *emit_start;            /* Start of emitted-code area */
 118     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 120     I32         naughty;                /* How bad is this pattern? */
 121     I32         sawback;                /* Did we see \1, ...? */
 122     U32         seen;
 123     I32         size;                   /* Code size. */
 124     I32         npar;                   /* Capture buffer count, (OPEN). */
 125     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 126     I32         nestroot;               /* root parens we are in - used by accept */
 127     I32         extralen;
 128     I32         seen_zerolen;
 129     I32         seen_evals;
 130     regnode     **open_parens;          /* pointers to open parens */
 131     regnode     **close_parens;         /* pointers to close parens */
 132     regnode     *opend;                 /* END node in program */
 133     I32         utf8;           /* whether the pattern is utf8 or not */
 134     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 135                                 /* XXX use this for future optimisation of case
 136                                  * where pattern must be upgraded to utf8. */
 137     HV          *paren_names;           /* Paren names */
 138
 139     regnode     **recurse;              /* Recurse regops */
 140     I32         recurse_count;          /* Number of recurse regops */
 141 #if ADD_TO_REGEXEC
 142     char        *starttry;              /* -Dr: where regtry was called. */
 143 #define RExC_starttry   (pRExC_state->starttry)
 144 #endif
 145 #ifdef DEBUGGING
 146     const char  *lastparse;
 147     I32         lastnum;
 148     AV          *paren_name_list;       /* idx -> name */
 149 #define RExC_lastparse  (pRExC_state->lastparse)
 150 #define RExC_lastnum    (pRExC_state->lastnum)
 151 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 152 #endif
 153 } RExC_state_t;
 154
 155 #define RExC_flags      (pRExC_state->flags)
 156 #define RExC_precomp    (pRExC_state->precomp)
 157 #define RExC_rx_sv      (pRExC_state->rx_sv)
 158 #define RExC_rx         (pRExC_state->rx)
 159 #define RExC_rxi        (pRExC_state->rxi)
 160 #define RExC_start      (pRExC_state->start)
 161 #define RExC_end        (pRExC_state->end)
 162 #define RExC_parse      (pRExC_state->parse)
 163 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 164 #ifdef RE_TRACK_PATTERN_OFFSETS
 165 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 166 #endif
 167 #define RExC_emit       (pRExC_state->emit)
 168 #define RExC_emit_start (pRExC_state->emit_start)
 169 #define RExC_emit_bound (pRExC_state->emit_bound)
 170 #define RExC_naughty    (pRExC_state->naughty)
 171 #define RExC_sawback    (pRExC_state->sawback)
 172 #define RExC_seen       (pRExC_state->seen)
 173 #define RExC_size       (pRExC_state->size)
 174 #define RExC_npar       (pRExC_state->npar)
 175 #define RExC_nestroot   (pRExC_state->nestroot)
 176 #define RExC_extralen   (pRExC_state->extralen)
 177 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 178 #define RExC_seen_evals (pRExC_state->seen_evals)
 179 #define RExC_utf8       (pRExC_state->utf8)
 180 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 181 #define RExC_open_parens        (pRExC_state->open_parens)
 182 #define RExC_close_parens       (pRExC_state->close_parens)
 183 #define RExC_opend      (pRExC_state->opend)
 184 #define RExC_paren_names        (pRExC_state->paren_names)
 185 #define RExC_recurse    (pRExC_state->recurse)
 186 #define RExC_recurse_count      (pRExC_state->recurse_count)
 187
 188
 189 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 190 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 191         ((*s) == '{' && regcurly(s)))
 192
 193 #ifdef SPSTART
 194 #undef SPSTART          /* dratted cpp namespace... */
 195 #endif
 196 /*
 197  * Flags to be passed up and down.
 198  */
 199 #define WORST           0       /* Worst case. */
 200 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 201
 202 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 203  * character, and if utf8, must be invariant. */
 204 #define SIMPLE          0x02
 205 #define SPSTART         0x04    /* Starts with * or +. */
 206 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 207 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 208
 209 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 210
 211 /* whether trie related optimizations are enabled */
 212 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 213 #define TRIE_STUDY_OPT
 214 #define FULL_TRIE_STUDY
 215 #define TRIE_STCLASS
 216 #endif
 217
 218
 219
 220 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 221 #define PBITVAL(paren) (1 << ((paren) & 7))
 222 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 223 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 224 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 225
 226 /* If not already in utf8, do a longjmp back to the beginning */
 227 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 228 #define REQUIRE_UTF8    STMT_START {                                       \
 229                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 230                         } STMT_END
 231
 232 /* About scan_data_t.
 233
 234   During optimisation we recurse through the regexp program performing
 235   various inplace (keyhole style) optimisations. In addition study_chunk
 236   and scan_commit populate this data structure with information about
 237   what strings MUST appear in the pattern. We look for the longest
 238   string that must appear at a fixed location, and we look for the
 239   longest string that may appear at a floating location. So for instance
 240   in the pattern:
 241
 242     /FOO[xX]A.*B[xX]BAR/
 243
 244   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 245   strings (because they follow a .* construct). study_chunk will identify
 246   both FOO and BAR as being the longest fixed and floating strings respectively.
 247
 248   The strings can be composites, for instance
 249
 250      /(f)(o)(o)/
 251
 252   will result in a composite fixed substring 'foo'.
 253
 254   For each string some basic information is maintained:
 255
 256   - offset or min_offset
 257     This is the position the string must appear at, or not before.
 258     It also implicitly (when combined with minlenp) tells us how many
 259     characters must match before the string we are searching for.
 260     Likewise when combined with minlenp and the length of the string it
 261     tells us how many characters must appear after the string we have
 262     found.
 263
 264   - max_offset
 265     Only used for floating strings. This is the rightmost point that
 266     the string can appear at. If set to I32 max it indicates that the
 267     string can occur infinitely far to the right.
 268
 269   - minlenp
 270     A pointer to the minimum length of the pattern that the string
 271     was found inside. This is important as in the case of positive
 272     lookahead or positive lookbehind we can have multiple patterns
 273     involved. Consider
 274
 275     /(?=FOO).*F/
 276
 277     The minimum length of the pattern overall is 3, the minimum length
 278     of the lookahead part is 3, but the minimum length of the part that
 279     will actually match is 1. So 'FOO's minimum length is 3, but the
 280     minimum length for the F is 1. This is important as the minimum length
 281     is used to determine offsets in front of and behind the string being
 282     looked for.  Since strings can be composites this is the length of the
 283     pattern at the time it was commited with a scan_commit. Note that
 284     the length is calculated by study_chunk, so that the minimum lengths
 285     are not known until the full pattern has been compiled, thus the
 286     pointer to the value.
 287
 288   - lookbehind
 289
 290     In the case of lookbehind the string being searched for can be
 291     offset past the start point of the final matching string.
 292     If this value was just blithely removed from the min_offset it would
 293     invalidate some of the calculations for how many chars must match
 294     before or after (as they are derived from min_offset and minlen and
 295     the length of the string being searched for).
 296     When the final pattern is compiled and the data is moved from the
 297     scan_data_t structure into the regexp structure the information
 298     about lookbehind is factored in, with the information that would
 299     have been lost precalculated in the end_shift field for the
 300     associated string.
 301
 302   The fields pos_min and pos_delta are used to store the minimum offset
 303   and the delta to the maximum offset at the current point in the pattern.
 304
 305 */
 306
 307 typedef struct scan_data_t {
 308     /*I32 len_min;      unused */
 309     /*I32 len_delta;    unused */
 310     I32 pos_min;
 311     I32 pos_delta;
 312     SV *last_found;
 313     I32 last_end;           /* min value, <0 unless valid. */
 314     I32 last_start_min;
 315     I32 last_start_max;
 316     SV **longest;           /* Either &l_fixed, or &l_float. */
 317     SV *longest_fixed;      /* longest fixed string found in pattern */
 318     I32 offset_fixed;       /* offset where it starts */
 319     I32 *minlen_fixed;      /* pointer to the minlen relevent to the string */
 320     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 321     SV *longest_float;      /* longest floating string found in pattern */
 322     I32 offset_float_min;   /* earliest point in string it can appear */
 323     I32 offset_float_max;   /* latest point in string it can appear */
 324     I32 *minlen_float;      /* pointer to the minlen relevent to the string */
 325     I32 lookbehind_float;   /* is the position of the string modified by LB */
 326     I32 flags;
 327     I32 whilem_c;
 328     I32 *last_closep;
 329     struct regnode_charclass_class *start_class;
 330 } scan_data_t;
 331
 332 /*
 333  * Forward declarations for pregcomp()'s friends.
 334  */
 335
 336 static const scan_data_t zero_scan_data =
 337   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 338
 339 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 340 #define SF_BEFORE_SEOL          0x0001
 341 #define SF_BEFORE_MEOL          0x0002
 342 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 343 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 344
 345 #ifdef NO_UNARY_PLUS
 346 #  define SF_FIX_SHIFT_EOL      (0+2)
 347 #  define SF_FL_SHIFT_EOL               (0+4)
 348 #else
 349 #  define SF_FIX_SHIFT_EOL      (+2)
 350 #  define SF_FL_SHIFT_EOL               (+4)
 351 #endif
 352
 353 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 354 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 355
 356 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 357 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 358 #define SF_IS_INF               0x0040
 359 #define SF_HAS_PAR              0x0080
 360 #define SF_IN_PAR               0x0100
 361 #define SF_HAS_EVAL             0x0200
 362 #define SCF_DO_SUBSTR           0x0400
 363 #define SCF_DO_STCLASS_AND      0x0800
 364 #define SCF_DO_STCLASS_OR       0x1000
 365 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 366 #define SCF_WHILEM_VISITED_POS  0x2000
 367
 368 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 369 #define SCF_SEEN_ACCEPT         0x8000
 370
 371 #define UTF cBOOL(RExC_utf8)
 372 #define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
 373 #define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
 374 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 375
 376 #define OOB_UNICODE             12345678
 377 #define OOB_NAMEDCLASS          -1
 378
 379 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 380 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 381
 382
 383 /* length of regex to show in messages that don't mark a position within */
 384 #define RegexLengthToShowInErrorMessages 127
 385
 386 /*
 387  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 388  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 389  * op/pragma/warn/regcomp.
 390  */
 391 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 392 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 393
 394 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 395
 396 /*
 397  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 398  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 399  * "...".
 400  */
 401 #define _FAIL(code) STMT_START {                                        \
 402     const char *ellipses = "";                                          \
 403     IV len = RExC_end - RExC_precomp;                                   \
 404                                                                         \
 405     if (!SIZE_ONLY)                                                     \
 406         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 407     if (len > RegexLengthToShowInErrorMessages) {                       \
 408         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 409         len = RegexLengthToShowInErrorMessages - 10;                    \
 410         ellipses = "...";                                               \
 411     }                                                                   \
 412     code;                                                               \
 413 } STMT_END
 414
 415 #define FAIL(msg) _FAIL(                            \
 416     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 417             msg, (int)len, RExC_precomp, ellipses))
 418
 419 #define FAIL2(msg,arg) _FAIL(                       \
 420     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 421             arg, (int)len, RExC_precomp, ellipses))
 422
 423 /*
 424  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 425  */
 426 #define Simple_vFAIL(m) STMT_START {                                    \
 427     const IV offset = RExC_parse - RExC_precomp;                        \
 428     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 429             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 430 } STMT_END
 431
 432 /*
 433  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 434  */
 435 #define vFAIL(m) STMT_START {                           \
 436     if (!SIZE_ONLY)                                     \
 437         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 438     Simple_vFAIL(m);                                    \
 439 } STMT_END
 440
 441 /*
 442  * Like Simple_vFAIL(), but accepts two arguments.
 443  */
 444 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 445     const IV offset = RExC_parse - RExC_precomp;                        \
 446     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 447             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 448 } STMT_END
 449
 450 /*
 451  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 452  */
 453 #define vFAIL2(m,a1) STMT_START {                       \
 454     if (!SIZE_ONLY)                                     \
 455         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 456     Simple_vFAIL2(m, a1);                               \
 457 } STMT_END
 458
 459
 460 /*
 461  * Like Simple_vFAIL(), but accepts three arguments.
 462  */
 463 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 464     const IV offset = RExC_parse - RExC_precomp;                \
 465     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 466             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 467 } STMT_END
 468
 469 /*
 470  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 471  */
 472 #define vFAIL3(m,a1,a2) STMT_START {                    \
 473     if (!SIZE_ONLY)                                     \
 474         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 475     Simple_vFAIL3(m, a1, a2);                           \
 476 } STMT_END
 477
 478 /*
 479  * Like Simple_vFAIL(), but accepts four arguments.
 480  */
 481 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 482     const IV offset = RExC_parse - RExC_precomp;                \
 483     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 484             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 485 } STMT_END
 486
 487 #define ckWARNreg(loc,m) STMT_START {                                   \
 488     const IV offset = loc - RExC_precomp;                               \
 489     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 490             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 491 } STMT_END
 492
 493 #define ckWARNregdep(loc,m) STMT_START {                                \
 494     const IV offset = loc - RExC_precomp;                               \
 495     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 496             m REPORT_LOCATION,                                          \
 497             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 498 } STMT_END
 499
 500 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 501     const IV offset = loc - RExC_precomp;                               \
 502     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 503             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 504 } STMT_END
 505
 506 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 507     const IV offset = loc - RExC_precomp;                               \
 508     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 509             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 510 } STMT_END
 511
 512 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 513     const IV offset = loc - RExC_precomp;                               \
 514     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 515             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 516 } STMT_END
 517
 518 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 519     const IV offset = loc - RExC_precomp;                               \
 520     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 521             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 522 } STMT_END
 523
 524 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 525     const IV offset = loc - RExC_precomp;                               \
 526     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 527             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 528 } STMT_END
 529
 530 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 531     const IV offset = loc - RExC_precomp;                               \
 532     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 533             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 534 } STMT_END
 535
 536
 537 /* Allow for side effects in s */
 538 #define REGC(c,s) STMT_START {                  \
 539     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 540 } STMT_END
 541
 542 /* Macros for recording node offsets.   20001227 mjd@plover.com
 543  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 544  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 545  * Element 0 holds the number n.
 546  * Position is 1 indexed.
 547  */
 548 #ifndef RE_TRACK_PATTERN_OFFSETS
 549 #define Set_Node_Offset_To_R(node,byte)
 550 #define Set_Node_Offset(node,byte)
 551 #define Set_Cur_Node_Offset
 552 #define Set_Node_Length_To_R(node,len)
 553 #define Set_Node_Length(node,len)
 554 #define Set_Node_Cur_Length(node)
 555 #define Node_Offset(n)
 556 #define Node_Length(n)
 557 #define Set_Node_Offset_Length(node,offset,len)
 558 #define ProgLen(ri) ri->u.proglen
 559 #define SetProgLen(ri,x) ri->u.proglen = x
 560 #else
 561 #define ProgLen(ri) ri->u.offsets[0]
 562 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 563 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 564     if (! SIZE_ONLY) {                                                  \
 565         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 566                     __LINE__, (int)(node), (int)(byte)));               \
 567         if((node) < 0) {                                                \
 568             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 569         } else {                                                        \
 570             RExC_offsets[2*(node)-1] = (byte);                          \
 571         }                                                               \
 572     }                                                                   \
 573 } STMT_END
 574
 575 #define Set_Node_Offset(node,byte) \
 576     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 577 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 578
 579 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 580     if (! SIZE_ONLY) {                                                  \
 581         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 582                 __LINE__, (int)(node), (int)(len)));                    \
 583         if((node) < 0) {                                                \
 584             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 585         } else {                                                        \
 586             RExC_offsets[2*(node)] = (len);                             \
 587         }                                                               \
 588     }                                                                   \
 589 } STMT_END
 590
 591 #define Set_Node_Length(node,len) \
 592     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 593 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 594 #define Set_Node_Cur_Length(node) \
 595     Set_Node_Length(node, RExC_parse - parse_start)
 596
 597 /* Get offsets and lengths */
 598 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 599 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 600
 601 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 602     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 603     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 604 } STMT_END
 605 #endif
 606
 607 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 608 #define EXPERIMENTAL_INPLACESCAN
 609 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 610
 611 #define DEBUG_STUDYDATA(str,data,depth)                              \
 612 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 613     PerlIO_printf(Perl_debug_log,                                    \
 614         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 615         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 616         (int)(depth)*2, "",                                          \
 617         (IV)((data)->pos_min),                                       \
 618         (IV)((data)->pos_delta),                                     \
 619         (UV)((data)->flags),                                         \
 620         (IV)((data)->whilem_c),                                      \
 621         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 622         is_inf ? "INF " : ""                                         \
 623     );                                                               \
 624     if ((data)->last_found)                                          \
 625         PerlIO_printf(Perl_debug_log,                                \
 626             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 627             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 628             SvPVX_const((data)->last_found),                         \
 629             (IV)((data)->last_end),                                  \
 630             (IV)((data)->last_start_min),                            \
 631             (IV)((data)->last_start_max),                            \
 632             ((data)->longest &&                                      \
 633              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 634             SvPVX_const((data)->longest_fixed),                      \
 635             (IV)((data)->offset_fixed),                              \
 636             ((data)->longest &&                                      \
 637              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 638             SvPVX_const((data)->longest_float),                      \
 639             (IV)((data)->offset_float_min),                          \
 640             (IV)((data)->offset_float_max)                           \
 641         );                                                           \
 642     PerlIO_printf(Perl_debug_log,"\n");                              \
 643 });
 644
 645 static void clear_re(pTHX_ void *r);
 646
 647 /* Mark that we cannot extend a found fixed substring at this point.
 648    Update the longest found anchored substring and the longest found
 649    floating substrings if needed. */
 650
 651 STATIC void
 652 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 653 {
 654     const STRLEN l = CHR_SVLEN(data->last_found);
 655     const STRLEN old_l = CHR_SVLEN(*data->longest);
 656     GET_RE_DEBUG_FLAGS_DECL;
 657
 658     PERL_ARGS_ASSERT_SCAN_COMMIT;
 659
 660     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 661         SvSetMagicSV(*data->longest, data->last_found);
 662         if (*data->longest == data->longest_fixed) {
 663             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 664             if (data->flags & SF_BEFORE_EOL)
 665                 data->flags
 666                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 667             else
 668                 data->flags &= ~SF_FIX_BEFORE_EOL;
 669             data->minlen_fixed=minlenp;
 670             data->lookbehind_fixed=0;
 671         }
 672         else { /* *data->longest == data->longest_float */
 673             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 674             data->offset_float_max = (l
 675                                       ? data->last_start_max
 676                                       : data->pos_min + data->pos_delta);
 677             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 678                 data->offset_float_max = I32_MAX;
 679             if (data->flags & SF_BEFORE_EOL)
 680                 data->flags
 681                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 682             else
 683                 data->flags &= ~SF_FL_BEFORE_EOL;
 684             data->minlen_float=minlenp;
 685             data->lookbehind_float=0;
 686         }
 687     }
 688     SvCUR_set(data->last_found, 0);
 689     {
 690         SV * const sv = data->last_found;
 691         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 692             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 693             if (mg)
 694                 mg->mg_len = 0;
 695         }
 696     }
 697     data->last_end = -1;
 698     data->flags &= ~SF_BEFORE_EOL;
 699     DEBUG_STUDYDATA("commit: ",data,0);
 700 }
 701
 702 /* Can match anything (initialization) */
 703 STATIC void
 704 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 705 {
 706     PERL_ARGS_ASSERT_CL_ANYTHING;
 707
 708     ANYOF_CLASS_ZERO(cl);
 709     ANYOF_BITMAP_SETALL(cl);
 710     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
 711     if (LOC)
 712         cl->flags |= ANYOF_LOCALE;
 713     cl->flags |= ANYOF_FOLD;
 714 }
 715
 716 /* Can match anything (initialization) */
 717 STATIC int
 718 S_cl_is_anything(const struct regnode_charclass_class *cl)
 719 {
 720     int value;
 721
 722     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 723
 724     for (value = 0; value <= ANYOF_MAX; value += 2)
 725         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 726             return 1;
 727     if (!(cl->flags & ANYOF_UNICODE_ALL))
 728         return 0;
 729     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 730         return 0;
 731     return 1;
 732 }
 733
 734 /* Can match anything (initialization) */
 735 STATIC void
 736 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 737 {
 738     PERL_ARGS_ASSERT_CL_INIT;
 739
 740     Zero(cl, 1, struct regnode_charclass_class);
 741     cl->type = ANYOF;
 742     cl_anything(pRExC_state, cl);
 743 }
 744
 745 STATIC void
 746 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 747 {
 748     PERL_ARGS_ASSERT_CL_INIT_ZERO;
 749
 750     Zero(cl, 1, struct regnode_charclass_class);
 751     cl->type = ANYOF;
 752     cl_anything(pRExC_state, cl);
 753     if (LOC)
 754         cl->flags |= ANYOF_LOCALE;
 755 }
 756
 757 /* 'And' a given class with another one.  Can create false positives */
 758 /* We assume that cl is not inverted */
 759 STATIC void
 760 S_cl_and(struct regnode_charclass_class *cl,
 761         const struct regnode_charclass_class *and_with)
 762 {
 763     PERL_ARGS_ASSERT_CL_AND;
 764
 765     assert(and_with->type == ANYOF);
 766     if (!(and_with->flags & ANYOF_CLASS)
 767         && !(cl->flags & ANYOF_CLASS)
 768         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 769         && !(and_with->flags & ANYOF_FOLD)
 770         && !(cl->flags & ANYOF_FOLD)) {
 771         int i;
 772
 773         if (and_with->flags & ANYOF_INVERT)
 774             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 775                 cl->bitmap[i] &= ~and_with->bitmap[i];
 776         else
 777             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 778                 cl->bitmap[i] &= and_with->bitmap[i];
 779     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 780     if (!(and_with->flags & ANYOF_EOS))
 781         cl->flags &= ~ANYOF_EOS;
 782
 783     if (!(and_with->flags & ANYOF_FOLD))
 784         cl->flags &= ~ANYOF_FOLD;
 785
 786     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
 787         !(and_with->flags & ANYOF_INVERT)) {
 788         cl->flags &= ~ANYOF_UNICODE_ALL;
 789         cl->flags |= ANYOF_UNICODE;
 790         ARG_SET(cl, ARG(and_with));
 791     }
 792     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
 793         !(and_with->flags & ANYOF_INVERT))
 794         cl->flags &= ~ANYOF_UNICODE_ALL;
 795     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
 796         !(and_with->flags & ANYOF_INVERT))
 797         cl->flags &= ~ANYOF_UNICODE;
 798 }
 799
 800 /* 'OR' a given class with another one.  Can create false positives */
 801 /* We assume that cl is not inverted */
 802 STATIC void
 803 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 804 {
 805     PERL_ARGS_ASSERT_CL_OR;
 806
 807     if (or_with->flags & ANYOF_INVERT) {
 808         /* We do not use
 809          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 810          *   <= (B1 | !B2) | (CL1 | !CL2)
 811          * which is wasteful if CL2 is small, but we ignore CL2:
 812          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 813          * XXXX Can we handle case-fold?  Unclear:
 814          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 815          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 816          */
 817         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 818              && !(or_with->flags & ANYOF_FOLD)
 819              && !(cl->flags & ANYOF_FOLD) ) {
 820             int i;
 821
 822             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 823                 cl->bitmap[i] |= ~or_with->bitmap[i];
 824         } /* XXXX: logic is complicated otherwise */
 825         else {
 826             cl_anything(pRExC_state, cl);
 827         }
 828     } else {
 829         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 830         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 831              && (!(or_with->flags & ANYOF_FOLD)
 832                  || (cl->flags & ANYOF_FOLD)) ) {
 833             int i;
 834
 835             /* OR char bitmap and class bitmap separately */
 836             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 837                 cl->bitmap[i] |= or_with->bitmap[i];
 838             if (or_with->flags & ANYOF_CLASS) {
 839                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 840                     cl->classflags[i] |= or_with->classflags[i];
 841                 cl->flags |= ANYOF_CLASS;
 842             }
 843         }
 844         else { /* XXXX: logic is complicated, leave it along for a moment. */
 845             cl_anything(pRExC_state, cl);
 846         }
 847     }
 848     if (or_with->flags & ANYOF_EOS)
 849         cl->flags |= ANYOF_EOS;
 850
 851     if (or_with->flags & ANYOF_FOLD)
 852         cl->flags |= ANYOF_FOLD;
 853
 854     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
 855         ARG(cl) != ARG(or_with)) {
 856         cl->flags |= ANYOF_UNICODE_ALL;
 857         cl->flags &= ~ANYOF_UNICODE;
 858     }
 859     if (or_with->flags & ANYOF_UNICODE_ALL) {
 860         cl->flags |= ANYOF_UNICODE_ALL;
 861         cl->flags &= ~ANYOF_UNICODE;
 862     }
 863 }
 864
 865 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 866 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 867 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 868 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
 869
 870
 871 #ifdef DEBUGGING
 872 /*
 873    dump_trie(trie,widecharmap,revcharmap)
 874    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
 875    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 876
 877    These routines dump out a trie in a somewhat readable format.
 878    The _interim_ variants are used for debugging the interim
 879    tables that are used to generate the final compressed
 880    representation which is what dump_trie expects.
 881
 882    Part of the reason for their existance is to provide a form
 883    of documentation as to how the different representations function.
 884
 885 */
 886
 887 /*
 888   Dumps the final compressed table form of the trie to Perl_debug_log.
 889   Used for debugging make_trie().
 890 */
 891
 892 STATIC void
 893 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
 894             AV *revcharmap, U32 depth)
 895 {
 896     U32 state;
 897     SV *sv=sv_newmortal();
 898     int colwidth= widecharmap ? 6 : 4;
 899     U16 word;
 900     GET_RE_DEBUG_FLAGS_DECL;
 901
 902     PERL_ARGS_ASSERT_DUMP_TRIE;
 903
 904     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
 905         (int)depth * 2 + 2,"",
 906         "Match","Base","Ofs" );
 907
 908     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
 909         SV ** const tmp = av_fetch( revcharmap, state, 0);
 910         if ( tmp ) {
 911             PerlIO_printf( Perl_debug_log, "%*s",
 912                 colwidth,
 913                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 914                             PL_colors[0], PL_colors[1],
 915                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 916                             PERL_PV_ESCAPE_FIRSTCHAR
 917                 )
 918             );
 919         }
 920     }
 921     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
 922         (int)depth * 2 + 2,"");
 923
 924     for( state = 0 ; state < trie->uniquecharcount ; state++ )
 925         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
 926     PerlIO_printf( Perl_debug_log, "\n");
 927
 928     for( state = 1 ; state < trie->statecount ; state++ ) {
 929         const U32 base = trie->states[ state ].trans.base;
 930
 931         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
 932
 933         if ( trie->states[ state ].wordnum ) {
 934             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
 935         } else {
 936             PerlIO_printf( Perl_debug_log, "%6s", "" );
 937         }
 938
 939         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
 940
 941         if ( base ) {
 942             U32 ofs = 0;
 943
 944             while( ( base + ofs  < trie->uniquecharcount ) ||
 945                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
 946                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
 947                     ofs++;
 948
 949             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
 950
 951             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
 952                 if ( ( base + ofs >= trie->uniquecharcount ) &&
 953                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
 954                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
 955                 {
 956                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
 957                     colwidth,
 958                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
 959                 } else {
 960                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
 961                 }
 962             }
 963
 964             PerlIO_printf( Perl_debug_log, "]");
 965
 966         }
 967         PerlIO_printf( Perl_debug_log, "\n" );
 968     }
 969     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
 970     for (word=1; word <= trie->wordcount; word++) {
 971         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
 972             (int)word, (int)(trie->wordinfo[word].prev),
 973             (int)(trie->wordinfo[word].len));
 974     }
 975     PerlIO_printf(Perl_debug_log, "\n" );
 976 }
 977 /*
 978   Dumps a fully constructed but uncompressed trie in list form.
 979   List tries normally only are used for construction when the number of
 980   possible chars (trie->uniquecharcount) is very high.
 981   Used for debugging make_trie().
 982 */
 983 STATIC void
 984 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
 985                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
 986                          U32 depth)
 987 {
 988     U32 state;
 989     SV *sv=sv_newmortal();
 990     int colwidth= widecharmap ? 6 : 4;
 991     GET_RE_DEBUG_FLAGS_DECL;
 992
 993     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
 994
 995     /* print out the table precompression.  */
 996     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
 997         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
 998         "------:-----+-----------------\n" );
 999
1000     for( state=1 ; state < next_alloc ; state ++ ) {
1001         U16 charid;
1002
1003         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1004             (int)depth * 2 + 2,"", (UV)state  );
1005         if ( ! trie->states[ state ].wordnum ) {
1006             PerlIO_printf( Perl_debug_log, "%5s| ","");
1007         } else {
1008             PerlIO_printf( Perl_debug_log, "W%4x| ",
1009                 trie->states[ state ].wordnum
1010             );
1011         }
1012         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1013             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1014             if ( tmp ) {
1015                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1016                     colwidth,
1017                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1018                             PL_colors[0], PL_colors[1],
1019                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1020                             PERL_PV_ESCAPE_FIRSTCHAR
1021                     ) ,
1022                     TRIE_LIST_ITEM(state,charid).forid,
1023                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1024                 );
1025                 if (!(charid % 10))
1026                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1027                         (int)((depth * 2) + 14), "");
1028             }
1029         }
1030         PerlIO_printf( Perl_debug_log, "\n");
1031     }
1032 }
1033
1034 /*
1035   Dumps a fully constructed but uncompressed trie in table form.
1036   This is the normal DFA style state transition table, with a few
1037   twists to facilitate compression later.
1038   Used for debugging make_trie().
1039 */
1040 STATIC void
1041 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1042                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1043                           U32 depth)
1044 {
1045     U32 state;
1046     U16 charid;
1047     SV *sv=sv_newmortal();
1048     int colwidth= widecharmap ? 6 : 4;
1049     GET_RE_DEBUG_FLAGS_DECL;
1050
1051     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1052
1053     /*
1054        print out the table precompression so that we can do a visual check
1055        that they are identical.
1056      */
1057
1058     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1059
1060     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1061         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1062         if ( tmp ) {
1063             PerlIO_printf( Perl_debug_log, "%*s",
1064                 colwidth,
1065                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1066                             PL_colors[0], PL_colors[1],
1067                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1068                             PERL_PV_ESCAPE_FIRSTCHAR
1069                 )
1070             );
1071         }
1072     }
1073
1074     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1075
1076     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1077         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1078     }
1079
1080     PerlIO_printf( Perl_debug_log, "\n" );
1081
1082     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1083
1084         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1085             (int)depth * 2 + 2,"",
1086             (UV)TRIE_NODENUM( state ) );
1087
1088         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1089             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1090             if (v)
1091                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1092             else
1093                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1094         }
1095         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1096             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1097         } else {
1098             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1099             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1100         }
1101     }
1102 }
1103
1104 #endif
1105
1106
1107 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1108   startbranch: the first branch in the whole branch sequence
1109   first      : start branch of sequence of branch-exact nodes.
1110                May be the same as startbranch
1111   last       : Thing following the last branch.
1112                May be the same as tail.
1113   tail       : item following the branch sequence
1114   count      : words in the sequence
1115   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1116   depth      : indent depth
1117
1118 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1119
1120 A trie is an N'ary tree where the branches are determined by digital
1121 decomposition of the key. IE, at the root node you look up the 1st character and
1122 follow that branch repeat until you find the end of the branches. Nodes can be
1123 marked as "accepting" meaning they represent a complete word. Eg:
1124
1125   /he|she|his|hers/
1126
1127 would convert into the following structure. Numbers represent states, letters
1128 following numbers represent valid transitions on the letter from that state, if
1129 the number is in square brackets it represents an accepting state, otherwise it
1130 will be in parenthesis.
1131
1132       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1133       |    |
1134       |   (2)
1135       |    |
1136      (1)   +-i->(6)-+-s->[7]
1137       |
1138       +-s->(3)-+-h->(4)-+-e->[5]
1139
1140       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1141
1142 This shows that when matching against the string 'hers' we will begin at state 1
1143 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1144 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1145 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1146 single traverse. We store a mapping from accepting to state to which word was
1147 matched, and then when we have multiple possibilities we try to complete the
1148 rest of the regex in the order in which they occured in the alternation.
1149
1150 The only prior NFA like behaviour that would be changed by the TRIE support is
1151 the silent ignoring of duplicate alternations which are of the form:
1152
1153  / (DUPE|DUPE) X? (?{ ... }) Y /x
1154
1155 Thus EVAL blocks follwing a trie may be called a different number of times with
1156 and without the optimisation. With the optimisations dupes will be silently
1157 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1158 the following demonstrates:
1159
1160  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1161
1162 which prints out 'word' three times, but
1163
1164  'words'=~/(word|word|word)(?{ print $1 })S/
1165
1166 which doesnt print it out at all. This is due to other optimisations kicking in.
1167
1168 Example of what happens on a structural level:
1169
1170 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1171
1172    1: CURLYM[1] {1,32767}(18)
1173    5:   BRANCH(8)
1174    6:     EXACT <ac>(16)
1175    8:   BRANCH(11)
1176    9:     EXACT <ad>(16)
1177   11:   BRANCH(14)
1178   12:     EXACT <ab>(16)
1179   16:   SUCCEED(0)
1180   17:   NOTHING(18)
1181   18: END(0)
1182
1183 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1184 and should turn into:
1185
1186    1: CURLYM[1] {1,32767}(18)
1187    5:   TRIE(16)
1188         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1189           <ac>
1190           <ad>
1191           <ab>
1192   16:   SUCCEED(0)
1193   17:   NOTHING(18)
1194   18: END(0)
1195
1196 Cases where tail != last would be like /(?foo|bar)baz/:
1197
1198    1: BRANCH(4)
1199    2:   EXACT <foo>(8)
1200    4: BRANCH(7)
1201    5:   EXACT <bar>(8)
1202    7: TAIL(8)
1203    8: EXACT <baz>(10)
1204   10: END(0)
1205
1206 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1207 and would end up looking like:
1208
1209     1: TRIE(8)
1210       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1211         <foo>
1212         <bar>
1213    7: TAIL(8)
1214    8: EXACT <baz>(10)
1215   10: END(0)
1216
1217     d = uvuni_to_utf8_flags(d, uv, 0);
1218
1219 is the recommended Unicode-aware way of saying
1220
1221     *(d++) = uv;
1222 */
1223
1224 #define TRIE_STORE_REVCHAR                                                 \
1225     STMT_START {                                                           \
1226         if (UTF) {                                                         \
1227             SV *zlopp = newSV(2);                                          \
1228             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1229             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1230             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1231             SvPOK_on(zlopp);                                               \
1232             SvUTF8_on(zlopp);                                              \
1233             av_push(revcharmap, zlopp);                                    \
1234         } else {                                                           \
1235             char ooooff = (char)uvc;                                               \
1236             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1237         }                                                                  \
1238         } STMT_END
1239
1240 #define TRIE_READ_CHAR STMT_START {                                           \
1241     wordlen++;                                                                \
1242     if ( UTF ) {                                                              \
1243         if ( folder ) {                                                       \
1244             if ( foldlen > 0 ) {                                              \
1245                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1246                foldlen -= len;                                                \
1247                scan += len;                                                   \
1248                len = 0;                                                       \
1249             } else {                                                          \
1250                 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1251                 uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1252                 foldlen -= UNISKIP( uvc );                                    \
1253                 scan = foldbuf + UNISKIP( uvc );                              \
1254             }                                                                 \
1255         } else {                                                              \
1256             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1257         }                                                                     \
1258     } else {                                                                  \
1259         uvc = (U32)*uc;                                                       \
1260         len = 1;                                                              \
1261     }                                                                         \
1262 } STMT_END
1263
1264
1265
1266 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1267     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1268         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1269         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1270     }                                                           \
1271     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1272     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1273     TRIE_LIST_CUR( state )++;                                   \
1274 } STMT_END
1275
1276 #define TRIE_LIST_NEW(state) STMT_START {                       \
1277     Newxz( trie->states[ state ].trans.list,               \
1278         4, reg_trie_trans_le );                                 \
1279      TRIE_LIST_CUR( state ) = 1;                                \
1280      TRIE_LIST_LEN( state ) = 4;                                \
1281 } STMT_END
1282
1283 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1284     U16 dupe= trie->states[ state ].wordnum;                    \
1285     regnode * const noper_next = regnext( noper );              \
1286                                                                 \
1287     DEBUG_r({                                                   \
1288         /* store the word for dumping */                        \
1289         SV* tmp;                                                \
1290         if (OP(noper) != NOTHING)                               \
1291             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1292         else                                                    \
1293             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1294         av_push( trie_words, tmp );                             \
1295     });                                                         \
1296                                                                 \
1297     curword++;                                                  \
1298     trie->wordinfo[curword].prev   = 0;                         \
1299     trie->wordinfo[curword].len    = wordlen;                   \
1300     trie->wordinfo[curword].accept = state;                     \
1301                                                                 \
1302     if ( noper_next < tail ) {                                  \
1303         if (!trie->jump)                                        \
1304             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1305         trie->jump[curword] = (U16)(noper_next - convert);      \
1306         if (!jumper)                                            \
1307             jumper = noper_next;                                \
1308         if (!nextbranch)                                        \
1309             nextbranch= regnext(cur);                           \
1310     }                                                           \
1311                                                                 \
1312     if ( dupe ) {                                               \
1313         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1314         /* chain, so that when the bits of chain are later    */\
1315         /* linked together, the dups appear in the chain      */\
1316         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1317         trie->wordinfo[dupe].prev = curword;                    \
1318     } else {                                                    \
1319         /* we haven't inserted this word yet.                */ \
1320         trie->states[ state ].wordnum = curword;                \
1321     }                                                           \
1322 } STMT_END
1323
1324
1325 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1326      ( ( base + charid >=  ucharcount                                   \
1327          && base + charid < ubound                                      \
1328          && state == trie->trans[ base - ucharcount + charid ].check    \
1329          && trie->trans[ base - ucharcount + charid ].next )            \
1330            ? trie->trans[ base - ucharcount + charid ].next             \
1331            : ( state==1 ? special : 0 )                                 \
1332       )
1333
1334 #define MADE_TRIE       1
1335 #define MADE_JUMP_TRIE  2
1336 #define MADE_EXACT_TRIE 4
1337
1338 STATIC I32
1339 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1340 {
1341     dVAR;
1342     /* first pass, loop through and scan words */
1343     reg_trie_data *trie;
1344     HV *widecharmap = NULL;
1345     AV *revcharmap = newAV();
1346     regnode *cur;
1347     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1348     STRLEN len = 0;
1349     UV uvc = 0;
1350     U16 curword = 0;
1351     U32 next_alloc = 0;
1352     regnode *jumper = NULL;
1353     regnode *nextbranch = NULL;
1354     regnode *convert = NULL;
1355     U32 *prev_states; /* temp array mapping each state to previous one */
1356     /* we just use folder as a flag in utf8 */
1357     const U8 * const folder = ( flags == EXACTF
1358                        ? PL_fold
1359                        : ( flags == EXACTFL
1360                            ? PL_fold_locale
1361                            : NULL
1362                          )
1363                      );
1364
1365 #ifdef DEBUGGING
1366     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1367     AV *trie_words = NULL;
1368     /* along with revcharmap, this only used during construction but both are
1369      * useful during debugging so we store them in the struct when debugging.
1370      */
1371 #else
1372     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1373     STRLEN trie_charcount=0;
1374 #endif
1375     SV *re_trie_maxbuff;
1376     GET_RE_DEBUG_FLAGS_DECL;
1377
1378     PERL_ARGS_ASSERT_MAKE_TRIE;
1379 #ifndef DEBUGGING
1380     PERL_UNUSED_ARG(depth);
1381 #endif
1382
1383     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1384     trie->refcount = 1;
1385     trie->startstate = 1;
1386     trie->wordcount = word_count;
1387     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1388     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1389     if (!(UTF && folder))
1390         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1391     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1392                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1393
1394     DEBUG_r({
1395         trie_words = newAV();
1396     });
1397
1398     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1399     if (!SvIOK(re_trie_maxbuff)) {
1400         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1401     }
1402     DEBUG_OPTIMISE_r({
1403                 PerlIO_printf( Perl_debug_log,
1404                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1405                   (int)depth * 2 + 2, "",
1406                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1407                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1408                   (int)depth);
1409     });
1410
1411    /* Find the node we are going to overwrite */
1412     if ( first == startbranch && OP( last ) != BRANCH ) {
1413         /* whole branch chain */
1414         convert = first;
1415     } else {
1416         /* branch sub-chain */
1417         convert = NEXTOPER( first );
1418     }
1419
1420     /*  -- First loop and Setup --
1421
1422        We first traverse the branches and scan each word to determine if it
1423        contains widechars, and how many unique chars there are, this is
1424        important as we have to build a table with at least as many columns as we
1425        have unique chars.
1426
1427        We use an array of integers to represent the character codes 0..255
1428        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1429        native representation of the character value as the key and IV's for the
1430        coded index.
1431
1432        *TODO* If we keep track of how many times each character is used we can
1433        remap the columns so that the table compression later on is more
1434        efficient in terms of memory by ensuring the most common value is in the
1435        middle and the least common are on the outside.  IMO this would be better
1436        than a most to least common mapping as theres a decent chance the most
1437        common letter will share a node with the least common, meaning the node
1438        will not be compressable. With a middle is most common approach the worst
1439        case is when we have the least common nodes twice.
1440
1441      */
1442
1443     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1444         regnode * const noper = NEXTOPER( cur );
1445         const U8 *uc = (U8*)STRING( noper );
1446         const U8 * const e  = uc + STR_LEN( noper );
1447         STRLEN foldlen = 0;
1448         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1449         const U8 *scan = (U8*)NULL;
1450         U32 wordlen      = 0;         /* required init */
1451         STRLEN chars = 0;
1452         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1453
1454         if (OP(noper) == NOTHING) {
1455             trie->minlen= 0;
1456             continue;
1457         }
1458         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1459             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1460                                           regardless of encoding */
1461
1462         for ( ; uc < e ; uc += len ) {
1463             TRIE_CHARCOUNT(trie)++;
1464             TRIE_READ_CHAR;
1465             chars++;
1466             if ( uvc < 256 ) {
1467                 if ( !trie->charmap[ uvc ] ) {
1468                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1469                     if ( folder )
1470                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1471                     TRIE_STORE_REVCHAR;
1472                 }
1473                 if ( set_bit ) {
1474                     /* store the codepoint in the bitmap, and if its ascii
1475                        also store its folded equivelent. */
1476                     TRIE_BITMAP_SET(trie,uvc);
1477
1478                     /* store the folded codepoint */
1479                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1480
1481                     if ( !UTF ) {
1482                         /* store first byte of utf8 representation of
1483                            codepoints in the 127 < uvc < 256 range */
1484                         if (127 < uvc && uvc < 192) {
1485                             TRIE_BITMAP_SET(trie,194);
1486                         } else if (191 < uvc ) {
1487                             TRIE_BITMAP_SET(trie,195);
1488                         /* && uvc < 256 -- we know uvc is < 256 already */
1489                         }
1490                     }
1491                     set_bit = 0; /* We've done our bit :-) */
1492                 }
1493             } else {
1494                 SV** svpp;
1495                 if ( !widecharmap )
1496                     widecharmap = newHV();
1497
1498                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1499
1500                 if ( !svpp )
1501                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1502
1503                 if ( !SvTRUE( *svpp ) ) {
1504                     sv_setiv( *svpp, ++trie->uniquecharcount );
1505                     TRIE_STORE_REVCHAR;
1506                 }
1507             }
1508         }
1509         if( cur == first ) {
1510             trie->minlen=chars;
1511             trie->maxlen=chars;
1512         } else if (chars < trie->minlen) {
1513             trie->minlen=chars;
1514         } else if (chars > trie->maxlen) {
1515             trie->maxlen=chars;
1516         }
1517
1518     } /* end first pass */
1519     DEBUG_TRIE_COMPILE_r(
1520         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1521                 (int)depth * 2 + 2,"",
1522                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1523                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1524                 (int)trie->minlen, (int)trie->maxlen )
1525     );
1526
1527     /*
1528         We now know what we are dealing with in terms of unique chars and
1529         string sizes so we can calculate how much memory a naive
1530         representation using a flat table  will take. If it's over a reasonable
1531         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1532         conservative but potentially much slower representation using an array
1533         of lists.
1534
1535         At the end we convert both representations into the same compressed
1536         form that will be used in regexec.c for matching with. The latter
1537         is a form that cannot be used to construct with but has memory
1538         properties similar to the list form and access properties similar
1539         to the table form making it both suitable for fast searches and
1540         small enough that its feasable to store for the duration of a program.
1541
1542         See the comment in the code where the compressed table is produced
1543         inplace from the flat tabe representation for an explanation of how
1544         the compression works.
1545
1546     */
1547
1548
1549     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1550     prev_states[1] = 0;
1551
1552     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1553         /*
1554             Second Pass -- Array Of Lists Representation
1555
1556             Each state will be represented by a list of charid:state records
1557             (reg_trie_trans_le) the first such element holds the CUR and LEN
1558             points of the allocated array. (See defines above).
1559
1560             We build the initial structure using the lists, and then convert
1561             it into the compressed table form which allows faster lookups
1562             (but cant be modified once converted).
1563         */
1564
1565         STRLEN transcount = 1;
1566
1567         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1568             "%*sCompiling trie using list compiler\n",
1569             (int)depth * 2 + 2, ""));
1570
1571         trie->states = (reg_trie_state *)
1572             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1573                                   sizeof(reg_trie_state) );
1574         TRIE_LIST_NEW(1);
1575         next_alloc = 2;
1576
1577         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1578
1579             regnode * const noper = NEXTOPER( cur );
1580             U8 *uc           = (U8*)STRING( noper );
1581             const U8 * const e = uc + STR_LEN( noper );
1582             U32 state        = 1;         /* required init */
1583             U16 charid       = 0;         /* sanity init */
1584             U8 *scan         = (U8*)NULL; /* sanity init */
1585             STRLEN foldlen   = 0;         /* required init */
1586             U32 wordlen      = 0;         /* required init */
1587             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1588
1589             if (OP(noper) != NOTHING) {
1590                 for ( ; uc < e ; uc += len ) {
1591
1592                     TRIE_READ_CHAR;
1593
1594                     if ( uvc < 256 ) {
1595                         charid = trie->charmap[ uvc ];
1596                     } else {
1597                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1598                         if ( !svpp ) {
1599                             charid = 0;
1600                         } else {
1601                             charid=(U16)SvIV( *svpp );
1602                         }
1603                     }
1604                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1605                     if ( charid ) {
1606
1607                         U16 check;
1608                         U32 newstate = 0;
1609
1610                         charid--;
1611                         if ( !trie->states[ state ].trans.list ) {
1612                             TRIE_LIST_NEW( state );
1613                         }
1614                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1615                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1616                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1617                                 break;
1618                             }
1619                         }
1620                         if ( ! newstate ) {
1621                             newstate = next_alloc++;
1622                             prev_states[newstate] = state;
1623                             TRIE_LIST_PUSH( state, charid, newstate );
1624                             transcount++;
1625                         }
1626                         state = newstate;
1627                     } else {
1628                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1629                     }
1630                 }
1631             }
1632             TRIE_HANDLE_WORD(state);
1633
1634         } /* end second pass */
1635
1636         /* next alloc is the NEXT state to be allocated */
1637         trie->statecount = next_alloc;
1638         trie->states = (reg_trie_state *)
1639             PerlMemShared_realloc( trie->states,
1640                                    next_alloc
1641                                    * sizeof(reg_trie_state) );
1642
1643         /* and now dump it out before we compress it */
1644         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1645                                                          revcharmap, next_alloc,
1646                                                          depth+1)
1647         );
1648
1649         trie->trans = (reg_trie_trans *)
1650             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1651         {
1652             U32 state;
1653             U32 tp = 0;
1654             U32 zp = 0;
1655
1656
1657             for( state=1 ; state < next_alloc ; state ++ ) {
1658                 U32 base=0;
1659
1660                 /*
1661                 DEBUG_TRIE_COMPILE_MORE_r(
1662                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1663                 );
1664                 */
1665
1666                 if (trie->states[state].trans.list) {
1667                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1668                     U16 maxid=minid;
1669                     U16 idx;
1670
1671                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1672                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1673                         if ( forid < minid ) {
1674                             minid=forid;
1675                         } else if ( forid > maxid ) {
1676                             maxid=forid;
1677                         }
1678                     }
1679                     if ( transcount < tp + maxid - minid + 1) {
1680                         transcount *= 2;
1681                         trie->trans = (reg_trie_trans *)
1682                             PerlMemShared_realloc( trie->trans,
1683                                                      transcount
1684                                                      * sizeof(reg_trie_trans) );
1685                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1686                     }
1687                     base = trie->uniquecharcount + tp - minid;
1688                     if ( maxid == minid ) {
1689                         U32 set = 0;
1690                         for ( ; zp < tp ; zp++ ) {
1691                             if ( ! trie->trans[ zp ].next ) {
1692                                 base = trie->uniquecharcount + zp - minid;
1693                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1694                                 trie->trans[ zp ].check = state;
1695                                 set = 1;
1696                                 break;
1697                             }
1698                         }
1699                         if ( !set ) {
1700                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1701                             trie->trans[ tp ].check = state;
1702                             tp++;
1703                             zp = tp;
1704                         }
1705                     } else {
1706                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1707                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1708                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1709                             trie->trans[ tid ].check = state;
1710                         }
1711                         tp += ( maxid - minid + 1 );
1712                     }
1713                     Safefree(trie->states[ state ].trans.list);
1714                 }
1715                 /*
1716                 DEBUG_TRIE_COMPILE_MORE_r(
1717                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1718                 );
1719                 */
1720                 trie->states[ state ].trans.base=base;
1721             }
1722             trie->lasttrans = tp + 1;
1723         }
1724     } else {
1725         /*
1726            Second Pass -- Flat Table Representation.
1727
1728            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1729            We know that we will need Charcount+1 trans at most to store the data
1730            (one row per char at worst case) So we preallocate both structures
1731            assuming worst case.
1732
1733            We then construct the trie using only the .next slots of the entry
1734            structs.
1735
1736            We use the .check field of the first entry of the node temporarily to
1737            make compression both faster and easier by keeping track of how many non
1738            zero fields are in the node.
1739
1740            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1741            transition.
1742
1743            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1744            number representing the first entry of the node, and state as a
1745            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1746            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1747            are 2 entrys per node. eg:
1748
1749              A B       A B
1750           1. 2 4    1. 3 7
1751           2. 0 3    3. 0 5
1752           3. 0 0    5. 0 0
1753           4. 0 0    7. 0 0
1754
1755            The table is internally in the right hand, idx form. However as we also
1756            have to deal with the states array which is indexed by nodenum we have to
1757            use TRIE_NODENUM() to convert.
1758
1759         */
1760         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1761             "%*sCompiling trie using table compiler\n",
1762             (int)depth * 2 + 2, ""));
1763
1764         trie->trans = (reg_trie_trans *)
1765             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1766                                   * trie->uniquecharcount + 1,
1767                                   sizeof(reg_trie_trans) );
1768         trie->states = (reg_trie_state *)
1769             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1770                                   sizeof(reg_trie_state) );
1771         next_alloc = trie->uniquecharcount + 1;
1772
1773
1774         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1775
1776             regnode * const noper   = NEXTOPER( cur );
1777             const U8 *uc     = (U8*)STRING( noper );
1778             const U8 * const e = uc + STR_LEN( noper );
1779
1780             U32 state        = 1;         /* required init */
1781
1782             U16 charid       = 0;         /* sanity init */
1783             U32 accept_state = 0;         /* sanity init */
1784             U8 *scan         = (U8*)NULL; /* sanity init */
1785
1786             STRLEN foldlen   = 0;         /* required init */
1787             U32 wordlen      = 0;         /* required init */
1788             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1789
1790             if ( OP(noper) != NOTHING ) {
1791                 for ( ; uc < e ; uc += len ) {
1792
1793                     TRIE_READ_CHAR;
1794
1795                     if ( uvc < 256 ) {
1796                         charid = trie->charmap[ uvc ];
1797                     } else {
1798                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1799                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1800                     }
1801                     if ( charid ) {
1802                         charid--;
1803                         if ( !trie->trans[ state + charid ].next ) {
1804                             trie->trans[ state + charid ].next = next_alloc;
1805                             trie->trans[ state ].check++;
1806                             prev_states[TRIE_NODENUM(next_alloc)]
1807                                     = TRIE_NODENUM(state);
1808                             next_alloc += trie->uniquecharcount;
1809                         }
1810                         state = trie->trans[ state + charid ].next;
1811                     } else {
1812                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1813                     }
1814                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1815                 }
1816             }
1817             accept_state = TRIE_NODENUM( state );
1818             TRIE_HANDLE_WORD(accept_state);
1819
1820         } /* end second pass */
1821
1822         /* and now dump it out before we compress it */
1823         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1824                                                           revcharmap,
1825                                                           next_alloc, depth+1));
1826
1827         {
1828         /*
1829            * Inplace compress the table.*
1830
1831            For sparse data sets the table constructed by the trie algorithm will
1832            be mostly 0/FAIL transitions or to put it another way mostly empty.
1833            (Note that leaf nodes will not contain any transitions.)
1834
1835            This algorithm compresses the tables by eliminating most such
1836            transitions, at the cost of a modest bit of extra work during lookup:
1837
1838            - Each states[] entry contains a .base field which indicates the
1839            index in the state[] array wheres its transition data is stored.
1840
1841            - If .base is 0 there are no valid transitions from that node.
1842
1843            - If .base is nonzero then charid is added to it to find an entry in
1844            the trans array.
1845
1846            -If trans[states[state].base+charid].check!=state then the
1847            transition is taken to be a 0/Fail transition. Thus if there are fail
1848            transitions at the front of the node then the .base offset will point
1849            somewhere inside the previous nodes data (or maybe even into a node
1850            even earlier), but the .check field determines if the transition is
1851            valid.
1852
1853            XXX - wrong maybe?
1854            The following process inplace converts the table to the compressed
1855            table: We first do not compress the root node 1,and mark all its
1856            .check pointers as 1 and set its .base pointer as 1 as well. This
1857            allows us to do a DFA construction from the compressed table later,
1858            and ensures that any .base pointers we calculate later are greater
1859            than 0.
1860
1861            - We set 'pos' to indicate the first entry of the second node.
1862
1863            - We then iterate over the columns of the node, finding the first and
1864            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1865            and set the .check pointers accordingly, and advance pos
1866            appropriately and repreat for the next node. Note that when we copy
1867            the next pointers we have to convert them from the original
1868            NODEIDX form to NODENUM form as the former is not valid post
1869            compression.
1870
1871            - If a node has no transitions used we mark its base as 0 and do not
1872            advance the pos pointer.
1873
1874            - If a node only has one transition we use a second pointer into the
1875            structure to fill in allocated fail transitions from other states.
1876            This pointer is independent of the main pointer and scans forward
1877            looking for null transitions that are allocated to a state. When it
1878            finds one it writes the single transition into the "hole".  If the
1879            pointer doesnt find one the single transition is appended as normal.
1880
1881            - Once compressed we can Renew/realloc the structures to release the
1882            excess space.
1883
1884            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1885            specifically Fig 3.47 and the associated pseudocode.
1886
1887            demq
1888         */
1889         const U32 laststate = TRIE_NODENUM( next_alloc );
1890         U32 state, charid;
1891         U32 pos = 0, zp=0;
1892         trie->statecount = laststate;
1893
1894         for ( state = 1 ; state < laststate ; state++ ) {
1895             U8 flag = 0;
1896             const U32 stateidx = TRIE_NODEIDX( state );
1897             const U32 o_used = trie->trans[ stateidx ].check;
1898             U32 used = trie->trans[ stateidx ].check;
1899             trie->trans[ stateidx ].check = 0;
1900
1901             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1902                 if ( flag || trie->trans[ stateidx + charid ].next ) {
1903                     if ( trie->trans[ stateidx + charid ].next ) {
1904                         if (o_used == 1) {
1905                             for ( ; zp < pos ; zp++ ) {
1906                                 if ( ! trie->trans[ zp ].next ) {
1907                                     break;
1908                                 }
1909                             }
1910                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1911                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1912                             trie->trans[ zp ].check = state;
1913                             if ( ++zp > pos ) pos = zp;
1914                             break;
1915                         }
1916                         used--;
1917                     }
1918                     if ( !flag ) {
1919                         flag = 1;
1920                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1921                     }
1922                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1923                     trie->trans[ pos ].check = state;
1924                     pos++;
1925                 }
1926             }
1927         }
1928         trie->lasttrans = pos + 1;
1929         trie->states = (reg_trie_state *)
1930             PerlMemShared_realloc( trie->states, laststate
1931                                    * sizeof(reg_trie_state) );
1932         DEBUG_TRIE_COMPILE_MORE_r(
1933                 PerlIO_printf( Perl_debug_log,
1934                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1935                     (int)depth * 2 + 2,"",
1936                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1937                     (IV)next_alloc,
1938                     (IV)pos,
1939                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1940             );
1941
1942         } /* end table compress */
1943     }
1944     DEBUG_TRIE_COMPILE_MORE_r(
1945             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1946                 (int)depth * 2 + 2, "",
1947                 (UV)trie->statecount,
1948                 (UV)trie->lasttrans)
1949     );
1950     /* resize the trans array to remove unused space */
1951     trie->trans = (reg_trie_trans *)
1952         PerlMemShared_realloc( trie->trans, trie->lasttrans
1953                                * sizeof(reg_trie_trans) );
1954
1955     {   /* Modify the program and insert the new TRIE node */
1956         U8 nodetype =(U8)(flags & 0xFF);
1957         char *str=NULL;
1958
1959 #ifdef DEBUGGING
1960         regnode *optimize = NULL;
1961 #ifdef RE_TRACK_PATTERN_OFFSETS
1962
1963         U32 mjd_offset = 0;
1964         U32 mjd_nodelen = 0;
1965 #endif /* RE_TRACK_PATTERN_OFFSETS */
1966 #endif /* DEBUGGING */
1967         /*
1968            This means we convert either the first branch or the first Exact,
1969            depending on whether the thing following (in 'last') is a branch
1970            or not and whther first is the startbranch (ie is it a sub part of
1971            the alternation or is it the whole thing.)
1972            Assuming its a sub part we convert the EXACT otherwise we convert
1973            the whole branch sequence, including the first.
1974          */
1975         /* Find the node we are going to overwrite */
1976         if ( first != startbranch || OP( last ) == BRANCH ) {
1977             /* branch sub-chain */
1978             NEXT_OFF( first ) = (U16)(last - first);
1979 #ifdef RE_TRACK_PATTERN_OFFSETS
1980             DEBUG_r({
1981                 mjd_offset= Node_Offset((convert));
1982                 mjd_nodelen= Node_Length((convert));
1983             });
1984 #endif
1985             /* whole branch chain */
1986         }
1987 #ifdef RE_TRACK_PATTERN_OFFSETS
1988         else {
1989             DEBUG_r({
1990                 const  regnode *nop = NEXTOPER( convert );
1991                 mjd_offset= Node_Offset((nop));
1992                 mjd_nodelen= Node_Length((nop));
1993             });
1994         }
1995         DEBUG_OPTIMISE_r(
1996             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1997                 (int)depth * 2 + 2, "",
1998                 (UV)mjd_offset, (UV)mjd_nodelen)
1999         );
2000 #endif
2001         /* But first we check to see if there is a common prefix we can
2002            split out as an EXACT and put in front of the TRIE node.  */
2003         trie->startstate= 1;
2004         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2005             U32 state;
2006             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2007                 U32 ofs = 0;
2008                 I32 idx = -1;
2009                 U32 count = 0;
2010                 const U32 base = trie->states[ state ].trans.base;
2011
2012                 if ( trie->states[state].wordnum )
2013                         count = 1;
2014
2015                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2016                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2017                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2018                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2019                     {
2020                         if ( ++count > 1 ) {
2021                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2022                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2023                             if ( state == 1 ) break;
2024                             if ( count == 2 ) {
2025                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2026                                 DEBUG_OPTIMISE_r(
2027                                     PerlIO_printf(Perl_debug_log,
2028                                         "%*sNew Start State=%"UVuf" Class: [",
2029                                         (int)depth * 2 + 2, "",
2030                                         (UV)state));
2031                                 if (idx >= 0) {
2032                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2033                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2034
2035                                     TRIE_BITMAP_SET(trie,*ch);
2036                                     if ( folder )
2037                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2038                                     DEBUG_OPTIMISE_r(
2039                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2040                                     );
2041                                 }
2042                             }
2043                             TRIE_BITMAP_SET(trie,*ch);
2044                             if ( folder )
2045                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2046                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2047                         }
2048                         idx = ofs;
2049                     }
2050                 }
2051                 if ( count == 1 ) {
2052                     SV **tmp = av_fetch( revcharmap, idx, 0);
2053                     STRLEN len;
2054                     char *ch = SvPV( *tmp, len );
2055                     DEBUG_OPTIMISE_r({
2056                         SV *sv=sv_newmortal();
2057                         PerlIO_printf( Perl_debug_log,
2058                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2059                             (int)depth * 2 + 2, "",
2060                             (UV)state, (UV)idx,
2061                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2062                                 PL_colors[0], PL_colors[1],
2063                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2064                                 PERL_PV_ESCAPE_FIRSTCHAR
2065                             )
2066                         );
2067                     });
2068                     if ( state==1 ) {
2069                         OP( convert ) = nodetype;
2070                         str=STRING(convert);
2071                         STR_LEN(convert)=0;
2072                     }
2073                     STR_LEN(convert) += len;
2074                     while (len--)
2075                         *str++ = *ch++;
2076                 } else {
2077 #ifdef DEBUGGING
2078                     if (state>1)
2079                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2080 #endif
2081                     break;
2082                 }
2083             }
2084             trie->prefixlen = (state-1);
2085             if (str) {
2086                 regnode *n = convert+NODE_SZ_STR(convert);
2087                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2088                 trie->startstate = state;
2089                 trie->minlen -= (state - 1);
2090                 trie->maxlen -= (state - 1);
2091 #ifdef DEBUGGING
2092                /* At least the UNICOS C compiler choked on this
2093                 * being argument to DEBUG_r(), so let's just have
2094                 * it right here. */
2095                if (
2096 #ifdef PERL_EXT_RE_BUILD
2097                    1
2098 #else
2099                    DEBUG_r_TEST
2100 #endif
2101                    ) {
2102                    regnode *fix = convert;
2103                    U32 word = trie->wordcount;
2104                    mjd_nodelen++;
2105                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2106                    while( ++fix < n ) {
2107                        Set_Node_Offset_Length(fix, 0, 0);
2108                    }
2109                    while (word--) {
2110                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2111                        if (tmp) {
2112                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2113                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2114                            else
2115                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2116                        }
2117                    }
2118                }
2119 #endif
2120                 if (trie->maxlen) {
2121                     convert = n;
2122                 } else {
2123                     NEXT_OFF(convert) = (U16)(tail - convert);
2124                     DEBUG_r(optimize= n);
2125                 }
2126             }
2127         }
2128         if (!jumper)
2129             jumper = last;
2130         if ( trie->maxlen ) {
2131             NEXT_OFF( convert ) = (U16)(tail - convert);
2132             ARG_SET( convert, data_slot );
2133             /* Store the offset to the first unabsorbed branch in
2134                jump[0], which is otherwise unused by the jump logic.
2135                We use this when dumping a trie and during optimisation. */
2136             if (trie->jump)
2137                 trie->jump[0] = (U16)(nextbranch - convert);
2138
2139             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2140              *   and there is a bitmap
2141              *   and the first "jump target" node we found leaves enough room
2142              * then convert the TRIE node into a TRIEC node, with the bitmap
2143              * embedded inline in the opcode - this is hypothetically faster.
2144              */
2145             if ( !trie->states[trie->startstate].wordnum
2146                  && trie->bitmap
2147                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2148             {
2149                 OP( convert ) = TRIEC;
2150                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2151                 PerlMemShared_free(trie->bitmap);
2152                 trie->bitmap= NULL;
2153             } else
2154                 OP( convert ) = TRIE;
2155
2156             /* store the type in the flags */
2157             convert->flags = nodetype;
2158             DEBUG_r({
2159             optimize = convert
2160                       + NODE_STEP_REGNODE
2161                       + regarglen[ OP( convert ) ];
2162             });
2163             /* XXX We really should free up the resource in trie now,
2164                    as we won't use them - (which resources?) dmq */
2165         }
2166         /* needed for dumping*/
2167         DEBUG_r(if (optimize) {
2168             regnode *opt = convert;
2169
2170             while ( ++opt < optimize) {
2171                 Set_Node_Offset_Length(opt,0,0);
2172             }
2173             /*
2174                 Try to clean up some of the debris left after the
2175                 optimisation.
2176              */
2177             while( optimize < jumper ) {
2178                 mjd_nodelen += Node_Length((optimize));
2179                 OP( optimize ) = OPTIMIZED;
2180                 Set_Node_Offset_Length(optimize,0,0);
2181                 optimize++;
2182             }
2183             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2184         });
2185     } /* end node insert */
2186     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2187
2188     /*  Finish populating the prev field of the wordinfo array.  Walk back
2189      *  from each accept state until we find another accept state, and if
2190      *  so, point the first word's .prev field at the second word. If the
2191      *  second already has a .prev field set, stop now. This will be the
2192      *  case either if we've already processed that word's accept state,
2193      *  or that state had multiple words, and the overspill words were
2194      *  already linked up earlier.
2195      */
2196     {
2197         U16 word;
2198         U32 state;
2199         U16 prev;
2200
2201         for (word=1; word <= trie->wordcount; word++) {
2202             prev = 0;
2203             if (trie->wordinfo[word].prev)
2204                 continue;
2205             state = trie->wordinfo[word].accept;
2206             while (state) {
2207                 state = prev_states[state];
2208                 if (!state)
2209                     break;
2210                 prev = trie->states[state].wordnum;
2211                 if (prev)
2212                     break;
2213             }
2214             trie->wordinfo[word].prev = prev;
2215         }
2216         Safefree(prev_states);
2217     }
2218
2219
2220     /* and now dump out the compressed format */
2221     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2222
2223     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2224 #ifdef DEBUGGING
2225     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2226     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2227 #else
2228     SvREFCNT_dec(revcharmap);
2229 #endif
2230     return trie->jump
2231            ? MADE_JUMP_TRIE
2232            : trie->startstate>1
2233              ? MADE_EXACT_TRIE
2234              : MADE_TRIE;
2235 }
2236
2237 STATIC void
2238 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2239 {
2240 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2241
2242    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2243    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2244    ISBN 0-201-10088-6
2245
2246    We find the fail state for each state in the trie, this state is the longest proper
2247    suffix of the current state's 'word' that is also a proper prefix of another word in our
2248    trie. State 1 represents the word '' and is thus the default fail state. This allows
2249    the DFA not to have to restart after its tried and failed a word at a given point, it
2250    simply continues as though it had been matching the other word in the first place.
2251    Consider
2252       'abcdgu'=~/abcdefg|cdgu/
2253    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2254    fail, which would bring us to the state representing 'd' in the second word where we would
2255    try 'g' and succeed, proceeding to match 'cdgu'.
2256  */
2257  /* add a fail transition */
2258     const U32 trie_offset = ARG(source);
2259     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2260     U32 *q;
2261     const U32 ucharcount = trie->uniquecharcount;
2262     const U32 numstates = trie->statecount;
2263     const U32 ubound = trie->lasttrans + ucharcount;
2264     U32 q_read = 0;
2265     U32 q_write = 0;
2266     U32 charid;
2267     U32 base = trie->states[ 1 ].trans.base;
2268     U32 *fail;
2269     reg_ac_data *aho;
2270     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2271     GET_RE_DEBUG_FLAGS_DECL;
2272
2273     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2274 #ifndef DEBUGGING
2275     PERL_UNUSED_ARG(depth);
2276 #endif
2277
2278
2279     ARG_SET( stclass, data_slot );
2280     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2281     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2282     aho->trie=trie_offset;
2283     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2284     Copy( trie->states, aho->states, numstates, reg_trie_state );
2285     Newxz( q, numstates, U32);
2286     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2287     aho->refcount = 1;
2288     fail = aho->fail;
2289     /* initialize fail[0..1] to be 1 so that we always have
2290        a valid final fail state */
2291     fail[ 0 ] = fail[ 1 ] = 1;
2292
2293     for ( charid = 0; charid < ucharcount ; charid++ ) {
2294         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2295         if ( newstate ) {
2296             q[ q_write ] = newstate;
2297             /* set to point at the root */
2298             fail[ q[ q_write++ ] ]=1;
2299         }
2300     }
2301     while ( q_read < q_write) {
2302         const U32 cur = q[ q_read++ % numstates ];
2303         base = trie->states[ cur ].trans.base;
2304
2305         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2306             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2307             if (ch_state) {
2308                 U32 fail_state = cur;
2309                 U32 fail_base;
2310                 do {
2311                     fail_state = fail[ fail_state ];
2312                     fail_base = aho->states[ fail_state ].trans.base;
2313                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2314
2315                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2316                 fail[ ch_state ] = fail_state;
2317                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2318                 {
2319                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2320                 }
2321                 q[ q_write++ % numstates] = ch_state;
2322             }
2323         }
2324     }
2325     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2326        when we fail in state 1, this allows us to use the
2327        charclass scan to find a valid start char. This is based on the principle
2328        that theres a good chance the string being searched contains lots of stuff
2329        that cant be a start char.
2330      */
2331     fail[ 0 ] = fail[ 1 ] = 0;
2332     DEBUG_TRIE_COMPILE_r({
2333         PerlIO_printf(Perl_debug_log,
2334                       "%*sStclass Failtable (%"UVuf" states): 0",
2335                       (int)(depth * 2), "", (UV)numstates
2336         );
2337         for( q_read=1; q_read<numstates; q_read++ ) {
2338             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2339         }
2340         PerlIO_printf(Perl_debug_log, "\n");
2341     });
2342     Safefree(q);
2343     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2344 }
2345
2346
2347 /*
2348  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2349  * These need to be revisited when a newer toolchain becomes available.
2350  */
2351 #if defined(__sparc64__) && defined(__GNUC__)
2352 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2353 #       undef  SPARC64_GCC_WORKAROUND
2354 #       define SPARC64_GCC_WORKAROUND 1
2355 #   endif
2356 #endif
2357
2358 #define DEBUG_PEEP(str,scan,depth) \
2359     DEBUG_OPTIMISE_r({if (scan){ \
2360        SV * const mysv=sv_newmortal(); \
2361        regnode *Next = regnext(scan); \
2362        regprop(RExC_rx, mysv, scan); \
2363        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2364        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2365        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2366    }});
2367
2368
2369
2370
2371
2372 #define JOIN_EXACT(scan,min,flags) \
2373     if (PL_regkind[OP(scan)] == EXACT) \
2374         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2375
2376 STATIC U32
2377 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2378     /* Merge several consecutive EXACTish nodes into one. */
2379     regnode *n = regnext(scan);
2380     U32 stringok = 1;
2381     regnode *next = scan + NODE_SZ_STR(scan);
2382     U32 merged = 0;
2383     U32 stopnow = 0;
2384 #ifdef DEBUGGING
2385     regnode *stop = scan;
2386     GET_RE_DEBUG_FLAGS_DECL;
2387 #else
2388     PERL_UNUSED_ARG(depth);
2389 #endif
2390
2391     PERL_ARGS_ASSERT_JOIN_EXACT;
2392 #ifndef EXPERIMENTAL_INPLACESCAN
2393     PERL_UNUSED_ARG(flags);
2394     PERL_UNUSED_ARG(val);
2395 #endif
2396     DEBUG_PEEP("join",scan,depth);
2397
2398     /* Skip NOTHING, merge EXACT*. */
2399     while (n &&
2400            ( PL_regkind[OP(n)] == NOTHING ||
2401              (stringok && (OP(n) == OP(scan))))
2402            && NEXT_OFF(n)
2403            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2404
2405         if (OP(n) == TAIL || n > next)
2406             stringok = 0;
2407         if (PL_regkind[OP(n)] == NOTHING) {
2408             DEBUG_PEEP("skip:",n,depth);
2409             NEXT_OFF(scan) += NEXT_OFF(n);
2410             next = n + NODE_STEP_REGNODE;
2411 #ifdef DEBUGGING
2412             if (stringok)
2413                 stop = n;
2414 #endif
2415             n = regnext(n);
2416         }
2417         else if (stringok) {
2418             const unsigned int oldl = STR_LEN(scan);
2419             regnode * const nnext = regnext(n);
2420
2421             DEBUG_PEEP("merg",n,depth);
2422
2423             merged++;
2424             if (oldl + STR_LEN(n) > U8_MAX)
2425                 break;
2426             NEXT_OFF(scan) += NEXT_OFF(n);
2427             STR_LEN(scan) += STR_LEN(n);
2428             next = n + NODE_SZ_STR(n);
2429             /* Now we can overwrite *n : */
2430             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2431 #ifdef DEBUGGING
2432             stop = next - 1;
2433 #endif
2434             n = nnext;
2435             if (stopnow) break;
2436         }
2437
2438 #ifdef EXPERIMENTAL_INPLACESCAN
2439         if (flags && !NEXT_OFF(n)) {
2440             DEBUG_PEEP("atch", val, depth);
2441             if (reg_off_by_arg[OP(n)]) {
2442                 ARG_SET(n, val - n);
2443             }
2444             else {
2445                 NEXT_OFF(n) = val - n;
2446             }
2447             stopnow = 1;
2448         }
2449 #endif
2450     }
2451
2452     if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2453     /*
2454     Two problematic code points in Unicode casefolding of EXACT nodes:
2455
2456     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2457     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2458
2459     which casefold to
2460
2461     Unicode                      UTF-8
2462
2463     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2464     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2465
2466     This means that in case-insensitive matching (or "loose matching",
2467     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2468     length of the above casefolded versions) can match a target string
2469     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2470     This would rather mess up the minimum length computation.
2471
2472     What we'll do is to look for the tail four bytes, and then peek
2473     at the preceding two bytes to see whether we need to decrease
2474     the minimum length by four (six minus two).
2475
2476     Thanks to the design of UTF-8, there cannot be false matches:
2477     A sequence of valid UTF-8 bytes cannot be a subsequence of
2478     another valid sequence of UTF-8 bytes.
2479
2480     */
2481          char * const s0 = STRING(scan), *s, *t;
2482          char * const s1 = s0 + STR_LEN(scan) - 1;
2483          char * const s2 = s1 - 4;
2484 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2485          const char t0[] = "\xaf\x49\xaf\x42";
2486 #else
2487          const char t0[] = "\xcc\x88\xcc\x81";
2488 #endif
2489          const char * const t1 = t0 + 3;
2490
2491          for (s = s0 + 2;
2492               s < s2 && (t = ninstr(s, s1, t0, t1));
2493               s = t + 4) {
2494 #ifdef EBCDIC
2495               if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2496                   ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2497 #else
2498               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2499                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2500 #endif
2501                    *min -= 4;
2502          }
2503     }
2504
2505 #ifdef DEBUGGING
2506     /* Allow dumping */
2507     n = scan + NODE_SZ_STR(scan);
2508     while (n <= stop) {
2509         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2510             OP(n) = OPTIMIZED;
2511             NEXT_OFF(n) = 0;
2512         }
2513         n++;
2514     }
2515 #endif
2516     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2517     return stopnow;
2518 }
2519
2520 /* REx optimizer.  Converts nodes into quickier variants "in place".
2521    Finds fixed substrings.  */
2522
2523 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2524    to the position after last scanned or to NULL. */
2525
2526 #define INIT_AND_WITHP \
2527     assert(!and_withp); \
2528     Newx(and_withp,1,struct regnode_charclass_class); \
2529     SAVEFREEPV(and_withp)
2530
2531 /* this is a chain of data about sub patterns we are processing that
2532    need to be handled seperately/specially in study_chunk. Its so
2533    we can simulate recursion without losing state.  */
2534 struct scan_frame;
2535 typedef struct scan_frame {
2536     regnode *last;  /* last node to process in this frame */
2537     regnode *next;  /* next node to process when last is reached */
2538     struct scan_frame *prev; /*previous frame*/
2539     I32 stop; /* what stopparen do we use */
2540 } scan_frame;
2541
2542
2543 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2544
2545 #define CASE_SYNST_FNC(nAmE)                                       \
2546 case nAmE:                                                         \
2547     if (flags & SCF_DO_STCLASS_AND) {                              \
2548             for (value = 0; value < 256; value++)                  \
2549                 if (!is_ ## nAmE ## _cp(value))                       \
2550                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2551     }                                                              \
2552     else {                                                         \
2553             for (value = 0; value < 256; value++)                  \
2554                 if (is_ ## nAmE ## _cp(value))                        \
2555                     ANYOF_BITMAP_SET(data->start_class, value);    \
2556     }                                                              \
2557     break;                                                         \
2558 case N ## nAmE:                                                    \
2559     if (flags & SCF_DO_STCLASS_AND) {                              \
2560             for (value = 0; value < 256; value++)                   \
2561                 if (is_ ## nAmE ## _cp(value))                         \
2562                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2563     }                                                               \
2564     else {                                                          \
2565             for (value = 0; value < 256; value++)                   \
2566                 if (!is_ ## nAmE ## _cp(value))                        \
2567                     ANYOF_BITMAP_SET(data->start_class, value);     \
2568     }                                                               \
2569     break
2570
2571
2572
2573 STATIC I32
2574 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2575                         I32 *minlenp, I32 *deltap,
2576                         regnode *last,
2577                         scan_data_t *data,
2578                         I32 stopparen,
2579                         U8* recursed,
2580                         struct regnode_charclass_class *and_withp,
2581                         U32 flags, U32 depth)
2582                         /* scanp: Start here (read-write). */
2583                         /* deltap: Write maxlen-minlen here. */
2584                         /* last: Stop before this one. */
2585                         /* data: string data about the pattern */
2586                         /* stopparen: treat close N as END */
2587                         /* recursed: which subroutines have we recursed into */
2588                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2589 {
2590     dVAR;
2591     I32 min = 0, pars = 0, code;
2592     regnode *scan = *scanp, *next;
2593     I32 delta = 0;
2594     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2595     int is_inf_internal = 0;            /* The studied chunk is infinite */
2596     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2597     scan_data_t data_fake;
2598     SV *re_trie_maxbuff = NULL;
2599     regnode *first_non_open = scan;
2600     I32 stopmin = I32_MAX;
2601     scan_frame *frame = NULL;
2602     GET_RE_DEBUG_FLAGS_DECL;
2603
2604     PERL_ARGS_ASSERT_STUDY_CHUNK;
2605
2606 #ifdef DEBUGGING
2607     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2608 #endif
2609
2610     if ( depth == 0 ) {
2611         while (first_non_open && OP(first_non_open) == OPEN)
2612             first_non_open=regnext(first_non_open);
2613     }
2614
2615
2616   fake_study_recurse:
2617     while ( scan && OP(scan) != END && scan < last ){
2618         /* Peephole optimizer: */
2619         DEBUG_STUDYDATA("Peep:", data,depth);
2620         DEBUG_PEEP("Peep",scan,depth);
2621         JOIN_EXACT(scan,&min,0);
2622
2623         /* Follow the next-chain of the current node and optimize
2624            away all the NOTHINGs from it.  */
2625         if (OP(scan) != CURLYX) {
2626             const int max = (reg_off_by_arg[OP(scan)]
2627                        ? I32_MAX
2628                        /* I32 may be smaller than U16 on CRAYs! */
2629                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2630             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2631             int noff;
2632             regnode *n = scan;
2633
2634             /* Skip NOTHING and LONGJMP. */
2635             while ((n = regnext(n))
2636                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2637                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2638                    && off + noff < max)
2639                 off += noff;
2640             if (reg_off_by_arg[OP(scan)])
2641                 ARG(scan) = off;
2642             else
2643                 NEXT_OFF(scan) = off;
2644         }
2645
2646
2647
2648         /* The principal pseudo-switch.  Cannot be a switch, since we
2649            look into several different things.  */
2650         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2651                    || OP(scan) == IFTHEN) {
2652             next = regnext(scan);
2653             code = OP(scan);
2654             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2655
2656             if (OP(next) == code || code == IFTHEN) {
2657                 /* NOTE - There is similar code to this block below for handling
2658                    TRIE nodes on a re-study.  If you change stuff here check there
2659                    too. */
2660                 I32 max1 = 0, min1 = I32_MAX, num = 0;
2661                 struct regnode_charclass_class accum;
2662                 regnode * const startbranch=scan;
2663
2664                 if (flags & SCF_DO_SUBSTR)
2665                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2666                 if (flags & SCF_DO_STCLASS)
2667                     cl_init_zero(pRExC_state, &accum);
2668
2669                 while (OP(scan) == code) {
2670                     I32 deltanext, minnext, f = 0, fake;
2671                     struct regnode_charclass_class this_class;
2672
2673                     num++;
2674                     data_fake.flags = 0;
2675                     if (data) {
2676                         data_fake.whilem_c = data->whilem_c;
2677                         data_fake.last_closep = data->last_closep;
2678                     }
2679                     else
2680                         data_fake.last_closep = &fake;
2681
2682                     data_fake.pos_delta = delta;
2683                     next = regnext(scan);
2684                     scan = NEXTOPER(scan);
2685                     if (code != BRANCH)
2686                         scan = NEXTOPER(scan);
2687                     if (flags & SCF_DO_STCLASS) {
2688                         cl_init(pRExC_state, &this_class);
2689                         data_fake.start_class = &this_class;
2690                         f = SCF_DO_STCLASS_AND;
2691                     }
2692                     if (flags & SCF_WHILEM_VISITED_POS)
2693                         f |= SCF_WHILEM_VISITED_POS;
2694
2695                     /* we suppose the run is continuous, last=next...*/
2696                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2697                                           next, &data_fake,
2698                                           stopparen, recursed, NULL, f,depth+1);
2699                     if (min1 > minnext)
2700                         min1 = minnext;
2701                     if (max1 < minnext + deltanext)
2702                         max1 = minnext + deltanext;
2703                     if (deltanext == I32_MAX)
2704                         is_inf = is_inf_internal = 1;
2705                     scan = next;
2706                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2707                         pars++;
2708                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
2709                         if ( stopmin > minnext)
2710                             stopmin = min + min1;
2711                         flags &= ~SCF_DO_SUBSTR;
2712                         if (data)
2713                             data->flags |= SCF_SEEN_ACCEPT;
2714                     }
2715                     if (data) {
2716                         if (data_fake.flags & SF_HAS_EVAL)
2717                             data->flags |= SF_HAS_EVAL;
2718                         data->whilem_c = data_fake.whilem_c;
2719                     }
2720                     if (flags & SCF_DO_STCLASS)
2721                         cl_or(pRExC_state, &accum, &this_class);
2722                 }
2723                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2724                     min1 = 0;
2725                 if (flags & SCF_DO_SUBSTR) {
2726                     data->pos_min += min1;
2727                     data->pos_delta += max1 - min1;
2728                     if (max1 != min1 || is_inf)
2729                         data->longest = &(data->longest_float);
2730                 }
2731                 min += min1;
2732                 delta += max1 - min1;
2733                 if (flags & SCF_DO_STCLASS_OR) {
2734                     cl_or(pRExC_state, data->start_class, &accum);
2735                     if (min1) {
2736                         cl_and(data->start_class, and_withp);
2737                         flags &= ~SCF_DO_STCLASS;
2738                     }
2739                 }
2740                 else if (flags & SCF_DO_STCLASS_AND) {
2741                     if (min1) {
2742                         cl_and(data->start_class, &accum);
2743                         flags &= ~SCF_DO_STCLASS;
2744                     }
2745                     else {
2746                         /* Switch to OR mode: cache the old value of
2747                          * data->start_class */
2748                         INIT_AND_WITHP;
2749                         StructCopy(data->start_class, and_withp,
2750                                    struct regnode_charclass_class);
2751                         flags &= ~SCF_DO_STCLASS_AND;
2752                         StructCopy(&accum, data->start_class,
2753                                    struct regnode_charclass_class);
2754                         flags |= SCF_DO_STCLASS_OR;
2755                         data->start_class->flags |= ANYOF_EOS;
2756                     }
2757                 }
2758
2759                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2760                 /* demq.
2761
2762                    Assuming this was/is a branch we are dealing with: 'scan' now
2763                    points at the item that follows the branch sequence, whatever
2764                    it is. We now start at the beginning of the sequence and look
2765                    for subsequences of
2766
2767                    BRANCH->EXACT=>x1
2768                    BRANCH->EXACT=>x2
2769                    tail
2770
2771                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
2772
2773                    If we can find such a subseqence we need to turn the first
2774                    element into a trie and then add the subsequent branch exact
2775                    strings to the trie.
2776
2777                    We have two cases
2778
2779                      1. patterns where the whole set of branches can be converted.
2780
2781                      2. patterns where only a subset can be converted.
2782
2783                    In case 1 we can replace the whole set with a single regop
2784                    for the trie. In case 2 we need to keep the start and end
2785                    branches so
2786
2787                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2788                      becomes BRANCH TRIE; BRANCH X;
2789
2790                   There is an additional case, that being where there is a
2791                   common prefix, which gets split out into an EXACT like node
2792                   preceding the TRIE node.
2793
2794                   If x(1..n)==tail then we can do a simple trie, if not we make
2795                   a "jump" trie, such that when we match the appropriate word
2796                   we "jump" to the appopriate tail node. Essentailly we turn
2797                   a nested if into a case structure of sorts.
2798
2799                 */
2800
2801                     int made=0;
2802                     if (!re_trie_maxbuff) {
2803                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2804                         if (!SvIOK(re_trie_maxbuff))
2805                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2806                     }
2807                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2808                         regnode *cur;
2809                         regnode *first = (regnode *)NULL;
2810                         regnode *last = (regnode *)NULL;
2811                         regnode *tail = scan;
2812                         U8 optype = 0;
2813                         U32 count=0;
2814
2815 #ifdef DEBUGGING
2816                         SV * const mysv = sv_newmortal();       /* for dumping */
2817 #endif
2818                         /* var tail is used because there may be a TAIL
2819                            regop in the way. Ie, the exacts will point to the
2820                            thing following the TAIL, but the last branch will
2821                            point at the TAIL. So we advance tail. If we
2822                            have nested (?:) we may have to move through several
2823                            tails.
2824                          */
2825
2826                         while ( OP( tail ) == TAIL ) {
2827                             /* this is the TAIL generated by (?:) */
2828                             tail = regnext( tail );
2829                         }
2830
2831
2832                         DEBUG_OPTIMISE_r({
2833                             regprop(RExC_rx, mysv, tail );
2834                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2835                                 (int)depth * 2 + 2, "",
2836                                 "Looking for TRIE'able sequences. Tail node is: ",
2837                                 SvPV_nolen_const( mysv )
2838                             );
2839                         });
2840
2841                         /*
2842
2843                            step through the branches, cur represents each
2844                            branch, noper is the first thing to be matched
2845                            as part of that branch and noper_next is the
2846                            regnext() of that node. if noper is an EXACT
2847                            and noper_next is the same as scan (our current
2848                            position in the regex) then the EXACT branch is
2849                            a possible optimization target. Once we have
2850                            two or more consequetive such branches we can
2851                            create a trie of the EXACT's contents and stich
2852                            it in place. If the sequence represents all of
2853                            the branches we eliminate the whole thing and
2854                            replace it with a single TRIE. If it is a
2855                            subsequence then we need to stitch it in. This
2856                            means the first branch has to remain, and needs
2857                            to be repointed at the item on the branch chain
2858                            following the last branch optimized. This could
2859                            be either a BRANCH, in which case the
2860                            subsequence is internal, or it could be the
2861                            item following the branch sequence in which
2862                            case the subsequence is at the end.
2863
2864                         */
2865
2866                         /* dont use tail as the end marker for this traverse */
2867                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2868                             regnode * const noper = NEXTOPER( cur );
2869 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2870                             regnode * const noper_next = regnext( noper );
2871 #endif
2872
2873                             DEBUG_OPTIMISE_r({
2874                                 regprop(RExC_rx, mysv, cur);
2875                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2876                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2877
2878                                 regprop(RExC_rx, mysv, noper);
2879                                 PerlIO_printf( Perl_debug_log, " -> %s",
2880                                     SvPV_nolen_const(mysv));
2881
2882                                 if ( noper_next ) {
2883                                   regprop(RExC_rx, mysv, noper_next );
2884                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2885                                     SvPV_nolen_const(mysv));
2886                                 }
2887                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2888                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2889                             });
2890                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2891                                          : PL_regkind[ OP( noper ) ] == EXACT )
2892                                   || OP(noper) == NOTHING )
2893 #ifdef NOJUMPTRIE
2894                                   && noper_next == tail
2895 #endif
2896                                   && count < U16_MAX)
2897                             {
2898                                 count++;
2899                                 if ( !first || optype == NOTHING ) {
2900                                     if (!first) first = cur;
2901                                     optype = OP( noper );
2902                                 } else {
2903                                     last = cur;
2904                                 }
2905                             } else {
2906 /*
2907     Currently we do not believe that the trie logic can
2908     handle case insensitive matching properly when the
2909     pattern is not unicode (thus forcing unicode semantics).
2910
2911     If/when this is fixed the following define can be swapped
2912     in below to fully enable trie logic.
2913
2914 #define TRIE_TYPE_IS_SAFE 1
2915
2916 */
2917 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2918
2919                                 if ( last && TRIE_TYPE_IS_SAFE ) {
2920                                     make_trie( pRExC_state,
2921                                             startbranch, first, cur, tail, count,
2922                                             optype, depth+1 );
2923                                 }
2924                                 if ( PL_regkind[ OP( noper ) ] == EXACT
2925 #ifdef NOJUMPTRIE
2926                                      && noper_next == tail
2927 #endif
2928                                 ){
2929                                     count = 1;
2930                                     first = cur;
2931                                     optype = OP( noper );
2932                                 } else {
2933                                     count = 0;
2934                                     first = NULL;
2935                                     optype = 0;
2936                                 }
2937                                 last = NULL;
2938                             }
2939                         }
2940                         DEBUG_OPTIMISE_r({
2941                             regprop(RExC_rx, mysv, cur);
2942                             PerlIO_printf( Perl_debug_log,
2943                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2944                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2945
2946                         });
2947
2948                         if ( last && TRIE_TYPE_IS_SAFE ) {
2949                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2950 #ifdef TRIE_STUDY_OPT
2951                             if ( ((made == MADE_EXACT_TRIE &&
2952                                  startbranch == first)
2953                                  || ( first_non_open == first )) &&
2954                                  depth==0 ) {
2955                                 flags |= SCF_TRIE_RESTUDY;
2956                                 if ( startbranch == first
2957                                      && scan == tail )
2958                                 {
2959                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2960                                 }
2961                             }
2962 #endif
2963                         }
2964                     }
2965
2966                 } /* do trie */
2967
2968             }
2969             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
2970                 scan = NEXTOPER(NEXTOPER(scan));
2971             } else                      /* single branch is optimized. */
2972                 scan = NEXTOPER(scan);
2973             continue;
2974         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2975             scan_frame *newframe = NULL;
2976             I32 paren;
2977             regnode *start;
2978             regnode *end;
2979
2980             if (OP(scan) != SUSPEND) {
2981             /* set the pointer */
2982                 if (OP(scan) == GOSUB) {
2983                     paren = ARG(scan);
2984                     RExC_recurse[ARG2L(scan)] = scan;
2985                     start = RExC_open_parens[paren-1];
2986                     end   = RExC_close_parens[paren-1];
2987                 } else {
2988                     paren = 0;
2989                     start = RExC_rxi->program + 1;
2990                     end   = RExC_opend;
2991                 }
2992                 if (!recursed) {
2993                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2994                     SAVEFREEPV(recursed);
2995                 }
2996                 if (!PAREN_TEST(recursed,paren+1)) {
2997                     PAREN_SET(recursed,paren+1);
2998                     Newx(newframe,1,scan_frame);
2999                 } else {
3000                     if (flags & SCF_DO_SUBSTR) {
3001                         SCAN_COMMIT(pRExC_state,data,minlenp);
3002                         data->longest = &(data->longest_float);
3003                     }
3004                     is_inf = is_inf_internal = 1;
3005                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3006                         cl_anything(pRExC_state, data->start_class);
3007                     flags &= ~SCF_DO_STCLASS;
3008                 }
3009             } else {
3010                 Newx(newframe,1,scan_frame);
3011                 paren = stopparen;
3012                 start = scan+2;
3013                 end = regnext(scan);
3014             }
3015             if (newframe) {
3016                 assert(start);
3017                 assert(end);
3018                 SAVEFREEPV(newframe);
3019                 newframe->next = regnext(scan);
3020                 newframe->last = last;
3021                 newframe->stop = stopparen;
3022                 newframe->prev = frame;
3023
3024                 frame = newframe;
3025                 scan =  start;
3026                 stopparen = paren;
3027                 last = end;
3028
3029                 continue;
3030             }
3031         }
3032         else if (OP(scan) == EXACT) {
3033             I32 l = STR_LEN(scan);
3034             UV uc;
3035             if (UTF) {
3036                 const U8 * const s = (U8*)STRING(scan);
3037                 l = utf8_length(s, s + l);
3038                 uc = utf8_to_uvchr(s, NULL);
3039             } else {
3040                 uc = *((U8*)STRING(scan));
3041             }
3042             min += l;
3043             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3044                 /* The code below prefers earlier match for fixed
3045                    offset, later match for variable offset.  */
3046                 if (data->last_end == -1) { /* Update the start info. */
3047                     data->last_start_min = data->pos_min;
3048                     data->last_start_max = is_inf
3049                         ? I32_MAX : data->pos_min + data->pos_delta;
3050                 }
3051                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3052                 if (UTF)
3053                     SvUTF8_on(data->last_found);
3054                 {
3055                     SV * const sv = data->last_found;
3056                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3057                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3058                     if (mg && mg->mg_len >= 0)
3059                         mg->mg_len += utf8_length((U8*)STRING(scan),
3060                                                   (U8*)STRING(scan)+STR_LEN(scan));
3061                 }
3062                 data->last_end = data->pos_min + l;
3063                 data->pos_min += l; /* As in the first entry. */
3064                 data->flags &= ~SF_BEFORE_EOL;
3065             }
3066             if (flags & SCF_DO_STCLASS_AND) {
3067                 /* Check whether it is compatible with what we know already! */
3068                 int compat = 1;
3069
3070                 if (uc >= 0x100 ||
3071                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3072                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3073                     && (!(data->start_class->flags & ANYOF_FOLD)
3074                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3075                     )
3076                     compat = 0;
3077                 ANYOF_CLASS_ZERO(data->start_class);
3078                 ANYOF_BITMAP_ZERO(data->start_class);
3079                 if (compat)
3080                     ANYOF_BITMAP_SET(data->start_class, uc);
3081                 data->start_class->flags &= ~ANYOF_EOS;
3082                 if (uc < 0x100)
3083                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3084             }
3085             else if (flags & SCF_DO_STCLASS_OR) {
3086                 /* false positive possible if the class is case-folded */
3087                 if (uc < 0x100)
3088                     ANYOF_BITMAP_SET(data->start_class, uc);
3089                 else
3090                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3091                 data->start_class->flags &= ~ANYOF_EOS;
3092                 cl_and(data->start_class, and_withp);
3093             }
3094             flags &= ~SCF_DO_STCLASS;
3095         }
3096         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3097             I32 l = STR_LEN(scan);
3098             UV uc = *((U8*)STRING(scan));
3099
3100             /* Search for fixed substrings supports EXACT only. */
3101             if (flags & SCF_DO_SUBSTR) {
3102                 assert(data);
3103                 SCAN_COMMIT(pRExC_state, data, minlenp);
3104             }
3105             if (UTF) {
3106                 const U8 * const s = (U8 *)STRING(scan);
3107                 l = utf8_length(s, s + l);
3108                 uc = utf8_to_uvchr(s, NULL);
3109             }
3110             min += l;
3111             if (flags & SCF_DO_SUBSTR)
3112                 data->pos_min += l;
3113             if (flags & SCF_DO_STCLASS_AND) {
3114                 /* Check whether it is compatible with what we know already! */
3115                 int compat = 1;
3116
3117                 if (uc >= 0x100 ||
3118                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3119                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3120                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3121                     compat = 0;
3122                 ANYOF_CLASS_ZERO(data->start_class);
3123                 ANYOF_BITMAP_ZERO(data->start_class);
3124                 if (compat) {
3125                     ANYOF_BITMAP_SET(data->start_class, uc);
3126                     data->start_class->flags &= ~ANYOF_EOS;
3127                     data->start_class->flags |= ANYOF_FOLD;
3128                     if (OP(scan) == EXACTFL)
3129                         data->start_class->flags |= ANYOF_LOCALE;
3130                 }
3131             }
3132             else if (flags & SCF_DO_STCLASS_OR) {
3133                 if (data->start_class->flags & ANYOF_FOLD) {
3134                     /* false positive possible if the class is case-folded.
3135                        Assume that the locale settings are the same... */
3136                     if (uc < 0x100)
3137                         ANYOF_BITMAP_SET(data->start_class, uc);
3138                     data->start_class->flags &= ~ANYOF_EOS;
3139                 }
3140                 cl_and(data->start_class, and_withp);
3141             }
3142             flags &= ~SCF_DO_STCLASS;
3143         }
3144         else if (REGNODE_VARIES(OP(scan))) {
3145             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3146             I32 f = flags, pos_before = 0;
3147             regnode * const oscan = scan;
3148             struct regnode_charclass_class this_class;
3149             struct regnode_charclass_class *oclass = NULL;
3150             I32 next_is_eval = 0;
3151
3152             switch (PL_regkind[OP(scan)]) {
3153             case WHILEM:                /* End of (?:...)* . */
3154                 scan = NEXTOPER(scan);
3155                 goto finish;
3156             case PLUS:
3157                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3158                     next = NEXTOPER(scan);
3159                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3160                         mincount = 1;
3161                         maxcount = REG_INFTY;
3162                         next = regnext(scan);
3163                         scan = NEXTOPER(scan);
3164                         goto do_curly;
3165                     }
3166                 }
3167                 if (flags & SCF_DO_SUBSTR)
3168                     data->pos_min++;
3169                 min++;
3170                 /* Fall through. */
3171             case STAR:
3172                 if (flags & SCF_DO_STCLASS) {
3173                     mincount = 0;
3174                     maxcount = REG_INFTY;
3175                     next = regnext(scan);
3176                     scan = NEXTOPER(scan);
3177                     goto do_curly;
3178                 }
3179                 is_inf = is_inf_internal = 1;
3180                 scan = regnext(scan);
3181                 if (flags & SCF_DO_SUBSTR) {
3182                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3183                     data->longest = &(data->longest_float);
3184                 }
3185                 goto optimize_curly_tail;
3186             case CURLY:
3187                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3188                     && (scan->flags == stopparen))
3189                 {
3190                     mincount = 1;
3191                     maxcount = 1;
3192                 } else {
3193                     mincount = ARG1(scan);
3194                     maxcount = ARG2(scan);
3195                 }
3196                 next = regnext(scan);
3197                 if (OP(scan) == CURLYX) {
3198                     I32 lp = (data ? *(data->last_closep) : 0);
3199                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3200                 }
3201                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3202                 next_is_eval = (OP(scan) == EVAL);
3203               do_curly:
3204                 if (flags & SCF_DO_SUBSTR) {
3205                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3206                     pos_before = data->pos_min;
3207                 }
3208                 if (data) {
3209                     fl = data->flags;
3210                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3211                     if (is_inf)
3212                         data->flags |= SF_IS_INF;
3213                 }
3214                 if (flags & SCF_DO_STCLASS) {
3215                     cl_init(pRExC_state, &this_class);
3216                     oclass = data->start_class;
3217                     data->start_class = &this_class;
3218                     f |= SCF_DO_STCLASS_AND;
3219                     f &= ~SCF_DO_STCLASS_OR;
3220                 }
3221                 /* These are the cases when once a subexpression
3222                    fails at a particular position, it cannot succeed
3223                    even after backtracking at the enclosing scope.
3224
3225                    XXXX what if minimal match and we are at the
3226                         initial run of {n,m}? */
3227                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3228                     f &= ~SCF_WHILEM_VISITED_POS;
3229
3230                 /* This will finish on WHILEM, setting scan, or on NULL: */
3231                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3232                                       last, data, stopparen, recursed, NULL,
3233                                       (mincount == 0
3234                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3235
3236                 if (flags & SCF_DO_STCLASS)
3237                     data->start_class = oclass;
3238                 if (mincount == 0 || minnext == 0) {
3239                     if (flags & SCF_DO_STCLASS_OR) {
3240                         cl_or(pRExC_state, data->start_class, &this_class);
3241                     }
3242                     else if (flags & SCF_DO_STCLASS_AND) {
3243                         /* Switch to OR mode: cache the old value of
3244                          * data->start_class */
3245                         INIT_AND_WITHP;
3246                         StructCopy(data->start_class, and_withp,
3247                                    struct regnode_charclass_class);
3248                         flags &= ~SCF_DO_STCLASS_AND;
3249                         StructCopy(&this_class, data->start_class,
3250                                    struct regnode_charclass_class);
3251                         flags |= SCF_DO_STCLASS_OR;
3252                         data->start_class->flags |= ANYOF_EOS;
3253                     }
3254                 } else {                /* Non-zero len */
3255                     if (flags & SCF_DO_STCLASS_OR) {
3256                         cl_or(pRExC_state, data->start_class, &this_class);
3257                         cl_and(data->start_class, and_withp);
3258                     }
3259                     else if (flags & SCF_DO_STCLASS_AND)
3260                         cl_and(data->start_class, &this_class);
3261                     flags &= ~SCF_DO_STCLASS;
3262                 }
3263                 if (!scan)              /* It was not CURLYX, but CURLY. */
3264                     scan = next;
3265                 if ( /* ? quantifier ok, except for (?{ ... }) */
3266                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3267                     && (minnext == 0) && (deltanext == 0)
3268                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3269                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3270                 {
3271                     ckWARNreg(RExC_parse,
3272                               "Quantifier unexpected on zero-length expression");
3273                 }
3274
3275                 min += minnext * mincount;
3276                 is_inf_internal |= ((maxcount == REG_INFTY
3277                                      && (minnext + deltanext) > 0)
3278                                     || deltanext == I32_MAX);
3279                 is_inf |= is_inf_internal;
3280                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3281
3282                 /* Try powerful optimization CURLYX => CURLYN. */
3283                 if (  OP(oscan) == CURLYX && data
3284                       && data->flags & SF_IN_PAR
3285                       && !(data->flags & SF_HAS_EVAL)
3286                       && !deltanext && minnext == 1 ) {
3287                     /* Try to optimize to CURLYN.  */
3288                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3289                     regnode * const nxt1 = nxt;
3290 #ifdef DEBUGGING
3291                     regnode *nxt2;
3292 #endif
3293
3294                     /* Skip open. */
3295                     nxt = regnext(nxt);
3296                     if (!REGNODE_SIMPLE(OP(nxt))
3297                         && !(PL_regkind[OP(nxt)] == EXACT
3298                              && STR_LEN(nxt) == 1))
3299                         goto nogo;
3300 #ifdef DEBUGGING
3301                     nxt2 = nxt;
3302 #endif
3303                     nxt = regnext(nxt);
3304                     if (OP(nxt) != CLOSE)
3305                         goto nogo;
3306                     if (RExC_open_parens) {
3307                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3308                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3309                     }
3310                     /* Now we know that nxt2 is the only contents: */
3311                     oscan->flags = (U8)ARG(nxt);
3312                     OP(oscan) = CURLYN;
3313                     OP(nxt1) = NOTHING; /* was OPEN. */
3314
3315 #ifdef DEBUGGING
3316                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3317                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3318                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3319                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3320                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3321                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3322 #endif
3323                 }
3324               nogo:
3325
3326                 /* Try optimization CURLYX => CURLYM. */
3327                 if (  OP(oscan) == CURLYX && data
3328                       && !(data->flags & SF_HAS_PAR)
3329                       && !(data->flags & SF_HAS_EVAL)
3330                       && !deltanext     /* atom is fixed width */
3331                       && minnext != 0   /* CURLYM can't handle zero width */
3332                 ) {
3333                     /* XXXX How to optimize if data == 0? */
3334                     /* Optimize to a simpler form.  */
3335                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3336                     regnode *nxt2;
3337
3338                     OP(oscan) = CURLYM;
3339                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3340                             && (OP(nxt2) != WHILEM))
3341                         nxt = nxt2;
3342                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3343                     /* Need to optimize away parenths. */
3344                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3345                         /* Set the parenth number.  */
3346                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3347
3348                         oscan->flags = (U8)ARG(nxt);
3349                         if (RExC_open_parens) {
3350                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3351                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3352                         }
3353                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3354                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3355
3356 #ifdef DEBUGGING
3357                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3358                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3359                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3360                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3361 #endif
3362 #if 0
3363                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3364                             regnode *nnxt = regnext(nxt1);
3365                             if (nnxt == nxt) {
3366                                 if (reg_off_by_arg[OP(nxt1)])
3367                                     ARG_SET(nxt1, nxt2 - nxt1);
3368                                 else if (nxt2 - nxt1 < U16_MAX)
3369                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3370                                 else
3371                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3372                             }
3373                             nxt1 = nnxt;
3374                         }
3375 #endif
3376                         /* Optimize again: */
3377                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3378                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3379                     }
3380                     else
3381                         oscan->flags = 0;
3382                 }
3383                 else if ((OP(oscan) == CURLYX)
3384                          && (flags & SCF_WHILEM_VISITED_POS)
3385                          /* See the comment on a similar expression above.
3386                             However, this time it's not a subexpression
3387                             we care about, but the expression itself. */
3388                          && (maxcount == REG_INFTY)
3389                          && data && ++data->whilem_c < 16) {
3390                     /* This stays as CURLYX, we can put the count/of pair. */
3391                     /* Find WHILEM (as in regexec.c) */
3392                     regnode *nxt = oscan + NEXT_OFF(oscan);
3393
3394                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3395                         nxt += ARG(nxt);
3396                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3397                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3398                 }
3399                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3400                     pars++;
3401                 if (flags & SCF_DO_SUBSTR) {
3402                     SV *last_str = NULL;
3403                     int counted = mincount != 0;
3404
3405                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3406 #if defined(SPARC64_GCC_WORKAROUND)
3407                         I32 b = 0;
3408                         STRLEN l = 0;
3409                         const char *s = NULL;
3410                         I32 old = 0;
3411
3412                         if (pos_before >= data->last_start_min)
3413                             b = pos_before;
3414                         else
3415                             b = data->last_start_min;
3416
3417                         l = 0;
3418                         s = SvPV_const(data->last_found, l);
3419                         old = b - data->last_start_min;
3420
3421 #else
3422                         I32 b = pos_before >= data->last_start_min
3423                             ? pos_before : data->last_start_min;
3424                         STRLEN l;
3425                         const char * const s = SvPV_const(data->last_found, l);
3426                         I32 old = b - data->last_start_min;
3427 #endif
3428
3429                         if (UTF)
3430                             old = utf8_hop((U8*)s, old) - (U8*)s;
3431                         l -= old;
3432                         /* Get the added string: */
3433                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3434                         if (deltanext == 0 && pos_before == b) {
3435                             /* What was added is a constant string */
3436                             if (mincount > 1) {
3437                                 SvGROW(last_str, (mincount * l) + 1);
3438                                 repeatcpy(SvPVX(last_str) + l,
3439                                           SvPVX_const(last_str), l, mincount - 1);
3440                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3441                                 /* Add additional parts. */
3442                                 SvCUR_set(data->last_found,
3443                                           SvCUR(data->last_found) - l);
3444                                 sv_catsv(data->last_found, last_str);
3445                                 {
3446                                     SV * sv = data->last_found;
3447                                     MAGIC *mg =
3448                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3449                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3450                                     if (mg && mg->mg_len >= 0)
3451                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3452                                 }
3453                                 data->last_end += l * (mincount - 1);
3454                             }
3455                         } else {
3456                             /* start offset must point into the last copy */
3457                             data->last_start_min += minnext * (mincount - 1);
3458                             data->last_start_max += is_inf ? I32_MAX
3459                                 : (maxcount - 1) * (minnext + data->pos_delta);
3460                         }
3461                     }
3462                     /* It is counted once already... */
3463                     data->pos_min += minnext * (mincount - counted);
3464                     data->pos_delta += - counted * deltanext +
3465                         (minnext + deltanext) * maxcount - minnext * mincount;
3466                     if (mincount != maxcount) {
3467                          /* Cannot extend fixed substrings found inside
3468                             the group.  */
3469                         SCAN_COMMIT(pRExC_state,data,minlenp);
3470                         if (mincount && last_str) {
3471                             SV * const sv = data->last_found;
3472                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3473                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3474
3475                             if (mg)
3476                                 mg->mg_len = -1;
3477                             sv_setsv(sv, last_str);
3478                             data->last_end = data->pos_min;
3479                             data->last_start_min =
3480                                 data->pos_min - CHR_SVLEN(last_str);
3481                             data->last_start_max = is_inf
3482                                 ? I32_MAX
3483                                 : data->pos_min + data->pos_delta
3484                                 - CHR_SVLEN(last_str);
3485                         }
3486                         data->longest = &(data->longest_float);
3487                     }
3488                     SvREFCNT_dec(last_str);
3489                 }
3490                 if (data && (fl & SF_HAS_EVAL))
3491                     data->flags |= SF_HAS_EVAL;
3492               optimize_curly_tail:
3493                 if (OP(oscan) != CURLYX) {
3494                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3495                            && NEXT_OFF(next))
3496                         NEXT_OFF(oscan) += NEXT_OFF(next);
3497                 }
3498                 continue;
3499             default:                    /* REF and CLUMP only? */
3500                 if (flags & SCF_DO_SUBSTR) {
3501                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3502                     data->longest = &(data->longest_float);
3503                 }
3504                 is_inf = is_inf_internal = 1;
3505                 if (flags & SCF_DO_STCLASS_OR)
3506                     cl_anything(pRExC_state, data->start_class);
3507                 flags &= ~SCF_DO_STCLASS;
3508                 break;
3509             }
3510         }
3511         else if (OP(scan) == LNBREAK) {
3512             if (flags & SCF_DO_STCLASS) {
3513                 int value = 0;
3514                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3515                 if (flags & SCF_DO_STCLASS_AND) {
3516                     for (value = 0; value < 256; value++)
3517                         if (!is_VERTWS_cp(value))
3518                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3519                 }
3520                 else {
3521                     for (value = 0; value < 256; value++)
3522                         if (is_VERTWS_cp(value))
3523                             ANYOF_BITMAP_SET(data->start_class, value);
3524                 }
3525                 if (flags & SCF_DO_STCLASS_OR)
3526                     cl_and(data->start_class, and_withp);
3527                 flags &= ~SCF_DO_STCLASS;
3528             }
3529             min += 1;
3530             delta += 1;
3531             if (flags & SCF_DO_SUBSTR) {
3532                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3533                 data->pos_min += 1;
3534                 data->pos_delta += 1;
3535                 data->longest = &(data->longest_float);
3536             }
3537         }
3538         else if (OP(scan) == FOLDCHAR) {
3539             int d = ARG(scan)==0xDF ? 1 : 2;
3540             flags &= ~SCF_DO_STCLASS;
3541             min += 1;
3542             delta += d;
3543             if (flags & SCF_DO_SUBSTR) {
3544                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3545                 data->pos_min += 1;
3546                 data->pos_delta += d;
3547                 data->longest = &(data->longest_float);
3548             }
3549         }
3550         else if (REGNODE_SIMPLE(OP(scan))) {
3551             int value = 0;
3552
3553             if (flags & SCF_DO_SUBSTR) {
3554                 SCAN_COMMIT(pRExC_state,data,minlenp);
3555                 data->pos_min++;
3556             }
3557             min++;
3558             if (flags & SCF_DO_STCLASS) {
3559                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3560
3561                 /* Some of the logic below assumes that switching
3562                    locale on will only add false positives. */
3563                 switch (PL_regkind[OP(scan)]) {
3564                 case SANY:
3565                 default:
3566                   do_default:
3567                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3568                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3569                         cl_anything(pRExC_state, data->start_class);
3570                     break;
3571                 case REG_ANY:
3572                     if (OP(scan) == SANY)
3573                         goto do_default;
3574                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3575                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3576                                  || (data->start_class->flags & ANYOF_CLASS));
3577                         cl_anything(pRExC_state, data->start_class);
3578                     }
3579                     if (flags & SCF_DO_STCLASS_AND || !value)
3580                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3581                     break;
3582                 case ANYOF:
3583                     if (flags & SCF_DO_STCLASS_AND)
3584                         cl_and(data->start_class,
3585                                (struct regnode_charclass_class*)scan);
3586                     else
3587                         cl_or(pRExC_state, data->start_class,
3588                               (struct regnode_charclass_class*)scan);
3589                     break;
3590                 case ALNUM:
3591                     if (flags & SCF_DO_STCLASS_AND) {
3592                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3593                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3594                             if (FLAGS(scan) & USE_UNI) {
3595                                 for (value = 0; value < 256; value++) {
3596                                     if (!isWORDCHAR_L1(value)) {
3597                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3598                                     }
3599                                 }
3600                             } else {
3601                                 for (value = 0; value < 256; value++) {
3602                                     if (!isALNUM(value)) {
3603                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3604                                     }
3605                                 }
3606                             }
3607                         }
3608                     }
3609                     else {
3610                         if (data->start_class->flags & ANYOF_LOCALE)
3611                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3612                         else if (FLAGS(scan) & USE_UNI) {
3613                             for (value = 0; value < 256; value++) {
3614                                 if (isWORDCHAR_L1(value)) {
3615                                     ANYOF_BITMAP_SET(data->start_class, value);
3616                                 }
3617                             }
3618                         } else {
3619                             for (value = 0; value < 256; value++) {
3620                                 if (isALNUM(value)) {
3621                                     ANYOF_BITMAP_SET(data->start_class, value);
3622                                 }
3623                             }
3624                         }
3625                     }
3626                     break;
3627                 case ALNUML:
3628                     if (flags & SCF_DO_STCLASS_AND) {
3629                         if (data->start_class->flags & ANYOF_LOCALE)
3630                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3631                     }
3632                     else {
3633                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3634                         data->start_class->flags |= ANYOF_LOCALE;
3635                     }
3636                     break;
3637                 case NALNUM:
3638                     if (flags & SCF_DO_STCLASS_AND) {
3639                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3640                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3641                             if (FLAGS(scan) & USE_UNI) {
3642                                 for (value = 0; value < 256; value++) {
3643                                     if (isWORDCHAR_L1(value)) {
3644                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3645                                     }
3646                                 }
3647                             } else {
3648                                 for (value = 0; value < 256; value++) {
3649                                     if (isALNUM(value)) {
3650                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3651                                     }
3652                                 }
3653                             }
3654                         }
3655                     }
3656                     else {
3657                         if (data->start_class->flags & ANYOF_LOCALE)
3658                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3659                         else {
3660                             for (value = 0; value < 256; value++)
3661                                 if (!isALNUM(value))
3662                                     ANYOF_BITMAP_SET(data->start_class, value);
3663                         }
3664                     }
3665                     break;
3666                 case NALNUML:
3667                     if (flags & SCF_DO_STCLASS_AND) {
3668                         if (data->start_class->flags & ANYOF_LOCALE)
3669                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3670                     }
3671                     else {
3672                         data->start_class->flags |= ANYOF_LOCALE;
3673                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3674                     }
3675                     break;
3676                 case SPACE:
3677                     if (flags & SCF_DO_STCLASS_AND) {
3678                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3679                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3680                             if (FLAGS(scan) & USE_UNI) {
3681                                 for (value = 0; value < 256; value++) {
3682                                     if (!isSPACE_L1(value)) {
3683                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3684                                     }
3685                                 }
3686                             } else {
3687                                 for (value = 0; value < 256; value++) {
3688                                     if (!isSPACE(value)) {
3689                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3690                                     }
3691                                 }
3692                             }
3693                         }
3694                     }
3695                     else {
3696                         if (data->start_class->flags & ANYOF_LOCALE) {
3697                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3698                         }
3699                         else if (FLAGS(scan) & USE_UNI) {
3700                             for (value = 0; value < 256; value++) {
3701                                 if (isSPACE_L1(value)) {
3702                                     ANYOF_BITMAP_SET(data->start_class, value);
3703                                 }
3704                             }
3705                         } else {
3706                             for (value = 0; value < 256; value++) {
3707                                 if (isSPACE(value)) {
3708                                     ANYOF_BITMAP_SET(data->start_class, value);
3709                                 }
3710                             }
3711                         }
3712                     }
3713                     break;
3714                 case SPACEL:
3715                     if (flags & SCF_DO_STCLASS_AND) {
3716                         if (data->start_class->flags & ANYOF_LOCALE)
3717                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3718                     }
3719                     else {
3720                         data->start_class->flags |= ANYOF_LOCALE;
3721                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3722                     }
3723                     break;
3724                 case NSPACE:
3725                     if (flags & SCF_DO_STCLASS_AND) {
3726                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3727                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3728                             if (FLAGS(scan) & USE_UNI) {
3729                                 for (value = 0; value < 256; value++) {
3730                                     if (isSPACE_L1(value)) {
3731                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3732                                     }
3733                                 }
3734                             } else {
3735                                 for (value = 0; value < 256; value++) {
3736                                     if (isSPACE(value)) {
3737                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3738                                     }
3739                                 }
3740                             }
3741                         }
3742                     }
3743                     else {
3744                         if (data->start_class->flags & ANYOF_LOCALE)
3745                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3746                         else if (FLAGS(scan) & USE_UNI) {
3747                             for (value = 0; value < 256; value++) {
3748                                 if (!isSPACE_L1(value)) {
3749                                     ANYOF_BITMAP_SET(data->start_class, value);
3750                                 }
3751                             }
3752                         }
3753                         else {
3754                             for (value = 0; value < 256; value++) {
3755                                 if (!isSPACE(value)) {
3756                                     ANYOF_BITMAP_SET(data->start_class, value);
3757                                 }
3758                             }
3759                         }
3760                     }
3761                     break;
3762                 case NSPACEL:
3763                     if (flags & SCF_DO_STCLASS_AND) {
3764                         if (data->start_class->flags & ANYOF_LOCALE) {
3765                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3766                             for (value = 0; value < 256; value++)
3767                                 if (!isSPACE(value))
3768                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3769                         }
3770                     }
3771                     else {
3772                         data->start_class->flags |= ANYOF_LOCALE;
3773                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3774                     }
3775                     break;
3776                 case DIGIT:
3777                     if (flags & SCF_DO_STCLASS_AND) {
3778                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3779                         for (value = 0; value < 256; value++)
3780                             if (!isDIGIT(value))
3781                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3782                     }
3783                     else {
3784                         if (data->start_class->flags & ANYOF_LOCALE)
3785                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3786                         else {
3787                             for (value = 0; value < 256; value++)
3788                                 if (isDIGIT(value))
3789                                     ANYOF_BITMAP_SET(data->start_class, value);
3790                         }
3791                     }
3792                     break;
3793                 case NDIGIT:
3794                     if (flags & SCF_DO_STCLASS_AND) {
3795                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3796                         for (value = 0; value < 256; value++)
3797                             if (isDIGIT(value))
3798                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3799                     }
3800                     else {
3801                         if (data->start_class->flags & ANYOF_LOCALE)
3802                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3803                         else {
3804                             for (value = 0; value < 256; value++)
3805                                 if (!isDIGIT(value))
3806                                     ANYOF_BITMAP_SET(data->start_class, value);
3807                         }
3808                     }
3809                     break;
3810                 CASE_SYNST_FNC(VERTWS);
3811                 CASE_SYNST_FNC(HORIZWS);
3812
3813                 }
3814                 if (flags & SCF_DO_STCLASS_OR)
3815                     cl_and(data->start_class, and_withp);
3816                 flags &= ~SCF_DO_STCLASS;
3817             }
3818         }
3819         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3820             data->flags |= (OP(scan) == MEOL
3821                             ? SF_BEFORE_MEOL
3822                             : SF_BEFORE_SEOL);
3823         }
3824         else if (  PL_regkind[OP(scan)] == BRANCHJ
3825                  /* Lookbehind, or need to calculate parens/evals/stclass: */
3826                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
3827                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3828             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3829                 || OP(scan) == UNLESSM )
3830             {
3831                 /* Negative Lookahead/lookbehind
3832                    In this case we can't do fixed string optimisation.
3833                 */
3834
3835                 I32 deltanext, minnext, fake = 0;
3836                 regnode *nscan;
3837                 struct regnode_charclass_class intrnl;
3838                 int f = 0;
3839
3840                 data_fake.flags = 0;
3841                 if (data) {
3842                     data_fake.whilem_c = data->whilem_c;
3843                     data_fake.last_closep = data->last_closep;
3844                 }
3845                 else
3846                     data_fake.last_closep = &fake;
3847                 data_fake.pos_delta = delta;
3848                 if ( flags & SCF_DO_STCLASS && !scan->flags
3849                      && OP(scan) == IFMATCH ) { /* Lookahead */
3850                     cl_init(pRExC_state, &intrnl);
3851                     data_fake.start_class = &intrnl;
3852                     f |= SCF_DO_STCLASS_AND;
3853                 }
3854                 if (flags & SCF_WHILEM_VISITED_POS)
3855                     f |= SCF_WHILEM_VISITED_POS;
3856                 next = regnext(scan);
3857                 nscan = NEXTOPER(NEXTOPER(scan));
3858                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3859                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3860                 if (scan->flags) {
3861                     if (deltanext) {
3862                         FAIL("Variable length lookbehind not implemented");
3863                     }
3864                     else if (minnext > (I32)U8_MAX) {
3865                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3866                     }
3867                     scan->flags = (U8)minnext;
3868                 }
3869                 if (data) {
3870                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3871                         pars++;
3872                     if (data_fake.flags & SF_HAS_EVAL)
3873                         data->flags |= SF_HAS_EVAL;
3874                     data->whilem_c = data_fake.whilem_c;
3875                 }
3876                 if (f & SCF_DO_STCLASS_AND) {
3877                     if (flags & SCF_DO_STCLASS_OR) {
3878                         /* OR before, AND after: ideally we would recurse with
3879                          * data_fake to get the AND applied by study of the
3880                          * remainder of the pattern, and then derecurse;
3881                          * *** HACK *** for now just treat as "no information".
3882                          * See [perl #56690].
3883                          */
3884                         cl_init(pRExC_state, data->start_class);
3885                     }  else {
3886                         /* AND before and after: combine and continue */
3887                         const int was = (data->start_class->flags & ANYOF_EOS);
3888
3889                         cl_and(data->start_class, &intrnl);
3890                         if (was)
3891                             data->start_class->flags |= ANYOF_EOS;
3892                     }
3893                 }
3894             }
3895 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3896             else {
3897                 /* Positive Lookahead/lookbehind
3898                    In this case we can do fixed string optimisation,
3899                    but we must be careful about it. Note in the case of
3900                    lookbehind the positions will be offset by the minimum
3901                    length of the pattern, something we won't know about
3902                    until after the recurse.
3903                 */
3904                 I32 deltanext, fake = 0;
3905                 regnode *nscan;
3906                 struct regnode_charclass_class intrnl;
3907                 int f = 0;
3908                 /* We use SAVEFREEPV so that when the full compile
3909                     is finished perl will clean up the allocated
3910                     minlens when it's all done. This way we don't
3911                     have to worry about freeing them when we know
3912                     they wont be used, which would be a pain.
3913                  */
3914                 I32 *minnextp;
3915                 Newx( minnextp, 1, I32 );
3916                 SAVEFREEPV(minnextp);
3917
3918                 if (data) {
3919                     StructCopy(data, &data_fake, scan_data_t);
3920                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3921                         f |= SCF_DO_SUBSTR;
3922                         if (scan->flags)
3923                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3924                         data_fake.last_found=newSVsv(data->last_found);
3925                     }
3926                 }
3927                 else
3928                     data_fake.last_closep = &fake;
3929                 data_fake.flags = 0;
3930                 data_fake.pos_delta = delta;
3931                 if (is_inf)
3932                     data_fake.flags |= SF_IS_INF;
3933                 if ( flags & SCF_DO_STCLASS && !scan->flags
3934                      && OP(scan) == IFMATCH ) { /* Lookahead */
3935                     cl_init(pRExC_state, &intrnl);
3936                     data_fake.start_class = &intrnl;
3937                     f |= SCF_DO_STCLASS_AND;
3938                 }
3939                 if (flags & SCF_WHILEM_VISITED_POS)
3940                     f |= SCF_WHILEM_VISITED_POS;
3941                 next = regnext(scan);
3942                 nscan = NEXTOPER(NEXTOPER(scan));
3943
3944                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3945                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3946                 if (scan->flags) {
3947                     if (deltanext) {
3948                         FAIL("Variable length lookbehind not implemented");
3949                     }
3950                     else if (*minnextp > (I32)U8_MAX) {
3951                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3952                     }
3953                     scan->flags = (U8)*minnextp;
3954                 }
3955
3956                 *minnextp += min;
3957
3958                 if (f & SCF_DO_STCLASS_AND) {
3959                     const int was = (data->start_class->flags & ANYOF_EOS);
3960
3961                     cl_and(data->start_class, &intrnl);
3962                     if (was)
3963                         data->start_class->flags |= ANYOF_EOS;
3964                 }
3965                 if (data) {
3966                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3967                         pars++;
3968                     if (data_fake.flags & SF_HAS_EVAL)
3969                         data->flags |= SF_HAS_EVAL;
3970                     data->whilem_c = data_fake.whilem_c;
3971                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3972                         if (RExC_rx->minlen<*minnextp)
3973                             RExC_rx->minlen=*minnextp;
3974                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3975                         SvREFCNT_dec(data_fake.last_found);
3976
3977                         if ( data_fake.minlen_fixed != minlenp )
3978                         {
3979                             data->offset_fixed= data_fake.offset_fixed;
3980                             data->minlen_fixed= data_fake.minlen_fixed;
3981                             data->lookbehind_fixed+= scan->flags;
3982                         }
3983                         if ( data_fake.minlen_float != minlenp )
3984                         {
3985                             data->minlen_float= data_fake.minlen_float;
3986                             data->offset_float_min=data_fake.offset_float_min;
3987                             data->offset_float_max=data_fake.offset_float_max;
3988                             data->lookbehind_float+= scan->flags;
3989                         }
3990                     }
3991                 }
3992
3993
3994             }
3995 #endif
3996         }
3997         else if (OP(scan) == OPEN) {
3998             if (stopparen != (I32)ARG(scan))
3999                 pars++;
4000         }
4001         else if (OP(scan) == CLOSE) {
4002             if (stopparen == (I32)ARG(scan)) {
4003                 break;
4004             }
4005             if ((I32)ARG(scan) == is_par) {
4006                 next = regnext(scan);
4007
4008                 if ( next && (OP(next) != WHILEM) && next < last)
4009                     is_par = 0;         /* Disable optimization */
4010             }
4011             if (data)
4012                 *(data->last_closep) = ARG(scan);
4013         }
4014         else if (OP(scan) == EVAL) {
4015                 if (data)
4016                     data->flags |= SF_HAS_EVAL;
4017         }
4018         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4019             if (flags & SCF_DO_SUBSTR) {
4020                 SCAN_COMMIT(pRExC_state,data,minlenp);
4021                 flags &= ~SCF_DO_SUBSTR;
4022             }
4023             if (data && OP(scan)==ACCEPT) {
4024                 data->flags |= SCF_SEEN_ACCEPT;
4025                 if (stopmin > min)
4026                     stopmin = min;
4027             }
4028         }
4029         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4030         {
4031                 if (flags & SCF_DO_SUBSTR) {
4032                     SCAN_COMMIT(pRExC_state,data,minlenp);
4033                     data->longest = &(data->longest_float);
4034                 }
4035                 is_inf = is_inf_internal = 1;
4036                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4037                     cl_anything(pRExC_state, data->start_class);
4038                 flags &= ~SCF_DO_STCLASS;
4039         }
4040         else if (OP(scan) == GPOS) {
4041             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4042                 !(delta || is_inf || (data && data->pos_delta)))
4043             {
4044                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4045                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4046                 if (RExC_rx->gofs < (U32)min)
4047                     RExC_rx->gofs = min;
4048             } else {
4049                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4050                 RExC_rx->gofs = 0;
4051             }
4052         }
4053 #ifdef TRIE_STUDY_OPT
4054 #ifdef FULL_TRIE_STUDY
4055         else if (PL_regkind[OP(scan)] == TRIE) {
4056             /* NOTE - There is similar code to this block above for handling
4057                BRANCH nodes on the initial study.  If you change stuff here
4058                check there too. */
4059             regnode *trie_node= scan;
4060             regnode *tail= regnext(scan);
4061             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4062             I32 max1 = 0, min1 = I32_MAX;
4063             struct regnode_charclass_class accum;
4064
4065             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4066                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4067             if (flags & SCF_DO_STCLASS)
4068                 cl_init_zero(pRExC_state, &accum);
4069
4070             if (!trie->jump) {
4071                 min1= trie->minlen;
4072                 max1= trie->maxlen;
4073             } else {
4074                 const regnode *nextbranch= NULL;
4075                 U32 word;
4076
4077                 for ( word=1 ; word <= trie->wordcount ; word++)
4078                 {
4079                     I32 deltanext=0, minnext=0, f = 0, fake;
4080                     struct regnode_charclass_class this_class;
4081
4082                     data_fake.flags = 0;
4083                     if (data) {
4084                         data_fake.whilem_c = data->whilem_c;
4085                         data_fake.last_closep = data->last_closep;
4086                     }
4087                     else
4088                         data_fake.last_closep = &fake;
4089                     data_fake.pos_delta = delta;
4090                     if (flags & SCF_DO_STCLASS) {
4091                         cl_init(pRExC_state, &this_class);
4092                         data_fake.start_class = &this_class;
4093                         f = SCF_DO_STCLASS_AND;
4094                     }
4095                     if (flags & SCF_WHILEM_VISITED_POS)
4096                         f |= SCF_WHILEM_VISITED_POS;
4097
4098                     if (trie->jump[word]) {
4099                         if (!nextbranch)
4100                             nextbranch = trie_node + trie->jump[0];
4101                         scan= trie_node + trie->jump[word];
4102                         /* We go from the jump point to the branch that follows
4103                            it. Note this means we need the vestigal unused branches
4104                            even though they arent otherwise used.
4105                          */
4106                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4107                             &deltanext, (regnode *)nextbranch, &data_fake,
4108                             stopparen, recursed, NULL, f,depth+1);
4109                     }
4110                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4111                         nextbranch= regnext((regnode*)nextbranch);
4112
4113                     if (min1 > (I32)(minnext + trie->minlen))
4114                         min1 = minnext + trie->minlen;
4115                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4116                         max1 = minnext + deltanext + trie->maxlen;
4117                     if (deltanext == I32_MAX)
4118                         is_inf = is_inf_internal = 1;
4119
4120                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4121                         pars++;
4122                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4123                         if ( stopmin > min + min1)
4124                             stopmin = min + min1;
4125                         flags &= ~SCF_DO_SUBSTR;
4126                         if (data)
4127                             data->flags |= SCF_SEEN_ACCEPT;
4128                     }
4129                     if (data) {
4130                         if (data_fake.flags & SF_HAS_EVAL)
4131                             data->flags |= SF_HAS_EVAL;
4132                         data->whilem_c = data_fake.whilem_c;
4133                     }
4134                     if (flags & SCF_DO_STCLASS)
4135                         cl_or(pRExC_state, &accum, &this_class);
4136                 }
4137             }
4138             if (flags & SCF_DO_SUBSTR) {
4139                 data->pos_min += min1;
4140                 data->pos_delta += max1 - min1;
4141                 if (max1 != min1 || is_inf)
4142                     data->longest = &(data->longest_float);
4143             }
4144             min += min1;
4145             delta += max1 - min1;
4146             if (flags & SCF_DO_STCLASS_OR) {
4147                 cl_or(pRExC_state, data->start_class, &accum);
4148                 if (min1) {
4149                     cl_and(data->start_class, and_withp);
4150                     flags &= ~SCF_DO_STCLASS;
4151                 }
4152             }
4153             else if (flags & SCF_DO_STCLASS_AND) {
4154                 if (min1) {
4155                     cl_and(data->start_class, &accum);
4156                     flags &= ~SCF_DO_STCLASS;
4157                 }
4158                 else {
4159                     /* Switch to OR mode: cache the old value of
4160                      * data->start_class */
4161                     INIT_AND_WITHP;
4162                     StructCopy(data->start_class, and_withp,
4163                                struct regnode_charclass_class);
4164                     flags &= ~SCF_DO_STCLASS_AND;
4165                     StructCopy(&accum, data->start_class,
4166                                struct regnode_charclass_class);
4167                     flags |= SCF_DO_STCLASS_OR;
4168                     data->start_class->flags |= ANYOF_EOS;
4169                 }
4170             }
4171             scan= tail;
4172             continue;
4173         }
4174 #else
4175         else if (PL_regkind[OP(scan)] == TRIE) {
4176             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4177             U8*bang=NULL;
4178
4179             min += trie->minlen;
4180             delta += (trie->maxlen - trie->minlen);
4181             flags &= ~SCF_DO_STCLASS; /* xxx */
4182             if (flags & SCF_DO_SUBSTR) {
4183                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4184                 data->pos_min += trie->minlen;
4185                 data->pos_delta += (trie->maxlen - trie->minlen);
4186                 if (trie->maxlen != trie->minlen)
4187                     data->longest = &(data->longest_float);
4188             }
4189             if (trie->jump) /* no more substrings -- for now /grr*/
4190                 flags &= ~SCF_DO_SUBSTR;
4191         }
4192 #endif /* old or new */
4193 #endif /* TRIE_STUDY_OPT */
4194
4195         /* Else: zero-length, ignore. */
4196         scan = regnext(scan);
4197     }
4198     if (frame) {
4199         last = frame->last;
4200         scan = frame->next;
4201         stopparen = frame->stop;
4202         frame = frame->prev;
4203         goto fake_study_recurse;
4204     }
4205
4206   finish:
4207     assert(!frame);
4208     DEBUG_STUDYDATA("pre-fin:",data,depth);
4209
4210     *scanp = scan;
4211     *deltap = is_inf_internal ? I32_MAX : delta;
4212     if (flags & SCF_DO_SUBSTR && is_inf)
4213         data->pos_delta = I32_MAX - data->pos_min;
4214     if (is_par > (I32)U8_MAX)
4215         is_par = 0;
4216     if (is_par && pars==1 && data) {
4217         data->flags |= SF_IN_PAR;
4218         data->flags &= ~SF_HAS_PAR;
4219     }
4220     else if (pars && data) {
4221         data->flags |= SF_HAS_PAR;
4222         data->flags &= ~SF_IN_PAR;
4223     }
4224     if (flags & SCF_DO_STCLASS_OR)
4225         cl_and(data->start_class, and_withp);
4226     if (flags & SCF_TRIE_RESTUDY)
4227         data->flags |=  SCF_TRIE_RESTUDY;
4228
4229     DEBUG_STUDYDATA("post-fin:",data,depth);
4230
4231     return min < stopmin ? min : stopmin;
4232 }
4233
4234 STATIC U32
4235 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4236 {
4237     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4238
4239     PERL_ARGS_ASSERT_ADD_DATA;
4240
4241     Renewc(RExC_rxi->data,
4242            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4243            char, struct reg_data);
4244     if(count)
4245         Renew(RExC_rxi->data->what, count + n, U8);
4246     else
4247         Newx(RExC_rxi->data->what, n, U8);
4248     RExC_rxi->data->count = count + n;
4249     Copy(s, RExC_rxi->data->what + count, n, U8);
4250     return count;
4251 }
4252
4253 /*XXX: todo make this not included in a non debugging perl */
4254 #ifndef PERL_IN_XSUB_RE
4255 void
4256 Perl_reginitcolors(pTHX)
4257 {
4258     dVAR;
4259     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4260     if (s) {
4261         char *t = savepv(s);
4262         int i = 0;
4263         PL_colors[0] = t;
4264         while (++i < 6) {
4265             t = strchr(t, '\t');
4266             if (t) {
4267                 *t = '\0';
4268                 PL_colors[i] = ++t;
4269             }
4270             else
4271                 PL_colors[i] = t = (char *)"";
4272         }
4273     } else {
4274         int i = 0;
4275         while (i < 6)
4276             PL_colors[i++] = (char *)"";
4277     }
4278     PL_colorset = 1;
4279 }
4280 #endif
4281
4282
4283 #ifdef TRIE_STUDY_OPT
4284 #define CHECK_RESTUDY_GOTO                                  \
4285         if (                                                \
4286               (data.flags & SCF_TRIE_RESTUDY)               \
4287               && ! restudied++                              \
4288         )     goto reStudy
4289 #else
4290 #define CHECK_RESTUDY_GOTO
4291 #endif
4292
4293 /*
4294  - pregcomp - compile a regular expression into internal code
4295  *
4296  * We can't allocate space until we know how big the compiled form will be,
4297  * but we can't compile it (and thus know how big it is) until we've got a
4298  * place to put the code.  So we cheat:  we compile it twice, once with code
4299  * generation turned off and size counting turned on, and once "for real".
4300  * This also means that we don't allocate space until we are sure that the
4301  * thing really will compile successfully, and we never have to move the
4302  * code and thus invalidate pointers into it.  (Note that it has to be in
4303  * one piece because free() must be able to free it all.) [NB: not true in perl]
4304  *
4305  * Beware that the optimization-preparation code in here knows about some
4306  * of the structure of the compiled regexp.  [I'll say.]
4307  */
4308
4309
4310
4311 #ifndef PERL_IN_XSUB_RE
4312 #define RE_ENGINE_PTR &reh_regexp_engine
4313 #else
4314 extern const struct regexp_engine my_reg_engine;
4315 #define RE_ENGINE_PTR &my_reg_engine
4316 #endif
4317
4318 #ifndef PERL_IN_XSUB_RE
4319 REGEXP *
4320 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4321 {
4322     dVAR;
4323     HV * const table = GvHV(PL_hintgv);
4324
4325     PERL_ARGS_ASSERT_PREGCOMP;
4326
4327     /* Dispatch a request to compile a regexp to correct
4328        regexp engine. */
4329     if (table) {
4330         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4331         GET_RE_DEBUG_FLAGS_DECL;
4332         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4333             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4334             DEBUG_COMPILE_r({
4335                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4336                     SvIV(*ptr));
4337             });
4338             return CALLREGCOMP_ENG(eng, pattern, flags);
4339         }
4340     }
4341     return Perl_re_compile(aTHX_ pattern, flags);
4342 }
4343 #endif
4344
4345 REGEXP *
4346 Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
4347 {
4348     dVAR;
4349     REGEXP *rx;
4350     struct regexp *r;
4351     register regexp_internal *ri;
4352     STRLEN plen;
4353     char  *exp;
4354     char* xend;
4355     regnode *scan;
4356     I32 flags;
4357     I32 minlen = 0;
4358
4359     /* these are all flags - maybe they should be turned
4360      * into a single int with different bit masks */
4361     I32 sawlookahead = 0;
4362     I32 sawplus = 0;
4363     I32 sawopen = 0;
4364
4365     U8 jump_ret = 0;
4366     dJMPENV;
4367     scan_data_t data;
4368     RExC_state_t RExC_state;
4369     RExC_state_t * const pRExC_state = &RExC_state;
4370 #ifdef TRIE_STUDY_OPT
4371     int restudied;
4372     RExC_state_t copyRExC_state;
4373 #endif
4374     GET_RE_DEBUG_FLAGS_DECL;
4375
4376     PERL_ARGS_ASSERT_RE_COMPILE;
4377
4378     DEBUG_r(if (!PL_colorset) reginitcolors());
4379
4380     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4381
4382
4383     /* Longjmp back to here if have to switch in midstream to utf8 */
4384     if (! RExC_orig_utf8) {
4385         JMPENV_PUSH(jump_ret);
4386     }
4387
4388     if (jump_ret == 0) {    /* First time through */
4389         exp = SvPV(pattern, plen);
4390         xend = exp + plen;
4391
4392         DEBUG_COMPILE_r({
4393             SV *dsv= sv_newmortal();
4394             RE_PV_QUOTED_DECL(s, RExC_utf8,
4395                 dsv, exp, plen, 60);
4396             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4397                            PL_colors[4],PL_colors[5],s);
4398         });
4399     }
4400     else {  /* longjumped back */
4401         STRLEN len = plen;
4402
4403         /* If the cause for the longjmp was other than changing to utf8, pop
4404          * our own setjmp, and longjmp to the correct handler */
4405         if (jump_ret != UTF8_LONGJMP) {
4406             JMPENV_POP;
4407             JMPENV_JUMP(jump_ret);
4408         }
4409
4410         GET_RE_DEBUG_FLAGS;
4411
4412         /* It's possible to write a regexp in ascii that represents Unicode
4413         codepoints outside of the byte range, such as via \x{100}. If we
4414         detect such a sequence we have to convert the entire pattern to utf8
4415         and then recompile, as our sizing calculation will have been based
4416         on 1 byte == 1 character, but we will need to use utf8 to encode
4417         at least some part of the pattern, and therefore must convert the whole
4418         thing.
4419         -- dmq */
4420         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4421             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4422         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4423         xend = exp + len;
4424         RExC_orig_utf8 = RExC_utf8 = 1;
4425         SAVEFREEPV(exp);
4426     }
4427
4428 #ifdef TRIE_STUDY_OPT
4429     restudied = 0;
4430 #endif
4431
4432     RExC_precomp = exp;
4433     RExC_flags = pm_flags;
4434     RExC_sawback = 0;
4435
4436     RExC_seen = 0;
4437     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4438     RExC_seen_evals = 0;
4439     RExC_extralen = 0;
4440
4441     /* First pass: determine size, legality. */
4442     RExC_parse = exp;
4443     RExC_start = exp;
4444     RExC_end = xend;
4445     RExC_naughty = 0;
4446     RExC_npar = 1;
4447     RExC_nestroot = 0;
4448     RExC_size = 0L;
4449     RExC_emit = &PL_regdummy;
4450     RExC_whilem_seen = 0;
4451     RExC_open_parens = NULL;
4452     RExC_close_parens = NULL;
4453     RExC_opend = NULL;
4454     RExC_paren_names = NULL;
4455 #ifdef DEBUGGING
4456     RExC_paren_name_list = NULL;
4457 #endif
4458     RExC_recurse = NULL;
4459     RExC_recurse_count = 0;
4460
4461 #if 0 /* REGC() is (currently) a NOP at the first pass.
4462        * Clever compilers notice this and complain. --jhi */
4463     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4464 #endif
4465     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4466     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4467         RExC_precomp = NULL;
4468         return(NULL);
4469     }
4470
4471     /* Here, finished first pass.  Get rid of our setjmp, which we added for
4472      * efficiency only if the passed-in string wasn't in utf8, as shown by
4473      * RExC_orig_utf8.  But if the first pass was redone, that variable will be
4474      * 1 here even though the original string wasn't utf8, but in this case
4475      * there will have been a long jump */
4476     if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
4477         JMPENV_POP;
4478     }
4479     DEBUG_PARSE_r({
4480         PerlIO_printf(Perl_debug_log,
4481             "Required size %"IVdf" nodes\n"
4482             "Starting second pass (creation)\n",
4483             (IV)RExC_size);
4484         RExC_lastnum=0;
4485         RExC_lastparse=NULL;
4486     });
4487     /* Small enough for pointer-storage convention?
4488        If extralen==0, this means that we will not need long jumps. */
4489     if (RExC_size >= 0x10000L && RExC_extralen)
4490         RExC_size += RExC_extralen;
4491     else
4492         RExC_extralen = 0;
4493     if (RExC_whilem_seen > 15)
4494         RExC_whilem_seen = 15;
4495
4496     /* Allocate space and zero-initialize. Note, the two step process
4497        of zeroing when in debug mode, thus anything assigned has to
4498        happen after that */
4499     rx = (REGEXP*) newSV_type(SVt_REGEXP);
4500     r = (struct regexp*)SvANY(rx);
4501     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4502          char, regexp_internal);
4503     if ( r == NULL || ri == NULL )
4504         FAIL("Regexp out of space");
4505 #ifdef DEBUGGING
4506     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4507     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4508 #else
4509     /* bulk initialize base fields with 0. */
4510     Zero(ri, sizeof(regexp_internal), char);
4511 #endif
4512
4513     /* non-zero initialization begins here */
4514     RXi_SET( r, ri );
4515     r->engine= RE_ENGINE_PTR;
4516     r->extflags = pm_flags;
4517     {
4518         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4519         bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
4520
4521         /* The caret is output if there are any defaults: if not all the STD
4522          * flags are set, or if no character set specifier is needed */
4523         bool has_default =
4524                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4525                     || ! has_charset);
4526         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4527         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4528                             >> RXf_PMf_STD_PMMOD_SHIFT);
4529         const char *fptr = STD_PAT_MODS;        /*"msix"*/
4530         char *p;
4531         /* Allocate for the worst case, which is all the std flags are turned
4532          * on.  If more precision is desired, we could do a population count of
4533          * the flags set.  This could be done with a small lookup table, or by
4534          * shifting, masking and adding, or even, when available, assembly
4535          * language for a machine-language population count.
4536          * We never output a minus, as all those are defaults, so are
4537          * covered by the caret */
4538         const STRLEN wraplen = plen + has_p + has_runon
4539             + has_default       /* If needs a caret */
4540             + has_charset       /* If needs a character set specifier */
4541             + (sizeof(STD_PAT_MODS) - 1)
4542             + (sizeof("(?:)") - 1);
4543
4544         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4545         SvPOK_on(rx);
4546         SvFLAGS(rx) |= SvUTF8(pattern);
4547         *p++='('; *p++='?';
4548
4549         /* If a default, cover it using the caret */
4550         if (has_default) {
4551             *p++= DEFAULT_PAT_MOD;
4552         }
4553         if (has_charset) {
4554             if (r->extflags & RXf_PMf_LOCALE) {
4555                 *p++ = LOCALE_PAT_MOD;
4556             } else {
4557                 *p++ = UNICODE_PAT_MOD;
4558             }
4559         }
4560         if (has_p)
4561             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4562         {
4563             char ch;
4564             while((ch = *fptr++)) {
4565                 if(reganch & 1)
4566                     *p++ = ch;
4567                 reganch >>= 1;
4568             }
4569         }
4570
4571         *p++ = ':';
4572         Copy(RExC_precomp, p, plen, char);
4573         assert ((RX_WRAPPED(rx) - p) < 16);
4574         r->pre_prefix = p - RX_WRAPPED(rx);
4575         p += plen;
4576         if (has_runon)
4577             *p++ = '\n';
4578         *p++ = ')';
4579         *p = 0;
4580         SvCUR_set(rx, p - SvPVX_const(rx));
4581     }
4582
4583     r->intflags = 0;
4584     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4585
4586     if (RExC_seen & REG_SEEN_RECURSE) {
4587         Newxz(RExC_open_parens, RExC_npar,regnode *);
4588         SAVEFREEPV(RExC_open_parens);
4589         Newxz(RExC_close_parens,RExC_npar,regnode *);
4590         SAVEFREEPV(RExC_close_parens);
4591     }
4592
4593     /* Useful during FAIL. */
4594 #ifdef RE_TRACK_PATTERN_OFFSETS
4595     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4596     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4597                           "%s %"UVuf" bytes for offset annotations.\n",
4598                           ri->u.offsets ? "Got" : "Couldn't get",
4599                           (UV)((2*RExC_size+1) * sizeof(U32))));
4600 #endif
4601     SetProgLen(ri,RExC_size);
4602     RExC_rx_sv = rx;
4603     RExC_rx = r;
4604     RExC_rxi = ri;
4605     REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4606
4607     /* Second pass: emit code. */
4608     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
4609     RExC_parse = exp;
4610     RExC_end = xend;
4611     RExC_naughty = 0;
4612     RExC_npar = 1;
4613     RExC_emit_start = ri->program;
4614     RExC_emit = ri->program;
4615     RExC_emit_bound = ri->program + RExC_size + 1;
4616
4617     /* Store the count of eval-groups for security checks: */
4618     RExC_rx->seen_evals = RExC_seen_evals;
4619     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4620     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4621         ReREFCNT_dec(rx);
4622         return(NULL);
4623     }
4624     /* XXXX To minimize changes to RE engine we always allocate
4625        3-units-long substrs field. */
4626     Newx(r->substrs, 1, struct reg_substr_data);
4627     if (RExC_recurse_count) {
4628         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4629         SAVEFREEPV(RExC_recurse);
4630     }
4631
4632 reStudy:
4633     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4634     Zero(r->substrs, 1, struct reg_substr_data);
4635
4636 #ifdef TRIE_STUDY_OPT
4637     if (!restudied) {
4638         StructCopy(&zero_scan_data, &data, scan_data_t);
4639         copyRExC_state = RExC_state;
4640     } else {
4641         U32 seen=RExC_seen;
4642         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4643
4644         RExC_state = copyRExC_state;
4645         if (seen & REG_TOP_LEVEL_BRANCHES)
4646             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4647         else
4648             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4649         if (data.last_found) {
4650             SvREFCNT_dec(data.longest_fixed);
4651             SvREFCNT_dec(data.longest_float);
4652             SvREFCNT_dec(data.last_found);
4653         }
4654         StructCopy(&zero_scan_data, &data, scan_data_t);
4655     }
4656 #else
4657     StructCopy(&zero_scan_data, &data, scan_data_t);
4658 #endif
4659
4660     /* Dig out information for optimizations. */
4661     r->extflags = RExC_flags; /* was pm_op */
4662     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4663
4664     if (UTF)
4665         SvUTF8_on(rx);  /* Unicode in it? */
4666     ri->regstclass = NULL;
4667     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
4668         r->intflags |= PREGf_NAUGHTY;
4669     scan = ri->program + 1;             /* First BRANCH. */
4670
4671     /* testing for BRANCH here tells us whether there is "must appear"
4672        data in the pattern. If there is then we can use it for optimisations */
4673     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4674         I32 fake;
4675         STRLEN longest_float_length, longest_fixed_length;
4676         struct regnode_charclass_class ch_class; /* pointed to by data */
4677         int stclass_flag;
4678         I32 last_close = 0; /* pointed to by data */
4679         regnode *first= scan;
4680         regnode *first_next= regnext(first);
4681         /*
4682          * Skip introductions and multiplicators >= 1
4683          * so that we can extract the 'meat' of the pattern that must
4684          * match in the large if() sequence following.
4685          * NOTE that EXACT is NOT covered here, as it is normally
4686          * picked up by the optimiser separately.
4687          *
4688          * This is unfortunate as the optimiser isnt handling lookahead
4689          * properly currently.
4690          *
4691          */
4692         while ((OP(first) == OPEN && (sawopen = 1)) ||
4693                /* An OR of *one* alternative - should not happen now. */
4694             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4695             /* for now we can't handle lookbehind IFMATCH*/
4696             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4697             (OP(first) == PLUS) ||
4698             (OP(first) == MINMOD) ||
4699                /* An {n,m} with n>0 */
4700             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4701             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4702         {
4703                 /*
4704                  * the only op that could be a regnode is PLUS, all the rest
4705                  * will be regnode_1 or regnode_2.
4706                  *
4707                  */
4708                 if (OP(first) == PLUS)
4709                     sawplus = 1;
4710                 else
4711                     first += regarglen[OP(first)];
4712
4713                 first = NEXTOPER(first);
4714                 first_next= regnext(first);
4715         }
4716
4717         /* Starting-point info. */
4718       again:
4719         DEBUG_PEEP("first:",first,0);
4720         /* Ignore EXACT as we deal with it later. */
4721         if (PL_regkind[OP(first)] == EXACT) {
4722             if (OP(first) == EXACT)
4723                 NOOP;   /* Empty, get anchored substr later. */
4724             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4725                 ri->regstclass = first;
4726         }
4727 #ifdef TRIE_STCLASS
4728         else if (PL_regkind[OP(first)] == TRIE &&
4729                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4730         {
4731             regnode *trie_op;
4732             /* this can happen only on restudy */
4733             if ( OP(first) == TRIE ) {
4734                 struct regnode_1 *trieop = (struct regnode_1 *)
4735                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
4736                 StructCopy(first,trieop,struct regnode_1);
4737                 trie_op=(regnode *)trieop;
4738             } else {
4739                 struct regnode_charclass *trieop = (struct regnode_charclass *)
4740                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4741                 StructCopy(first,trieop,struct regnode_charclass);
4742                 trie_op=(regnode *)trieop;
4743             }
4744             OP(trie_op)+=2;
4745             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4746             ri->regstclass = trie_op;
4747         }
4748 #endif
4749         else if (REGNODE_SIMPLE(OP(first)))
4750             ri->regstclass = first;
4751         else if (PL_regkind[OP(first)] == BOUND ||
4752                  PL_regkind[OP(first)] == NBOUND)
4753             ri->regstclass = first;
4754         else if (PL_regkind[OP(first)] == BOL) {
4755             r->extflags |= (OP(first) == MBOL
4756                            ? RXf_ANCH_MBOL
4757                            : (OP(first) == SBOL
4758                               ? RXf_ANCH_SBOL
4759                               : RXf_ANCH_BOL));
4760             first = NEXTOPER(first);
4761             goto again;
4762         }
4763         else if (OP(first) == GPOS) {
4764             r->extflags |= RXf_ANCH_GPOS;
4765             first = NEXTOPER(first);
4766             goto again;
4767         }
4768         else if ((!sawopen || !RExC_sawback) &&
4769             (OP(first) == STAR &&
4770             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4771             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4772         {
4773             /* turn .* into ^.* with an implied $*=1 */
4774             const int type =
4775                 (OP(NEXTOPER(first)) == REG_ANY)
4776                     ? RXf_ANCH_MBOL
4777                     : RXf_ANCH_SBOL;
4778             r->extflags |= type;
4779             r->intflags |= PREGf_IMPLICIT;
4780             first = NEXTOPER(first);
4781             goto again;
4782         }
4783         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4784             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4785             /* x+ must match at the 1st pos of run of x's */
4786             r->intflags |= PREGf_SKIP;
4787
4788         /* Scan is after the zeroth branch, first is atomic matcher. */
4789 #ifdef TRIE_STUDY_OPT
4790         DEBUG_PARSE_r(
4791             if (!restudied)
4792                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4793                               (IV)(first - scan + 1))
4794         );
4795 #else
4796         DEBUG_PARSE_r(
4797             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4798                 (IV)(first - scan + 1))
4799         );
4800 #endif
4801
4802
4803         /*
4804         * If there's something expensive in the r.e., find the
4805         * longest literal string that must appear and make it the
4806         * regmust.  Resolve ties in favor of later strings, since
4807         * the regstart check works with the beginning of the r.e.
4808         * and avoiding duplication strengthens checking.  Not a
4809         * strong reason, but sufficient in the absence of others.
4810         * [Now we resolve ties in favor of the earlier string if
4811         * it happens that c_offset_min has been invalidated, since the
4812         * earlier string may buy us something the later one won't.]
4813         */
4814
4815         data.longest_fixed = newSVpvs("");
4816         data.longest_float = newSVpvs("");
4817         data.last_found = newSVpvs("");
4818         data.longest = &(data.longest_fixed);
4819         first = scan;
4820         if (!ri->regstclass) {
4821             cl_init(pRExC_state, &ch_class);
4822             data.start_class = &ch_class;
4823             stclass_flag = SCF_DO_STCLASS_AND;
4824         } else                          /* XXXX Check for BOUND? */
4825             stclass_flag = 0;
4826         data.last_closep = &last_close;
4827
4828         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4829             &data, -1, NULL, NULL,
4830             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4831
4832
4833         CHECK_RESTUDY_GOTO;
4834
4835
4836         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4837              && data.last_start_min == 0 && data.last_end > 0
4838              && !RExC_seen_zerolen
4839              && !(RExC_seen & REG_SEEN_VERBARG)
4840              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4841             r->extflags |= RXf_CHECK_ALL;
4842         scan_commit(pRExC_state, &data,&minlen,0);
4843         SvREFCNT_dec(data.last_found);
4844
4845         /* Note that code very similar to this but for anchored string
4846            follows immediately below, changes may need to be made to both.
4847            Be careful.
4848          */
4849         longest_float_length = CHR_SVLEN(data.longest_float);
4850         if (longest_float_length
4851             || (data.flags & SF_FL_BEFORE_EOL
4852                 && (!(data.flags & SF_FL_BEFORE_MEOL)
4853                     || (RExC_flags & RXf_PMf_MULTILINE))))
4854         {
4855             I32 t,ml;
4856
4857             if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
4858                 && data.offset_fixed == data.offset_float_min
4859                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4860                     goto remove_float;          /* As in (a)+. */
4861
4862             /* copy the information about the longest float from the reg_scan_data
4863                over to the program. */
4864             if (SvUTF8(data.longest_float)) {
4865                 r->float_utf8 = data.longest_float;
4866                 r->float_substr = NULL;
4867             } else {
4868                 r->float_substr = data.longest_float;
4869                 r->float_utf8 = NULL;
4870             }
4871             /* float_end_shift is how many chars that must be matched that
4872                follow this item. We calculate it ahead of time as once the
4873                lookbehind offset is added in we lose the ability to correctly
4874                calculate it.*/
4875             ml = data.minlen_float ? *(data.minlen_float)
4876                                    : (I32)longest_float_length;
4877             r->float_end_shift = ml - data.offset_float_min
4878                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4879                 + data.lookbehind_float;
4880             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4881             r->float_max_offset = data.offset_float_max;
4882             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4883                 r->float_max_offset -= data.lookbehind_float;
4884
4885             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4886                        && (!(data.flags & SF_FL_BEFORE_MEOL)
4887                            || (RExC_flags & RXf_PMf_MULTILINE)));
4888             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4889         }
4890         else {
4891           remove_float:
4892             r->float_substr = r->float_utf8 = NULL;
4893             SvREFCNT_dec(data.longest_float);
4894             longest_float_length = 0;
4895         }
4896
4897         /* Note that code very similar to this but for floating string
4898            is immediately above, changes may need to be made to both.
4899            Be careful.
4900          */
4901         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4902         if (longest_fixed_length
4903             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4904                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4905                     || (RExC_flags & RXf_PMf_MULTILINE))))
4906         {
4907             I32 t,ml;
4908
4909             /* copy the information about the longest fixed
4910                from the reg_scan_data over to the program. */
4911             if (SvUTF8(data.longest_fixed)) {
4912                 r->anchored_utf8 = data.longest_fixed;
4913                 r->anchored_substr = NULL;
4914             } else {
4915                 r->anchored_substr = data.longest_fixed;
4916                 r->anchored_utf8 = NULL;
4917             }
4918             /* fixed_end_shift is how many chars that must be matched that
4919                follow this item. We calculate it ahead of time as once the
4920                lookbehind offset is added in we lose the ability to correctly
4921                calculate it.*/
4922             ml = data.minlen_fixed ? *(data.minlen_fixed)
4923                                    : (I32)longest_fixed_length;
4924             r->anchored_end_shift = ml - data.offset_fixed
4925                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4926                 + data.lookbehind_fixed;
4927             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4928
4929             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4930                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
4931                      || (RExC_flags & RXf_PMf_MULTILINE)));
4932             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4933         }
4934         else {
4935             r->anchored_substr = r->anchored_utf8 = NULL;
4936             SvREFCNT_dec(data.longest_fixed);
4937             longest_fixed_length = 0;
4938         }
4939         if (ri->regstclass
4940             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4941             ri->regstclass = NULL;
4942         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4943             && stclass_flag
4944             && !(data.start_class->flags & ANYOF_EOS)
4945             && !cl_is_anything(data.start_class))
4946         {
4947             const U32 n = add_data(pRExC_state, 1, "f");
4948
4949             Newx(RExC_rxi->data->data[n], 1,
4950                 struct regnode_charclass_class);
4951             StructCopy(data.start_class,
4952                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4953                        struct regnode_charclass_class);
4954             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4955             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4956             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4957                       regprop(r, sv, (regnode*)data.start_class);
4958                       PerlIO_printf(Perl_debug_log,
4959                                     "synthetic stclass \"%s\".\n",
4960                                     SvPVX_const(sv));});
4961         }
4962
4963         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4964         if (longest_fixed_length > longest_float_length) {
4965             r->check_end_shift = r->anchored_end_shift;
4966             r->check_substr = r->anchored_substr;
4967             r->check_utf8 = r->anchored_utf8;
4968             r->check_offset_min = r->check_offset_max = r->anchored_offset;
4969             if (r->extflags & RXf_ANCH_SINGLE)
4970                 r->extflags |= RXf_NOSCAN;
4971         }
4972         else {
4973             r->check_end_shift = r->float_end_shift;
4974             r->check_substr = r->float_substr;
4975             r->check_utf8 = r->float_utf8;
4976             r->check_offset_min = r->float_min_offset;
4977             r->check_offset_max = r->float_max_offset;
4978         }
4979         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4980            This should be changed ASAP!  */
4981         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4982             r->extflags |= RXf_USE_INTUIT;
4983             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4984                 r->extflags |= RXf_INTUIT_TAIL;
4985         }
4986         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4987         if ( (STRLEN)minlen < longest_float_length )
4988             minlen= longest_float_length;
4989         if ( (STRLEN)minlen < longest_fixed_length )
4990             minlen= longest_fixed_length;
4991         */
4992     }
4993     else {
4994         /* Several toplevels. Best we can is to set minlen. */
4995         I32 fake;
4996         struct regnode_charclass_class ch_class;
4997         I32 last_close = 0;
4998
4999         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5000
5001         scan = ri->program + 1;
5002         cl_init(pRExC_state, &ch_class);
5003         data.start_class = &ch_class;
5004         data.last_closep = &last_close;
5005
5006
5007         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5008             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5009
5010         CHECK_RESTUDY_GOTO;
5011
5012         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5013                 = r->float_substr = r->float_utf8 = NULL;
5014         if (!(data.start_class->flags & ANYOF_EOS)
5015             && !cl_is_anything(data.start_class))
5016         {
5017             const U32 n = add_data(pRExC_state, 1, "f");
5018
5019             Newx(RExC_rxi->data->data[n], 1,
5020                 struct regnode_charclass_class);
5021             StructCopy(data.start_class,
5022                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5023                        struct regnode_charclass_class);
5024             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5025             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5026             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5027                       regprop(r, sv, (regnode*)data.start_class);
5028                       PerlIO_printf(Perl_debug_log,
5029                                     "synthetic stclass \"%s\".\n",
5030                                     SvPVX_const(sv));});
5031         }
5032     }
5033
5034     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5035        the "real" pattern. */
5036     DEBUG_OPTIMISE_r({
5037         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5038                       (IV)minlen, (IV)r->minlen);
5039     });
5040     r->minlenret = minlen;
5041     if (r->minlen < minlen)
5042         r->minlen = minlen;
5043
5044     if (RExC_seen & REG_SEEN_GPOS)
5045         r->extflags |= RXf_GPOS_SEEN;
5046     if (RExC_seen & REG_SEEN_LOOKBEHIND)
5047         r->extflags |= RXf_LOOKBEHIND_SEEN;
5048     if (RExC_seen & REG_SEEN_EVAL)
5049         r->extflags |= RXf_EVAL_SEEN;
5050     if (RExC_seen & REG_SEEN_CANY)
5051         r->extflags |= RXf_CANY_SEEN;
5052     if (RExC_seen & REG_SEEN_VERBARG)
5053         r->intflags |= PREGf_VERBARG_SEEN;
5054     if (RExC_seen & REG_SEEN_CUTGROUP)
5055         r->intflags |= PREGf_CUTGROUP_SEEN;
5056     if (RExC_paren_names)
5057         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5058     else
5059         RXp_PAREN_NAMES(r) = NULL;
5060
5061 #ifdef STUPID_PATTERN_CHECKS
5062     if (RX_PRELEN(rx) == 0)
5063         r->extflags |= RXf_NULL;
5064     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5065         /* XXX: this should happen BEFORE we compile */
5066         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5067     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5068         r->extflags |= RXf_WHITE;
5069     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5070         r->extflags |= RXf_START_ONLY;
5071 #else
5072     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5073             /* XXX: this should happen BEFORE we compile */
5074             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5075     else {
5076         regnode *first = ri->program + 1;
5077         U8 fop = OP(first);
5078         U8 nop = OP(NEXTOPER(first));
5079
5080         if (PL_regkind[fop] == NOTHING && nop == END)
5081             r->extflags |= RXf_NULL;
5082         else if (PL_regkind[fop] == BOL && nop == END)
5083             r->extflags |= RXf_START_ONLY;
5084         else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
5085             r->extflags |= RXf_WHITE;
5086     }
5087 #endif
5088 #ifdef DEBUGGING
5089     if (RExC_paren_names) {
5090         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5091         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5092     } else
5093 #endif
5094         ri->name_list_idx = 0;
5095
5096     if (RExC_recurse_count) {
5097         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5098             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5099             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5100         }
5101     }
5102     Newxz(r->offs, RExC_npar, regexp_paren_pair);
5103     /* assume we don't need to swap parens around before we match */
5104
5105     DEBUG_DUMP_r({
5106         PerlIO_printf(Perl_debug_log,"Final program:\n");
5107         regdump(r);
5108     });
5109 #ifdef RE_TRACK_PATTERN_OFFSETS
5110     DEBUG_OFFSETS_r(if (ri->u.offsets) {
5111         const U32 len = ri->u.offsets[0];
5112         U32 i;
5113         GET_RE_DEBUG_FLAGS_DECL;
5114         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5115         for (i = 1; i <= len; i++) {
5116             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5117                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5118                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5119             }
5120         PerlIO_printf(Perl_debug_log, "\n");
5121     });
5122 #endif
5123     return rx;
5124 }
5125
5126 #undef RE_ENGINE_PTR
5127
5128
5129 SV*
5130 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5131                     const U32 flags)
5132 {
5133     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5134
5135     PERL_UNUSED_ARG(value);
5136
5137     if (flags & RXapif_FETCH) {
5138         return reg_named_buff_fetch(rx, key, flags);
5139     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5140         Perl_croak_no_modify(aTHX);
5141         return NULL;
5142     } else if (flags & RXapif_EXISTS) {
5143         return reg_named_buff_exists(rx, key, flags)
5144             ? &PL_sv_yes
5145             : &PL_sv_no;
5146     } else if (flags & RXapif_REGNAMES) {
5147         return reg_named_buff_all(rx, flags);
5148     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5149         return reg_named_buff_scalar(rx, flags);
5150     } else {
5151         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5152         return NULL;
5153     }
5154 }
5155
5156 SV*
5157 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5158                          const U32 flags)
5159 {
5160     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5161     PERL_UNUSED_ARG(lastkey);
5162
5163     if (flags & RXapif_FIRSTKEY)
5164         return reg_named_buff_firstkey(rx, flags);
5165     else if (flags & RXapif_NEXTKEY)
5166         return reg_named_buff_nextkey(rx, flags);
5167     else {
5168         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5169         return NULL;
5170     }
5171 }
5172
5173 SV*
5174 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5175                           const U32 flags)
5176 {
5177     AV *retarray = NULL;
5178     SV *ret;
5179     struct regexp *const rx = (struct regexp *)SvANY(r);
5180
5181     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5182
5183     if (flags & RXapif_ALL)
5184         retarray=newAV();
5185
5186     if (rx && RXp_PAREN_NAMES(rx)) {
5187         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5188         if (he_str) {
5189             IV i;
5190             SV* sv_dat=HeVAL(he_str);
5191             I32 *nums=(I32*)SvPVX(sv_dat);
5192             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5193                 if ((I32)(rx->nparens) >= nums[i]
5194                     && rx->offs[nums[i]].start != -1
5195                     && rx->offs[nums[i]].end != -1)
5196                 {
5197                     ret = newSVpvs("");
5198                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5199                     if (!retarray)
5200                         return ret;
5201                 } else {
5202                     ret = newSVsv(&PL_sv_undef);
5203                 }
5204                 if (retarray)
5205                     av_push(retarray, ret);
5206             }
5207             if (retarray)
5208                 return newRV_noinc(MUTABLE_SV(retarray));
5209         }
5210     }
5211     return NULL;
5212 }
5213
5214 bool
5215 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5216                            const U32 flags)
5217 {
5218     struct regexp *const rx = (struct regexp *)SvANY(r);
5219
5220     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5221
5222     if (rx && RXp_PAREN_NAMES(rx)) {
5223         if (flags & RXapif_ALL) {
5224             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5225         } else {
5226             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5227             if (sv) {
5228                 SvREFCNT_dec(sv);
5229                 return TRUE;
5230             } else {
5231                 return FALSE;
5232             }
5233         }
5234     } else {
5235         return FALSE;
5236     }
5237 }
5238
5239 SV*
5240 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5241 {
5242     struct regexp *const rx = (struct regexp *)SvANY(r);
5243
5244     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5245
5246     if ( rx && RXp_PAREN_NAMES(rx) ) {
5247         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5248
5249         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5250     } else {
5251         return FALSE;
5252     }
5253 }
5254
5255 SV*
5256 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5257 {
5258     struct regexp *const rx = (struct regexp *)SvANY(r);
5259     GET_RE_DEBUG_FLAGS_DECL;
5260
5261     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5262
5263     if (rx && RXp_PAREN_NAMES(rx)) {
5264         HV *hv = RXp_PAREN_NAMES(rx);
5265         HE *temphe;
5266         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5267             IV i;
5268             IV parno = 0;
5269             SV* sv_dat = HeVAL(temphe);
5270             I32 *nums = (I32*)SvPVX(sv_dat);
5271             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5272                 if ((I32)(rx->lastparen) >= nums[i] &&
5273                     rx->offs[nums[i]].start != -1 &&
5274                     rx->offs[nums[i]].end != -1)
5275                 {
5276                     parno = nums[i];
5277                     break;
5278                 }
5279             }
5280             if (parno || flags & RXapif_ALL) {
5281                 return newSVhek(HeKEY_hek(temphe));
5282             }
5283         }
5284     }
5285     return NULL;
5286 }
5287
5288 SV*
5289 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5290 {
5291     SV *ret;
5292     AV *av;
5293     I32 length;
5294     struct regexp *const rx = (struct regexp *)SvANY(r);
5295
5296     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5297
5298     if (rx && RXp_PAREN_NAMES(rx)) {
5299         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5300             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5301         } else if (flags & RXapif_ONE) {
5302             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5303             av = MUTABLE_AV(SvRV(ret));
5304             length = av_len(av);
5305             SvREFCNT_dec(ret);
5306             return newSViv(length + 1);
5307         } else {
5308             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5309             return NULL;
5310         }
5311     }
5312     return &PL_sv_undef;
5313 }
5314
5315 SV*
5316 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5317 {
5318     struct regexp *const rx = (struct regexp *)SvANY(r);
5319     AV *av = newAV();
5320
5321     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5322
5323     if (rx && RXp_PAREN_NAMES(rx)) {
5324         HV *hv= RXp_PAREN_NAMES(rx);
5325         HE *temphe;
5326         (void)hv_iterinit(hv);
5327         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5328             IV i;
5329             IV parno = 0;
5330             SV* sv_dat = HeVAL(temphe);
5331             I32 *nums = (I32*)SvPVX(sv_dat);
5332             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5333                 if ((I32)(rx->lastparen) >= nums[i] &&
5334                     rx->offs[nums[i]].start != -1 &&
5335                     rx->offs[nums[i]].end != -1)
5336                 {
5337                     parno = nums[i];
5338                     break;
5339                 }
5340             }
5341             if (parno || flags & RXapif_ALL) {
5342                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5343             }
5344         }
5345     }
5346
5347     return newRV_noinc(MUTABLE_SV(av));
5348 }
5349
5350 void
5351 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5352                              SV * const sv)
5353 {
5354     struct regexp *const rx = (struct regexp *)SvANY(r);
5355     char *s = NULL;
5356     I32 i = 0;
5357     I32 s1, t1;
5358
5359     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5360
5361     if (!rx->subbeg) {
5362         sv_setsv(sv,&PL_sv_undef);
5363         return;
5364     }
5365     else
5366     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5367         /* $` */
5368         i = rx->offs[0].start;
5369         s = rx->subbeg;
5370     }
5371     else
5372     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5373         /* $' */
5374         s = rx->subbeg + rx->offs[0].end;
5375         i = rx->sublen - rx->offs[0].end;
5376     }
5377     else
5378     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5379         (s1 = rx->offs[paren].start) != -1 &&
5380         (t1 = rx->offs[paren].end) != -1)
5381     {
5382         /* $& $1 ... */
5383         i = t1 - s1;
5384         s = rx->subbeg + s1;
5385     } else {
5386         sv_setsv(sv,&PL_sv_undef);
5387         return;
5388     }
5389     assert(rx->sublen >= (s - rx->subbeg) + i );
5390     if (i >= 0) {
5391         const int oldtainted = PL_tainted;
5392         TAINT_NOT;
5393         sv_setpvn(sv, s, i);
5394         PL_tainted = oldtainted;
5395         if ( (rx->extflags & RXf_CANY_SEEN)
5396             ? (RXp_MATCH_UTF8(rx)
5397                         && (!i || is_utf8_string((U8*)s, i)))
5398             : (RXp_MATCH_UTF8(rx)) )
5399         {
5400             SvUTF8_on(sv);
5401         }
5402         else
5403             SvUTF8_off(sv);
5404         if (PL_tainting) {
5405             if (RXp_MATCH_TAINTED(rx)) {
5406                 if (SvTYPE(sv) >= SVt_PVMG) {
5407                     MAGIC* const mg = SvMAGIC(sv);
5408                     MAGIC* mgt;
5409                     PL_tainted = 1;
5410                     SvMAGIC_set(sv, mg->mg_moremagic);
5411                     SvTAINT(sv);
5412                     if ((mgt = SvMAGIC(sv))) {
5413                         mg->mg_moremagic = mgt;
5414                         SvMAGIC_set(sv, mg);
5415                     }
5416                 } else {
5417                     PL_tainted = 1;
5418                     SvTAINT(sv);
5419                 }
5420             } else
5421                 SvTAINTED_off(sv);
5422         }
5423     } else {
5424         sv_setsv(sv,&PL_sv_undef);
5425         return;
5426     }
5427 }
5428
5429 void
5430 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5431                                                          SV const * const value)
5432 {
5433     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5434
5435     PERL_UNUSED_ARG(rx);
5436     PERL_UNUSED_ARG(paren);
5437     PERL_UNUSED_ARG(value);
5438
5439     if (!PL_localizing)
5440         Perl_croak_no_modify(aTHX);
5441 }
5442
5443 I32
5444 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5445                               const I32 paren)
5446 {
5447     struct regexp *const rx = (struct regexp *)SvANY(r);
5448     I32 i;
5449     I32 s1, t1;
5450
5451     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5452
5453     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5454         switch (paren) {
5455       /* $` / ${^PREMATCH} */
5456       case RX_BUFF_IDX_PREMATCH:
5457         if (rx->offs[0].start != -1) {
5458                         i = rx->offs[0].start;
5459                         if (i > 0) {
5460                                 s1 = 0;
5461                                 t1 = i;
5462                                 goto getlen;
5463                         }
5464             }
5465         return 0;
5466       /* $' / ${^POSTMATCH} */
5467       case RX_BUFF_IDX_POSTMATCH:
5468             if (rx->offs[0].end != -1) {
5469                         i = rx->sublen - rx->offs[0].end;
5470                         if (i > 0) {
5471                                 s1 = rx->offs[0].end;
5472                                 t1 = rx->sublen;
5473                                 goto getlen;
5474                         }
5475             }
5476         return 0;
5477       /* $& / ${^MATCH}, $1, $2, ... */
5478       default:
5479             if (paren <= (I32)rx->nparens &&
5480             (s1 = rx->offs[paren].start) != -1 &&
5481             (t1 = rx->offs[paren].end) != -1)
5482             {
5483             i = t1 - s1;
5484             goto getlen;
5485         } else {
5486             if (ckWARN(WARN_UNINITIALIZED))
5487                 report_uninit((const SV *)sv);
5488             return 0;
5489         }
5490     }
5491   getlen:
5492     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5493         const char * const s = rx->subbeg + s1;
5494         const U8 *ep;
5495         STRLEN el;
5496
5497         i = t1 - s1;
5498         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5499                         i = el;
5500     }
5501     return i;
5502 }
5503
5504 SV*
5505 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5506 {
5507     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5508         PERL_UNUSED_ARG(rx);
5509         if (0)
5510             return NULL;
5511         else
5512             return newSVpvs("Regexp");
5513 }
5514
5515 /* Scans the name of a named buffer from the pattern.
5516  * If flags is REG_RSN_RETURN_NULL returns null.
5517  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5518  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5519  * to the parsed name as looked up in the RExC_paren_names hash.
5520  * If there is an error throws a vFAIL().. type exception.
5521  */
5522
5523 #define REG_RSN_RETURN_NULL    0
5524 #define REG_RSN_RETURN_NAME    1
5525 #define REG_RSN_RETURN_DATA    2
5526
5527 STATIC SV*
5528 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5529 {
5530     char *name_start = RExC_parse;
5531
5532     PERL_ARGS_ASSERT_REG_SCAN_NAME;
5533
5534     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5535          /* skip IDFIRST by using do...while */
5536         if (UTF)
5537             do {
5538                 RExC_parse += UTF8SKIP(RExC_parse);
5539             } while (isALNUM_utf8((U8*)RExC_parse));
5540         else
5541             do {
5542                 RExC_parse++;
5543             } while (isALNUM(*RExC_parse));
5544     }
5545
5546     if ( flags ) {
5547         SV* sv_name
5548             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5549                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5550         if ( flags == REG_RSN_RETURN_NAME)
5551             return sv_name;
5552         else if (flags==REG_RSN_RETURN_DATA) {
5553             HE *he_str = NULL;
5554             SV *sv_dat = NULL;
5555             if ( ! sv_name )      /* should not happen*/
5556                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5557             if (RExC_paren_names)
5558                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5559             if ( he_str )
5560                 sv_dat = HeVAL(he_str);
5561             if ( ! sv_dat )
5562                 vFAIL("Reference to nonexistent named group");
5563             return sv_dat;
5564         }
5565         else {
5566             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5567         }
5568         /* NOT REACHED */
5569     }
5570     return NULL;
5571 }
5572
5573 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5574     int rem=(int)(RExC_end - RExC_parse);                       \
5575     int cut;                                                    \
5576     int num;                                                    \
5577     int iscut=0;                                                \
5578     if (rem>10) {                                               \
5579         rem=10;                                                 \
5580         iscut=1;                                                \
5581     }                                                           \
5582     cut=10-rem;                                                 \
5583     if (RExC_lastparse!=RExC_parse)                             \
5584         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5585             rem, RExC_parse,                                    \
5586             cut + 4,                                            \
5587             iscut ? "..." : "<"                                 \
5588         );                                                      \
5589     else                                                        \
5590         PerlIO_printf(Perl_debug_log,"%16s","");                \
5591                                                                 \
5592     if (SIZE_ONLY)                                              \
5593        num = RExC_size + 1;                                     \
5594     else                                                        \
5595        num=REG_NODE_NUM(RExC_emit);                             \
5596     if (RExC_lastnum!=num)                                      \
5597        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5598     else                                                        \
5599        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5600     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5601         (int)((depth*2)), "",                                   \
5602         (funcname)                                              \
5603     );                                                          \
5604     RExC_lastnum=num;                                           \
5605     RExC_lastparse=RExC_parse;                                  \
5606 })
5607
5608
5609
5610 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5611     DEBUG_PARSE_MSG((funcname));                            \
5612     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5613 })
5614 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5615     DEBUG_PARSE_MSG((funcname));                            \
5616     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5617 })
5618 /*
5619  - reg - regular expression, i.e. main body or parenthesized thing
5620  *
5621  * Caller must absorb opening parenthesis.
5622  *
5623  * Combining parenthesis handling with the base level of regular expression
5624  * is a trifle forced, but the need to tie the tails of the branches to what
5625  * follows makes it hard to avoid.
5626  */
5627 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5628 #ifdef DEBUGGING
5629 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5630 #else
5631 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5632 #endif
5633
5634 STATIC regnode *
5635 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5636     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5637 {
5638     dVAR;
5639     register regnode *ret;              /* Will be the head of the group. */
5640     register regnode *br;
5641     register regnode *lastbr;
5642     register regnode *ender = NULL;
5643     register I32 parno = 0;
5644     I32 flags;
5645     U32 oregflags = RExC_flags;
5646     bool have_branch = 0;
5647     bool is_open = 0;
5648     I32 freeze_paren = 0;
5649     I32 after_freeze = 0;
5650
5651     /* for (?g), (?gc), and (?o) warnings; warning
5652        about (?c) will warn about (?g) -- japhy    */
5653
5654 #define WASTED_O  0x01
5655 #define WASTED_G  0x02
5656 #define WASTED_C  0x04
5657 #define WASTED_GC (0x02|0x04)
5658     I32 wastedflags = 0x00;
5659
5660     char * parse_start = RExC_parse; /* MJD */
5661     char * const oregcomp_parse = RExC_parse;
5662
5663     GET_RE_DEBUG_FLAGS_DECL;
5664
5665     PERL_ARGS_ASSERT_REG;
5666     DEBUG_PARSE("reg ");
5667
5668     *flagp = 0;                         /* Tentatively. */
5669
5670
5671     /* Make an OPEN node, if parenthesized. */
5672     if (paren) {
5673         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5674             char *start_verb = RExC_parse;
5675             STRLEN verb_len = 0;
5676             char *start_arg = NULL;
5677             unsigned char op = 0;
5678             int argok = 1;
5679             int internal_argval = 0; /* internal_argval is only useful if !argok */
5680             while ( *RExC_parse && *RExC_parse != ')' ) {
5681                 if ( *RExC_parse == ':' ) {
5682                     start_arg = RExC_parse + 1;
5683                     break;
5684                 }
5685                 RExC_parse++;
5686             }
5687             ++start_verb;
5688             verb_len = RExC_parse - start_verb;
5689             if ( start_arg ) {
5690                 RExC_parse++;
5691                 while ( *RExC_parse && *RExC_parse != ')' )
5692                     RExC_parse++;
5693                 if ( *RExC_parse != ')' )
5694                     vFAIL("Unterminated verb pattern argument");
5695                 if ( RExC_parse == start_arg )
5696                     start_arg = NULL;
5697             } else {
5698                 if ( *RExC_parse != ')' )
5699                     vFAIL("Unterminated verb pattern");
5700             }
5701
5702             switch ( *start_verb ) {
5703             case 'A':  /* (*ACCEPT) */
5704                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5705                     op = ACCEPT;
5706                     internal_argval = RExC_nestroot;
5707                 }
5708                 break;
5709             case 'C':  /* (*COMMIT) */
5710                 if ( memEQs(start_verb,verb_len,"COMMIT") )
5711                     op = COMMIT;
5712                 break;
5713             case 'F':  /* (*FAIL) */
5714                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5715                     op = OPFAIL;
5716                     argok = 0;
5717                 }
5718                 break;
5719             case ':':  /* (*:NAME) */
5720             case 'M':  /* (*MARK:NAME) */
5721                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5722                     op = MARKPOINT;
5723                     argok = -1;
5724                 }
5725                 break;
5726             case 'P':  /* (*PRUNE) */
5727                 if ( memEQs(start_verb,verb_len,"PRUNE") )
5728                     op = PRUNE;
5729                 break;
5730             case 'S':   /* (*SKIP) */
5731                 if ( memEQs(start_verb,verb_len,"SKIP") )
5732                     op = SKIP;
5733                 break;
5734             case 'T':  /* (*THEN) */
5735                 /* [19:06] <TimToady> :: is then */
5736                 if ( memEQs(start_verb,verb_len,"THEN") ) {
5737                     op = CUTGROUP;
5738                     RExC_seen |= REG_SEEN_CUTGROUP;
5739                 }
5740                 break;
5741             }
5742             if ( ! op ) {
5743                 RExC_parse++;
5744                 vFAIL3("Unknown verb pattern '%.*s'",
5745                     verb_len, start_verb);
5746             }
5747             if ( argok ) {
5748                 if ( start_arg && internal_argval ) {
5749                     vFAIL3("Verb pattern '%.*s' may not have an argument",
5750                         verb_len, start_verb);
5751                 } else if ( argok < 0 && !start_arg ) {
5752                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5753                         verb_len, start_verb);
5754                 } else {
5755                     ret = reganode(pRExC_state, op, internal_argval);
5756                     if ( ! internal_argval && ! SIZE_ONLY ) {
5757                         if (start_arg) {
5758                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5759                             ARG(ret) = add_data( pRExC_state, 1, "S" );
5760                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5761                             ret->flags = 0;
5762                         } else {
5763                             ret->flags = 1;
5764                         }
5765                     }
5766                 }
5767                 if (!internal_argval)
5768                     RExC_seen |= REG_SEEN_VERBARG;
5769             } else if ( start_arg ) {
5770                 vFAIL3("Verb pattern '%.*s' may not have an argument",
5771                         verb_len, start_verb);
5772             } else {
5773                 ret = reg_node(pRExC_state, op);
5774             }
5775             nextchar(pRExC_state);
5776             return ret;
5777         } else
5778         if (*RExC_parse == '?') { /* (?...) */
5779             bool is_logical = 0;
5780             const char * const seqstart = RExC_parse;
5781             bool has_use_defaults = FALSE;
5782
5783             RExC_parse++;
5784             paren = *RExC_parse++;
5785             ret = NULL;                 /* For look-ahead/behind. */
5786             switch (paren) {
5787
5788             case 'P':   /* (?P...) variants for those used to PCRE/Python */
5789                 paren = *RExC_parse++;
5790                 if ( paren == '<')         /* (?P<...>) named capture */
5791                     goto named_capture;
5792                 else if (paren == '>') {   /* (?P>name) named recursion */
5793                     goto named_recursion;
5794                 }
5795                 else if (paren == '=') {   /* (?P=...)  named backref */
5796                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
5797                        you change this make sure you change that */
5798                     char* name_start = RExC_parse;
5799                     U32 num = 0;
5800                     SV *sv_dat = reg_scan_name(pRExC_state,
5801                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5802                     if (RExC_parse == name_start || *RExC_parse != ')')
5803                         vFAIL2("Sequence %.3s... not terminated",parse_start);
5804
5805                     if (!SIZE_ONLY) {
5806                         num = add_data( pRExC_state, 1, "S" );
5807                         RExC_rxi->data->data[num]=(void*)sv_dat;
5808                         SvREFCNT_inc_simple_void(sv_dat);
5809                     }
5810                     RExC_sawback = 1;
5811                     ret = reganode(pRExC_state,
5812                            (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5813                            num);
5814                     *flagp |= HASWIDTH;
5815
5816                     Set_Node_Offset(ret, parse_start+1);
5817                     Set_Node_Cur_Length(ret); /* MJD */
5818
5819                     nextchar(pRExC_state);
5820                     return ret;
5821                 }
5822                 RExC_parse++;
5823                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5824                 /*NOTREACHED*/
5825             case '<':           /* (?<...) */
5826                 if (*RExC_parse == '!')
5827                     paren = ',';
5828                 else if (*RExC_parse != '=')
5829               named_capture:
5830                 {               /* (?<...>) */
5831                     char *name_start;
5832                     SV *svname;
5833                     paren= '>';
5834             case '\'':          /* (?'...') */
5835                     name_start= RExC_parse;
5836                     svname = reg_scan_name(pRExC_state,
5837                         SIZE_ONLY ?  /* reverse test from the others */
5838                         REG_RSN_RETURN_NAME :
5839                         REG_RSN_RETURN_NULL);
5840                     if (RExC_parse == name_start) {
5841                         RExC_parse++;
5842                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5843                         /*NOTREACHED*/
5844                     }
5845                     if (*RExC_parse != paren)
5846                         vFAIL2("Sequence (?%c... not terminated",
5847                             paren=='>' ? '<' : paren);
5848                     if (SIZE_ONLY) {
5849                         HE *he_str;
5850                         SV *sv_dat = NULL;
5851                         if (!svname) /* shouldnt happen */
5852                             Perl_croak(aTHX_
5853                                 "panic: reg_scan_name returned NULL");
5854                         if (!RExC_paren_names) {
5855                             RExC_paren_names= newHV();
5856                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
5857 #ifdef DEBUGGING
5858                             RExC_paren_name_list= newAV();
5859                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5860 #endif
5861                         }
5862                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5863                         if ( he_str )
5864                             sv_dat = HeVAL(he_str);
5865                         if ( ! sv_dat ) {
5866                             /* croak baby croak */
5867                             Perl_croak(aTHX_
5868                                 "panic: paren_name hash element allocation failed");
5869                         } else if ( SvPOK(sv_dat) ) {
5870                             /* (?|...) can mean we have dupes so scan to check
5871                                its already been stored. Maybe a flag indicating
5872                                we are inside such a construct would be useful,
5873                                but the arrays are likely to be quite small, so
5874                                for now we punt -- dmq */
5875                             IV count = SvIV(sv_dat);
5876                             I32 *pv = (I32*)SvPVX(sv_dat);
5877                             IV i;
5878                             for ( i = 0 ; i < count ; i++ ) {
5879                                 if ( pv[i] == RExC_npar ) {
5880                                     count = 0;
5881                                     break;
5882                                 }
5883                             }
5884                             if ( count ) {
5885                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5886                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5887                                 pv[count] = RExC_npar;
5888                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5889                             }
5890                         } else {
5891                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
5892                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5893                             SvIOK_on(sv_dat);
5894                             SvIV_set(sv_dat, 1);
5895                         }
5896 #ifdef DEBUGGING
5897                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5898                             SvREFCNT_dec(svname);
5899 #endif
5900
5901                         /*sv_dump(sv_dat);*/
5902                     }
5903                     nextchar(pRExC_state);
5904                     paren = 1;
5905                     goto capturing_parens;
5906                 }
5907                 RExC_seen |= REG_SEEN_LOOKBEHIND;
5908                 RExC_parse++;
5909             case '=':           /* (?=...) */
5910                 RExC_seen_zerolen++;
5911                 break;
5912             case '!':           /* (?!...) */
5913                 RExC_seen_zerolen++;
5914                 if (*RExC_parse == ')') {
5915                     ret=reg_node(pRExC_state, OPFAIL);
5916                     nextchar(pRExC_state);
5917                     return ret;
5918                 }
5919                 break;
5920             case '|':           /* (?|...) */
5921                 /* branch reset, behave like a (?:...) except that
5922                    buffers in alternations share the same numbers */
5923                 paren = ':';
5924                 after_freeze = freeze_paren = RExC_npar;
5925                 break;
5926             case ':':           /* (?:...) */
5927             case '>':           /* (?>...) */
5928                 break;
5929             case '$':           /* (?$...) */
5930             case '@':           /* (?@...) */
5931                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5932                 break;
5933             case '#':           /* (?#...) */
5934                 while (*RExC_parse && *RExC_parse != ')')
5935                     RExC_parse++;
5936                 if (*RExC_parse != ')')
5937                     FAIL("Sequence (?#... not terminated");
5938                 nextchar(pRExC_state);
5939                 *flagp = TRYAGAIN;
5940                 return NULL;
5941             case '0' :           /* (?0) */
5942             case 'R' :           /* (?R) */
5943                 if (*RExC_parse != ')')
5944                     FAIL("Sequence (?R) not terminated");
5945                 ret = reg_node(pRExC_state, GOSTART);
5946                 *flagp |= POSTPONED;
5947                 nextchar(pRExC_state);
5948                 return ret;
5949                 /*notreached*/
5950             { /* named and numeric backreferences */
5951                 I32 num;
5952             case '&':            /* (?&NAME) */
5953                 parse_start = RExC_parse - 1;
5954               named_recursion:
5955                 {
5956                     SV *sv_dat = reg_scan_name(pRExC_state,
5957                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5958                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5959                 }
5960                 goto gen_recurse_regop;
5961                 /* NOT REACHED */
5962             case '+':
5963                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5964                     RExC_parse++;
5965                     vFAIL("Illegal pattern");
5966                 }
5967                 goto parse_recursion;
5968                 /* NOT REACHED*/
5969             case '-': /* (?-1) */
5970                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5971                     RExC_parse--; /* rewind to let it be handled later */
5972                     goto parse_flags;
5973                 }
5974                 /*FALLTHROUGH */
5975             case '1': case '2': case '3': case '4': /* (?1) */
5976             case '5': case '6': case '7': case '8': case '9':
5977                 RExC_parse--;
5978               parse_recursion:
5979                 num = atoi(RExC_parse);
5980                 parse_start = RExC_parse - 1; /* MJD */
5981                 if (*RExC_parse == '-')
5982                     RExC_parse++;
5983                 while (isDIGIT(*RExC_parse))
5984                         RExC_parse++;
5985                 if (*RExC_parse!=')')
5986                     vFAIL("Expecting close bracket");
5987
5988               gen_recurse_regop:
5989                 if ( paren == '-' ) {
5990                     /*
5991                     Diagram of capture buffer numbering.
5992                     Top line is the normal capture buffer numbers
5993                     Bottom line is the negative indexing as from
5994                     the X (the (?-2))
5995
5996                     +   1 2    3 4 5 X          6 7
5997                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5998                     -   5 4    3 2 1 X          x x
5999
6000                     */
6001                     num = RExC_npar + num;
6002                     if (num < 1)  {
6003                         RExC_parse++;
6004                         vFAIL("Reference to nonexistent group");
6005                     }
6006                 } else if ( paren == '+' ) {
6007                     num = RExC_npar + num - 1;
6008                 }
6009
6010                 ret = reganode(pRExC_state, GOSUB, num);
6011                 if (!SIZE_ONLY) {
6012                     if (num > (I32)RExC_rx->nparens) {
6013                         RExC_parse++;
6014                         vFAIL("Reference to nonexistent group");
6015                     }
6016                     ARG2L_SET( ret, RExC_recurse_count++);
6017                     RExC_emit++;
6018                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6019                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6020                 } else {
6021                     RExC_size++;
6022                 }
6023                 RExC_seen |= REG_SEEN_RECURSE;
6024                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6025                 Set_Node_Offset(ret, parse_start); /* MJD */
6026
6027                 *flagp |= POSTPONED;
6028                 nextchar(pRExC_state);
6029                 return ret;
6030             } /* named and numeric backreferences */
6031             /* NOT REACHED */
6032
6033             case '?':           /* (??...) */
6034                 is_logical = 1;
6035                 if (*RExC_parse != '{') {
6036                     RExC_parse++;
6037                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6038                     /*NOTREACHED*/
6039                 }
6040                 *flagp |= POSTPONED;
6041                 paren = *RExC_parse++;
6042                 /* FALL THROUGH */
6043             case '{':           /* (?{...}) */
6044             {
6045                 I32 count = 1;
6046                 U32 n = 0;
6047                 char c;
6048                 char *s = RExC_parse;
6049
6050                 RExC_seen_zerolen++;
6051                 RExC_seen |= REG_SEEN_EVAL;
6052                 while (count && (c = *RExC_parse)) {
6053                     if (c == '\\') {
6054                         if (RExC_parse[1])
6055                             RExC_parse++;
6056                     }
6057                     else if (c == '{')
6058                         count++;
6059                     else if (c == '}')
6060                         count--;
6061                     RExC_parse++;
6062                 }
6063                 if (*RExC_parse != ')') {
6064                     RExC_parse = s;
6065                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6066                 }
6067                 if (!SIZE_ONLY) {
6068                     PAD *pad;
6069                     OP_4tree *sop, *rop;
6070                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6071
6072                     ENTER;
6073                     Perl_save_re_context(aTHX);
6074                     rop = sv_compile_2op(sv, &sop, "re", &pad);
6075                     sop->op_private |= OPpREFCOUNTED;
6076                     /* re_dup will OpREFCNT_inc */
6077                     OpREFCNT_set(sop, 1);
6078                     LEAVE;
6079
6080                     n = add_data(pRExC_state, 3, "nop");
6081                     RExC_rxi->data->data[n] = (void*)rop;
6082                     RExC_rxi->data->data[n+1] = (void*)sop;
6083                     RExC_rxi->data->data[n+2] = (void*)pad;
6084                     SvREFCNT_dec(sv);
6085                 }
6086                 else {                                          /* First pass */
6087                     if (PL_reginterp_cnt < ++RExC_seen_evals
6088                         && IN_PERL_RUNTIME)
6089                         /* No compiled RE interpolated, has runtime
6090                            components ===> unsafe.  */
6091                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
6092                     if (PL_tainting && PL_tainted)
6093                         FAIL("Eval-group in insecure regular expression");
6094 #if PERL_VERSION > 8
6095                     if (IN_PERL_COMPILETIME)
6096                         PL_cv_has_eval = 1;
6097 #endif
6098                 }
6099
6100                 nextchar(pRExC_state);
6101                 if (is_logical) {
6102                     ret = reg_node(pRExC_state, LOGICAL);
6103                     if (!SIZE_ONLY)
6104                         ret->flags = 2;
6105                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6106                     /* deal with the length of this later - MJD */
6107                     return ret;
6108                 }
6109                 ret = reganode(pRExC_state, EVAL, n);
6110                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6111                 Set_Node_Offset(ret, parse_start);
6112                 return ret;
6113             }
6114             case '(':           /* (?(?{...})...) and (?(?=...)...) */
6115             {
6116                 int is_define= 0;
6117                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
6118                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6119                         || RExC_parse[1] == '<'
6120                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
6121                         I32 flag;
6122
6123                         ret = reg_node(pRExC_state, LOGICAL);
6124                         if (!SIZE_ONLY)
6125                             ret->flags = 1;
6126                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6127                         goto insert_if;
6128                     }
6129                 }
6130                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
6131                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6132                 {
6133                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
6134                     char *name_start= RExC_parse++;
6135                     U32 num = 0;
6136                     SV *sv_dat=reg_scan_name(pRExC_state,
6137                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6138                     if (RExC_parse == name_start || *RExC_parse != ch)
6139                         vFAIL2("Sequence (?(%c... not terminated",
6140                             (ch == '>' ? '<' : ch));
6141                     RExC_parse++;
6142                     if (!SIZE_ONLY) {
6143                         num = add_data( pRExC_state, 1, "S" );
6144                         RExC_rxi->data->data[num]=(void*)sv_dat;
6145                         SvREFCNT_inc_simple_void(sv_dat);
6146                     }
6147                     ret = reganode(pRExC_state,NGROUPP,num);
6148                     goto insert_if_check_paren;
6149                 }
6150                 else if (RExC_parse[0] == 'D' &&
6151                          RExC_parse[1] == 'E' &&
6152                          RExC_parse[2] == 'F' &&
6153                          RExC_parse[3] == 'I' &&
6154                          RExC_parse[4] == 'N' &&
6155                          RExC_parse[5] == 'E')
6156                 {
6157                     ret = reganode(pRExC_state,DEFINEP,0);
6158                     RExC_parse +=6 ;
6159                     is_define = 1;
6160                     goto insert_if_check_paren;
6161                 }
6162                 else if (RExC_parse[0] == 'R') {
6163                     RExC_parse++;
6164                     parno = 0;
6165                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6166                         parno = atoi(RExC_parse++);
6167                         while (isDIGIT(*RExC_parse))
6168                             RExC_parse++;
6169                     } else if (RExC_parse[0] == '&') {
6170                         SV *sv_dat;
6171                         RExC_parse++;
6172                         sv_dat = reg_scan_name(pRExC_state,
6173                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6174                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6175                     }
6176                     ret = reganode(pRExC_state,INSUBP,parno);
6177                     goto insert_if_check_paren;
6178                 }
6179                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6180                     /* (?(1)...) */
6181                     char c;
6182                     parno = atoi(RExC_parse++);
6183
6184                     while (isDIGIT(*RExC_parse))
6185                         RExC_parse++;
6186                     ret = reganode(pRExC_state, GROUPP, parno);
6187
6188                  insert_if_check_paren:
6189                     if ((c = *nextchar(pRExC_state)) != ')')
6190                         vFAIL("Switch condition not recognized");
6191                   insert_if:
6192                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6193                     br = regbranch(pRExC_state, &flags, 1,depth+1);
6194                     if (br == NULL)
6195                         br = reganode(pRExC_state, LONGJMP, 0);
6196                     else
6197                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6198                     c = *nextchar(pRExC_state);
6199                     if (flags&HASWIDTH)
6200                         *flagp |= HASWIDTH;
6201                     if (c == '|') {
6202                         if (is_define)
6203                             vFAIL("(?(DEFINE)....) does not allow branches");
6204                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6205                         regbranch(pRExC_state, &flags, 1,depth+1);
6206                         REGTAIL(pRExC_state, ret, lastbr);
6207                         if (flags&HASWIDTH)
6208                             *flagp |= HASWIDTH;
6209                         c = *nextchar(pRExC_state);
6210                     }
6211                     else
6212                         lastbr = NULL;
6213                     if (c != ')')
6214                         vFAIL("Switch (?(condition)... contains too many branches");
6215                     ender = reg_node(pRExC_state, TAIL);
6216                     REGTAIL(pRExC_state, br, ender);
6217                     if (lastbr) {
6218                         REGTAIL(pRExC_state, lastbr, ender);
6219                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6220                     }
6221                     else
6222                         REGTAIL(pRExC_state, ret, ender);
6223                     RExC_size++; /* XXX WHY do we need this?!!
6224                                     For large programs it seems to be required
6225                                     but I can't figure out why. -- dmq*/
6226                     return ret;
6227                 }
6228                 else {
6229                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6230                 }
6231             }
6232             case 0:
6233                 RExC_parse--; /* for vFAIL to print correctly */
6234                 vFAIL("Sequence (? incomplete");
6235                 break;
6236             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
6237                                        that follow */
6238                 has_use_defaults = TRUE;
6239                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
6240                 RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
6241                 goto parse_flags;
6242             default:
6243                 --RExC_parse;
6244                 parse_flags:      /* (?i) */
6245             {
6246                 U32 posflags = 0, negflags = 0;
6247                 U32 *flagsp = &posflags;
6248                 bool has_charset_modifier = 0;
6249
6250                 while (*RExC_parse) {
6251                     /* && strchr("iogcmsx", *RExC_parse) */
6252                     /* (?g), (?gc) and (?o) are useless here
6253                        and must be globally applied -- japhy */
6254                     switch (*RExC_parse) {
6255                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6256                     case LOCALE_PAT_MOD:
6257                         if (has_charset_modifier || flagsp == &negflags) {
6258                             goto fail_modifiers;
6259                         }
6260                         posflags |= RXf_PMf_LOCALE;
6261                         negflags |= RXf_PMf_UNICODE;
6262                         has_charset_modifier = 1;
6263                         break;
6264                     case UNICODE_PAT_MOD:
6265                         if (has_charset_modifier || flagsp == &negflags) {
6266                             goto fail_modifiers;
6267                         }
6268                         posflags |= RXf_PMf_UNICODE;
6269                         negflags |= RXf_PMf_LOCALE;
6270                         has_charset_modifier = 1;
6271                         break;
6272                     case DUAL_PAT_MOD:
6273                         if (has_use_defaults
6274                             || has_charset_modifier
6275                             || flagsp == &negflags)
6276                         {
6277                             goto fail_modifiers;
6278                         }
6279                         negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
6280                         has_charset_modifier = 1;
6281                         break;
6282                     case ONCE_PAT_MOD: /* 'o' */
6283                     case GLOBAL_PAT_MOD: /* 'g' */
6284                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6285                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6286                             if (! (wastedflags & wflagbit) ) {
6287                                 wastedflags |= wflagbit;
6288                                 vWARN5(
6289                                     RExC_parse + 1,
6290                                     "Useless (%s%c) - %suse /%c modifier",
6291                                     flagsp == &negflags ? "?-" : "?",
6292                                     *RExC_parse,
6293                                     flagsp == &negflags ? "don't " : "",
6294                                     *RExC_parse
6295                                 );
6296                             }
6297                         }
6298                         break;
6299
6300                     case CONTINUE_PAT_MOD: /* 'c' */
6301                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6302                             if (! (wastedflags & WASTED_C) ) {
6303                                 wastedflags |= WASTED_GC;
6304                                 vWARN3(
6305                                     RExC_parse + 1,
6306                                     "Useless (%sc) - %suse /gc modifier",
6307                                     flagsp == &negflags ? "?-" : "?",
6308                                     flagsp == &negflags ? "don't " : ""
6309                                 );
6310                             }
6311                         }
6312                         break;
6313                     case KEEPCOPY_PAT_MOD: /* 'p' */
6314                         if (flagsp == &negflags) {
6315                             if (SIZE_ONLY)
6316                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6317                         } else {
6318                             *flagsp |= RXf_PMf_KEEPCOPY;
6319                         }
6320                         break;
6321                     case '-':
6322                         /* A flag is a default iff it is following a minus, so
6323                          * if there is a minus, it means will be trying to
6324                          * re-specify a default which is an error */
6325                         if (has_use_defaults || flagsp == &negflags) {
6326             fail_modifiers:
6327                             RExC_parse++;
6328                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6329                             /*NOTREACHED*/
6330                         }
6331                         flagsp = &negflags;
6332                         wastedflags = 0;  /* reset so (?g-c) warns twice */
6333                         break;
6334                     case ':':
6335                         paren = ':';
6336                         /*FALLTHROUGH*/
6337                     case ')':
6338                         RExC_flags |= posflags;
6339                         RExC_flags &= ~negflags;
6340                         if (paren != ':') {
6341                             oregflags |= posflags;
6342                             oregflags &= ~negflags;
6343                         }
6344                         nextchar(pRExC_state);
6345                         if (paren != ':') {
6346                             *flagp = TRYAGAIN;
6347                             return NULL;
6348                         } else {
6349                             ret = NULL;
6350                             goto parse_rest;
6351                         }
6352                         /*NOTREACHED*/
6353                     default:
6354                         RExC_parse++;
6355                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6356                         /*NOTREACHED*/
6357                     }
6358                     ++RExC_parse;
6359                 }
6360             }} /* one for the default block, one for the switch */
6361         }
6362         else {                  /* (...) */
6363           capturing_parens:
6364             parno = RExC_npar;
6365             RExC_npar++;
6366
6367             ret = reganode(pRExC_state, OPEN, parno);
6368             if (!SIZE_ONLY ){
6369                 if (!RExC_nestroot)
6370                     RExC_nestroot = parno;
6371                 if (RExC_seen & REG_SEEN_RECURSE
6372                     && !RExC_open_parens[parno-1])
6373                 {
6374                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6375                         "Setting open paren #%"IVdf" to %d\n",
6376                         (IV)parno, REG_NODE_NUM(ret)));
6377                     RExC_open_parens[parno-1]= ret;
6378                 }
6379             }
6380             Set_Node_Length(ret, 1); /* MJD */
6381             Set_Node_Offset(ret, RExC_parse); /* MJD */
6382             is_open = 1;
6383         }
6384     }
6385     else                        /* ! paren */
6386         ret = NULL;
6387
6388    parse_rest:
6389     /* Pick up the branches, linking them together. */
6390     parse_start = RExC_parse;   /* MJD */
6391     br = regbranch(pRExC_state, &flags, 1,depth+1);
6392
6393     if (freeze_paren) {
6394         if (RExC_npar > after_freeze)
6395             after_freeze = RExC_npar;
6396         RExC_npar = freeze_paren;
6397     }
6398
6399     /*     branch_len = (paren != 0); */
6400
6401     if (br == NULL)
6402         return(NULL);
6403     if (*RExC_parse == '|') {
6404         if (!SIZE_ONLY && RExC_extralen) {
6405             reginsert(pRExC_state, BRANCHJ, br, depth+1);
6406         }
6407         else {                  /* MJD */
6408             reginsert(pRExC_state, BRANCH, br, depth+1);
6409             Set_Node_Length(br, paren != 0);
6410             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6411         }
6412         have_branch = 1;
6413         if (SIZE_ONLY)
6414             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
6415     }
6416     else if (paren == ':') {
6417         *flagp |= flags&SIMPLE;
6418     }
6419     if (is_open) {                              /* Starts with OPEN. */
6420         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
6421     }
6422     else if (paren != '?')              /* Not Conditional */
6423         ret = br;
6424     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6425     lastbr = br;
6426     while (*RExC_parse == '|') {
6427         if (!SIZE_ONLY && RExC_extralen) {
6428             ender = reganode(pRExC_state, LONGJMP,0);
6429             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6430         }
6431         if (SIZE_ONLY)
6432             RExC_extralen += 2;         /* Account for LONGJMP. */
6433         nextchar(pRExC_state);
6434         if (freeze_paren) {
6435             if (RExC_npar > after_freeze)
6436                 after_freeze = RExC_npar;
6437             RExC_npar = freeze_paren;
6438         }
6439         br = regbranch(pRExC_state, &flags, 0, depth+1);
6440
6441         if (br == NULL)
6442             return(NULL);
6443         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
6444         lastbr = br;
6445         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6446     }
6447
6448     if (have_branch || paren != ':') {
6449         /* Make a closing node, and hook it on the end. */
6450         switch (paren) {
6451         case ':':
6452             ender = reg_node(pRExC_state, TAIL);
6453             break;
6454         case 1:
6455             ender = reganode(pRExC_state, CLOSE, parno);
6456             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6457                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6458                         "Setting close paren #%"IVdf" to %d\n",
6459                         (IV)parno, REG_NODE_NUM(ender)));
6460                 RExC_close_parens[parno-1]= ender;
6461                 if (RExC_nestroot == parno)
6462                     RExC_nestroot = 0;
6463             }
6464             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6465             Set_Node_Length(ender,1); /* MJD */
6466             break;
6467         case '<':
6468         case ',':
6469         case '=':
6470         case '!':
6471             *flagp &= ~HASWIDTH;
6472             /* FALL THROUGH */
6473         case '>':
6474             ender = reg_node(pRExC_state, SUCCEED);
6475             break;
6476         case 0:
6477             ender = reg_node(pRExC_state, END);
6478             if (!SIZE_ONLY) {
6479                 assert(!RExC_opend); /* there can only be one! */
6480                 RExC_opend = ender;
6481             }
6482             break;
6483         }
6484         REGTAIL(pRExC_state, lastbr, ender);
6485
6486         if (have_branch && !SIZE_ONLY) {
6487             if (depth==1)
6488                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6489
6490             /* Hook the tails of the branches to the closing node. */
6491             for (br = ret; br; br = regnext(br)) {
6492                 const U8 op = PL_regkind[OP(br)];
6493                 if (op == BRANCH) {
6494                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6495                 }
6496                 else if (op == BRANCHJ) {
6497                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6498                 }
6499             }
6500         }
6501     }
6502
6503     {
6504         const char *p;
6505         static const char parens[] = "=!<,>";
6506
6507         if (paren && (p = strchr(parens, paren))) {
6508             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6509             int flag = (p - parens) > 1;
6510
6511             if (paren == '>')
6512                 node = SUSPEND, flag = 0;
6513             reginsert(pRExC_state, node,ret, depth+1);
6514             Set_Node_Cur_Length(ret);
6515             Set_Node_Offset(ret, parse_start + 1);
6516             ret->flags = flag;
6517             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6518         }
6519     }
6520
6521     /* Check for proper termination. */
6522     if (paren) {
6523         RExC_flags = oregflags;
6524         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6525             RExC_parse = oregcomp_parse;
6526             vFAIL("Unmatched (");
6527         }
6528     }
6529     else if (!paren && RExC_parse < RExC_end) {
6530         if (*RExC_parse == ')') {
6531             RExC_parse++;
6532             vFAIL("Unmatched )");
6533         }
6534         else
6535             FAIL("Junk on end of regexp");      /* "Can't happen". */
6536         /* NOTREACHED */
6537     }
6538     if (after_freeze)
6539         RExC_npar = after_freeze;
6540     return(ret);
6541 }
6542
6543 /*
6544  - regbranch - one alternative of an | operator
6545  *
6546  * Implements the concatenation operator.
6547  */
6548 STATIC regnode *
6549 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6550 {
6551     dVAR;
6552     register regnode *ret;
6553     register regnode *chain = NULL;
6554     register regnode *latest;
6555     I32 flags = 0, c = 0;
6556     GET_RE_DEBUG_FLAGS_DECL;
6557
6558     PERL_ARGS_ASSERT_REGBRANCH;
6559
6560     DEBUG_PARSE("brnc");
6561
6562     if (first)
6563         ret = NULL;
6564     else {
6565         if (!SIZE_ONLY && RExC_extralen)
6566             ret = reganode(pRExC_state, BRANCHJ,0);
6567         else {
6568             ret = reg_node(pRExC_state, BRANCH);
6569             Set_Node_Length(ret, 1);
6570         }
6571     }
6572
6573     if (!first && SIZE_ONLY)
6574         RExC_extralen += 1;                     /* BRANCHJ */
6575
6576     *flagp = WORST;                     /* Tentatively. */
6577
6578     RExC_parse--;
6579     nextchar(pRExC_state);
6580     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6581         flags &= ~TRYAGAIN;
6582         latest = regpiece(pRExC_state, &flags,depth+1);
6583         if (latest == NULL) {
6584             if (flags & TRYAGAIN)
6585                 continue;
6586             return(NULL);
6587         }
6588         else if (ret == NULL)
6589             ret = latest;
6590         *flagp |= flags&(HASWIDTH|POSTPONED);
6591         if (chain == NULL)      /* First piece. */
6592             *flagp |= flags&SPSTART;
6593         else {
6594             RExC_naughty++;
6595             REGTAIL(pRExC_state, chain, latest);
6596         }
6597         chain = latest;
6598         c++;
6599     }
6600     if (chain == NULL) {        /* Loop ran zero times. */
6601         chain = reg_node(pRExC_state, NOTHING);
6602         if (ret == NULL)
6603             ret = chain;
6604     }
6605     if (c == 1) {
6606         *flagp |= flags&SIMPLE;
6607     }
6608
6609     return ret;
6610 }
6611
6612 /*
6613  - regpiece - something followed by possible [*+?]
6614  *
6615  * Note that the branching code sequences used for ? and the general cases
6616  * of * and + are somewhat optimized:  they use the same NOTHING node as
6617  * both the endmarker for their branch list and the body of the last branch.
6618  * It might seem that this node could be dispensed with entirely, but the
6619  * endmarker role is not redundant.
6620  */
6621 STATIC regnode *
6622 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6623 {
6624     dVAR;
6625     register regnode *ret;
6626     register char op;
6627     register char *next;
6628     I32 flags;
6629     const char * const origparse = RExC_parse;
6630     I32 min;
6631     I32 max = REG_INFTY;
6632     char *parse_start;
6633     const char *maxpos = NULL;
6634     GET_RE_DEBUG_FLAGS_DECL;
6635
6636     PERL_ARGS_ASSERT_REGPIECE;
6637
6638     DEBUG_PARSE("piec");
6639
6640     ret = regatom(pRExC_state, &flags,depth+1);
6641     if (ret == NULL) {
6642         if (flags & TRYAGAIN)
6643             *flagp |= TRYAGAIN;
6644         return(NULL);
6645     }
6646
6647     op = *RExC_parse;
6648
6649     if (op == '{' && regcurly(RExC_parse)) {
6650         maxpos = NULL;
6651         parse_start = RExC_parse; /* MJD */
6652         next = RExC_parse + 1;
6653         while (isDIGIT(*next) || *next == ',') {
6654             if (*next == ',') {
6655                 if (maxpos)
6656                     break;
6657                 else
6658                     maxpos = next;
6659             }
6660             next++;
6661         }
6662         if (*next == '}') {             /* got one */
6663             if (!maxpos)
6664                 maxpos = next;
6665             RExC_parse++;
6666             min = atoi(RExC_parse);
6667             if (*maxpos == ',')
6668                 maxpos++;
6669             else
6670                 maxpos = RExC_parse;
6671             max = atoi(maxpos);
6672             if (!max && *maxpos != '0')
6673                 max = REG_INFTY;                /* meaning "infinity" */
6674             else if (max >= REG_INFTY)
6675                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6676             RExC_parse = next;
6677             nextchar(pRExC_state);
6678
6679         do_curly:
6680             if ((flags&SIMPLE)) {
6681                 RExC_naughty += 2 + RExC_naughty / 2;
6682                 reginsert(pRExC_state, CURLY, ret, depth+1);
6683                 Set_Node_Offset(ret, parse_start+1); /* MJD */
6684                 Set_Node_Cur_Length(ret);
6685             }
6686             else {
6687                 regnode * const w = reg_node(pRExC_state, WHILEM);
6688
6689                 w->flags = 0;
6690                 REGTAIL(pRExC_state, ret, w);
6691                 if (!SIZE_ONLY && RExC_extralen) {
6692                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
6693                     reginsert(pRExC_state, NOTHING,ret, depth+1);
6694                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
6695                 }
6696                 reginsert(pRExC_state, CURLYX,ret, depth+1);
6697                                 /* MJD hk */
6698                 Set_Node_Offset(ret, parse_start+1);
6699                 Set_Node_Length(ret,
6700                                 op == '{' ? (RExC_parse - parse_start) : 1);
6701
6702                 if (!SIZE_ONLY && RExC_extralen)
6703                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
6704                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6705                 if (SIZE_ONLY)
6706                     RExC_whilem_seen++, RExC_extralen += 3;
6707                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
6708             }
6709             ret->flags = 0;
6710
6711             if (min > 0)
6712                 *flagp = WORST;
6713             if (max > 0)
6714                 *flagp |= HASWIDTH;
6715             if (max < min)
6716                 vFAIL("Can't do {n,m} with n > m");
6717             if (!SIZE_ONLY) {
6718                 ARG1_SET(ret, (U16)min);
6719                 ARG2_SET(ret, (U16)max);
6720             }
6721
6722             goto nest_check;
6723         }
6724     }
6725
6726     if (!ISMULT1(op)) {
6727         *flagp = flags;
6728         return(ret);
6729     }
6730
6731 #if 0                           /* Now runtime fix should be reliable. */
6732
6733     /* if this is reinstated, don't forget to put this back into perldiag:
6734
6735             =item Regexp *+ operand could be empty at {#} in regex m/%s/
6736
6737            (F) The part of the regexp subject to either the * or + quantifier
6738            could match an empty string. The {#} shows in the regular
6739            expression about where the problem was discovered.
6740
6741     */
6742
6743     if (!(flags&HASWIDTH) && op != '?')
6744       vFAIL("Regexp *+ operand could be empty");
6745 #endif
6746
6747     parse_start = RExC_parse;
6748     nextchar(pRExC_state);
6749
6750     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6751
6752     if (op == '*' && (flags&SIMPLE)) {
6753         reginsert(pRExC_state, STAR, ret, depth+1);
6754         ret->flags = 0;
6755         RExC_naughty += 4;
6756     }
6757     else if (op == '*') {
6758         min = 0;
6759         goto do_curly;
6760     }
6761     else if (op == '+' && (flags&SIMPLE)) {
6762         reginsert(pRExC_state, PLUS, ret, depth+1);
6763         ret->flags = 0;
6764         RExC_naughty += 3;
6765     }
6766     else if (op == '+') {
6767         min = 1;
6768         goto do_curly;
6769     }
6770     else if (op == '?') {
6771         min = 0; max = 1;
6772         goto do_curly;
6773     }
6774   nest_check:
6775     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6776         ckWARN3reg(RExC_parse,
6777                    "%.*s matches null string many times",
6778                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6779                    origparse);
6780     }
6781
6782     if (RExC_parse < RExC_end && *RExC_parse == '?') {
6783         nextchar(pRExC_state);
6784         reginsert(pRExC_state, MINMOD, ret, depth+1);
6785         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6786     }
6787 #ifndef REG_ALLOW_MINMOD_SUSPEND
6788     else
6789 #endif
6790     if (RExC_parse < RExC_end && *RExC_parse == '+') {
6791         regnode *ender;
6792         nextchar(pRExC_state);
6793         ender = reg_node(pRExC_state, SUCCEED);
6794         REGTAIL(pRExC_state, ret, ender);
6795         reginsert(pRExC_state, SUSPEND, ret, depth+1);
6796         ret->flags = 0;
6797         ender = reg_node(pRExC_state, TAIL);
6798         REGTAIL(pRExC_state, ret, ender);
6799         /*ret= ender;*/
6800     }
6801
6802     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6803         RExC_parse++;
6804         vFAIL("Nested quantifiers");
6805     }
6806
6807     return(ret);
6808 }
6809
6810
6811 /* reg_namedseq(pRExC_state,UVp)
6812
6813    This is expected to be called by a parser routine that has
6814    recognized '\N' and needs to handle the rest. RExC_parse is
6815    expected to point at the first char following the N at the time
6816    of the call.
6817
6818    The \N may be inside (indicated by valuep not being NULL) or outside a
6819    character class.
6820
6821    \N may begin either a named sequence, or if outside a character class, mean
6822    to match a non-newline.  For non single-quoted regexes, the tokenizer has
6823    attempted to decide which, and in the case of a named sequence converted it
6824    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6825    where c1... are the characters in the sequence.  For single-quoted regexes,
6826    the tokenizer passes the \N sequence through unchanged; this code will not
6827    attempt to determine this nor expand those.  The net effect is that if the
6828    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6829    signals that this \N occurrence means to match a non-newline.
6830
6831    Only the \N{U+...} form should occur in a character class, for the same
6832    reason that '.' inside a character class means to just match a period: it
6833    just doesn't make sense.
6834
6835    If valuep is non-null then it is assumed that we are parsing inside
6836    of a charclass definition and the first codepoint in the resolved
6837    string is returned via *valuep and the routine will return NULL.
6838    In this mode if a multichar string is returned from the charnames
6839    handler, a warning will be issued, and only the first char in the
6840    sequence will be examined. If the string returned is zero length
6841    then the value of *valuep is undefined and NON-NULL will
6842    be returned to indicate failure. (This will NOT be a valid pointer
6843    to a regnode.)
6844
6845    If valuep is null then it is assumed that we are parsing normal text and a
6846    new EXACT node is inserted into the program containing the resolved string,
6847    and a pointer to the new node is returned.  But if the string is zero length
6848    a NOTHING node is emitted instead.
6849
6850    On success RExC_parse is set to the char following the endbrace.
6851    Parsing failures will generate a fatal error via vFAIL(...)
6852  */
6853 STATIC regnode *
6854 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6855 {
6856     char * endbrace;    /* '}' following the name */
6857     regnode *ret = NULL;
6858 #ifdef DEBUGGING
6859     char* parse_start = RExC_parse - 2;     /* points to the '\N' */
6860 #endif
6861     char* p;
6862
6863     GET_RE_DEBUG_FLAGS_DECL;
6864
6865     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6866
6867     GET_RE_DEBUG_FLAGS;
6868
6869     /* The [^\n] meaning of \N ignores spaces and comments under the /x
6870      * modifier.  The other meaning does not */
6871     p = (RExC_flags & RXf_PMf_EXTENDED)
6872         ? regwhite( pRExC_state, RExC_parse )
6873         : RExC_parse;
6874
6875     /* Disambiguate between \N meaning a named character versus \N meaning
6876      * [^\n].  The former is assumed when it can't be the latter. */
6877     if (*p != '{' || regcurly(p)) {
6878         RExC_parse = p;
6879         if (valuep) {
6880             /* no bare \N in a charclass */
6881             vFAIL("\\N in a character class must be a named character: \\N{...}");
6882         }
6883         nextchar(pRExC_state);
6884         ret = reg_node(pRExC_state, REG_ANY);
6885         *flagp |= HASWIDTH|SIMPLE;
6886         RExC_naughty++;
6887         RExC_parse--;
6888         Set_Node_Length(ret, 1); /* MJD */
6889         return ret;
6890     }
6891
6892     /* Here, we have decided it should be a named sequence */
6893
6894     /* The test above made sure that the next real character is a '{', but
6895      * under the /x modifier, it could be separated by space (or a comment and
6896      * \n) and this is not allowed (for consistency with \x{...} and the
6897      * tokenizer handling of \N{NAME}). */
6898     if (*RExC_parse != '{') {
6899         vFAIL("Missing braces on \\N{}");
6900     }
6901
6902     RExC_parse++;       /* Skip past the '{' */
6903
6904     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6905         || ! (endbrace == RExC_parse            /* nothing between the {} */
6906               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
6907                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6908     {
6909         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
6910         vFAIL("\\N{NAME} must be resolved by the lexer");
6911     }
6912
6913     if (endbrace == RExC_parse) {   /* empty: \N{} */
6914         if (! valuep) {
6915             RExC_parse = endbrace + 1;
6916             return reg_node(pRExC_state,NOTHING);
6917         }
6918
6919         if (SIZE_ONLY) {
6920             ckWARNreg(RExC_parse,
6921                     "Ignoring zero length \\N{} in character class"
6922             );
6923             RExC_parse = endbrace + 1;
6924         }
6925         *valuep = 0;
6926         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6927     }
6928
6929     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
6930     RExC_parse += 2;    /* Skip past the 'U+' */
6931
6932     if (valuep) {   /* In a bracketed char class */
6933         /* We only pay attention to the first char of
6934         multichar strings being returned. I kinda wonder
6935         if this makes sense as it does change the behaviour
6936         from earlier versions, OTOH that behaviour was broken
6937         as well. XXX Solution is to recharacterize as
6938         [rest-of-class]|multi1|multi2... */
6939
6940         STRLEN length_of_hex;
6941         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6942             | PERL_SCAN_DISALLOW_PREFIX
6943             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6944
6945         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
6946         if (endchar < endbrace) {
6947             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6948         }
6949
6950         length_of_hex = (STRLEN)(endchar - RExC_parse);
6951         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
6952
6953         /* The tokenizer should have guaranteed validity, but it's possible to
6954          * bypass it by using single quoting, so check */
6955         if (length_of_hex == 0
6956             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6957         {
6958             RExC_parse += length_of_hex;        /* Includes all the valid */
6959             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
6960                             ? UTF8SKIP(RExC_parse)
6961                             : 1;
6962             /* Guard against malformed utf8 */
6963             if (RExC_parse >= endchar) RExC_parse = endchar;
6964             vFAIL("Invalid hexadecimal number in \\N{U+...}");
6965         }
6966
6967         RExC_parse = endbrace + 1;
6968         if (endchar == endbrace) return NULL;
6969
6970         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
6971     }
6972     else {      /* Not a char class */
6973         char *s;            /* String to put in generated EXACT node */
6974         STRLEN len = 0;     /* Its current byte length */
6975         char *endchar;      /* Points to '.' or '}' ending cur char in the input
6976                                stream */
6977
6978         ret = reg_node(pRExC_state,
6979                         (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6980         s= STRING(ret);
6981
6982         /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
6983          * the input which is of the form now 'c1.c2.c3...}' until find the
6984          * ending brace or exceed length 255.  The characters that exceed this
6985          * limit are dropped.  The limit could be relaxed should it become
6986          * desirable by reparsing this as (?:\N{NAME}), so could generate
6987          * multiple EXACT nodes, as is done for just regular input.  But this
6988          * is primarily a named character, and not intended to be a huge long
6989          * string, so 255 bytes should be good enough */
6990         while (1) {
6991             STRLEN length_of_hex;
6992             I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
6993                             | PERL_SCAN_DISALLOW_PREFIX
6994                             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6995             UV cp;  /* Ord of current character */
6996
6997             /* Code points are separated by dots.  If none, there is only one
6998              * code point, and is terminated by the brace */
6999             endchar = RExC_parse + strcspn(RExC_parse, ".}");
7000
7001             /* The values are Unicode even on EBCDIC machines */
7002             length_of_hex = (STRLEN)(endchar - RExC_parse);
7003             cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7004             if ( length_of_hex == 0
7005                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7006             {
7007                 RExC_parse += length_of_hex;        /* Includes all the valid */
7008                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
7009                                 ? UTF8SKIP(RExC_parse)
7010                                 : 1;
7011                 /* Guard against malformed utf8 */
7012                 if (RExC_parse >= endchar) RExC_parse = endchar;
7013                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7014             }
7015
7016             if (! FOLD) {       /* Not folding, just append to the string */
7017                 STRLEN unilen;
7018
7019                 /* Quit before adding this character if would exceed limit */
7020                 if (len + UNISKIP(cp) > U8_MAX) break;
7021
7022                 unilen = reguni(pRExC_state, cp, s);
7023                 if (unilen > 0) {
7024                     s   += unilen;
7025                     len += unilen;
7026                 }
7027             } else {    /* Folding, output the folded equivalent */
7028                 STRLEN foldlen,numlen;
7029                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7030                 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7031
7032                 /* Quit before exceeding size limit */
7033                 if (len + foldlen > U8_MAX) break;
7034
7035                 for (foldbuf = tmpbuf;
7036                     foldlen;
7037                     foldlen -= numlen)
7038                 {
7039                     cp = utf8_to_uvchr(foldbuf, &numlen);
7040                     if (numlen > 0) {
7041                         const STRLEN unilen = reguni(pRExC_state, cp, s);
7042                         s       += unilen;
7043                         len     += unilen;
7044                         /* In EBCDIC the numlen and unilen can differ. */
7045                         foldbuf += numlen;
7046                         if (numlen >= foldlen)
7047                             break;
7048                     }
7049                     else
7050                         break; /* "Can't happen." */
7051                 }
7052             }
7053
7054             /* Point to the beginning of the next character in the sequence. */
7055             RExC_parse = endchar + 1;
7056
7057             /* Quit if no more characters */
7058             if (RExC_parse >= endbrace) break;
7059         }
7060
7061
7062         if (SIZE_ONLY) {
7063             if (RExC_parse < endbrace) {
7064                 ckWARNreg(RExC_parse - 1,
7065                           "Using just the first characters returned by \\N{}");
7066             }
7067
7068             RExC_size += STR_SZ(len);
7069         } else {
7070             STR_LEN(ret) = len;
7071             RExC_emit += STR_SZ(len);
7072         }
7073
7074         RExC_parse = endbrace + 1;
7075
7076         *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7077                                with malformed in t/re/pat_advanced.t */
7078         RExC_parse --;
7079         Set_Node_Cur_Length(ret); /* MJD */
7080         nextchar(pRExC_state);
7081     }
7082
7083     return ret;
7084 }
7085
7086
7087 /*
7088  * reg_recode
7089  *
7090  * It returns the code point in utf8 for the value in *encp.
7091  *    value: a code value in the source encoding
7092  *    encp:  a pointer to an Encode object
7093  *
7094  * If the result from Encode is not a single character,
7095  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7096  */
7097 STATIC UV
7098 S_reg_recode(pTHX_ const char value, SV **encp)
7099 {
7100     STRLEN numlen = 1;
7101     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7102     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7103     const STRLEN newlen = SvCUR(sv);
7104     UV uv = UNICODE_REPLACEMENT;
7105
7106     PERL_ARGS_ASSERT_REG_RECODE;
7107
7108     if (newlen)
7109         uv = SvUTF8(sv)
7110              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7111              : *(U8*)s;
7112
7113     if (!newlen || numlen != newlen) {
7114         uv = UNICODE_REPLACEMENT;
7115         *encp = NULL;
7116     }
7117     return uv;
7118 }
7119
7120
7121 /*
7122  - regatom - the lowest level
7123
7124    Try to identify anything special at the start of the pattern. If there
7125    is, then handle it as required. This may involve generating a single regop,
7126    such as for an assertion; or it may involve recursing, such as to
7127    handle a () structure.
7128
7129    If the string doesn't start with something special then we gobble up
7130    as much literal text as we can.
7131
7132    Once we have been able to handle whatever type of thing started the
7133    sequence, we return.
7134
7135    Note: we have to be careful with escapes, as they can be both literal
7136    and special, and in the case of \10 and friends can either, depending
7137    on context. Specifically there are two seperate switches for handling
7138    escape sequences, with the one for handling literal escapes requiring
7139    a dummy entry for all of the special escapes that are actually handled
7140    by the other.
7141 */
7142
7143 STATIC regnode *
7144 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7145 {
7146     dVAR;
7147     register regnode *ret = NULL;
7148     I32 flags;
7149     char *parse_start = RExC_parse;
7150     GET_RE_DEBUG_FLAGS_DECL;
7151     DEBUG_PARSE("atom");
7152     *flagp = WORST;             /* Tentatively. */
7153
7154     PERL_ARGS_ASSERT_REGATOM;
7155
7156 tryagain:
7157     switch ((U8)*RExC_parse) {
7158     case '^':
7159         RExC_seen_zerolen++;
7160         nextchar(pRExC_state);
7161         if (RExC_flags & RXf_PMf_MULTILINE)
7162             ret = reg_node(pRExC_state, MBOL);
7163         else if (RExC_flags & RXf_PMf_SINGLELINE)
7164             ret = reg_node(pRExC_state, SBOL);
7165         else
7166             ret = reg_node(pRExC_state, BOL);
7167         Set_Node_Length(ret, 1); /* MJD */
7168         break;
7169     case '$':
7170         nextchar(pRExC_state);
7171         if (*RExC_parse)
7172             RExC_seen_zerolen++;
7173         if (RExC_flags & RXf_PMf_MULTILINE)
7174             ret = reg_node(pRExC_state, MEOL);
7175         else if (RExC_flags & RXf_PMf_SINGLELINE)
7176             ret = reg_node(pRExC_state, SEOL);
7177         else
7178             ret = reg_node(pRExC_state, EOL);
7179         Set_Node_Length(ret, 1); /* MJD */
7180         break;
7181     case '.':
7182         nextchar(pRExC_state);
7183         if (RExC_flags & RXf_PMf_SINGLELINE)
7184             ret = reg_node(pRExC_state, SANY);
7185         else
7186             ret = reg_node(pRExC_state, REG_ANY);
7187         *flagp |= HASWIDTH|SIMPLE;
7188         RExC_naughty++;
7189         Set_Node_Length(ret, 1); /* MJD */
7190         break;
7191     case '[':
7192     {
7193         char * const oregcomp_parse = ++RExC_parse;
7194         ret = regclass(pRExC_state,depth+1);
7195         if (*RExC_parse != ']') {
7196             RExC_parse = oregcomp_parse;
7197             vFAIL("Unmatched [");
7198         }
7199         nextchar(pRExC_state);
7200         *flagp |= HASWIDTH|SIMPLE;
7201         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7202         break;
7203     }
7204     case '(':
7205         nextchar(pRExC_state);
7206         ret = reg(pRExC_state, 1, &flags,depth+1);
7207         if (ret == NULL) {
7208                 if (flags & TRYAGAIN) {
7209                     if (RExC_parse == RExC_end) {
7210                          /* Make parent create an empty node if needed. */
7211                         *flagp |= TRYAGAIN;
7212                         return(NULL);
7213                     }
7214                     goto tryagain;
7215                 }
7216                 return(NULL);
7217         }
7218         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7219         break;
7220     case '|':
7221     case ')':
7222         if (flags & TRYAGAIN) {
7223             *flagp |= TRYAGAIN;
7224             return NULL;
7225         }
7226         vFAIL("Internal urp");
7227                                 /* Supposed to be caught earlier. */
7228         break;
7229     case '{':
7230         if (!regcurly(RExC_parse)) {
7231             RExC_parse++;
7232             goto defchar;
7233         }
7234         /* FALL THROUGH */
7235     case '?':
7236     case '+':
7237     case '*':
7238         RExC_parse++;
7239         vFAIL("Quantifier follows nothing");
7240         break;
7241     case 0xDF:
7242     case 0xC3:
7243     case 0xCE:
7244         do_foldchar:
7245         if (!LOC && FOLD) {
7246             U32 len,cp;
7247             len=0; /* silence a spurious compiler warning */
7248             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7249                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7250                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7251                 ret = reganode(pRExC_state, FOLDCHAR, cp);
7252                 Set_Node_Length(ret, 1); /* MJD */
7253                 nextchar(pRExC_state); /* kill whitespace under /x */
7254                 return ret;
7255             }
7256         }
7257         goto outer_default;
7258     case '\\':
7259         /* Special Escapes
7260
7261            This switch handles escape sequences that resolve to some kind
7262            of special regop and not to literal text. Escape sequnces that
7263            resolve to literal text are handled below in the switch marked
7264            "Literal Escapes".
7265
7266            Every entry in this switch *must* have a corresponding entry
7267            in the literal escape switch. However, the opposite is not
7268            required, as the default for this switch is to jump to the
7269            literal text handling code.
7270         */
7271         switch ((U8)*++RExC_parse) {
7272         case 0xDF:
7273         case 0xC3:
7274         case 0xCE:
7275                    goto do_foldchar;
7276         /* Special Escapes */
7277         case 'A':
7278             RExC_seen_zerolen++;
7279             ret = reg_node(pRExC_state, SBOL);
7280             *flagp |= SIMPLE;
7281             goto finish_meta_pat;
7282         case 'G':
7283             ret = reg_node(pRExC_state, GPOS);
7284             RExC_seen |= REG_SEEN_GPOS;
7285             *flagp |= SIMPLE;
7286             goto finish_meta_pat;
7287         case 'K':
7288             RExC_seen_zerolen++;
7289             ret = reg_node(pRExC_state, KEEPS);
7290             *flagp |= SIMPLE;
7291             /* XXX:dmq : disabling in-place substitution seems to
7292              * be necessary here to avoid cases of memory corruption, as
7293              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7294              */
7295             RExC_seen |= REG_SEEN_LOOKBEHIND;
7296             goto finish_meta_pat;
7297         case 'Z':
7298             ret = reg_node(pRExC_state, SEOL);
7299             *flagp |= SIMPLE;
7300             RExC_seen_zerolen++;                /* Do not optimize RE away */
7301             goto finish_meta_pat;
7302         case 'z':
7303             ret = reg_node(pRExC_state, EOS);
7304             *flagp |= SIMPLE;
7305             RExC_seen_zerolen++;                /* Do not optimize RE away */
7306             goto finish_meta_pat;
7307         case 'C':
7308             ret = reg_node(pRExC_state, CANY);
7309             RExC_seen |= REG_SEEN_CANY;
7310             *flagp |= HASWIDTH|SIMPLE;
7311             goto finish_meta_pat;
7312         case 'X':
7313             ret = reg_node(pRExC_state, CLUMP);
7314             *flagp |= HASWIDTH;
7315             goto finish_meta_pat;
7316         case 'w':
7317             if (LOC) {
7318                 ret = reg_node(pRExC_state, (U8)(ALNUML));
7319             } else {
7320                 ret = reg_node(pRExC_state, (U8)(ALNUM));
7321                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7322             }
7323             *flagp |= HASWIDTH|SIMPLE;
7324             goto finish_meta_pat;
7325         case 'W':
7326             if (LOC) {
7327                 ret = reg_node(pRExC_state, (U8)(NALNUML));
7328             } else {
7329                 ret = reg_node(pRExC_state, (U8)(NALNUM));
7330                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7331             }
7332             *flagp |= HASWIDTH|SIMPLE;
7333             goto finish_meta_pat;
7334         case 'b':
7335             RExC_seen_zerolen++;
7336             RExC_seen |= REG_SEEN_LOOKBEHIND;
7337             if (LOC) {
7338                 ret = reg_node(pRExC_state, (U8)(BOUNDL));
7339             } else {
7340                 ret = reg_node(pRExC_state, (U8)(BOUND));
7341                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7342             }
7343             *flagp |= SIMPLE;
7344             goto finish_meta_pat;
7345         case 'B':
7346             RExC_seen_zerolen++;
7347             RExC_seen |= REG_SEEN_LOOKBEHIND;
7348             if (LOC) {
7349                 ret = reg_node(pRExC_state, (U8)(NBOUNDL));
7350             } else {
7351                 ret = reg_node(pRExC_state, (U8)(NBOUND));
7352                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7353             }
7354             *flagp |= SIMPLE;
7355             goto finish_meta_pat;
7356         case 's':
7357             if (LOC) {
7358                 ret = reg_node(pRExC_state, (U8)(SPACEL));
7359             } else {
7360                 ret = reg_node(pRExC_state, (U8)(SPACE));
7361                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7362             }
7363             *flagp |= HASWIDTH|SIMPLE;
7364             goto finish_meta_pat;
7365         case 'S':
7366             if (LOC) {
7367                 ret = reg_node(pRExC_state, (U8)(NSPACEL));
7368             } else {
7369                 ret = reg_node(pRExC_state, (U8)(NSPACE));
7370                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7371             }
7372             *flagp |= HASWIDTH|SIMPLE;
7373             goto finish_meta_pat;
7374         case 'd':
7375             ret = reg_node(pRExC_state, DIGIT);
7376             *flagp |= HASWIDTH|SIMPLE;
7377             goto finish_meta_pat;
7378         case 'D':
7379             ret = reg_node(pRExC_state, NDIGIT);
7380             *flagp |= HASWIDTH|SIMPLE;
7381             goto finish_meta_pat;
7382         case 'R':
7383             ret = reg_node(pRExC_state, LNBREAK);
7384             *flagp |= HASWIDTH|SIMPLE;
7385             goto finish_meta_pat;
7386         case 'h':
7387             ret = reg_node(pRExC_state, HORIZWS);
7388             *flagp |= HASWIDTH|SIMPLE;
7389             goto finish_meta_pat;
7390         case 'H':
7391             ret = reg_node(pRExC_state, NHORIZWS);
7392             *flagp |= HASWIDTH|SIMPLE;
7393             goto finish_meta_pat;
7394         case 'v':
7395             ret = reg_node(pRExC_state, VERTWS);
7396             *flagp |= HASWIDTH|SIMPLE;
7397             goto finish_meta_pat;
7398         case 'V':
7399             ret = reg_node(pRExC_state, NVERTWS);
7400             *flagp |= HASWIDTH|SIMPLE;
7401          finish_meta_pat:
7402             nextchar(pRExC_state);
7403             Set_Node_Length(ret, 2); /* MJD */
7404             break;
7405         case 'p':
7406         case 'P':
7407             {
7408                 char* const oldregxend = RExC_end;
7409 #ifdef DEBUGGING
7410                 char* parse_start = RExC_parse - 2;
7411 #endif
7412
7413                 if (RExC_parse[1] == '{') {
7414                   /* a lovely hack--pretend we saw [\pX] instead */
7415                     RExC_end = strchr(RExC_parse, '}');
7416                     if (!RExC_end) {
7417                         const U8 c = (U8)*RExC_parse;
7418                         RExC_parse += 2;
7419                         RExC_end = oldregxend;
7420                         vFAIL2("Missing right brace on \\%c{}", c);
7421                     }
7422                     RExC_end++;
7423                 }
7424                 else {
7425                     RExC_end = RExC_parse + 2;
7426                     if (RExC_end > oldregxend)
7427                         RExC_end = oldregxend;
7428                 }
7429                 RExC_parse--;
7430
7431                 ret = regclass(pRExC_state,depth+1);
7432
7433                 RExC_end = oldregxend;
7434                 RExC_parse--;
7435
7436                 Set_Node_Offset(ret, parse_start + 2);
7437                 Set_Node_Cur_Length(ret);
7438                 nextchar(pRExC_state);
7439                 *flagp |= HASWIDTH|SIMPLE;
7440             }
7441             break;
7442         case 'N':
7443             /* Handle \N and \N{NAME} here and not below because it can be
7444             multicharacter. join_exact() will join them up later on.
7445             Also this makes sure that things like /\N{BLAH}+/ and
7446             \N{BLAH} being multi char Just Happen. dmq*/
7447             ++RExC_parse;
7448             ret= reg_namedseq(pRExC_state, NULL, flagp);
7449             break;
7450         case 'k':    /* Handle \k<NAME> and \k'NAME' */
7451         parse_named_seq:
7452         {
7453             char ch= RExC_parse[1];
7454             if (ch != '<' && ch != '\'' && ch != '{') {
7455                 RExC_parse++;
7456                 vFAIL2("Sequence %.2s... not terminated",parse_start);
7457             } else {
7458                 /* this pretty much dupes the code for (?P=...) in reg(), if
7459                    you change this make sure you change that */
7460                 char* name_start = (RExC_parse += 2);
7461                 U32 num = 0;
7462                 SV *sv_dat = reg_scan_name(pRExC_state,
7463                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7464                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7465                 if (RExC_parse == name_start || *RExC_parse != ch)
7466                     vFAIL2("Sequence %.3s... not terminated",parse_start);
7467
7468                 if (!SIZE_ONLY) {
7469                     num = add_data( pRExC_state, 1, "S" );
7470                     RExC_rxi->data->data[num]=(void*)sv_dat;
7471                     SvREFCNT_inc_simple_void(sv_dat);
7472                 }
7473
7474                 RExC_sawback = 1;
7475                 ret = reganode(pRExC_state,
7476                            (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7477                            num);
7478                 *flagp |= HASWIDTH;
7479
7480                 /* override incorrect value set in reganode MJD */
7481                 Set_Node_Offset(ret, parse_start+1);
7482                 Set_Node_Cur_Length(ret); /* MJD */
7483                 nextchar(pRExC_state);
7484
7485             }
7486             break;
7487         }
7488         case 'g':
7489         case '1': case '2': case '3': case '4':
7490         case '5': case '6': case '7': case '8': case '9':
7491             {
7492                 I32 num;
7493                 bool isg = *RExC_parse == 'g';
7494                 bool isrel = 0;
7495                 bool hasbrace = 0;
7496                 if (isg) {
7497                     RExC_parse++;
7498                     if (*RExC_parse == '{') {
7499                         RExC_parse++;
7500                         hasbrace = 1;
7501                     }
7502                     if (*RExC_parse == '-') {
7503                         RExC_parse++;
7504                         isrel = 1;
7505                     }
7506                     if (hasbrace && !isDIGIT(*RExC_parse)) {
7507                         if (isrel) RExC_parse--;
7508                         RExC_parse -= 2;
7509                         goto parse_named_seq;
7510                 }   }
7511                 num = atoi(RExC_parse);
7512                 if (isg && num == 0)
7513                     vFAIL("Reference to invalid group 0");
7514                 if (isrel) {
7515                     num = RExC_npar - num;
7516                     if (num < 1)
7517                         vFAIL("Reference to nonexistent or unclosed group");
7518                 }
7519                 if (!isg && num > 9 && num >= RExC_npar)
7520                     goto defchar;
7521                 else {
7522                     char * const parse_start = RExC_parse - 1; /* MJD */
7523                     while (isDIGIT(*RExC_parse))
7524                         RExC_parse++;
7525                     if (parse_start == RExC_parse - 1)
7526                         vFAIL("Unterminated \\g... pattern");
7527                     if (hasbrace) {
7528                         if (*RExC_parse != '}')
7529                             vFAIL("Unterminated \\g{...} pattern");
7530                         RExC_parse++;
7531                     }
7532                     if (!SIZE_ONLY) {
7533                         if (num > (I32)RExC_rx->nparens)
7534                             vFAIL("Reference to nonexistent group");
7535                     }
7536                     RExC_sawback = 1;
7537                     ret = reganode(pRExC_state,
7538                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7539                                    num);
7540                     *flagp |= HASWIDTH;
7541
7542                     /* override incorrect value set in reganode MJD */
7543                     Set_Node_Offset(ret, parse_start+1);
7544                     Set_Node_Cur_Length(ret); /* MJD */
7545                     RExC_parse--;
7546                     nextchar(pRExC_state);
7547                 }
7548             }
7549             break;
7550         case '\0':
7551             if (RExC_parse >= RExC_end)
7552                 FAIL("Trailing \\");
7553             /* FALL THROUGH */
7554         default:
7555             /* Do not generate "unrecognized" warnings here, we fall
7556                back into the quick-grab loop below */
7557             parse_start--;
7558             goto defchar;
7559         }
7560         break;
7561
7562     case '#':
7563         if (RExC_flags & RXf_PMf_EXTENDED) {
7564             if ( reg_skipcomment( pRExC_state ) )
7565                 goto tryagain;
7566         }
7567         /* FALL THROUGH */
7568
7569     default:
7570         outer_default:{
7571             register STRLEN len;
7572             register UV ender;
7573             register char *p;
7574             char *s;
7575             STRLEN foldlen;
7576             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7577
7578             parse_start = RExC_parse - 1;
7579
7580             RExC_parse++;
7581
7582         defchar:
7583             ender = 0;
7584             ret = reg_node(pRExC_state,
7585                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7586             s = STRING(ret);
7587             for (len = 0, p = RExC_parse - 1;
7588               len < 127 && p < RExC_end;
7589               len++)
7590             {
7591                 char * const oldp = p;
7592
7593                 if (RExC_flags & RXf_PMf_EXTENDED)
7594                     p = regwhite( pRExC_state, p );
7595                 switch ((U8)*p) {
7596                 case 0xDF:
7597                 case 0xC3:
7598                 case 0xCE:
7599                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7600                                 goto normal_default;
7601                 case '^':
7602                 case '$':
7603                 case '.':
7604                 case '[':
7605                 case '(':
7606                 case ')':
7607                 case '|':
7608                     goto loopdone;
7609                 case '\\':
7610                     /* Literal Escapes Switch
7611
7612                        This switch is meant to handle escape sequences that
7613                        resolve to a literal character.
7614
7615                        Every escape sequence that represents something
7616                        else, like an assertion or a char class, is handled
7617                        in the switch marked 'Special Escapes' above in this
7618                        routine, but also has an entry here as anything that
7619                        isn't explicitly mentioned here will be treated as
7620                        an unescaped equivalent literal.
7621                     */
7622
7623                     switch ((U8)*++p) {
7624                     /* These are all the special escapes. */
7625                     case 0xDF:
7626                     case 0xC3:
7627                     case 0xCE:
7628                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7629                                 goto normal_default;
7630                     case 'A':             /* Start assertion */
7631                     case 'b': case 'B':   /* Word-boundary assertion*/
7632                     case 'C':             /* Single char !DANGEROUS! */
7633                     case 'd': case 'D':   /* digit class */
7634                     case 'g': case 'G':   /* generic-backref, pos assertion */
7635                     case 'h': case 'H':   /* HORIZWS */
7636                     case 'k': case 'K':   /* named backref, keep marker */
7637                     case 'N':             /* named char sequence */
7638                     case 'p': case 'P':   /* Unicode property */
7639                               case 'R':   /* LNBREAK */
7640                     case 's': case 'S':   /* space class */
7641                     case 'v': case 'V':   /* VERTWS */
7642                     case 'w': case 'W':   /* word class */
7643                     case 'X':             /* eXtended Unicode "combining character sequence" */
7644                     case 'z': case 'Z':   /* End of line/string assertion */
7645                         --p;
7646                         goto loopdone;
7647
7648                     /* Anything after here is an escape that resolves to a
7649                        literal. (Except digits, which may or may not)
7650                      */
7651                     case 'n':
7652                         ender = '\n';
7653                         p++;
7654                         break;
7655                     case 'r':
7656                         ender = '\r';
7657                         p++;
7658                         break;
7659                     case 't':
7660                         ender = '\t';
7661                         p++;
7662                         break;
7663                     case 'f':
7664                         ender = '\f';
7665                         p++;
7666                         break;
7667                     case 'e':
7668                           ender = ASCII_TO_NATIVE('\033');
7669                         p++;
7670                         break;
7671                     case 'a':
7672                           ender = ASCII_TO_NATIVE('\007');
7673                         p++;
7674                         break;
7675                     case 'o':
7676                         {
7677                             STRLEN brace_len = len;
7678                             UV result;
7679                             const char* error_msg;
7680
7681                             bool valid = grok_bslash_o(p,
7682                                                        &result,
7683                                                        &brace_len,
7684                                                        &error_msg,
7685                                                        1);
7686                             p += brace_len;
7687                             if (! valid) {
7688                                 RExC_parse = p; /* going to die anyway; point
7689                                                    to exact spot of failure */
7690                                 vFAIL(error_msg);
7691                             }
7692                             else
7693                             {
7694                                 ender = result;
7695                             }
7696                             if (PL_encoding && ender < 0x100) {
7697                                 goto recode_encoding;
7698                             }
7699                             if (ender > 0xff) {
7700                                 REQUIRE_UTF8;
7701                             }
7702                             break;
7703                         }
7704                     case 'x':
7705                         if (*++p == '{') {
7706                             char* const e = strchr(p, '}');
7707
7708                             if (!e) {
7709                                 RExC_parse = p + 1;
7710                                 vFAIL("Missing right brace on \\x{}");
7711                             }
7712                             else {
7713                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7714                                     | PERL_SCAN_DISALLOW_PREFIX;
7715                                 STRLEN numlen = e - p - 1;
7716                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7717                                 if (ender > 0xff)
7718                                     REQUIRE_UTF8;
7719                                 p = e + 1;
7720                             }
7721                         }
7722                         else {
7723                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7724                             STRLEN numlen = 2;
7725                             ender = grok_hex(p, &numlen, &flags, NULL);
7726                             p += numlen;
7727                         }
7728                         if (PL_encoding && ender < 0x100)
7729                             goto recode_encoding;
7730                         break;
7731                     case 'c':
7732                         p++;
7733                         ender = grok_bslash_c(*p++, SIZE_ONLY);
7734                         break;
7735                     case '0': case '1': case '2': case '3':case '4':
7736                     case '5': case '6': case '7': case '8':case '9':
7737                         if (*p == '0' ||
7738                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
7739                         {
7740                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
7741                             STRLEN numlen = 3;
7742                             ender = grok_oct(p, &numlen, &flags, NULL);
7743                             if (ender > 0xff) {
7744                                 REQUIRE_UTF8;
7745                             }
7746                             p += numlen;
7747                         }
7748                         else {
7749                             --p;
7750                             goto loopdone;
7751                         }
7752                         if (PL_encoding && ender < 0x100)
7753                             goto recode_encoding;
7754                         break;
7755                     recode_encoding:
7756                         {
7757                             SV* enc = PL_encoding;
7758                             ender = reg_recode((const char)(U8)ender, &enc);
7759                             if (!enc && SIZE_ONLY)
7760                                 ckWARNreg(p, "Invalid escape in the specified encoding");
7761                             REQUIRE_UTF8;
7762                         }
7763                         break;
7764                     case '\0':
7765                         if (p >= RExC_end)
7766                             FAIL("Trailing \\");
7767                         /* FALL THROUGH */
7768                     default:
7769                         if (!SIZE_ONLY&& isALPHA(*p))
7770                             ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7771                         goto normal_default;
7772                     }
7773                     break;
7774                 default:
7775                   normal_default:
7776                     if (UTF8_IS_START(*p) && UTF) {
7777                         STRLEN numlen;
7778                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7779                                                &numlen, UTF8_ALLOW_DEFAULT);
7780                         p += numlen;
7781                     }
7782                     else
7783                         ender = *p++;
7784                     break;
7785                 }
7786                 if ( RExC_flags & RXf_PMf_EXTENDED)
7787                     p = regwhite( pRExC_state, p );
7788                 if (UTF && FOLD) {
7789                     /* Prime the casefolded buffer. */
7790                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7791                 }
7792                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7793                     if (len)
7794                         p = oldp;
7795                     else if (UTF) {
7796                          if (FOLD) {
7797                               /* Emit all the Unicode characters. */
7798                               STRLEN numlen;
7799                               for (foldbuf = tmpbuf;
7800                                    foldlen;
7801                                    foldlen -= numlen) {
7802                                    ender = utf8_to_uvchr(foldbuf, &numlen);
7803                                    if (numlen > 0) {
7804                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
7805                                         s       += unilen;
7806                                         len     += unilen;
7807                                         /* In EBCDIC the numlen
7808                                          * and unilen can differ. */
7809                                         foldbuf += numlen;
7810                                         if (numlen >= foldlen)
7811                                              break;
7812                                    }
7813                                    else
7814                                         break; /* "Can't happen." */
7815                               }
7816                          }
7817                          else {
7818                               const STRLEN unilen = reguni(pRExC_state, ender, s);
7819                               if (unilen > 0) {
7820                                    s   += unilen;
7821                                    len += unilen;
7822                               }
7823                          }
7824                     }
7825                     else {
7826                         len++;
7827                         REGC((char)ender, s++);
7828                     }
7829                     break;
7830                 }
7831                 if (UTF) {
7832                      if (FOLD) {
7833                           /* Emit all the Unicode characters. */
7834                           STRLEN numlen;
7835                           for (foldbuf = tmpbuf;
7836                                foldlen;
7837                                foldlen -= numlen) {
7838                                ender = utf8_to_uvchr(foldbuf, &numlen);
7839                                if (numlen > 0) {
7840                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
7841                                     len     += unilen;
7842                                     s       += unilen;
7843                                     /* In EBCDIC the numlen
7844                                      * and unilen can differ. */
7845                                     foldbuf += numlen;
7846                                     if (numlen >= foldlen)
7847                                          break;
7848                                }
7849                                else
7850                                     break;
7851                           }
7852                      }
7853                      else {
7854                           const STRLEN unilen = reguni(pRExC_state, ender, s);
7855                           if (unilen > 0) {
7856                                s   += unilen;
7857                                len += unilen;
7858                           }
7859                      }
7860                      len--;
7861                 }
7862                 else
7863                     REGC((char)ender, s++);
7864             }
7865         loopdone:
7866             RExC_parse = p - 1;
7867             Set_Node_Cur_Length(ret); /* MJD */
7868             nextchar(pRExC_state);
7869             {
7870                 /* len is STRLEN which is unsigned, need to copy to signed */
7871                 IV iv = len;
7872                 if (iv < 0)
7873                     vFAIL("Internal disaster");
7874             }
7875             if (len > 0)
7876                 *flagp |= HASWIDTH;
7877             if (len == 1 && UNI_IS_INVARIANT(ender))
7878                 *flagp |= SIMPLE;
7879
7880             if (SIZE_ONLY)
7881                 RExC_size += STR_SZ(len);
7882             else {
7883                 STR_LEN(ret) = len;
7884                 RExC_emit += STR_SZ(len);
7885             }
7886         }
7887         break;
7888     }
7889
7890     return(ret);
7891 }
7892
7893 STATIC char *
7894 S_regwhite( RExC_state_t *pRExC_state, char *p )
7895 {
7896     const char *e = RExC_end;
7897
7898     PERL_ARGS_ASSERT_REGWHITE;
7899
7900     while (p < e) {
7901         if (isSPACE(*p))
7902             ++p;
7903         else if (*p == '#') {
7904             bool ended = 0;
7905             do {
7906                 if (*p++ == '\n') {
7907                     ended = 1;
7908                     break;
7909                 }
7910             } while (p < e);
7911             if (!ended)
7912                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7913         }
7914         else
7915             break;
7916     }
7917     return p;
7918 }
7919
7920 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7921    Character classes ([:foo:]) can also be negated ([:^foo:]).
7922    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7923    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7924    but trigger failures because they are currently unimplemented. */
7925
7926 #define POSIXCC_DONE(c)   ((c) == ':')
7927 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7928 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7929
7930 STATIC I32
7931 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7932 {
7933     dVAR;
7934     I32 namedclass = OOB_NAMEDCLASS;
7935
7936     PERL_ARGS_ASSERT_REGPPOSIXCC;
7937
7938     if (value == '[' && RExC_parse + 1 < RExC_end &&
7939         /* I smell either [: or [= or [. -- POSIX has been here, right? */
7940         POSIXCC(UCHARAT(RExC_parse))) {
7941         const char c = UCHARAT(RExC_parse);
7942         char* const s = RExC_parse++;
7943
7944         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7945             RExC_parse++;
7946         if (RExC_parse == RExC_end)
7947             /* Grandfather lone [:, [=, [. */
7948             RExC_parse = s;
7949         else {
7950             const char* const t = RExC_parse++; /* skip over the c */
7951             assert(*t == c);
7952
7953             if (UCHARAT(RExC_parse) == ']') {
7954                 const char *posixcc = s + 1;
7955                 RExC_parse++; /* skip over the ending ] */
7956
7957                 if (*s == ':') {
7958                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7959                     const I32 skip = t - posixcc;
7960
7961                     /* Initially switch on the length of the name.  */
7962                     switch (skip) {
7963                     case 4:
7964                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7965                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7966                         break;
7967                     case 5:
7968                         /* Names all of length 5.  */
7969                         /* alnum alpha ascii blank cntrl digit graph lower
7970                            print punct space upper  */
7971                         /* Offset 4 gives the best switch position.  */
7972                         switch (posixcc[4]) {
7973                         case 'a':
7974                             if (memEQ(posixcc, "alph", 4)) /* alpha */
7975                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7976                             break;
7977                         case 'e':
7978                             if (memEQ(posixcc, "spac", 4)) /* space */
7979                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7980                             break;
7981                         case 'h':
7982                             if (memEQ(posixcc, "grap", 4)) /* graph */
7983                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7984                             break;
7985                         case 'i':
7986                             if (memEQ(posixcc, "asci", 4)) /* ascii */
7987                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7988                             break;
7989                         case 'k':
7990                             if (memEQ(posixcc, "blan", 4)) /* blank */
7991                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7992                             break;
7993                         case 'l':
7994                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7995                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7996                             break;
7997                         case 'm':
7998                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
7999                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
8000                             break;
8001                         case 'r':
8002                             if (memEQ(posixcc, "lowe", 4)) /* lower */
8003                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
8004                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
8005                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
8006                             break;
8007                         case 't':
8008                             if (memEQ(posixcc, "digi", 4)) /* digit */
8009                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
8010                             else if (memEQ(posixcc, "prin", 4)) /* print */
8011                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
8012                             else if (memEQ(posixcc, "punc", 4)) /* punct */
8013                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
8014                             break;
8015                         }
8016                         break;
8017                     case 6:
8018                         if (memEQ(posixcc, "xdigit", 6))
8019                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
8020                         break;
8021                     }
8022
8023                     if (namedclass == OOB_NAMEDCLASS)
8024                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8025                                       t - s - 1, s + 1);
8026                     assert (posixcc[skip] == ':');
8027                     assert (posixcc[skip+1] == ']');
8028                 } else if (!SIZE_ONLY) {
8029                     /* [[=foo=]] and [[.foo.]] are still future. */
8030
8031                     /* adjust RExC_parse so the warning shows after
8032                        the class closes */
8033                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
8034                         RExC_parse++;
8035                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8036                 }
8037             } else {
8038                 /* Maternal grandfather:
8039                  * "[:" ending in ":" but not in ":]" */
8040                 RExC_parse = s;
8041             }
8042         }
8043     }
8044
8045     return namedclass;
8046 }
8047
8048 STATIC void
8049 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
8050 {
8051     dVAR;
8052
8053     PERL_ARGS_ASSERT_CHECKPOSIXCC;
8054
8055     if (POSIXCC(UCHARAT(RExC_parse))) {
8056         const char *s = RExC_parse;
8057         const char  c = *s++;
8058
8059         while (isALNUM(*s))
8060             s++;
8061         if (*s && c == *s && s[1] == ']') {
8062             ckWARN3reg(s+2,
8063                        "POSIX syntax [%c %c] belongs inside character classes",
8064                        c, c);
8065
8066             /* [[=foo=]] and [[.foo.]] are still future. */
8067             if (POSIXCC_NOTYET(c)) {
8068                 /* adjust RExC_parse so the error shows after
8069                    the class closes */
8070                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
8071                     NOOP;
8072                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8073             }
8074         }
8075     }
8076 }
8077
8078 /* No locale test */
8079 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
8080 ANYOF_##NAME:                                           \
8081         for (value = 0; value < 256; value++)           \
8082             if (TEST)                                   \
8083                 ANYOF_BITMAP_SET(ret, value);           \
8084     yesno = '+';                                        \
8085     what = WORD;                                        \
8086     break;                                              \
8087 case ANYOF_N##NAME:                                     \
8088         for (value = 0; value < 256; value++)           \
8089             if (!TEST)                                  \
8090                 ANYOF_BITMAP_SET(ret, value);           \
8091     yesno = '!';                                        \
8092     what = WORD;                                        \
8093     break
8094
8095 /* Like the above, but there are differences if we are in uni-8-bit or not, so
8096  * there are two tests passed in, to use depending on that. There aren't any
8097  * cases where the label is different from the name, so no need for that
8098  * parameter */
8099 #define _C_C_T_(NAME,TEST_8,TEST_7,WORD)       \
8100 ANYOF_##NAME:                                           \
8101     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);        \
8102     else if (UNI_SEMANTICS) {                           \
8103         for (value = 0; value < 256; value++) {         \
8104             if (TEST_8) ANYOF_BITMAP_SET(ret, value);   \
8105         }                                               \
8106     }                                                   \
8107     else {                                              \
8108         for (value = 0; value < 256; value++) {         \
8109             if (TEST_7) ANYOF_BITMAP_SET(ret, value);   \
8110         }                                               \
8111     }                                                   \
8112     yesno = '+';                                        \
8113     what = WORD;                                        \
8114     break;                                              \
8115 case ANYOF_N##NAME:                                     \
8116     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);       \
8117     else if (UNI_SEMANTICS) {                           \
8118         for (value = 0; value < 256; value++) {         \
8119             if (! TEST_8) ANYOF_BITMAP_SET(ret, value); \
8120         }                                               \
8121     }                                                   \
8122     else {                                              \
8123         for (value = 0; value < 256; value++) {         \
8124             if (! TEST_7) ANYOF_BITMAP_SET(ret, value); \
8125         }                                               \
8126     }                                                   \
8127     yesno = '!';                                        \
8128     what = WORD;                                        \
8129     break
8130
8131 /*
8132    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8133    so that it is possible to override the option here without having to
8134    rebuild the entire core. as we are required to do if we change regcomp.h
8135    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8136 */
8137 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8138 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8139 #endif
8140
8141 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8142 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8143 #else
8144 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8145 #endif
8146
8147 /*
8148    parse a class specification and produce either an ANYOF node that
8149    matches the pattern or if the pattern matches a single char only and
8150    that char is < 256 and we are case insensitive then we produce an
8151    EXACT node instead.
8152 */
8153
8154 STATIC regnode *
8155 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
8156 {
8157     dVAR;
8158     register UV nextvalue;
8159     register IV prevvalue = OOB_UNICODE;
8160     register IV range = 0;
8161     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
8162     register regnode *ret;
8163     STRLEN numlen;
8164     IV namedclass;
8165     char *rangebegin = NULL;
8166     bool need_class = 0;
8167     SV *listsv = NULL;
8168     UV n;
8169     bool optimize_invert   = TRUE;
8170     AV* unicode_alternate  = NULL;
8171 #ifdef EBCDIC
8172     UV literal_endpoint = 0;
8173 #endif
8174     UV stored = 0;  /* 0, 1, or more than 1 chars stored in the class */
8175
8176     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
8177         case we need to change the emitted regop to an EXACT. */
8178     const char * orig_parse = RExC_parse;
8179     GET_RE_DEBUG_FLAGS_DECL;
8180
8181     PERL_ARGS_ASSERT_REGCLASS;
8182 #ifndef DEBUGGING
8183     PERL_UNUSED_ARG(depth);
8184 #endif
8185
8186     DEBUG_PARSE("clas");
8187
8188     /* Assume we are going to generate an ANYOF node. */
8189     ret = reganode(pRExC_state, ANYOF, 0);
8190
8191     if (!SIZE_ONLY)
8192         ANYOF_FLAGS(ret) = 0;
8193
8194     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
8195         RExC_naughty++;
8196         RExC_parse++;
8197         if (!SIZE_ONLY)
8198             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8199     }
8200
8201     if (SIZE_ONLY) {
8202         RExC_size += ANYOF_SKIP;
8203         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8204     }
8205     else {
8206         RExC_emit += ANYOF_SKIP;
8207         if (FOLD)
8208             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
8209         if (LOC)
8210             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
8211         ANYOF_BITMAP_ZERO(ret);
8212         listsv = newSVpvs("# comment\n");
8213     }
8214
8215     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8216
8217     if (!SIZE_ONLY && POSIXCC(nextvalue))
8218         checkposixcc(pRExC_state);
8219
8220     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8221     if (UCHARAT(RExC_parse) == ']')
8222         goto charclassloop;
8223
8224 parseit:
8225     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
8226
8227     charclassloop:
8228
8229         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8230
8231         if (!range)
8232             rangebegin = RExC_parse;
8233         if (UTF) {
8234             value = utf8n_to_uvchr((U8*)RExC_parse,
8235                                    RExC_end - RExC_parse,
8236                                    &numlen, UTF8_ALLOW_DEFAULT);
8237             RExC_parse += numlen;
8238         }
8239         else
8240             value = UCHARAT(RExC_parse++);
8241
8242         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8243         if (value == '[' && POSIXCC(nextvalue))
8244             namedclass = regpposixcc(pRExC_state, value);
8245         else if (value == '\\') {
8246             if (UTF) {
8247                 value = utf8n_to_uvchr((U8*)RExC_parse,
8248                                    RExC_end - RExC_parse,
8249                                    &numlen, UTF8_ALLOW_DEFAULT);
8250                 RExC_parse += numlen;
8251             }
8252             else
8253                 value = UCHARAT(RExC_parse++);
8254             /* Some compilers cannot handle switching on 64-bit integer
8255              * values, therefore value cannot be an UV.  Yes, this will
8256              * be a problem later if we want switch on Unicode.
8257              * A similar issue a little bit later when switching on
8258              * namedclass. --jhi */
8259             switch ((I32)value) {
8260             case 'w':   namedclass = ANYOF_ALNUM;       break;
8261             case 'W':   namedclass = ANYOF_NALNUM;      break;
8262             case 's':   namedclass = ANYOF_SPACE;       break;
8263             case 'S':   namedclass = ANYOF_NSPACE;      break;
8264             case 'd':   namedclass = ANYOF_DIGIT;       break;
8265             case 'D':   namedclass = ANYOF_NDIGIT;      break;
8266             case 'v':   namedclass = ANYOF_VERTWS;      break;
8267             case 'V':   namedclass = ANYOF_NVERTWS;     break;
8268             case 'h':   namedclass = ANYOF_HORIZWS;     break;
8269             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
8270             case 'N':  /* Handle \N{NAME} in class */
8271                 {
8272                     /* We only pay attention to the first char of
8273                     multichar strings being returned. I kinda wonder
8274                     if this makes sense as it does change the behaviour
8275                     from earlier versions, OTOH that behaviour was broken
8276                     as well. */
8277                     UV v; /* value is register so we cant & it /grrr */
8278                     if (reg_namedseq(pRExC_state, &v, NULL)) {
8279                         goto parseit;
8280                     }
8281                     value= v;
8282                 }
8283                 break;
8284             case 'p':
8285             case 'P':
8286                 {
8287                 char *e;
8288                 if (RExC_parse >= RExC_end)
8289                     vFAIL2("Empty \\%c{}", (U8)value);
8290                 if (*RExC_parse == '{') {
8291                     const U8 c = (U8)value;
8292                     e = strchr(RExC_parse++, '}');
8293                     if (!e)
8294                         vFAIL2("Missing right brace on \\%c{}", c);
8295                     while (isSPACE(UCHARAT(RExC_parse)))
8296                         RExC_parse++;
8297                     if (e == RExC_parse)
8298                         vFAIL2("Empty \\%c{}", c);
8299                     n = e - RExC_parse;
8300                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8301                         n--;
8302                 }
8303                 else {
8304                     e = RExC_parse;
8305                     n = 1;
8306                 }
8307                 if (!SIZE_ONLY) {
8308                     if (UCHARAT(RExC_parse) == '^') {
8309                          RExC_parse++;
8310                          n--;
8311                          value = value == 'p' ? 'P' : 'p'; /* toggle */
8312                          while (isSPACE(UCHARAT(RExC_parse))) {
8313                               RExC_parse++;
8314                               n--;
8315                          }
8316                     }
8317                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8318                         (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8319                 }
8320                 RExC_parse = e + 1;
8321                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8322                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
8323                 }
8324                 break;
8325             case 'n':   value = '\n';                   break;
8326             case 'r':   value = '\r';                   break;
8327             case 't':   value = '\t';                   break;
8328             case 'f':   value = '\f';                   break;
8329             case 'b':   value = '\b';                   break;
8330             case 'e':   value = ASCII_TO_NATIVE('\033');break;
8331             case 'a':   value = ASCII_TO_NATIVE('\007');break;
8332             case 'o':
8333                 RExC_parse--;   /* function expects to be pointed at the 'o' */
8334                 {
8335                     const char* error_msg;
8336                     bool valid = grok_bslash_o(RExC_parse,
8337                                                &value,
8338                                                &numlen,
8339                                                &error_msg,
8340                                                SIZE_ONLY);
8341                     RExC_parse += numlen;
8342                     if (! valid) {
8343                         vFAIL(error_msg);
8344                     }
8345                 }
8346                 if (PL_encoding && value < 0x100) {
8347                     goto recode_encoding;
8348                 }
8349                 break;
8350             case 'x':
8351                 if (*RExC_parse == '{') {
8352                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8353                         | PERL_SCAN_DISALLOW_PREFIX;
8354                     char * const e = strchr(RExC_parse++, '}');
8355                     if (!e)
8356                         vFAIL("Missing right brace on \\x{}");
8357
8358                     numlen = e - RExC_parse;
8359                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8360                     RExC_parse = e + 1;
8361                 }
8362                 else {
8363                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8364                     numlen = 2;
8365                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8366                     RExC_parse += numlen;
8367                 }
8368                 if (PL_encoding && value < 0x100)
8369                     goto recode_encoding;
8370                 break;
8371             case 'c':
8372                 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
8373                 break;
8374             case '0': case '1': case '2': case '3': case '4':
8375             case '5': case '6': case '7':
8376                 {
8377                     /* Take 1-3 octal digits */
8378                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8379                     numlen = 3;
8380                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8381                     RExC_parse += numlen;
8382                     if (PL_encoding && value < 0x100)
8383                         goto recode_encoding;
8384                     break;
8385                 }
8386             recode_encoding:
8387                 {
8388                     SV* enc = PL_encoding;
8389                     value = reg_recode((const char)(U8)value, &enc);
8390                     if (!enc && SIZE_ONLY)
8391                         ckWARNreg(RExC_parse,
8392                                   "Invalid escape in the specified encoding");
8393                     break;
8394                 }
8395             default:
8396                 /* Allow \_ to not give an error */
8397                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
8398                     ckWARN2reg(RExC_parse,
8399                                "Unrecognized escape \\%c in character class passed through",
8400                                (int)value);
8401                 }
8402                 break;
8403             }
8404         } /* end of \blah */
8405 #ifdef EBCDIC
8406         else
8407             literal_endpoint++;
8408 #endif
8409
8410         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8411
8412             /* What matches in a locale is not known until runtime, so need to
8413              * (one time per class) allocate extra space to pass to regexec.
8414              * The space will contain a bit for each named class that is to be
8415              * matched against.  This isn't needed for \p{} and pseudo-classes,
8416              * as they are not affected by locale, and hence are dealt with
8417              * separately */
8418             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
8419                 need_class = 1;
8420                 if (SIZE_ONLY) {
8421                     RExC_size += ANYOF_CLASS_ADD_SKIP;
8422                 }
8423                 else {
8424                     RExC_emit += ANYOF_CLASS_ADD_SKIP;
8425                     ANYOF_CLASS_ZERO(ret);
8426                 }
8427                     ANYOF_FLAGS(ret) |= ANYOF_CLASS|ANYOF_LARGE;
8428             }
8429
8430             /* a bad range like a-\d, a-[:digit:] ? */
8431             if (range) {
8432                 if (!SIZE_ONLY) {
8433                     const int w =
8434                         RExC_parse >= rangebegin ?
8435                         RExC_parse - rangebegin : 0;
8436                     ckWARN4reg(RExC_parse,
8437                                "False [] range \"%*.*s\"",
8438                                w, w, rangebegin);
8439
8440                     if (prevvalue < 256) {
8441                         ANYOF_BITMAP_SET(ret, prevvalue);
8442                         ANYOF_BITMAP_SET(ret, '-');
8443                     }
8444                     else {
8445                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8446                         Perl_sv_catpvf(aTHX_ listsv,
8447                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8448                     }
8449                 }
8450
8451                 range = 0; /* this was not a true range */
8452             }
8453
8454
8455
8456             if (!SIZE_ONLY) {
8457                 const char *what = NULL;
8458                 char yesno = 0;
8459
8460                 if (namedclass > OOB_NAMEDCLASS)
8461                     optimize_invert = FALSE;
8462                 /* Possible truncation here but in some 64-bit environments
8463                  * the compiler gets heartburn about switch on 64-bit values.
8464                  * A similar issue a little earlier when switching on value.
8465                  * --jhi */
8466                 switch ((I32)namedclass) {
8467
8468                 case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
8469                 case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
8470                 case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
8471                 case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
8472                 case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
8473                 case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
8474                 case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
8475                 case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
8476                 case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
8477                 case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
8478 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8479                 /* \s, \w match all unicode if utf8. */
8480                 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
8481                 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
8482 #else
8483                 /* \s, \w match ascii and locale only */
8484                 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
8485                 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
8486 #endif
8487                 case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
8488                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8489                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8490                 case ANYOF_ASCII:
8491                     if (LOC)
8492                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8493                     else {
8494 #ifndef EBCDIC
8495                         for (value = 0; value < 128; value++)
8496                             ANYOF_BITMAP_SET(ret, value);
8497 #else  /* EBCDIC */
8498                         for (value = 0; value < 256; value++) {
8499                             if (isASCII(value))
8500                                 ANYOF_BITMAP_SET(ret, value);
8501                         }
8502 #endif /* EBCDIC */
8503                     }
8504                     yesno = '+';
8505                     what = "ASCII";
8506                     break;
8507                 case ANYOF_NASCII:
8508                     if (LOC)
8509                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8510                     else {
8511 #ifndef EBCDIC
8512                         for (value = 128; value < 256; value++)
8513                             ANYOF_BITMAP_SET(ret, value);
8514 #else  /* EBCDIC */
8515                         for (value = 0; value < 256; value++) {
8516                             if (!isASCII(value))
8517                                 ANYOF_BITMAP_SET(ret, value);
8518                         }
8519 #endif /* EBCDIC */
8520                     }
8521                     yesno = '!';
8522                     what = "ASCII";
8523                     break;
8524                 case ANYOF_DIGIT:
8525                     if (LOC)
8526                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8527                     else {
8528                         /* consecutive digits assumed */
8529                         for (value = '0'; value <= '9'; value++)
8530                             ANYOF_BITMAP_SET(ret, value);
8531                     }
8532                     yesno = '+';
8533                     what = POSIX_CC_UNI_NAME("Digit");
8534                     break;
8535                 case ANYOF_NDIGIT:
8536                     if (LOC)
8537                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8538                     else {
8539                         /* consecutive digits assumed */
8540                         for (value = 0; value < '0'; value++)
8541                             ANYOF_BITMAP_SET(ret, value);
8542                         for (value = '9' + 1; value < 256; value++)
8543                             ANYOF_BITMAP_SET(ret, value);
8544                     }
8545                     yesno = '!';
8546                     what = POSIX_CC_UNI_NAME("Digit");
8547                     break;
8548                 case ANYOF_MAX:
8549                     /* this is to handle \p and \P */
8550                     break;
8551                 default:
8552                     vFAIL("Invalid [::] class");
8553                     break;
8554                 }
8555                 if (what) {
8556                     /* Strings such as "+utf8::isWord\n" */
8557                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8558                 }
8559                 stored+=2; /* can't optimize this class */
8560                 continue;
8561             }
8562         } /* end of namedclass \blah */
8563
8564         if (range) {
8565             if (prevvalue > (IV)value) /* b-a */ {
8566                 const int w = RExC_parse - rangebegin;
8567                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8568                 range = 0; /* not a valid range */
8569             }
8570         }
8571         else {
8572             prevvalue = value; /* save the beginning of the range */
8573             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8574                 RExC_parse[1] != ']') {
8575                 RExC_parse++;
8576
8577                 /* a bad range like \w-, [:word:]- ? */
8578                 if (namedclass > OOB_NAMEDCLASS) {
8579                     if (ckWARN(WARN_REGEXP)) {
8580                         const int w =
8581                             RExC_parse >= rangebegin ?
8582                             RExC_parse - rangebegin : 0;
8583                         vWARN4(RExC_parse,
8584                                "False [] range \"%*.*s\"",
8585                                w, w, rangebegin);
8586                     }
8587                     if (!SIZE_ONLY)
8588                         ANYOF_BITMAP_SET(ret, '-');
8589                 } else
8590                     range = 1;  /* yeah, it's a range! */
8591                 continue;       /* but do it the next time */
8592             }
8593         }
8594
8595         /* now is the next time */
8596         /*stored += (value - prevvalue + 1);*/
8597         if (!SIZE_ONLY) {
8598             if (prevvalue < 256) {
8599                 const IV ceilvalue = value < 256 ? value : 255;
8600                 IV i;
8601 #ifdef EBCDIC
8602                 /* In EBCDIC [\x89-\x91] should include
8603                  * the \x8e but [i-j] should not. */
8604                 if (literal_endpoint == 2 &&
8605                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8606                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8607                 {
8608                     if (isLOWER(prevvalue)) {
8609                         for (i = prevvalue; i <= ceilvalue; i++)
8610                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8611                                 stored++;
8612                                 ANYOF_BITMAP_SET(ret, i);
8613                             }
8614                     } else {
8615                         for (i = prevvalue; i <= ceilvalue; i++)
8616                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8617                                 stored++;
8618                                 ANYOF_BITMAP_SET(ret, i);
8619                             }
8620                     }
8621                 }
8622                 else
8623 #endif
8624                       for (i = prevvalue; i <= ceilvalue; i++) {
8625                         if (!ANYOF_BITMAP_TEST(ret,i)) {
8626                             stored++;
8627                             ANYOF_BITMAP_SET(ret, i);
8628                         }
8629                       }
8630           }
8631           if (value > 255 || UTF) {
8632                 const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
8633                 const UV natvalue      = NATIVE_TO_UNI(value);
8634                 stored+=2; /* can't optimize this class */
8635                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8636                 if (prevnatvalue < natvalue) { /* what about > ? */
8637                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8638                                    prevnatvalue, natvalue);
8639                 }
8640                 else if (prevnatvalue == natvalue) {
8641                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8642                     if (FOLD) {
8643                          U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8644                          STRLEN foldlen;
8645                          const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8646
8647 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8648                          if (RExC_precomp[0] == ':' &&
8649                              RExC_precomp[1] == '[' &&
8650                              (f == 0xDF || f == 0x92)) {
8651                              f = NATIVE_TO_UNI(f);
8652                         }
8653 #endif
8654                          /* If folding and foldable and a single
8655                           * character, insert also the folded version
8656                           * to the charclass. */
8657                          if (f != value) {
8658 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8659                              if ((RExC_precomp[0] == ':' &&
8660                                   RExC_precomp[1] == '[' &&
8661                                   (f == 0xA2 &&
8662                                    (value == 0xFB05 || value == 0xFB06))) ?
8663                                  foldlen == ((STRLEN)UNISKIP(f) - 1) :
8664                                  foldlen == (STRLEN)UNISKIP(f) )
8665 #else
8666                               if (foldlen == (STRLEN)UNISKIP(f))
8667 #endif
8668                                   Perl_sv_catpvf(aTHX_ listsv,
8669                                                  "%04"UVxf"\n", f);
8670                               else {
8671                                   /* Any multicharacter foldings
8672                                    * require the following transform:
8673                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8674                                    * where E folds into "pq" and F folds
8675                                    * into "rst", all other characters
8676                                    * fold to single characters.  We save
8677                                    * away these multicharacter foldings,
8678                                    * to be later saved as part of the
8679                                    * additional "s" data. */
8680                                   SV *sv;
8681
8682                                   if (!unicode_alternate)
8683                                       unicode_alternate = newAV();
8684                                   sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8685                                                      TRUE);
8686                                   av_push(unicode_alternate, sv);
8687                               }
8688                          }
8689
8690                          /* If folding and the value is one of the Greek
8691                           * sigmas insert a few more sigmas to make the
8692                           * folding rules of the sigmas to work right.
8693                           * Note that not all the possible combinations
8694                           * are handled here: some of them are handled
8695                           * by the standard folding rules, and some of
8696                           * them (literal or EXACTF cases) are handled
8697                           * during runtime in regexec.c:S_find_byclass(). */
8698                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8699                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8700                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8701                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8702                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8703                          }
8704                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8705                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8706                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8707                     }
8708                 }
8709             }
8710 #ifdef EBCDIC
8711             literal_endpoint = 0;
8712 #endif
8713         }
8714
8715         range = 0; /* this range (if it was one) is done now */
8716     }
8717
8718
8719
8720     if (SIZE_ONLY)
8721         return ret;
8722     /****** !SIZE_ONLY AFTER HERE *********/
8723
8724     if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8725         && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8726     ) {
8727         /* optimize single char class to an EXACT node
8728            but *only* when its not a UTF/high char  */
8729         const char * cur_parse= RExC_parse;
8730         RExC_emit = (regnode *)orig_emit;
8731         RExC_parse = (char *)orig_parse;
8732         ret = reg_node(pRExC_state,
8733                        (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8734         RExC_parse = (char *)cur_parse;
8735         *STRING(ret)= (char)value;
8736         STR_LEN(ret)= 1;
8737         RExC_emit += STR_SZ(1);
8738         SvREFCNT_dec(listsv);
8739         return ret;
8740     }
8741     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8742     if ( /* If the only flag is folding (plus possibly inversion). */
8743         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8744        ) {
8745         for (value = 0; value < 256; ++value) {
8746             if (ANYOF_BITMAP_TEST(ret, value)) {
8747                 UV fold = PL_fold[value];
8748
8749                 if (fold != value)
8750                     ANYOF_BITMAP_SET(ret, fold);
8751             }
8752         }
8753         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8754     }
8755
8756     /* optimize inverted simple patterns (e.g. [^a-z]) */
8757     if (optimize_invert &&
8758         /* If the only flag is inversion. */
8759         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8760         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8761             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8762         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8763     }
8764     {
8765         AV * const av = newAV();
8766         SV *rv;
8767         /* The 0th element stores the character class description
8768          * in its textual form: used later (regexec.c:Perl_regclass_swash())
8769          * to initialize the appropriate swash (which gets stored in
8770          * the 1st element), and also useful for dumping the regnode.
8771          * The 2nd element stores the multicharacter foldings,
8772          * used later (regexec.c:S_reginclass()). */
8773         av_store(av, 0, listsv);
8774         av_store(av, 1, NULL);
8775         av_store(av, 2, MUTABLE_SV(unicode_alternate));
8776         rv = newRV_noinc(MUTABLE_SV(av));
8777         n = add_data(pRExC_state, 1, "s");
8778         RExC_rxi->data->data[n] = (void*)rv;
8779         ARG_SET(ret, n);
8780     }
8781     return ret;
8782 }
8783 #undef _C_C_T_
8784
8785
8786 /* reg_skipcomment()
8787
8788    Absorbs an /x style # comments from the input stream.
8789    Returns true if there is more text remaining in the stream.
8790    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8791    terminates the pattern without including a newline.
8792
8793    Note its the callers responsibility to ensure that we are
8794    actually in /x mode
8795
8796 */
8797
8798 STATIC bool
8799 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8800 {
8801     bool ended = 0;
8802
8803     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8804
8805     while (RExC_parse < RExC_end)
8806         if (*RExC_parse++ == '\n') {
8807             ended = 1;
8808             break;
8809         }
8810     if (!ended) {
8811         /* we ran off the end of the pattern without ending
8812            the comment, so we have to add an \n when wrapping */
8813         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8814         return 0;
8815     } else
8816         return 1;
8817 }
8818
8819 /* nextchar()
8820
8821    Advances the parse position, and optionally absorbs
8822    "whitespace" from the inputstream.
8823
8824    Without /x "whitespace" means (?#...) style comments only,
8825    with /x this means (?#...) and # comments and whitespace proper.
8826
8827    Returns the RExC_parse point from BEFORE the scan occurs.
8828
8829    This is the /x friendly way of saying RExC_parse++.
8830 */
8831
8832 STATIC char*
8833 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8834 {
8835     char* const retval = RExC_parse++;
8836
8837     PERL_ARGS_ASSERT_NEXTCHAR;
8838
8839     for (;;) {
8840         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8841                 RExC_parse[2] == '#') {
8842             while (*RExC_parse != ')') {
8843                 if (RExC_parse == RExC_end)
8844                     FAIL("Sequence (?#... not terminated");
8845                 RExC_parse++;
8846             }
8847             RExC_parse++;
8848             continue;
8849         }
8850         if (RExC_flags & RXf_PMf_EXTENDED) {
8851             if (isSPACE(*RExC_parse)) {
8852                 RExC_parse++;
8853                 continue;
8854             }
8855             else if (*RExC_parse == '#') {
8856                 if ( reg_skipcomment( pRExC_state ) )
8857                     continue;
8858             }
8859         }
8860         return retval;
8861     }
8862 }
8863
8864 /*
8865 - reg_node - emit a node
8866 */
8867 STATIC regnode *                        /* Location. */
8868 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8869 {
8870     dVAR;
8871     register regnode *ptr;
8872     regnode * const ret = RExC_emit;
8873     GET_RE_DEBUG_FLAGS_DECL;
8874
8875     PERL_ARGS_ASSERT_REG_NODE;
8876
8877     if (SIZE_ONLY) {
8878         SIZE_ALIGN(RExC_size);
8879         RExC_size += 1;
8880         return(ret);
8881     }
8882     if (RExC_emit >= RExC_emit_bound)
8883         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8884
8885     NODE_ALIGN_FILL(ret);
8886     ptr = ret;
8887     FILL_ADVANCE_NODE(ptr, op);
8888     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
8889 #ifdef RE_TRACK_PATTERN_OFFSETS
8890     if (RExC_offsets) {         /* MJD */
8891         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8892               "reg_node", __LINE__,
8893               PL_reg_name[op],
8894               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8895                 ? "Overwriting end of array!\n" : "OK",
8896               (UV)(RExC_emit - RExC_emit_start),
8897               (UV)(RExC_parse - RExC_start),
8898               (UV)RExC_offsets[0]));
8899         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8900     }
8901 #endif
8902     RExC_emit = ptr;
8903     return(ret);
8904 }
8905
8906 /*
8907 - reganode - emit a node with an argument
8908 */
8909 STATIC regnode *                        /* Location. */
8910 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8911 {
8912     dVAR;
8913     register regnode *ptr;
8914     regnode * const ret = RExC_emit;
8915     GET_RE_DEBUG_FLAGS_DECL;
8916
8917     PERL_ARGS_ASSERT_REGANODE;
8918
8919     if (SIZE_ONLY) {
8920         SIZE_ALIGN(RExC_size);
8921         RExC_size += 2;
8922         /*
8923            We can't do this:
8924
8925            assert(2==regarglen[op]+1);
8926
8927            Anything larger than this has to allocate the extra amount.
8928            If we changed this to be:
8929
8930            RExC_size += (1 + regarglen[op]);
8931
8932            then it wouldn't matter. Its not clear what side effect
8933            might come from that so its not done so far.
8934            -- dmq
8935         */
8936         return(ret);
8937     }
8938     if (RExC_emit >= RExC_emit_bound)
8939         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8940
8941     NODE_ALIGN_FILL(ret);
8942     ptr = ret;
8943     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8944     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
8945 #ifdef RE_TRACK_PATTERN_OFFSETS
8946     if (RExC_offsets) {         /* MJD */
8947         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8948               "reganode",
8949               __LINE__,
8950               PL_reg_name[op],
8951               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8952               "Overwriting end of array!\n" : "OK",
8953               (UV)(RExC_emit - RExC_emit_start),
8954               (UV)(RExC_parse - RExC_start),
8955               (UV)RExC_offsets[0]));
8956         Set_Cur_Node_Offset;
8957     }
8958 #endif
8959     RExC_emit = ptr;
8960     return(ret);
8961 }
8962
8963 /*
8964 - reguni - emit (if appropriate) a Unicode character
8965 */
8966 STATIC STRLEN
8967 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8968 {
8969     dVAR;
8970
8971     PERL_ARGS_ASSERT_REGUNI;
8972
8973     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8974 }
8975
8976 /*
8977 - reginsert - insert an operator in front of already-emitted operand
8978 *
8979 * Means relocating the operand.
8980 */
8981 STATIC void
8982 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8983 {
8984     dVAR;
8985     register regnode *src;
8986     register regnode *dst;
8987     register regnode *place;
8988     const int offset = regarglen[(U8)op];
8989     const int size = NODE_STEP_REGNODE + offset;
8990     GET_RE_DEBUG_FLAGS_DECL;
8991
8992     PERL_ARGS_ASSERT_REGINSERT;
8993     PERL_UNUSED_ARG(depth);
8994 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8995     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8996     if (SIZE_ONLY) {
8997         RExC_size += size;
8998         return;
8999     }
9000
9001     src = RExC_emit;
9002     RExC_emit += size;
9003     dst = RExC_emit;
9004     if (RExC_open_parens) {
9005         int paren;
9006         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
9007         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
9008             if ( RExC_open_parens[paren] >= opnd ) {
9009                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
9010                 RExC_open_parens[paren] += size;
9011             } else {
9012                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
9013             }
9014             if ( RExC_close_parens[paren] >= opnd ) {
9015                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
9016                 RExC_close_parens[paren] += size;
9017             } else {
9018                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
9019             }
9020         }
9021     }
9022
9023     while (src > opnd) {
9024         StructCopy(--src, --dst, regnode);
9025 #ifdef RE_TRACK_PATTERN_OFFSETS
9026         if (RExC_offsets) {     /* MJD 20010112 */
9027             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
9028                   "reg_insert",
9029                   __LINE__,
9030                   PL_reg_name[op],
9031                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
9032                     ? "Overwriting end of array!\n" : "OK",
9033                   (UV)(src - RExC_emit_start),
9034                   (UV)(dst - RExC_emit_start),
9035                   (UV)RExC_offsets[0]));
9036             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
9037             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
9038         }
9039 #endif
9040     }
9041
9042
9043     place = opnd;               /* Op node, where operand used to be. */
9044 #ifdef RE_TRACK_PATTERN_OFFSETS
9045     if (RExC_offsets) {         /* MJD */
9046         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9047               "reginsert",
9048               __LINE__,
9049               PL_reg_name[op],
9050               (UV)(place - RExC_emit_start) > RExC_offsets[0]
9051               ? "Overwriting end of array!\n" : "OK",
9052               (UV)(place - RExC_emit_start),
9053               (UV)(RExC_parse - RExC_start),
9054               (UV)RExC_offsets[0]));
9055         Set_Node_Offset(place, RExC_parse);
9056         Set_Node_Length(place, 1);
9057     }
9058 #endif
9059     src = NEXTOPER(place);
9060     FILL_ADVANCE_NODE(place, op);
9061     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
9062     Zero(src, offset, regnode);
9063 }
9064
9065 /*
9066 - regtail - set the next-pointer at the end of a node chain of p to val.
9067 - SEE ALSO: regtail_study
9068 */
9069 /* TODO: All three parms should be const */
9070 STATIC void
9071 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9072 {
9073     dVAR;
9074     register regnode *scan;
9075     GET_RE_DEBUG_FLAGS_DECL;
9076
9077     PERL_ARGS_ASSERT_REGTAIL;
9078 #ifndef DEBUGGING
9079     PERL_UNUSED_ARG(depth);
9080 #endif
9081
9082     if (SIZE_ONLY)
9083         return;
9084
9085     /* Find last node. */
9086     scan = p;
9087     for (;;) {
9088         regnode * const temp = regnext(scan);
9089         DEBUG_PARSE_r({
9090             SV * const mysv=sv_newmortal();
9091             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
9092             regprop(RExC_rx, mysv, scan);
9093             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
9094                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
9095                     (temp == NULL ? "->" : ""),
9096                     (temp == NULL ? PL_reg_name[OP(val)] : "")
9097             );
9098         });
9099         if (temp == NULL)
9100             break;
9101         scan = temp;
9102     }
9103
9104     if (reg_off_by_arg[OP(scan)]) {
9105         ARG_SET(scan, val - scan);
9106     }
9107     else {
9108         NEXT_OFF(scan) = val - scan;
9109     }
9110 }
9111
9112 #ifdef DEBUGGING
9113 /*
9114 - regtail_study - set the next-pointer at the end of a node chain of p to val.
9115 - Look for optimizable sequences at the same time.
9116 - currently only looks for EXACT chains.
9117
9118 This is expermental code. The idea is to use this routine to perform
9119 in place optimizations on branches and groups as they are constructed,
9120 with the long term intention of removing optimization from study_chunk so
9121 that it is purely analytical.
9122
9123 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
9124 to control which is which.
9125
9126 */
9127 /* TODO: All four parms should be const */
9128
9129 STATIC U8
9130 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9131 {
9132     dVAR;
9133     register regnode *scan;
9134     U8 exact = PSEUDO;
9135 #ifdef EXPERIMENTAL_INPLACESCAN
9136     I32 min = 0;
9137 #endif
9138     GET_RE_DEBUG_FLAGS_DECL;
9139
9140     PERL_ARGS_ASSERT_REGTAIL_STUDY;
9141
9142
9143     if (SIZE_ONLY)
9144         return exact;
9145
9146     /* Find last node. */
9147
9148     scan = p;
9149     for (;;) {
9150         regnode * const temp = regnext(scan);
9151 #ifdef EXPERIMENTAL_INPLACESCAN
9152         if (PL_regkind[OP(scan)] == EXACT)
9153             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
9154                 return EXACT;
9155 #endif
9156         if ( exact ) {
9157             switch (OP(scan)) {
9158                 case EXACT:
9159                 case EXACTF:
9160                 case EXACTFL:
9161                         if( exact == PSEUDO )
9162                             exact= OP(scan);
9163                         else if ( exact != OP(scan) )
9164                             exact= 0;
9165                 case NOTHING:
9166                     break;
9167                 default:
9168                     exact= 0;
9169             }
9170         }
9171         DEBUG_PARSE_r({
9172             SV * const mysv=sv_newmortal();
9173             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
9174             regprop(RExC_rx, mysv, scan);
9175             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
9176                 SvPV_nolen_const(mysv),
9177                 REG_NODE_NUM(scan),
9178                 PL_reg_name[exact]);
9179         });
9180         if (temp == NULL)
9181             break;
9182         scan = temp;
9183     }
9184     DEBUG_PARSE_r({
9185         SV * const mysv_val=sv_newmortal();
9186         DEBUG_PARSE_MSG("");
9187         regprop(RExC_rx, mysv_val, val);
9188         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
9189                       SvPV_nolen_const(mysv_val),
9190                       (IV)REG_NODE_NUM(val),
9191                       (IV)(val - scan)
9192         );
9193     });
9194     if (reg_off_by_arg[OP(scan)]) {
9195         ARG_SET(scan, val - scan);
9196     }
9197     else {
9198         NEXT_OFF(scan) = val - scan;
9199     }
9200
9201     return exact;
9202 }
9203 #endif
9204
9205 /*
9206  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
9207  */
9208 #ifdef DEBUGGING
9209 static void
9210 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9211 {
9212     int bit;
9213     int set=0;
9214
9215     for (bit=0; bit<32; bit++) {
9216         if (flags & (1<<bit)) {
9217             if (!set++ && lead)
9218                 PerlIO_printf(Perl_debug_log, "%s",lead);
9219             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9220         }
9221     }
9222     if (lead)  {
9223         if (set)
9224             PerlIO_printf(Perl_debug_log, "\n");
9225         else
9226             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9227     }
9228 }
9229 #endif
9230
9231 void
9232 Perl_regdump(pTHX_ const regexp *r)
9233 {
9234 #ifdef DEBUGGING
9235     dVAR;
9236     SV * const sv = sv_newmortal();
9237     SV *dsv= sv_newmortal();
9238     RXi_GET_DECL(r,ri);
9239     GET_RE_DEBUG_FLAGS_DECL;
9240
9241     PERL_ARGS_ASSERT_REGDUMP;
9242
9243     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
9244
9245     /* Header fields of interest. */
9246     if (r->anchored_substr) {
9247         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9248             RE_SV_DUMPLEN(r->anchored_substr), 30);
9249         PerlIO_printf(Perl_debug_log,
9250                       "anchored %s%s at %"IVdf" ",
9251                       s, RE_SV_TAIL(r->anchored_substr),
9252                       (IV)r->anchored_offset);
9253     } else if (r->anchored_utf8) {
9254         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9255             RE_SV_DUMPLEN(r->anchored_utf8), 30);
9256         PerlIO_printf(Perl_debug_log,
9257                       "anchored utf8 %s%s at %"IVdf" ",
9258                       s, RE_SV_TAIL(r->anchored_utf8),
9259                       (IV)r->anchored_offset);
9260     }
9261     if (r->float_substr) {
9262         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9263             RE_SV_DUMPLEN(r->float_substr), 30);
9264         PerlIO_printf(Perl_debug_log,
9265                       "floating %s%s at %"IVdf"..%"UVuf" ",
9266                       s, RE_SV_TAIL(r->float_substr),
9267                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9268     } else if (r->float_utf8) {
9269         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9270             RE_SV_DUMPLEN(r->float_utf8), 30);
9271         PerlIO_printf(Perl_debug_log,
9272                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9273                       s, RE_SV_TAIL(r->float_utf8),
9274                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9275     }
9276     if (r->check_substr || r->check_utf8)
9277         PerlIO_printf(Perl_debug_log,
9278                       (const char *)
9279                       (r->check_substr == r->float_substr
9280                        && r->check_utf8 == r->float_utf8
9281                        ? "(checking floating" : "(checking anchored"));
9282     if (r->extflags & RXf_NOSCAN)
9283         PerlIO_printf(Perl_debug_log, " noscan");
9284     if (r->extflags & RXf_CHECK_ALL)
9285         PerlIO_printf(Perl_debug_log, " isall");
9286     if (r->check_substr || r->check_utf8)
9287         PerlIO_printf(Perl_debug_log, ") ");
9288
9289     if (ri->regstclass) {
9290         regprop(r, sv, ri->regstclass);
9291         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9292     }
9293     if (r->extflags & RXf_ANCH) {
9294         PerlIO_printf(Perl_debug_log, "anchored");
9295         if (r->extflags & RXf_ANCH_BOL)
9296             PerlIO_printf(Perl_debug_log, "(BOL)");
9297         if (r->extflags & RXf_ANCH_MBOL)
9298             PerlIO_printf(Perl_debug_log, "(MBOL)");
9299         if (r->extflags & RXf_ANCH_SBOL)
9300             PerlIO_printf(Perl_debug_log, "(SBOL)");
9301         if (r->extflags & RXf_ANCH_GPOS)
9302             PerlIO_printf(Perl_debug_log, "(GPOS)");
9303         PerlIO_putc(Perl_debug_log, ' ');
9304     }
9305     if (r->extflags & RXf_GPOS_SEEN)
9306         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9307     if (r->intflags & PREGf_SKIP)
9308         PerlIO_printf(Perl_debug_log, "plus ");
9309     if (r->intflags & PREGf_IMPLICIT)
9310         PerlIO_printf(Perl_debug_log, "implicit ");
9311     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9312     if (r->extflags & RXf_EVAL_SEEN)
9313         PerlIO_printf(Perl_debug_log, "with eval ");
9314     PerlIO_printf(Perl_debug_log, "\n");
9315     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9316 #else
9317     PERL_ARGS_ASSERT_REGDUMP;
9318     PERL_UNUSED_CONTEXT;
9319     PERL_UNUSED_ARG(r);
9320 #endif  /* DEBUGGING */
9321 }
9322
9323 /*
9324 - regprop - printable representation of opcode
9325 */
9326 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9327 STMT_START { \
9328         if (do_sep) {                           \
9329             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9330             if (flags & ANYOF_INVERT)           \
9331                 /*make sure the invert info is in each */ \
9332                 sv_catpvs(sv, "^");             \
9333             do_sep = 0;                         \
9334         }                                       \
9335 } STMT_END
9336
9337 void
9338 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9339 {
9340 #ifdef DEBUGGING
9341     dVAR;
9342     register int k;
9343     RXi_GET_DECL(prog,progi);
9344     GET_RE_DEBUG_FLAGS_DECL;
9345
9346     PERL_ARGS_ASSERT_REGPROP;
9347
9348     sv_setpvs(sv, "");
9349
9350     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
9351         /* It would be nice to FAIL() here, but this may be called from
9352            regexec.c, and it would be hard to supply pRExC_state. */
9353         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9354     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9355
9356     k = PL_regkind[OP(o)];
9357
9358     if (k == EXACT) {
9359         sv_catpvs(sv, " ");
9360         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9361          * is a crude hack but it may be the best for now since
9362          * we have no flag "this EXACTish node was UTF-8"
9363          * --jhi */
9364         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9365                   PERL_PV_ESCAPE_UNI_DETECT |
9366                   PERL_PV_PRETTY_ELLIPSES   |
9367                   PERL_PV_PRETTY_LTGT       |
9368                   PERL_PV_PRETTY_NOCLEAR
9369                   );
9370     } else if (k == TRIE) {
9371         /* print the details of the trie in dumpuntil instead, as
9372          * progi->data isn't available here */
9373         const char op = OP(o);
9374         const U32 n = ARG(o);
9375         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9376                (reg_ac_data *)progi->data->data[n] :
9377                NULL;
9378         const reg_trie_data * const trie
9379             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9380
9381         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9382         DEBUG_TRIE_COMPILE_r(
9383             Perl_sv_catpvf(aTHX_ sv,
9384                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9385                 (UV)trie->startstate,
9386                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9387                 (UV)trie->wordcount,
9388                 (UV)trie->minlen,
9389                 (UV)trie->maxlen,
9390                 (UV)TRIE_CHARCOUNT(trie),
9391                 (UV)trie->uniquecharcount
9392             )
9393         );
9394         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9395             int i;
9396             int rangestart = -1;
9397             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9398             sv_catpvs(sv, "[");
9399             for (i = 0; i <= 256; i++) {
9400                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9401                     if (rangestart == -1)
9402                         rangestart = i;
9403                 } else if (rangestart != -1) {
9404                     if (i <= rangestart + 3)
9405                         for (; rangestart < i; rangestart++)
9406                             put_byte(sv, rangestart);
9407                     else {
9408                         put_byte(sv, rangestart);
9409                         sv_catpvs(sv, "-");
9410                         put_byte(sv, i - 1);
9411                     }
9412                     rangestart = -1;
9413                 }
9414             }
9415             sv_catpvs(sv, "]");
9416         }
9417
9418     } else if (k == CURLY) {
9419         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9420             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9421         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9422     }
9423     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
9424         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9425     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9426         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
9427         if ( RXp_PAREN_NAMES(prog) ) {
9428             if ( k != REF || OP(o) < NREF) {
9429                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9430                 SV **name= av_fetch(list, ARG(o), 0 );
9431                 if (name)
9432                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9433             }
9434             else {
9435                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9436                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9437                 I32 *nums=(I32*)SvPVX(sv_dat);
9438                 SV **name= av_fetch(list, nums[0], 0 );
9439                 I32 n;
9440                 if (name) {
9441                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
9442                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9443                                     (n ? "," : ""), (IV)nums[n]);
9444                     }
9445                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9446                 }
9447             }
9448         }
9449     } else if (k == GOSUB)
9450         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9451     else if (k == VERB) {
9452         if (!o->flags)
9453             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9454                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9455     } else if (k == LOGICAL)
9456         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
9457     else if (k == FOLDCHAR)
9458         Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9459     else if (k == ANYOF) {
9460         int i, rangestart = -1;
9461         const U8 flags = ANYOF_FLAGS(o);
9462         int do_sep = 0;
9463
9464         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9465         static const char * const anyofs[] = {
9466             "\\w",
9467             "\\W",
9468             "\\s",
9469             "\\S",
9470             "\\d",
9471             "\\D",
9472             "[:alnum:]",
9473             "[:^alnum:]",
9474             "[:alpha:]",
9475             "[:^alpha:]",
9476             "[:ascii:]",
9477             "[:^ascii:]",
9478             "[:cntrl:]",
9479             "[:^cntrl:]",
9480             "[:graph:]",
9481             "[:^graph:]",
9482             "[:lower:]",
9483             "[:^lower:]",
9484             "[:print:]",
9485             "[:^print:]",
9486             "[:punct:]",
9487             "[:^punct:]",
9488             "[:upper:]",
9489             "[:^upper:]",
9490             "[:xdigit:]",
9491             "[:^xdigit:]",
9492             "[:space:]",
9493             "[:^space:]",
9494             "[:blank:]",
9495             "[:^blank:]"
9496         };
9497
9498         if (flags & ANYOF_LOCALE)
9499             sv_catpvs(sv, "{loc}");
9500         if (flags & ANYOF_FOLD)
9501             sv_catpvs(sv, "{i}");
9502         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9503         if (flags & ANYOF_INVERT)
9504             sv_catpvs(sv, "^");
9505
9506         /* output what the standard cp 0-255 bitmap matches */
9507         for (i = 0; i <= 256; i++) {
9508             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9509                 if (rangestart == -1)
9510                     rangestart = i;
9511             } else if (rangestart != -1) {
9512                 if (i <= rangestart + 3)
9513                     for (; rangestart < i; rangestart++)
9514                         put_byte(sv, rangestart);
9515                 else {
9516                     put_byte(sv, rangestart);
9517                     sv_catpvs(sv, "-");
9518                     put_byte(sv, i - 1);
9519                 }
9520                 do_sep = 1;
9521                 rangestart = -1;
9522             }
9523         }
9524
9525         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9526         /* output any special charclass tests (used mostly under use locale) */
9527         if (o->flags & ANYOF_CLASS)
9528             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9529                 if (ANYOF_CLASS_TEST(o,i)) {
9530                     sv_catpv(sv, anyofs[i]);
9531                     do_sep = 1;
9532                 }
9533
9534         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9535
9536         /* output information about the unicode matching */
9537         if (flags & ANYOF_UNICODE)
9538             sv_catpvs(sv, "{unicode}");
9539         else if (flags & ANYOF_UNICODE_ALL)
9540             sv_catpvs(sv, "{unicode_all}");
9541
9542         {
9543             SV *lv;
9544             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9545
9546             if (lv) {
9547                 if (sw) {
9548                     U8 s[UTF8_MAXBYTES_CASE+1];
9549
9550                     for (i = 0; i <= 256; i++) { /* just the first 256 */
9551                         uvchr_to_utf8(s, i);
9552
9553                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
9554                             if (rangestart == -1)
9555                                 rangestart = i;
9556                         } else if (rangestart != -1) {
9557                             if (i <= rangestart + 3)
9558                                 for (; rangestart < i; rangestart++) {
9559                                     const U8 * const e = uvchr_to_utf8(s,rangestart);
9560                                     U8 *p;
9561                                     for(p = s; p < e; p++)
9562                                         put_byte(sv, *p);
9563                                 }
9564                             else {
9565                                 const U8 *e = uvchr_to_utf8(s,rangestart);
9566                                 U8 *p;
9567                                 for (p = s; p < e; p++)
9568                                     put_byte(sv, *p);
9569                                 sv_catpvs(sv, "-");
9570                                 e = uvchr_to_utf8(s, i-1);
9571                                 for (p = s; p < e; p++)
9572                                     put_byte(sv, *p);
9573                                 }
9574                                 rangestart = -1;
9575                             }
9576                         }
9577
9578                     sv_catpvs(sv, "..."); /* et cetera */
9579                 }
9580
9581                 {
9582                     char *s = savesvpv(lv);
9583                     char * const origs = s;
9584
9585                     while (*s && *s != '\n')
9586                         s++;
9587
9588                     if (*s == '\n') {
9589                         const char * const t = ++s;
9590
9591                         while (*s) {
9592                             if (*s == '\n')
9593                                 *s = ' ';
9594                             s++;
9595                         }
9596                         if (s[-1] == ' ')
9597                             s[-1] = 0;
9598
9599                         sv_catpv(sv, t);
9600                     }
9601
9602                     Safefree(origs);
9603                 }
9604             }
9605         }
9606
9607         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9608     }
9609     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9610         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9611 #else
9612     PERL_UNUSED_CONTEXT;
9613     PERL_UNUSED_ARG(sv);
9614     PERL_UNUSED_ARG(o);
9615     PERL_UNUSED_ARG(prog);
9616 #endif  /* DEBUGGING */
9617 }
9618
9619 SV *
9620 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9621 {                               /* Assume that RE_INTUIT is set */
9622     dVAR;
9623     struct regexp *const prog = (struct regexp *)SvANY(r);
9624     GET_RE_DEBUG_FLAGS_DECL;
9625
9626     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9627     PERL_UNUSED_CONTEXT;
9628
9629     DEBUG_COMPILE_r(
9630         {
9631             const char * const s = SvPV_nolen_const(prog->check_substr
9632                       ? prog->check_substr : prog->check_utf8);
9633
9634             if (!PL_colorset) reginitcolors();
9635             PerlIO_printf(Perl_debug_log,
9636                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9637                       PL_colors[4],
9638                       prog->check_substr ? "" : "utf8 ",
9639                       PL_colors[5],PL_colors[0],
9640                       s,
9641                       PL_colors[1],
9642                       (strlen(s) > 60 ? "..." : ""));
9643         } );
9644
9645     return prog->check_substr ? prog->check_substr : prog->check_utf8;
9646 }
9647
9648 /*
9649    pregfree()
9650
9651    handles refcounting and freeing the perl core regexp structure. When
9652    it is necessary to actually free the structure the first thing it
9653    does is call the 'free' method of the regexp_engine associated to
9654    the regexp, allowing the handling of the void *pprivate; member
9655    first. (This routine is not overridable by extensions, which is why
9656    the extensions free is called first.)
9657
9658    See regdupe and regdupe_internal if you change anything here.
9659 */
9660 #ifndef PERL_IN_XSUB_RE
9661 void
9662 Perl_pregfree(pTHX_ REGEXP *r)
9663 {
9664     SvREFCNT_dec(r);
9665 }
9666
9667 void
9668 Perl_pregfree2(pTHX_ REGEXP *rx)
9669 {
9670     dVAR;
9671     struct regexp *const r = (struct regexp *)SvANY(rx);
9672     GET_RE_DEBUG_FLAGS_DECL;
9673
9674     PERL_ARGS_ASSERT_PREGFREE2;
9675
9676     if (r->mother_re) {
9677         ReREFCNT_dec(r->mother_re);
9678     } else {
9679         CALLREGFREE_PVT(rx); /* free the private data */
9680         SvREFCNT_dec(RXp_PAREN_NAMES(r));
9681     }
9682     if (r->substrs) {
9683         SvREFCNT_dec(r->anchored_substr);
9684         SvREFCNT_dec(r->anchored_utf8);
9685         SvREFCNT_dec(r->float_substr);
9686         SvREFCNT_dec(r->float_utf8);
9687         Safefree(r->substrs);
9688     }
9689     RX_MATCH_COPY_FREE(rx);
9690 #ifdef PERL_OLD_COPY_ON_WRITE
9691     SvREFCNT_dec(r->saved_copy);
9692 #endif
9693     Safefree(r->offs);
9694 }
9695
9696 /*  reg_temp_copy()
9697
9698     This is a hacky workaround to the structural issue of match results
9699     being stored in the regexp structure which is in turn stored in
9700     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9701     could be PL_curpm in multiple contexts, and could require multiple
9702     result sets being associated with the pattern simultaneously, such
9703     as when doing a recursive match with (??{$qr})
9704
9705     The solution is to make a lightweight copy of the regexp structure
9706     when a qr// is returned from the code executed by (??{$qr}) this
9707     lightweight copy doesnt actually own any of its data except for
9708     the starp/end and the actual regexp structure itself.
9709
9710 */
9711
9712
9713 REGEXP *
9714 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9715 {
9716     struct regexp *ret;
9717     struct regexp *const r = (struct regexp *)SvANY(rx);
9718     register const I32 npar = r->nparens+1;
9719
9720     PERL_ARGS_ASSERT_REG_TEMP_COPY;
9721
9722     if (!ret_x)
9723         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9724     ret = (struct regexp *)SvANY(ret_x);
9725
9726     (void)ReREFCNT_inc(rx);
9727     /* We can take advantage of the existing "copied buffer" mechanism in SVs
9728        by pointing directly at the buffer, but flagging that the allocated
9729        space in the copy is zero. As we've just done a struct copy, it's now
9730        a case of zero-ing that, rather than copying the current length.  */
9731     SvPV_set(ret_x, RX_WRAPPED(rx));
9732     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9733     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9734            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9735     SvLEN_set(ret_x, 0);
9736     SvSTASH_set(ret_x, NULL);
9737     SvMAGIC_set(ret_x, NULL);
9738     Newx(ret->offs, npar, regexp_paren_pair);
9739     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9740     if (r->substrs) {
9741         Newx(ret->substrs, 1, struct reg_substr_data);
9742         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9743
9744         SvREFCNT_inc_void(ret->anchored_substr);
9745         SvREFCNT_inc_void(ret->anchored_utf8);
9746         SvREFCNT_inc_void(ret->float_substr);
9747         SvREFCNT_inc_void(ret->float_utf8);
9748
9749         /* check_substr and check_utf8, if non-NULL, point to either their
9750            anchored or float namesakes, and don't hold a second reference.  */
9751     }
9752     RX_MATCH_COPIED_off(ret_x);
9753 #ifdef PERL_OLD_COPY_ON_WRITE
9754     ret->saved_copy = NULL;
9755 #endif
9756     ret->mother_re = rx;
9757
9758     return ret_x;
9759 }
9760 #endif
9761
9762 /* regfree_internal()
9763
9764    Free the private data in a regexp. This is overloadable by
9765    extensions. Perl takes care of the regexp structure in pregfree(),
9766    this covers the *pprivate pointer which technically perl doesn't
9767    know about, however of course we have to handle the
9768    regexp_internal structure when no extension is in use.
9769
9770    Note this is called before freeing anything in the regexp
9771    structure.
9772  */
9773
9774 void
9775 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9776 {
9777     dVAR;
9778     struct regexp *const r = (struct regexp *)SvANY(rx);
9779     RXi_GET_DECL(r,ri);
9780     GET_RE_DEBUG_FLAGS_DECL;
9781
9782     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9783
9784     DEBUG_COMPILE_r({
9785         if (!PL_colorset)
9786             reginitcolors();
9787         {
9788             SV *dsv= sv_newmortal();
9789             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
9790                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
9791             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9792                 PL_colors[4],PL_colors[5],s);
9793         }
9794     });
9795 #ifdef RE_TRACK_PATTERN_OFFSETS
9796     if (ri->u.offsets)
9797         Safefree(ri->u.offsets);             /* 20010421 MJD */
9798 #endif
9799     if (ri->data) {
9800         int n = ri->data->count;
9801         PAD* new_comppad = NULL;
9802         PAD* old_comppad;
9803         PADOFFSET refcnt;
9804
9805         while (--n >= 0) {
9806           /* If you add a ->what type here, update the comment in regcomp.h */
9807             switch (ri->data->what[n]) {
9808             case 'a':
9809             case 's':
9810             case 'S':
9811             case 'u':
9812                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9813                 break;
9814             case 'f':
9815                 Safefree(ri->data->data[n]);
9816                 break;
9817             case 'p':
9818                 new_comppad = MUTABLE_AV(ri->data->data[n]);
9819                 break;
9820             case 'o':
9821                 if (new_comppad == NULL)
9822                     Perl_croak(aTHX_ "panic: pregfree comppad");
9823                 PAD_SAVE_LOCAL(old_comppad,
9824                     /* Watch out for global destruction's random ordering. */
9825                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9826                 );
9827                 OP_REFCNT_LOCK;
9828                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9829                 OP_REFCNT_UNLOCK;
9830                 if (!refcnt)
9831                     op_free((OP_4tree*)ri->data->data[n]);
9832
9833                 PAD_RESTORE_LOCAL(old_comppad);
9834                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
9835                 new_comppad = NULL;
9836                 break;
9837             case 'n':
9838                 break;
9839             case 'T':
9840                 { /* Aho Corasick add-on structure for a trie node.
9841                      Used in stclass optimization only */
9842                     U32 refcount;
9843                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9844                     OP_REFCNT_LOCK;
9845                     refcount = --aho->refcount;
9846                     OP_REFCNT_UNLOCK;
9847                     if ( !refcount ) {
9848                         PerlMemShared_free(aho->states);
9849                         PerlMemShared_free(aho->fail);
9850                          /* do this last!!!! */
9851                         PerlMemShared_free(ri->data->data[n]);
9852                         PerlMemShared_free(ri->regstclass);
9853                     }
9854                 }
9855                 break;
9856             case 't':
9857                 {
9858                     /* trie structure. */
9859                     U32 refcount;
9860                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9861                     OP_REFCNT_LOCK;
9862                     refcount = --trie->refcount;
9863                     OP_REFCNT_UNLOCK;
9864                     if ( !refcount ) {
9865                         PerlMemShared_free(trie->charmap);
9866                         PerlMemShared_free(trie->states);
9867                         PerlMemShared_free(trie->trans);
9868                         if (trie->bitmap)
9869                             PerlMemShared_free(trie->bitmap);
9870                         if (trie->jump)
9871                             PerlMemShared_free(trie->jump);
9872                         PerlMemShared_free(trie->wordinfo);
9873                         /* do this last!!!! */
9874                         PerlMemShared_free(ri->data->data[n]);
9875                     }
9876                 }
9877                 break;
9878             default:
9879                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9880             }
9881         }
9882         Safefree(ri->data->what);
9883         Safefree(ri->data);
9884     }
9885
9886     Safefree(ri);
9887 }
9888
9889 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
9890 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
9891 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
9892
9893 /*
9894    re_dup - duplicate a regexp.
9895
9896    This routine is expected to clone a given regexp structure. It is only
9897    compiled under USE_ITHREADS.
9898
9899    After all of the core data stored in struct regexp is duplicated
9900    the regexp_engine.dupe method is used to copy any private data
9901    stored in the *pprivate pointer. This allows extensions to handle
9902    any duplication it needs to do.
9903
9904    See pregfree() and regfree_internal() if you change anything here.
9905 */
9906 #if defined(USE_ITHREADS)
9907 #ifndef PERL_IN_XSUB_RE
9908 void
9909 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
9910 {
9911     dVAR;
9912     I32 npar;
9913     const struct regexp *r = (const struct regexp *)SvANY(sstr);
9914     struct regexp *ret = (struct regexp *)SvANY(dstr);
9915
9916     PERL_ARGS_ASSERT_RE_DUP_GUTS;
9917
9918     npar = r->nparens+1;
9919     Newx(ret->offs, npar, regexp_paren_pair);
9920     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9921     if(ret->swap) {
9922         /* no need to copy these */
9923         Newx(ret->swap, npar, regexp_paren_pair);
9924     }
9925
9926     if (ret->substrs) {
9927         /* Do it this way to avoid reading from *r after the StructCopy().
9928            That way, if any of the sv_dup_inc()s dislodge *r from the L1
9929            cache, it doesn't matter.  */
9930         const bool anchored = r->check_substr
9931             ? r->check_substr == r->anchored_substr
9932             : r->check_utf8 == r->anchored_utf8;
9933         Newx(ret->substrs, 1, struct reg_substr_data);
9934         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9935
9936         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9937         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9938         ret->float_substr = sv_dup_inc(ret->float_substr, param);
9939         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9940
9941         /* check_substr and check_utf8, if non-NULL, point to either their
9942            anchored or float namesakes, and don't hold a second reference.  */
9943
9944         if (ret->check_substr) {
9945             if (anchored) {
9946                 assert(r->check_utf8 == r->anchored_utf8);
9947                 ret->check_substr = ret->anchored_substr;
9948                 ret->check_utf8 = ret->anchored_utf8;
9949             } else {
9950                 assert(r->check_substr == r->float_substr);
9951                 assert(r->check_utf8 == r->float_utf8);
9952                 ret->check_substr = ret->float_substr;
9953                 ret->check_utf8 = ret->float_utf8;
9954             }
9955         } else if (ret->check_utf8) {
9956             if (anchored) {
9957                 ret->check_utf8 = ret->anchored_utf8;
9958             } else {
9959                 ret->check_utf8 = ret->float_utf8;
9960             }
9961         }
9962     }
9963
9964     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9965
9966     if (ret->pprivate)
9967         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
9968
9969     if (RX_MATCH_COPIED(dstr))
9970         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
9971     else
9972         ret->subbeg = NULL;
9973 #ifdef PERL_OLD_COPY_ON_WRITE
9974     ret->saved_copy = NULL;
9975 #endif
9976
9977     if (ret->mother_re) {
9978         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9979             /* Our storage points directly to our mother regexp, but that's
9980                1: a buffer in a different thread
9981                2: something we no longer hold a reference on
9982                so we need to copy it locally.  */
9983             /* Note we need to sue SvCUR() on our mother_re, because it, in
9984                turn, may well be pointing to its own mother_re.  */
9985             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9986                                    SvCUR(ret->mother_re)+1));
9987             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9988         }
9989         ret->mother_re      = NULL;
9990     }
9991     ret->gofs = 0;
9992 }
9993 #endif /* PERL_IN_XSUB_RE */
9994
9995 /*
9996    regdupe_internal()
9997
9998    This is the internal complement to regdupe() which is used to copy
9999    the structure pointed to by the *pprivate pointer in the regexp.
10000    This is the core version of the extension overridable cloning hook.
10001    The regexp structure being duplicated will be copied by perl prior
10002    to this and will be provided as the regexp *r argument, however
10003    with the /old/ structures pprivate pointer value. Thus this routine
10004    may override any copying normally done by perl.
10005
10006    It returns a pointer to the new regexp_internal structure.
10007 */
10008
10009 void *
10010 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
10011 {
10012     dVAR;
10013     struct regexp *const r = (struct regexp *)SvANY(rx);
10014     regexp_internal *reti;
10015     int len, npar;
10016     RXi_GET_DECL(r,ri);
10017
10018     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
10019
10020     npar = r->nparens+1;
10021     len = ProgLen(ri);
10022
10023     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
10024     Copy(ri->program, reti->program, len+1, regnode);
10025
10026
10027     reti->regstclass = NULL;
10028
10029     if (ri->data) {
10030         struct reg_data *d;
10031         const int count = ri->data->count;
10032         int i;
10033
10034         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
10035                 char, struct reg_data);
10036         Newx(d->what, count, U8);
10037
10038         d->count = count;
10039         for (i = 0; i < count; i++) {
10040             d->what[i] = ri->data->what[i];
10041             switch (d->what[i]) {
10042                 /* legal options are one of: sSfpontTua
10043                    see also regcomp.h and pregfree() */
10044             case 'a': /* actually an AV, but the dup function is identical.  */
10045             case 's':
10046             case 'S':
10047             case 'p': /* actually an AV, but the dup function is identical.  */
10048             case 'u': /* actually an HV, but the dup function is identical.  */
10049                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
10050                 break;
10051             case 'f':
10052                 /* This is cheating. */
10053                 Newx(d->data[i], 1, struct regnode_charclass_class);
10054                 StructCopy(ri->data->data[i], d->data[i],
10055                             struct regnode_charclass_class);
10056                 reti->regstclass = (regnode*)d->data[i];
10057                 break;
10058             case 'o':
10059                 /* Compiled op trees are readonly and in shared memory,
10060                    and can thus be shared without duplication. */
10061                 OP_REFCNT_LOCK;
10062                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
10063                 OP_REFCNT_UNLOCK;
10064                 break;
10065             case 'T':
10066                 /* Trie stclasses are readonly and can thus be shared
10067                  * without duplication. We free the stclass in pregfree
10068                  * when the corresponding reg_ac_data struct is freed.
10069                  */
10070                 reti->regstclass= ri->regstclass;
10071                 /* Fall through */
10072             case 't':
10073                 OP_REFCNT_LOCK;
10074                 ((reg_trie_data*)ri->data->data[i])->refcount++;
10075                 OP_REFCNT_UNLOCK;
10076                 /* Fall through */
10077             case 'n':
10078                 d->data[i] = ri->data->data[i];
10079                 break;
10080             default:
10081                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
10082             }
10083         }
10084
10085         reti->data = d;
10086     }
10087     else
10088         reti->data = NULL;
10089
10090     reti->name_list_idx = ri->name_list_idx;
10091
10092 #ifdef RE_TRACK_PATTERN_OFFSETS
10093     if (ri->u.offsets) {
10094         Newx(reti->u.offsets, 2*len+1, U32);
10095         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
10096     }
10097 #else
10098     SetProgLen(reti,len);
10099 #endif
10100
10101     return (void*)reti;
10102 }
10103
10104 #endif    /* USE_ITHREADS */
10105
10106 #ifndef PERL_IN_XSUB_RE
10107
10108 /*
10109  - regnext - dig the "next" pointer out of a node
10110  */
10111 regnode *
10112 Perl_regnext(pTHX_ register regnode *p)
10113 {
10114     dVAR;
10115     register I32 offset;
10116
10117     if (!p)
10118         return(NULL);
10119
10120     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
10121         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
10122     }
10123
10124     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
10125     if (offset == 0)
10126         return(NULL);
10127
10128     return(p+offset);
10129 }
10130 #endif
10131
10132 STATIC void
10133 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
10134 {
10135     va_list args;
10136     STRLEN l1 = strlen(pat1);
10137     STRLEN l2 = strlen(pat2);
10138     char buf[512];
10139     SV *msv;
10140     const char *message;
10141
10142     PERL_ARGS_ASSERT_RE_CROAK2;
10143
10144     if (l1 > 510)
10145         l1 = 510;
10146     if (l1 + l2 > 510)
10147         l2 = 510 - l1;
10148     Copy(pat1, buf, l1 , char);
10149     Copy(pat2, buf + l1, l2 , char);
10150     buf[l1 + l2] = '\n';
10151     buf[l1 + l2 + 1] = '\0';
10152 #ifdef I_STDARG
10153     /* ANSI variant takes additional second argument */
10154     va_start(args, pat2);
10155 #else
10156     va_start(args);
10157 #endif
10158     msv = vmess(buf, &args);
10159     va_end(args);
10160     message = SvPV_const(msv,l1);
10161     if (l1 > 512)
10162         l1 = 512;
10163     Copy(message, buf, l1 , char);
10164     buf[l1-1] = '\0';                   /* Overwrite \n */
10165     Perl_croak(aTHX_ "%s", buf);
10166 }
10167
10168 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
10169
10170 #ifndef PERL_IN_XSUB_RE
10171 void
10172 Perl_save_re_context(pTHX)
10173 {
10174     dVAR;
10175
10176     struct re_save_state *state;
10177
10178     SAVEVPTR(PL_curcop);
10179     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10180
10181     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10182     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
10183     SSPUSHUV(SAVEt_RE_STATE);
10184
10185     Copy(&PL_reg_state, state, 1, struct re_save_state);
10186
10187     PL_reg_start_tmp = 0;
10188     PL_reg_start_tmpl = 0;
10189     PL_reg_oldsaved = NULL;
10190     PL_reg_oldsavedlen = 0;
10191     PL_reg_maxiter = 0;
10192     PL_reg_leftiter = 0;
10193     PL_reg_poscache = NULL;
10194     PL_reg_poscache_size = 0;
10195 #ifdef PERL_OLD_COPY_ON_WRITE
10196     PL_nrs = NULL;
10197 #endif
10198
10199     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10200     if (PL_curpm) {
10201         const REGEXP * const rx = PM_GETRE(PL_curpm);
10202         if (rx) {
10203             U32 i;
10204             for (i = 1; i <= RX_NPARENS(rx); i++) {
10205                 char digits[TYPE_CHARS(long)];
10206                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
10207                 GV *const *const gvp
10208                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10209
10210                 if (gvp) {
10211                     GV * const gv = *gvp;
10212                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10213                         save_scalar(gv);
10214                 }
10215             }
10216         }
10217     }
10218 }
10219 #endif
10220
10221 static void
10222 clear_re(pTHX_ void *r)
10223 {
10224     dVAR;
10225     ReREFCNT_dec((REGEXP *)r);
10226 }
10227
10228 #ifdef DEBUGGING
10229
10230 STATIC void
10231 S_put_byte(pTHX_ SV *sv, int c)
10232 {
10233     PERL_ARGS_ASSERT_PUT_BYTE;
10234
10235     /* Our definition of isPRINT() ignores locales, so only bytes that are
10236        not part of UTF-8 are considered printable. I assume that the same
10237        holds for UTF-EBCDIC.
10238        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10239        which Wikipedia says:
10240
10241        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10242        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10243        identical, to the ASCII delete (DEL) or rubout control character.
10244        ) So the old condition can be simplified to !isPRINT(c)  */
10245     if (!isPRINT(c))
10246         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
10247     else {
10248         const char string = c;
10249         if (c == '-' || c == ']' || c == '\\' || c == '^')
10250             sv_catpvs(sv, "\\");
10251         sv_catpvn(sv, &string, 1);
10252     }
10253 }
10254
10255
10256 #define CLEAR_OPTSTART \
10257     if (optstart) STMT_START { \
10258             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
10259             optstart=NULL; \
10260     } STMT_END
10261
10262 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
10263
10264 STATIC const regnode *
10265 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
10266             const regnode *last, const regnode *plast,
10267             SV* sv, I32 indent, U32 depth)
10268 {
10269     dVAR;
10270     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
10271     register const regnode *next;
10272     const regnode *optstart= NULL;
10273
10274     RXi_GET_DECL(r,ri);
10275     GET_RE_DEBUG_FLAGS_DECL;
10276
10277     PERL_ARGS_ASSERT_DUMPUNTIL;
10278
10279 #ifdef DEBUG_DUMPUNTIL
10280     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10281         last ? last-start : 0,plast ? plast-start : 0);
10282 #endif
10283
10284     if (plast && plast < last)
10285         last= plast;
10286
10287     while (PL_regkind[op] != END && (!last || node < last)) {
10288         /* While that wasn't END last time... */
10289         NODE_ALIGN(node);
10290         op = OP(node);
10291         if (op == CLOSE || op == WHILEM)
10292             indent--;
10293         next = regnext((regnode *)node);
10294
10295         /* Where, what. */
10296         if (OP(node) == OPTIMIZED) {
10297             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10298                 optstart = node;
10299             else
10300                 goto after_print;
10301         } else
10302             CLEAR_OPTSTART;
10303
10304         regprop(r, sv, node);
10305         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10306                       (int)(2*indent + 1), "", SvPVX_const(sv));
10307
10308         if (OP(node) != OPTIMIZED) {
10309             if (next == NULL)           /* Next ptr. */
10310                 PerlIO_printf(Perl_debug_log, " (0)");
10311             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10312                 PerlIO_printf(Perl_debug_log, " (FAIL)");
10313             else
10314                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10315             (void)PerlIO_putc(Perl_debug_log, '\n');
10316         }
10317
10318       after_print:
10319         if (PL_regkind[(U8)op] == BRANCHJ) {
10320             assert(next);
10321             {
10322                 register const regnode *nnode = (OP(next) == LONGJMP
10323                                              ? regnext((regnode *)next)
10324                                              : next);
10325                 if (last && nnode > last)
10326                     nnode = last;
10327                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10328             }
10329         }
10330         else if (PL_regkind[(U8)op] == BRANCH) {
10331             assert(next);
10332             DUMPUNTIL(NEXTOPER(node), next);
10333         }
10334         else if ( PL_regkind[(U8)op]  == TRIE ) {
10335             const regnode *this_trie = node;
10336             const char op = OP(node);
10337             const U32 n = ARG(node);
10338             const reg_ac_data * const ac = op>=AHOCORASICK ?
10339                (reg_ac_data *)ri->data->data[n] :
10340                NULL;
10341             const reg_trie_data * const trie =
10342                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10343 #ifdef DEBUGGING
10344             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10345 #endif
10346             const regnode *nextbranch= NULL;
10347             I32 word_idx;
10348             sv_setpvs(sv, "");
10349             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10350                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10351
10352                 PerlIO_printf(Perl_debug_log, "%*s%s ",
10353                    (int)(2*(indent+3)), "",
10354                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10355                             PL_colors[0], PL_colors[1],
10356                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10357                             PERL_PV_PRETTY_ELLIPSES    |
10358                             PERL_PV_PRETTY_LTGT
10359                             )
10360                             : "???"
10361                 );
10362                 if (trie->jump) {
10363                     U16 dist= trie->jump[word_idx+1];
10364                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10365                                   (UV)((dist ? this_trie + dist : next) - start));
10366                     if (dist) {
10367                         if (!nextbranch)
10368                             nextbranch= this_trie + trie->jump[0];
10369                         DUMPUNTIL(this_trie + dist, nextbranch);
10370                     }
10371                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10372                         nextbranch= regnext((regnode *)nextbranch);
10373                 } else {
10374                     PerlIO_printf(Perl_debug_log, "\n");
10375                 }
10376             }
10377             if (last && next > last)
10378                 node= last;
10379             else
10380                 node= next;
10381         }
10382         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
10383             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10384                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10385         }
10386         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10387             assert(next);
10388             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10389         }
10390         else if ( op == PLUS || op == STAR) {
10391             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10392         }
10393         else if (op == ANYOF) {
10394             /* arglen 1 + class block */
10395             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10396                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10397             node = NEXTOPER(node);
10398         }
10399         else if (PL_regkind[(U8)op] == EXACT) {
10400             /* Literal string, where present. */
10401             node += NODE_SZ_STR(node) - 1;
10402             node = NEXTOPER(node);
10403         }
10404         else {
10405             node = NEXTOPER(node);
10406             node += regarglen[(U8)op];
10407         }
10408         if (op == CURLYX || op == OPEN)
10409             indent++;
10410     }
10411     CLEAR_OPTSTART;
10412 #ifdef DEBUG_DUMPUNTIL
10413     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10414 #endif
10415     return node;
10416 }
10417
10418 #endif  /* DEBUGGING */
10419
10420 /*
10421  * Local variables:
10422  * c-indentation-style: bsd
10423  * c-basic-offset: 4
10424  * indent-tabs-mode: t
10425  * End:
10426  *
10427  * ex: set ts=8 sts=4 sw=4 noet:
10428  */