orig/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89
  90 #ifdef op
  91 #undef op
  92 #endif /* op */
  93
  94 #ifdef MSDOS
  95 #  if defined(BUGGY_MSC6)
  96  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  97 #    pragma optimize("a",off)
  98  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  99 #    pragma optimize("w",on )
 100 #  endif /* BUGGY_MSC6 */
 101 #endif /* MSDOS */
 102
 103 #ifndef STATIC
 104 #define STATIC  static
 105 #endif
 106
 107 typedef struct RExC_state_t {
 108     U32         flags;                  /* are we folding, multilining? */
 109     char        *precomp;               /* uncompiled string. */
 110     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 111     regexp      *rx;                    /* perl core regexp structure */
 112     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 113     char        *start;                 /* Start of input for compile */
 114     char        *end;                   /* End of input for compile */
 115     char        *parse;                 /* Input-scan pointer. */
 116     I32         whilem_seen;            /* number of WHILEM in this expr */
 117     regnode     *emit_start;            /* Start of emitted-code area */
 118     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 120     I32         naughty;                /* How bad is this pattern? */
 121     I32         sawback;                /* Did we see \1, ...? */
 122     U32         seen;
 123     I32         size;                   /* Code size. */
 124     I32         npar;                   /* Capture buffer count, (OPEN). */
 125     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 126     I32         nestroot;               /* root parens we are in - used by accept */
 127     I32         extralen;
 128     I32         seen_zerolen;
 129     I32         seen_evals;
 130     regnode     **open_parens;          /* pointers to open parens */
 131     regnode     **close_parens;         /* pointers to close parens */
 132     regnode     *opend;                 /* END node in program */
 133     I32         utf8;           /* whether the pattern is utf8 or not */
 134     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 135                                 /* XXX use this for future optimisation of case
 136                                  * where pattern must be upgraded to utf8. */
 137     HV          *paren_names;           /* Paren names */
 138
 139     regnode     **recurse;              /* Recurse regops */
 140     I32         recurse_count;          /* Number of recurse regops */
 141 #if ADD_TO_REGEXEC
 142     char        *starttry;              /* -Dr: where regtry was called. */
 143 #define RExC_starttry   (pRExC_state->starttry)
 144 #endif
 145 #ifdef DEBUGGING
 146     const char  *lastparse;
 147     I32         lastnum;
 148     AV          *paren_name_list;       /* idx -> name */
 149 #define RExC_lastparse  (pRExC_state->lastparse)
 150 #define RExC_lastnum    (pRExC_state->lastnum)
 151 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 152 #endif
 153 } RExC_state_t;
 154
 155 #define RExC_flags      (pRExC_state->flags)
 156 #define RExC_precomp    (pRExC_state->precomp)
 157 #define RExC_rx_sv      (pRExC_state->rx_sv)
 158 #define RExC_rx         (pRExC_state->rx)
 159 #define RExC_rxi        (pRExC_state->rxi)
 160 #define RExC_start      (pRExC_state->start)
 161 #define RExC_end        (pRExC_state->end)
 162 #define RExC_parse      (pRExC_state->parse)
 163 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 164 #ifdef RE_TRACK_PATTERN_OFFSETS
 165 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 166 #endif
 167 #define RExC_emit       (pRExC_state->emit)
 168 #define RExC_emit_start (pRExC_state->emit_start)
 169 #define RExC_emit_bound (pRExC_state->emit_bound)
 170 #define RExC_naughty    (pRExC_state->naughty)
 171 #define RExC_sawback    (pRExC_state->sawback)
 172 #define RExC_seen       (pRExC_state->seen)
 173 #define RExC_size       (pRExC_state->size)
 174 #define RExC_npar       (pRExC_state->npar)
 175 #define RExC_nestroot   (pRExC_state->nestroot)
 176 #define RExC_extralen   (pRExC_state->extralen)
 177 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 178 #define RExC_seen_evals (pRExC_state->seen_evals)
 179 #define RExC_utf8       (pRExC_state->utf8)
 180 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 181 #define RExC_open_parens        (pRExC_state->open_parens)
 182 #define RExC_close_parens       (pRExC_state->close_parens)
 183 #define RExC_opend      (pRExC_state->opend)
 184 #define RExC_paren_names        (pRExC_state->paren_names)
 185 #define RExC_recurse    (pRExC_state->recurse)
 186 #define RExC_recurse_count      (pRExC_state->recurse_count)
 187
 188
 189 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 190 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 191         ((*s) == '{' && regcurly(s)))
 192
 193 #ifdef SPSTART
 194 #undef SPSTART          /* dratted cpp namespace... */
 195 #endif
 196 /*
 197  * Flags to be passed up and down.
 198  */
 199 #define WORST           0       /* Worst case. */
 200 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 201
 202 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 203  * character, and if utf8, must be invariant. */
 204 #define SIMPLE          0x02
 205 #define SPSTART         0x04    /* Starts with * or +. */
 206 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 207 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 208
 209 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 210
 211 /* whether trie related optimizations are enabled */
 212 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 213 #define TRIE_STUDY_OPT
 214 #define FULL_TRIE_STUDY
 215 #define TRIE_STCLASS
 216 #endif
 217
 218
 219
 220 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 221 #define PBITVAL(paren) (1 << ((paren) & 7))
 222 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 223 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 224 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 225
 226 /* If not already in utf8, do a longjmp back to the beginning */
 227 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 228 #define REQUIRE_UTF8    STMT_START {                                       \
 229                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 230                         } STMT_END
 231
 232 /* About scan_data_t.
 233
 234   During optimisation we recurse through the regexp program performing
 235   various inplace (keyhole style) optimisations. In addition study_chunk
 236   and scan_commit populate this data structure with information about
 237   what strings MUST appear in the pattern. We look for the longest
 238   string that must appear at a fixed location, and we look for the
 239   longest string that may appear at a floating location. So for instance
 240   in the pattern:
 241
 242     /FOO[xX]A.*B[xX]BAR/
 243
 244   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 245   strings (because they follow a .* construct). study_chunk will identify
 246   both FOO and BAR as being the longest fixed and floating strings respectively.
 247
 248   The strings can be composites, for instance
 249
 250      /(f)(o)(o)/
 251
 252   will result in a composite fixed substring 'foo'.
 253
 254   For each string some basic information is maintained:
 255
 256   - offset or min_offset
 257     This is the position the string must appear at, or not before.
 258     It also implicitly (when combined with minlenp) tells us how many
 259     characters must match before the string we are searching for.
 260     Likewise when combined with minlenp and the length of the string it
 261     tells us how many characters must appear after the string we have
 262     found.
 263
 264   - max_offset
 265     Only used for floating strings. This is the rightmost point that
 266     the string can appear at. If set to I32 max it indicates that the
 267     string can occur infinitely far to the right.
 268
 269   - minlenp
 270     A pointer to the minimum length of the pattern that the string
 271     was found inside. This is important as in the case of positive
 272     lookahead or positive lookbehind we can have multiple patterns
 273     involved. Consider
 274
 275     /(?=FOO).*F/
 276
 277     The minimum length of the pattern overall is 3, the minimum length
 278     of the lookahead part is 3, but the minimum length of the part that
 279     will actually match is 1. So 'FOO's minimum length is 3, but the
 280     minimum length for the F is 1. This is important as the minimum length
 281     is used to determine offsets in front of and behind the string being
 282     looked for.  Since strings can be composites this is the length of the
 283     pattern at the time it was commited with a scan_commit. Note that
 284     the length is calculated by study_chunk, so that the minimum lengths
 285     are not known until the full pattern has been compiled, thus the
 286     pointer to the value.
 287
 288   - lookbehind
 289
 290     In the case of lookbehind the string being searched for can be
 291     offset past the start point of the final matching string.
 292     If this value was just blithely removed from the min_offset it would
 293     invalidate some of the calculations for how many chars must match
 294     before or after (as they are derived from min_offset and minlen and
 295     the length of the string being searched for).
 296     When the final pattern is compiled and the data is moved from the
 297     scan_data_t structure into the regexp structure the information
 298     about lookbehind is factored in, with the information that would
 299     have been lost precalculated in the end_shift field for the
 300     associated string.
 301
 302   The fields pos_min and pos_delta are used to store the minimum offset
 303   and the delta to the maximum offset at the current point in the pattern.
 304
 305 */
 306
 307 typedef struct scan_data_t {
 308     /*I32 len_min;      unused */
 309     /*I32 len_delta;    unused */
 310     I32 pos_min;
 311     I32 pos_delta;
 312     SV *last_found;
 313     I32 last_end;           /* min value, <0 unless valid. */
 314     I32 last_start_min;
 315     I32 last_start_max;
 316     SV **longest;           /* Either &l_fixed, or &l_float. */
 317     SV *longest_fixed;      /* longest fixed string found in pattern */
 318     I32 offset_fixed;       /* offset where it starts */
 319     I32 *minlen_fixed;      /* pointer to the minlen relevent to the string */
 320     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 321     SV *longest_float;      /* longest floating string found in pattern */
 322     I32 offset_float_min;   /* earliest point in string it can appear */
 323     I32 offset_float_max;   /* latest point in string it can appear */
 324     I32 *minlen_float;      /* pointer to the minlen relevent to the string */
 325     I32 lookbehind_float;   /* is the position of the string modified by LB */
 326     I32 flags;
 327     I32 whilem_c;
 328     I32 *last_closep;
 329     struct regnode_charclass_class *start_class;
 330 } scan_data_t;
 331
 332 /*
 333  * Forward declarations for pregcomp()'s friends.
 334  */
 335
 336 static const scan_data_t zero_scan_data =
 337   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 338
 339 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 340 #define SF_BEFORE_SEOL          0x0001
 341 #define SF_BEFORE_MEOL          0x0002
 342 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 343 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 344
 345 #ifdef NO_UNARY_PLUS
 346 #  define SF_FIX_SHIFT_EOL      (0+2)
 347 #  define SF_FL_SHIFT_EOL               (0+4)
 348 #else
 349 #  define SF_FIX_SHIFT_EOL      (+2)
 350 #  define SF_FL_SHIFT_EOL               (+4)
 351 #endif
 352
 353 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 354 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 355
 356 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 357 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 358 #define SF_IS_INF               0x0040
 359 #define SF_HAS_PAR              0x0080
 360 #define SF_IN_PAR               0x0100
 361 #define SF_HAS_EVAL             0x0200
 362 #define SCF_DO_SUBSTR           0x0400
 363 #define SCF_DO_STCLASS_AND      0x0800
 364 #define SCF_DO_STCLASS_OR       0x1000
 365 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 366 #define SCF_WHILEM_VISITED_POS  0x2000
 367
 368 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 369 #define SCF_SEEN_ACCEPT         0x8000
 370
 371 #define UTF cBOOL(RExC_utf8)
 372 #define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
 373 #define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
 374 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 375
 376 #define OOB_UNICODE             12345678
 377 #define OOB_NAMEDCLASS          -1
 378
 379 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 380 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 381
 382
 383 /* length of regex to show in messages that don't mark a position within */
 384 #define RegexLengthToShowInErrorMessages 127
 385
 386 /*
 387  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 388  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 389  * op/pragma/warn/regcomp.
 390  */
 391 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 392 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 393
 394 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 395
 396 /*
 397  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 398  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 399  * "...".
 400  */
 401 #define _FAIL(code) STMT_START {                                        \
 402     const char *ellipses = "";                                          \
 403     IV len = RExC_end - RExC_precomp;                                   \
 404                                                                         \
 405     if (!SIZE_ONLY)                                                     \
 406         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 407     if (len > RegexLengthToShowInErrorMessages) {                       \
 408         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 409         len = RegexLengthToShowInErrorMessages - 10;                    \
 410         ellipses = "...";                                               \
 411     }                                                                   \
 412     code;                                                               \
 413 } STMT_END
 414
 415 #define FAIL(msg) _FAIL(                            \
 416     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 417             msg, (int)len, RExC_precomp, ellipses))
 418
 419 #define FAIL2(msg,arg) _FAIL(                       \
 420     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 421             arg, (int)len, RExC_precomp, ellipses))
 422
 423 /*
 424  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 425  */
 426 #define Simple_vFAIL(m) STMT_START {                                    \
 427     const IV offset = RExC_parse - RExC_precomp;                        \
 428     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 429             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 430 } STMT_END
 431
 432 /*
 433  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 434  */
 435 #define vFAIL(m) STMT_START {                           \
 436     if (!SIZE_ONLY)                                     \
 437         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 438     Simple_vFAIL(m);                                    \
 439 } STMT_END
 440
 441 /*
 442  * Like Simple_vFAIL(), but accepts two arguments.
 443  */
 444 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 445     const IV offset = RExC_parse - RExC_precomp;                        \
 446     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 447             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 448 } STMT_END
 449
 450 /*
 451  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 452  */
 453 #define vFAIL2(m,a1) STMT_START {                       \
 454     if (!SIZE_ONLY)                                     \
 455         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 456     Simple_vFAIL2(m, a1);                               \
 457 } STMT_END
 458
 459
 460 /*
 461  * Like Simple_vFAIL(), but accepts three arguments.
 462  */
 463 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 464     const IV offset = RExC_parse - RExC_precomp;                \
 465     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 466             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 467 } STMT_END
 468
 469 /*
 470  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 471  */
 472 #define vFAIL3(m,a1,a2) STMT_START {                    \
 473     if (!SIZE_ONLY)                                     \
 474         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 475     Simple_vFAIL3(m, a1, a2);                           \
 476 } STMT_END
 477
 478 /*
 479  * Like Simple_vFAIL(), but accepts four arguments.
 480  */
 481 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 482     const IV offset = RExC_parse - RExC_precomp;                \
 483     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 484             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 485 } STMT_END
 486
 487 #define ckWARNreg(loc,m) STMT_START {                                   \
 488     const IV offset = loc - RExC_precomp;                               \
 489     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 490             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 491 } STMT_END
 492
 493 #define ckWARNregdep(loc,m) STMT_START {                                \
 494     const IV offset = loc - RExC_precomp;                               \
 495     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 496             m REPORT_LOCATION,                                          \
 497             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 498 } STMT_END
 499
 500 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 501     const IV offset = loc - RExC_precomp;                               \
 502     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 503             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 504 } STMT_END
 505
 506 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 507     const IV offset = loc - RExC_precomp;                               \
 508     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 509             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 510 } STMT_END
 511
 512 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 513     const IV offset = loc - RExC_precomp;                               \
 514     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 515             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 516 } STMT_END
 517
 518 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 519     const IV offset = loc - RExC_precomp;                               \
 520     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 521             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 522 } STMT_END
 523
 524 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 525     const IV offset = loc - RExC_precomp;                               \
 526     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 527             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 528 } STMT_END
 529
 530 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 531     const IV offset = loc - RExC_precomp;                               \
 532     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 533             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 534 } STMT_END
 535
 536
 537 /* Allow for side effects in s */
 538 #define REGC(c,s) STMT_START {                  \
 539     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 540 } STMT_END
 541
 542 /* Macros for recording node offsets.   20001227 mjd@plover.com
 543  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 544  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 545  * Element 0 holds the number n.
 546  * Position is 1 indexed.
 547  */
 548 #ifndef RE_TRACK_PATTERN_OFFSETS
 549 #define Set_Node_Offset_To_R(node,byte)
 550 #define Set_Node_Offset(node,byte)
 551 #define Set_Cur_Node_Offset
 552 #define Set_Node_Length_To_R(node,len)
 553 #define Set_Node_Length(node,len)
 554 #define Set_Node_Cur_Length(node)
 555 #define Node_Offset(n)
 556 #define Node_Length(n)
 557 #define Set_Node_Offset_Length(node,offset,len)
 558 #define ProgLen(ri) ri->u.proglen
 559 #define SetProgLen(ri,x) ri->u.proglen = x
 560 #else
 561 #define ProgLen(ri) ri->u.offsets[0]
 562 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 563 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 564     if (! SIZE_ONLY) {                                                  \
 565         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 566                     __LINE__, (int)(node), (int)(byte)));               \
 567         if((node) < 0) {                                                \
 568             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 569         } else {                                                        \
 570             RExC_offsets[2*(node)-1] = (byte);                          \
 571         }                                                               \
 572     }                                                                   \
 573 } STMT_END
 574
 575 #define Set_Node_Offset(node,byte) \
 576     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 577 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 578
 579 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 580     if (! SIZE_ONLY) {                                                  \
 581         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 582                 __LINE__, (int)(node), (int)(len)));                    \
 583         if((node) < 0) {                                                \
 584             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 585         } else {                                                        \
 586             RExC_offsets[2*(node)] = (len);                             \
 587         }                                                               \
 588     }                                                                   \
 589 } STMT_END
 590
 591 #define Set_Node_Length(node,len) \
 592     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 593 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 594 #define Set_Node_Cur_Length(node) \
 595     Set_Node_Length(node, RExC_parse - parse_start)
 596
 597 /* Get offsets and lengths */
 598 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 599 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 600
 601 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 602     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 603     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 604 } STMT_END
 605 #endif
 606
 607 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 608 #define EXPERIMENTAL_INPLACESCAN
 609 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 610
 611 #define DEBUG_STUDYDATA(str,data,depth)                              \
 612 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 613     PerlIO_printf(Perl_debug_log,                                    \
 614         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 615         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 616         (int)(depth)*2, "",                                          \
 617         (IV)((data)->pos_min),                                       \
 618         (IV)((data)->pos_delta),                                     \
 619         (UV)((data)->flags),                                         \
 620         (IV)((data)->whilem_c),                                      \
 621         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 622         is_inf ? "INF " : ""                                         \
 623     );                                                               \
 624     if ((data)->last_found)                                          \
 625         PerlIO_printf(Perl_debug_log,                                \
 626             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 627             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 628             SvPVX_const((data)->last_found),                         \
 629             (IV)((data)->last_end),                                  \
 630             (IV)((data)->last_start_min),                            \
 631             (IV)((data)->last_start_max),                            \
 632             ((data)->longest &&                                      \
 633              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 634             SvPVX_const((data)->longest_fixed),                      \
 635             (IV)((data)->offset_fixed),                              \
 636             ((data)->longest &&                                      \
 637              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 638             SvPVX_const((data)->longest_float),                      \
 639             (IV)((data)->offset_float_min),                          \
 640             (IV)((data)->offset_float_max)                           \
 641         );                                                           \
 642     PerlIO_printf(Perl_debug_log,"\n");                              \
 643 });
 644
 645 static void clear_re(pTHX_ void *r);
 646
 647 /* Mark that we cannot extend a found fixed substring at this point.
 648    Update the longest found anchored substring and the longest found
 649    floating substrings if needed. */
 650
 651 STATIC void
 652 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 653 {
 654     const STRLEN l = CHR_SVLEN(data->last_found);
 655     const STRLEN old_l = CHR_SVLEN(*data->longest);
 656     GET_RE_DEBUG_FLAGS_DECL;
 657
 658     PERL_ARGS_ASSERT_SCAN_COMMIT;
 659
 660     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 661         SvSetMagicSV(*data->longest, data->last_found);
 662         if (*data->longest == data->longest_fixed) {
 663             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 664             if (data->flags & SF_BEFORE_EOL)
 665                 data->flags
 666                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 667             else
 668                 data->flags &= ~SF_FIX_BEFORE_EOL;
 669             data->minlen_fixed=minlenp;
 670             data->lookbehind_fixed=0;
 671         }
 672         else { /* *data->longest == data->longest_float */
 673             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 674             data->offset_float_max = (l
 675                                       ? data->last_start_max
 676                                       : data->pos_min + data->pos_delta);
 677             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 678                 data->offset_float_max = I32_MAX;
 679             if (data->flags & SF_BEFORE_EOL)
 680                 data->flags
 681                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 682             else
 683                 data->flags &= ~SF_FL_BEFORE_EOL;
 684             data->minlen_float=minlenp;
 685             data->lookbehind_float=0;
 686         }
 687     }
 688     SvCUR_set(data->last_found, 0);
 689     {
 690         SV * const sv = data->last_found;
 691         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 692             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 693             if (mg)
 694                 mg->mg_len = 0;
 695         }
 696     }
 697     data->last_end = -1;
 698     data->flags &= ~SF_BEFORE_EOL;
 699     DEBUG_STUDYDATA("commit: ",data,0);
 700 }
 701
 702 /* Can match anything (initialization) */
 703 STATIC void
 704 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 705 {
 706     PERL_ARGS_ASSERT_CL_ANYTHING;
 707
 708     ANYOF_CLASS_ZERO(cl);
 709     ANYOF_BITMAP_SETALL(cl);
 710     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
 711     if (LOC)
 712         cl->flags |= ANYOF_LOCALE;
 713     cl->flags |= ANYOF_FOLD;
 714 }
 715
 716 /* Can match anything (initialization) */
 717 STATIC int
 718 S_cl_is_anything(const struct regnode_charclass_class *cl)
 719 {
 720     int value;
 721
 722     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 723
 724     for (value = 0; value <= ANYOF_MAX; value += 2)
 725         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 726             return 1;
 727     if (!(cl->flags & ANYOF_UNICODE_ALL))
 728         return 0;
 729     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 730         return 0;
 731     return 1;
 732 }
 733
 734 /* Can match anything (initialization) */
 735 STATIC void
 736 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 737 {
 738     PERL_ARGS_ASSERT_CL_INIT;
 739
 740     Zero(cl, 1, struct regnode_charclass_class);
 741     cl->type = ANYOF;
 742     cl_anything(pRExC_state, cl);
 743 }
 744
 745 STATIC void
 746 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 747 {
 748     PERL_ARGS_ASSERT_CL_INIT_ZERO;
 749
 750     Zero(cl, 1, struct regnode_charclass_class);
 751     cl->type = ANYOF;
 752     cl_anything(pRExC_state, cl);
 753     if (LOC)
 754         cl->flags |= ANYOF_LOCALE;
 755 }
 756
 757 /* 'And' a given class with another one.  Can create false positives */
 758 /* We assume that cl is not inverted */
 759 STATIC void
 760 S_cl_and(struct regnode_charclass_class *cl,
 761         const struct regnode_charclass_class *and_with)
 762 {
 763     PERL_ARGS_ASSERT_CL_AND;
 764
 765     assert(and_with->type == ANYOF);
 766     if (!(and_with->flags & ANYOF_CLASS)
 767         && !(cl->flags & ANYOF_CLASS)
 768         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 769         && !(and_with->flags & ANYOF_FOLD)
 770         && !(cl->flags & ANYOF_FOLD)) {
 771         int i;
 772
 773         if (and_with->flags & ANYOF_INVERT)
 774             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 775                 cl->bitmap[i] &= ~and_with->bitmap[i];
 776         else
 777             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 778                 cl->bitmap[i] &= and_with->bitmap[i];
 779     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 780     if (!(and_with->flags & ANYOF_EOS))
 781         cl->flags &= ~ANYOF_EOS;
 782
 783     if (!(and_with->flags & ANYOF_FOLD))
 784         cl->flags &= ~ANYOF_FOLD;
 785
 786     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
 787         !(and_with->flags & ANYOF_INVERT)) {
 788         cl->flags &= ~ANYOF_UNICODE_ALL;
 789         cl->flags |= ANYOF_UNICODE;
 790         ARG_SET(cl, ARG(and_with));
 791     }
 792     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
 793         !(and_with->flags & ANYOF_INVERT))
 794         cl->flags &= ~ANYOF_UNICODE_ALL;
 795     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
 796         !(and_with->flags & ANYOF_INVERT))
 797         cl->flags &= ~ANYOF_UNICODE;
 798 }
 799
 800 /* 'OR' a given class with another one.  Can create false positives */
 801 /* We assume that cl is not inverted */
 802 STATIC void
 803 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 804 {
 805     PERL_ARGS_ASSERT_CL_OR;
 806
 807     if (or_with->flags & ANYOF_INVERT) {
 808         /* We do not use
 809          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 810          *   <= (B1 | !B2) | (CL1 | !CL2)
 811          * which is wasteful if CL2 is small, but we ignore CL2:
 812          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 813          * XXXX Can we handle case-fold?  Unclear:
 814          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 815          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 816          */
 817         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 818              && !(or_with->flags & ANYOF_FOLD)
 819              && !(cl->flags & ANYOF_FOLD) ) {
 820             int i;
 821
 822             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 823                 cl->bitmap[i] |= ~or_with->bitmap[i];
 824         } /* XXXX: logic is complicated otherwise */
 825         else {
 826             cl_anything(pRExC_state, cl);
 827         }
 828     } else {
 829         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 830         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 831              && (!(or_with->flags & ANYOF_FOLD)
 832                  || (cl->flags & ANYOF_FOLD)) ) {
 833             int i;
 834
 835             /* OR char bitmap and class bitmap separately */
 836             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 837                 cl->bitmap[i] |= or_with->bitmap[i];
 838             if (or_with->flags & ANYOF_CLASS) {
 839                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 840                     cl->classflags[i] |= or_with->classflags[i];
 841                 cl->flags |= ANYOF_CLASS;
 842             }
 843         }
 844         else { /* XXXX: logic is complicated, leave it along for a moment. */
 845             cl_anything(pRExC_state, cl);
 846         }
 847     }
 848     if (or_with->flags & ANYOF_EOS)
 849         cl->flags |= ANYOF_EOS;
 850
 851     if (or_with->flags & ANYOF_FOLD)
 852         cl->flags |= ANYOF_FOLD;
 853
 854     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
 855         ARG(cl) != ARG(or_with)) {
 856         cl->flags |= ANYOF_UNICODE_ALL;
 857         cl->flags &= ~ANYOF_UNICODE;
 858     }
 859     if (or_with->flags & ANYOF_UNICODE_ALL) {
 860         cl->flags |= ANYOF_UNICODE_ALL;
 861         cl->flags &= ~ANYOF_UNICODE;
 862     }
 863 }
 864
 865 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 866 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 867 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 868 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
 869
 870
 871 #ifdef DEBUGGING
 872 /*
 873    dump_trie(trie,widecharmap,revcharmap)
 874    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
 875    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 876
 877    These routines dump out a trie in a somewhat readable format.
 878    The _interim_ variants are used for debugging the interim
 879    tables that are used to generate the final compressed
 880    representation which is what dump_trie expects.
 881
 882    Part of the reason for their existance is to provide a form
 883    of documentation as to how the different representations function.
 884
 885 */
 886
 887 /*
 888   Dumps the final compressed table form of the trie to Perl_debug_log.
 889   Used for debugging make_trie().
 890 */
 891
 892 STATIC void
 893 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
 894             AV *revcharmap, U32 depth)
 895 {
 896     U32 state;
 897     SV *sv=sv_newmortal();
 898     int colwidth= widecharmap ? 6 : 4;
 899     U16 word;
 900     GET_RE_DEBUG_FLAGS_DECL;
 901
 902     PERL_ARGS_ASSERT_DUMP_TRIE;
 903
 904     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
 905         (int)depth * 2 + 2,"",
 906         "Match","Base","Ofs" );
 907
 908     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
 909         SV ** const tmp = av_fetch( revcharmap, state, 0);
 910         if ( tmp ) {
 911             PerlIO_printf( Perl_debug_log, "%*s",
 912                 colwidth,
 913                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 914                             PL_colors[0], PL_colors[1],
 915                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 916                             PERL_PV_ESCAPE_FIRSTCHAR
 917                 )
 918             );
 919         }
 920     }
 921     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
 922         (int)depth * 2 + 2,"");
 923
 924     for( state = 0 ; state < trie->uniquecharcount ; state++ )
 925         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
 926     PerlIO_printf( Perl_debug_log, "\n");
 927
 928     for( state = 1 ; state < trie->statecount ; state++ ) {
 929         const U32 base = trie->states[ state ].trans.base;
 930
 931         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
 932
 933         if ( trie->states[ state ].wordnum ) {
 934             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
 935         } else {
 936             PerlIO_printf( Perl_debug_log, "%6s", "" );
 937         }
 938
 939         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
 940
 941         if ( base ) {
 942             U32 ofs = 0;
 943
 944             while( ( base + ofs  < trie->uniquecharcount ) ||
 945                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
 946                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
 947                     ofs++;
 948
 949             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
 950
 951             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
 952                 if ( ( base + ofs >= trie->uniquecharcount ) &&
 953                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
 954                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
 955                 {
 956                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
 957                     colwidth,
 958                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
 959                 } else {
 960                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
 961                 }
 962             }
 963
 964             PerlIO_printf( Perl_debug_log, "]");
 965
 966         }
 967         PerlIO_printf( Perl_debug_log, "\n" );
 968     }
 969     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
 970     for (word=1; word <= trie->wordcount; word++) {
 971         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
 972             (int)word, (int)(trie->wordinfo[word].prev),
 973             (int)(trie->wordinfo[word].len));
 974     }
 975     PerlIO_printf(Perl_debug_log, "\n" );
 976 }
 977 /*
 978   Dumps a fully constructed but uncompressed trie in list form.
 979   List tries normally only are used for construction when the number of
 980   possible chars (trie->uniquecharcount) is very high.
 981   Used for debugging make_trie().
 982 */
 983 STATIC void
 984 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
 985                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
 986                          U32 depth)
 987 {
 988     U32 state;
 989     SV *sv=sv_newmortal();
 990     int colwidth= widecharmap ? 6 : 4;
 991     GET_RE_DEBUG_FLAGS_DECL;
 992
 993     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
 994
 995     /* print out the table precompression.  */
 996     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
 997         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
 998         "------:-----+-----------------\n" );
 999
1000     for( state=1 ; state < next_alloc ; state ++ ) {
1001         U16 charid;
1002
1003         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1004             (int)depth * 2 + 2,"", (UV)state  );
1005         if ( ! trie->states[ state ].wordnum ) {
1006             PerlIO_printf( Perl_debug_log, "%5s| ","");
1007         } else {
1008             PerlIO_printf( Perl_debug_log, "W%4x| ",
1009                 trie->states[ state ].wordnum
1010             );
1011         }
1012         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1013             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1014             if ( tmp ) {
1015                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1016                     colwidth,
1017                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1018                             PL_colors[0], PL_colors[1],
1019                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1020                             PERL_PV_ESCAPE_FIRSTCHAR
1021                     ) ,
1022                     TRIE_LIST_ITEM(state,charid).forid,
1023                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1024                 );
1025                 if (!(charid % 10))
1026                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1027                         (int)((depth * 2) + 14), "");
1028             }
1029         }
1030         PerlIO_printf( Perl_debug_log, "\n");
1031     }
1032 }
1033
1034 /*
1035   Dumps a fully constructed but uncompressed trie in table form.
1036   This is the normal DFA style state transition table, with a few
1037   twists to facilitate compression later.
1038   Used for debugging make_trie().
1039 */
1040 STATIC void
1041 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1042                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1043                           U32 depth)
1044 {
1045     U32 state;
1046     U16 charid;
1047     SV *sv=sv_newmortal();
1048     int colwidth= widecharmap ? 6 : 4;
1049     GET_RE_DEBUG_FLAGS_DECL;
1050
1051     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1052
1053     /*
1054        print out the table precompression so that we can do a visual check
1055        that they are identical.
1056      */
1057
1058     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1059
1060     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1061         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1062         if ( tmp ) {
1063             PerlIO_printf( Perl_debug_log, "%*s",
1064                 colwidth,
1065                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1066                             PL_colors[0], PL_colors[1],
1067                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1068                             PERL_PV_ESCAPE_FIRSTCHAR
1069                 )
1070             );
1071         }
1072     }
1073
1074     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1075
1076     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1077         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1078     }
1079
1080     PerlIO_printf( Perl_debug_log, "\n" );
1081
1082     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1083
1084         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1085             (int)depth * 2 + 2,"",
1086             (UV)TRIE_NODENUM( state ) );
1087
1088         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1089             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1090             if (v)
1091                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1092             else
1093                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1094         }
1095         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1096             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1097         } else {
1098             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1099             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1100         }
1101     }
1102 }
1103
1104 #endif
1105
1106
1107 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1108   startbranch: the first branch in the whole branch sequence
1109   first      : start branch of sequence of branch-exact nodes.
1110                May be the same as startbranch
1111   last       : Thing following the last branch.
1112                May be the same as tail.
1113   tail       : item following the branch sequence
1114   count      : words in the sequence
1115   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1116   depth      : indent depth
1117
1118 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1119
1120 A trie is an N'ary tree where the branches are determined by digital
1121 decomposition of the key. IE, at the root node you look up the 1st character and
1122 follow that branch repeat until you find the end of the branches. Nodes can be
1123 marked as "accepting" meaning they represent a complete word. Eg:
1124
1125   /he|she|his|hers/
1126
1127 would convert into the following structure. Numbers represent states, letters
1128 following numbers represent valid transitions on the letter from that state, if
1129 the number is in square brackets it represents an accepting state, otherwise it
1130 will be in parenthesis.
1131
1132       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1133       |    |
1134       |   (2)
1135       |    |
1136      (1)   +-i->(6)-+-s->[7]
1137       |
1138       +-s->(3)-+-h->(4)-+-e->[5]
1139
1140       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1141
1142 This shows that when matching against the string 'hers' we will begin at state 1
1143 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1144 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1145 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1146 single traverse. We store a mapping from accepting to state to which word was
1147 matched, and then when we have multiple possibilities we try to complete the
1148 rest of the regex in the order in which they occured in the alternation.
1149
1150 The only prior NFA like behaviour that would be changed by the TRIE support is
1151 the silent ignoring of duplicate alternations which are of the form:
1152
1153  / (DUPE|DUPE) X? (?{ ... }) Y /x
1154
1155 Thus EVAL blocks follwing a trie may be called a different number of times with
1156 and without the optimisation. With the optimisations dupes will be silently
1157 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1158 the following demonstrates:
1159
1160  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1161
1162 which prints out 'word' three times, but
1163
1164  'words'=~/(word|word|word)(?{ print $1 })S/
1165
1166 which doesnt print it out at all. This is due to other optimisations kicking in.
1167
1168 Example of what happens on a structural level:
1169
1170 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1171
1172    1: CURLYM[1] {1,32767}(18)
1173    5:   BRANCH(8)
1174    6:     EXACT <ac>(16)
1175    8:   BRANCH(11)
1176    9:     EXACT <ad>(16)
1177   11:   BRANCH(14)
1178   12:     EXACT <ab>(16)
1179   16:   SUCCEED(0)
1180   17:   NOTHING(18)
1181   18: END(0)
1182
1183 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1184 and should turn into:
1185
1186    1: CURLYM[1] {1,32767}(18)
1187    5:   TRIE(16)
1188         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1189           <ac>
1190           <ad>
1191           <ab>
1192   16:   SUCCEED(0)
1193   17:   NOTHING(18)
1194   18: END(0)
1195
1196 Cases where tail != last would be like /(?foo|bar)baz/:
1197
1198    1: BRANCH(4)
1199    2:   EXACT <foo>(8)
1200    4: BRANCH(7)
1201    5:   EXACT <bar>(8)
1202    7: TAIL(8)
1203    8: EXACT <baz>(10)
1204   10: END(0)
1205
1206 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1207 and would end up looking like:
1208
1209     1: TRIE(8)
1210       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1211         <foo>
1212         <bar>
1213    7: TAIL(8)
1214    8: EXACT <baz>(10)
1215   10: END(0)
1216
1217     d = uvuni_to_utf8_flags(d, uv, 0);
1218
1219 is the recommended Unicode-aware way of saying
1220
1221     *(d++) = uv;
1222 */
1223
1224 #define TRIE_STORE_REVCHAR                                                 \
1225     STMT_START {                                                           \
1226         if (UTF) {                                                         \
1227             SV *zlopp = newSV(2);                                          \
1228             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1229             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1230             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1231             SvPOK_on(zlopp);                                               \
1232             SvUTF8_on(zlopp);                                              \
1233             av_push(revcharmap, zlopp);                                    \
1234         } else {                                                           \
1235             char ooooff = (char)uvc;                                               \
1236             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1237         }                                                                  \
1238         } STMT_END
1239
1240 #define TRIE_READ_CHAR STMT_START {                                           \
1241     wordlen++;                                                                \
1242     if ( UTF ) {                                                              \
1243         if ( folder ) {                                                       \
1244             if ( foldlen > 0 ) {                                              \
1245                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1246                foldlen -= len;                                                \
1247                scan += len;                                                   \
1248                len = 0;                                                       \
1249             } else {                                                          \
1250                 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1251                 uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1252                 foldlen -= UNISKIP( uvc );                                    \
1253                 scan = foldbuf + UNISKIP( uvc );                              \
1254             }                                                                 \
1255         } else {                                                              \
1256             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1257         }                                                                     \
1258     } else {                                                                  \
1259         uvc = (U32)*uc;                                                       \
1260         len = 1;                                                              \
1261     }                                                                         \
1262 } STMT_END
1263
1264
1265
1266 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1267     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1268         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1269         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1270     }                                                           \
1271     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1272     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1273     TRIE_LIST_CUR( state )++;                                   \
1274 } STMT_END
1275
1276 #define TRIE_LIST_NEW(state) STMT_START {                       \
1277     Newxz( trie->states[ state ].trans.list,               \
1278         4, reg_trie_trans_le );                                 \
1279      TRIE_LIST_CUR( state ) = 1;                                \
1280      TRIE_LIST_LEN( state ) = 4;                                \
1281 } STMT_END
1282
1283 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1284     U16 dupe= trie->states[ state ].wordnum;                    \
1285     regnode * const noper_next = regnext( noper );              \
1286                                                                 \
1287     DEBUG_r({                                                   \
1288         /* store the word for dumping */                        \
1289         SV* tmp;                                                \
1290         if (OP(noper) != NOTHING)                               \
1291             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1292         else                                                    \
1293             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1294         av_push( trie_words, tmp );                             \
1295     });                                                         \
1296                                                                 \
1297     curword++;                                                  \
1298     trie->wordinfo[curword].prev   = 0;                         \
1299     trie->wordinfo[curword].len    = wordlen;                   \
1300     trie->wordinfo[curword].accept = state;                     \
1301                                                                 \
1302     if ( noper_next < tail ) {                                  \
1303         if (!trie->jump)                                        \
1304             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1305         trie->jump[curword] = (U16)(noper_next - convert);      \
1306         if (!jumper)                                            \
1307             jumper = noper_next;                                \
1308         if (!nextbranch)                                        \
1309             nextbranch= regnext(cur);                           \
1310     }                                                           \
1311                                                                 \
1312     if ( dupe ) {                                               \
1313         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1314         /* chain, so that when the bits of chain are later    */\
1315         /* linked together, the dups appear in the chain      */\
1316         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1317         trie->wordinfo[dupe].prev = curword;                    \
1318     } else {                                                    \
1319         /* we haven't inserted this word yet.                */ \
1320         trie->states[ state ].wordnum = curword;                \
1321     }                                                           \
1322 } STMT_END
1323
1324
1325 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1326      ( ( base + charid >=  ucharcount                                   \
1327          && base + charid < ubound                                      \
1328          && state == trie->trans[ base - ucharcount + charid ].check    \
1329          && trie->trans[ base - ucharcount + charid ].next )            \
1330            ? trie->trans[ base - ucharcount + charid ].next             \
1331            : ( state==1 ? special : 0 )                                 \
1332       )
1333
1334 #define MADE_TRIE       1
1335 #define MADE_JUMP_TRIE  2
1336 #define MADE_EXACT_TRIE 4
1337
1338 STATIC I32
1339 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1340 {
1341     dVAR;
1342     /* first pass, loop through and scan words */
1343     reg_trie_data *trie;
1344     HV *widecharmap = NULL;
1345     AV *revcharmap = newAV();
1346     regnode *cur;
1347     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1348     STRLEN len = 0;
1349     UV uvc = 0;
1350     U16 curword = 0;
1351     U32 next_alloc = 0;
1352     regnode *jumper = NULL;
1353     regnode *nextbranch = NULL;
1354     regnode *convert = NULL;
1355     U32 *prev_states; /* temp array mapping each state to previous one */
1356     /* we just use folder as a flag in utf8 */
1357     const U8 * const folder = ( flags == EXACTF
1358                        ? PL_fold
1359                        : ( flags == EXACTFL
1360                            ? PL_fold_locale
1361                            : NULL
1362                          )
1363                      );
1364
1365 #ifdef DEBUGGING
1366     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1367     AV *trie_words = NULL;
1368     /* along with revcharmap, this only used during construction but both are
1369      * useful during debugging so we store them in the struct when debugging.
1370      */
1371 #else
1372     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1373     STRLEN trie_charcount=0;
1374 #endif
1375     SV *re_trie_maxbuff;
1376     GET_RE_DEBUG_FLAGS_DECL;
1377
1378     PERL_ARGS_ASSERT_MAKE_TRIE;
1379 #ifndef DEBUGGING
1380     PERL_UNUSED_ARG(depth);
1381 #endif
1382
1383     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1384     trie->refcount = 1;
1385     trie->startstate = 1;
1386     trie->wordcount = word_count;
1387     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1388     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1389     if (!(UTF && folder))
1390         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1391     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1392                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1393
1394     DEBUG_r({
1395         trie_words = newAV();
1396     });
1397
1398     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1399     if (!SvIOK(re_trie_maxbuff)) {
1400         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1401     }
1402     DEBUG_OPTIMISE_r({
1403                 PerlIO_printf( Perl_debug_log,
1404                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1405                   (int)depth * 2 + 2, "",
1406                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1407                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1408                   (int)depth);
1409     });
1410
1411    /* Find the node we are going to overwrite */
1412     if ( first == startbranch && OP( last ) != BRANCH ) {
1413         /* whole branch chain */
1414         convert = first;
1415     } else {
1416         /* branch sub-chain */
1417         convert = NEXTOPER( first );
1418     }
1419
1420     /*  -- First loop and Setup --
1421
1422        We first traverse the branches and scan each word to determine if it
1423        contains widechars, and how many unique chars there are, this is
1424        important as we have to build a table with at least as many columns as we
1425        have unique chars.
1426
1427        We use an array of integers to represent the character codes 0..255
1428        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1429        native representation of the character value as the key and IV's for the
1430        coded index.
1431
1432        *TODO* If we keep track of how many times each character is used we can
1433        remap the columns so that the table compression later on is more
1434        efficient in terms of memory by ensuring the most common value is in the
1435        middle and the least common are on the outside.  IMO this would be better
1436        than a most to least common mapping as theres a decent chance the most
1437        common letter will share a node with the least common, meaning the node
1438        will not be compressable. With a middle is most common approach the worst
1439        case is when we have the least common nodes twice.
1440
1441      */
1442
1443     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1444         regnode * const noper = NEXTOPER( cur );
1445         const U8 *uc = (U8*)STRING( noper );
1446         const U8 * const e  = uc + STR_LEN( noper );
1447         STRLEN foldlen = 0;
1448         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1449         const U8 *scan = (U8*)NULL;
1450         U32 wordlen      = 0;         /* required init */
1451         STRLEN chars = 0;
1452         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1453
1454         if (OP(noper) == NOTHING) {
1455             trie->minlen= 0;
1456             continue;
1457         }
1458         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1459             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1460                                           regardless of encoding */
1461
1462         for ( ; uc < e ; uc += len ) {
1463             TRIE_CHARCOUNT(trie)++;
1464             TRIE_READ_CHAR;
1465             chars++;
1466             if ( uvc < 256 ) {
1467                 if ( !trie->charmap[ uvc ] ) {
1468                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1469                     if ( folder )
1470                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1471                     TRIE_STORE_REVCHAR;
1472                 }
1473                 if ( set_bit ) {
1474                     /* store the codepoint in the bitmap, and if its ascii
1475                        also store its folded equivelent. */
1476                     TRIE_BITMAP_SET(trie,uvc);
1477
1478                     /* store the folded codepoint */
1479                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1480
1481                     if ( !UTF ) {
1482                         /* store first byte of utf8 representation of
1483                            codepoints in the 127 < uvc < 256 range */
1484                         if (127 < uvc && uvc < 192) {
1485                             TRIE_BITMAP_SET(trie,194);
1486                         } else if (191 < uvc ) {
1487                             TRIE_BITMAP_SET(trie,195);
1488                         /* && uvc < 256 -- we know uvc is < 256 already */
1489                         }
1490                     }
1491                     set_bit = 0; /* We've done our bit :-) */
1492                 }
1493             } else {
1494                 SV** svpp;
1495                 if ( !widecharmap )
1496                     widecharmap = newHV();
1497
1498                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1499
1500                 if ( !svpp )
1501                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1502
1503                 if ( !SvTRUE( *svpp ) ) {
1504                     sv_setiv( *svpp, ++trie->uniquecharcount );
1505                     TRIE_STORE_REVCHAR;
1506                 }
1507             }
1508         }
1509         if( cur == first ) {
1510             trie->minlen=chars;
1511             trie->maxlen=chars;
1512         } else if (chars < trie->minlen) {
1513             trie->minlen=chars;
1514         } else if (chars > trie->maxlen) {
1515             trie->maxlen=chars;
1516         }
1517
1518     } /* end first pass */
1519     DEBUG_TRIE_COMPILE_r(
1520         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1521                 (int)depth * 2 + 2,"",
1522                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1523                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1524                 (int)trie->minlen, (int)trie->maxlen )
1525     );
1526
1527     /*
1528         We now know what we are dealing with in terms of unique chars and
1529         string sizes so we can calculate how much memory a naive
1530         representation using a flat table  will take. If it's over a reasonable
1531         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1532         conservative but potentially much slower representation using an array
1533         of lists.
1534
1535         At the end we convert both representations into the same compressed
1536         form that will be used in regexec.c for matching with. The latter
1537         is a form that cannot be used to construct with but has memory
1538         properties similar to the list form and access properties similar
1539         to the table form making it both suitable for fast searches and
1540         small enough that its feasable to store for the duration of a program.
1541
1542         See the comment in the code where the compressed table is produced
1543         inplace from the flat tabe representation for an explanation of how
1544         the compression works.
1545
1546     */
1547
1548
1549     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1550     prev_states[1] = 0;
1551
1552     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1553         /*
1554             Second Pass -- Array Of Lists Representation
1555
1556             Each state will be represented by a list of charid:state records
1557             (reg_trie_trans_le) the first such element holds the CUR and LEN
1558             points of the allocated array. (See defines above).
1559
1560             We build the initial structure using the lists, and then convert
1561             it into the compressed table form which allows faster lookups
1562             (but cant be modified once converted).
1563         */
1564
1565         STRLEN transcount = 1;
1566
1567         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1568             "%*sCompiling trie using list compiler\n",
1569             (int)depth * 2 + 2, ""));
1570
1571         trie->states = (reg_trie_state *)
1572             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1573                                   sizeof(reg_trie_state) );
1574         TRIE_LIST_NEW(1);
1575         next_alloc = 2;
1576
1577         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1578
1579             regnode * const noper = NEXTOPER( cur );
1580             U8 *uc           = (U8*)STRING( noper );
1581             const U8 * const e = uc + STR_LEN( noper );
1582             U32 state        = 1;         /* required init */
1583             U16 charid       = 0;         /* sanity init */
1584             U8 *scan         = (U8*)NULL; /* sanity init */
1585             STRLEN foldlen   = 0;         /* required init */
1586             U32 wordlen      = 0;         /* required init */
1587             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1588
1589             if (OP(noper) != NOTHING) {
1590                 for ( ; uc < e ; uc += len ) {
1591
1592                     TRIE_READ_CHAR;
1593
1594                     if ( uvc < 256 ) {
1595                         charid = trie->charmap[ uvc ];
1596                     } else {
1597                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1598                         if ( !svpp ) {
1599                             charid = 0;
1600                         } else {
1601                             charid=(U16)SvIV( *svpp );
1602                         }
1603                     }
1604                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1605                     if ( charid ) {
1606
1607                         U16 check;
1608                         U32 newstate = 0;
1609
1610                         charid--;
1611                         if ( !trie->states[ state ].trans.list ) {
1612                             TRIE_LIST_NEW( state );
1613                         }
1614                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1615                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1616                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1617                                 break;
1618                             }
1619                         }
1620                         if ( ! newstate ) {
1621                             newstate = next_alloc++;
1622                             prev_states[newstate] = state;
1623                             TRIE_LIST_PUSH( state, charid, newstate );
1624                             transcount++;
1625                         }
1626                         state = newstate;
1627                     } else {
1628                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1629                     }
1630                 }
1631             }
1632             TRIE_HANDLE_WORD(state);
1633
1634         } /* end second pass */
1635
1636         /* next alloc is the NEXT state to be allocated */
1637         trie->statecount = next_alloc;
1638         trie->states = (reg_trie_state *)
1639             PerlMemShared_realloc( trie->states,
1640                                    next_alloc
1641                                    * sizeof(reg_trie_state) );
1642
1643         /* and now dump it out before we compress it */
1644         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1645                                                          revcharmap, next_alloc,
1646                                                          depth+1)
1647         );
1648
1649         trie->trans = (reg_trie_trans *)
1650             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1651         {
1652             U32 state;
1653             U32 tp = 0;
1654             U32 zp = 0;
1655
1656
1657             for( state=1 ; state < next_alloc ; state ++ ) {
1658                 U32 base=0;
1659
1660                 /*
1661                 DEBUG_TRIE_COMPILE_MORE_r(
1662                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1663                 );
1664                 */
1665
1666                 if (trie->states[state].trans.list) {
1667                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1668                     U16 maxid=minid;
1669                     U16 idx;
1670
1671                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1672                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1673                         if ( forid < minid ) {
1674                             minid=forid;
1675                         } else if ( forid > maxid ) {
1676                             maxid=forid;
1677                         }
1678                     }
1679                     if ( transcount < tp + maxid - minid + 1) {
1680                         transcount *= 2;
1681                         trie->trans = (reg_trie_trans *)
1682                             PerlMemShared_realloc( trie->trans,
1683                                                      transcount
1684                                                      * sizeof(reg_trie_trans) );
1685                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1686                     }
1687                     base = trie->uniquecharcount + tp - minid;
1688                     if ( maxid == minid ) {
1689                         U32 set = 0;
1690                         for ( ; zp < tp ; zp++ ) {
1691                             if ( ! trie->trans[ zp ].next ) {
1692                                 base = trie->uniquecharcount + zp - minid;
1693                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1694                                 trie->trans[ zp ].check = state;
1695                                 set = 1;
1696                                 break;
1697                             }
1698                         }
1699                         if ( !set ) {
1700                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1701                             trie->trans[ tp ].check = state;
1702                             tp++;
1703                             zp = tp;
1704                         }
1705                     } else {
1706                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1707                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1708                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1709                             trie->trans[ tid ].check = state;
1710                         }
1711                         tp += ( maxid - minid + 1 );
1712                     }
1713                     Safefree(trie->states[ state ].trans.list);
1714                 }
1715                 /*
1716                 DEBUG_TRIE_COMPILE_MORE_r(
1717                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1718                 );
1719                 */
1720                 trie->states[ state ].trans.base=base;
1721             }
1722             trie->lasttrans = tp + 1;
1723         }
1724     } else {
1725         /*
1726            Second Pass -- Flat Table Representation.
1727
1728            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1729            We know that we will need Charcount+1 trans at most to store the data
1730            (one row per char at worst case) So we preallocate both structures
1731            assuming worst case.
1732
1733            We then construct the trie using only the .next slots of the entry
1734            structs.
1735
1736            We use the .check field of the first entry of the node temporarily to
1737            make compression both faster and easier by keeping track of how many non
1738            zero fields are in the node.
1739
1740            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1741            transition.
1742
1743            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1744            number representing the first entry of the node, and state as a
1745            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1746            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1747            are 2 entrys per node. eg:
1748
1749              A B       A B
1750           1. 2 4    1. 3 7
1751           2. 0 3    3. 0 5
1752           3. 0 0    5. 0 0
1753           4. 0 0    7. 0 0
1754
1755            The table is internally in the right hand, idx form. However as we also
1756            have to deal with the states array which is indexed by nodenum we have to
1757            use TRIE_NODENUM() to convert.
1758
1759         */
1760         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1761             "%*sCompiling trie using table compiler\n",
1762             (int)depth * 2 + 2, ""));
1763
1764         trie->trans = (reg_trie_trans *)
1765             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1766                                   * trie->uniquecharcount + 1,
1767                                   sizeof(reg_trie_trans) );
1768         trie->states = (reg_trie_state *)
1769             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1770                                   sizeof(reg_trie_state) );
1771         next_alloc = trie->uniquecharcount + 1;
1772
1773
1774         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1775
1776             regnode * const noper   = NEXTOPER( cur );
1777             const U8 *uc     = (U8*)STRING( noper );
1778             const U8 * const e = uc + STR_LEN( noper );
1779
1780             U32 state        = 1;         /* required init */
1781
1782             U16 charid       = 0;         /* sanity init */
1783             U32 accept_state = 0;         /* sanity init */
1784             U8 *scan         = (U8*)NULL; /* sanity init */
1785
1786             STRLEN foldlen   = 0;         /* required init */
1787             U32 wordlen      = 0;         /* required init */
1788             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1789
1790             if ( OP(noper) != NOTHING ) {
1791                 for ( ; uc < e ; uc += len ) {
1792
1793                     TRIE_READ_CHAR;
1794
1795                     if ( uvc < 256 ) {
1796                         charid = trie->charmap[ uvc ];
1797                     } else {
1798                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1799                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1800                     }
1801                     if ( charid ) {
1802                         charid--;
1803                         if ( !trie->trans[ state + charid ].next ) {
1804                             trie->trans[ state + charid ].next = next_alloc;
1805                             trie->trans[ state ].check++;
1806                             prev_states[TRIE_NODENUM(next_alloc)]
1807                                     = TRIE_NODENUM(state);
1808                             next_alloc += trie->uniquecharcount;
1809                         }
1810                         state = trie->trans[ state + charid ].next;
1811                     } else {
1812                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1813                     }
1814                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1815                 }
1816             }
1817             accept_state = TRIE_NODENUM( state );
1818             TRIE_HANDLE_WORD(accept_state);
1819
1820         } /* end second pass */
1821
1822         /* and now dump it out before we compress it */
1823         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1824                                                           revcharmap,
1825                                                           next_alloc, depth+1));
1826
1827         {
1828         /*
1829            * Inplace compress the table.*
1830
1831            For sparse data sets the table constructed by the trie algorithm will
1832            be mostly 0/FAIL transitions or to put it another way mostly empty.
1833            (Note that leaf nodes will not contain any transitions.)
1834
1835            This algorithm compresses the tables by eliminating most such
1836            transitions, at the cost of a modest bit of extra work during lookup:
1837
1838            - Each states[] entry contains a .base field which indicates the
1839            index in the state[] array wheres its transition data is stored.
1840
1841            - If .base is 0 there are no valid transitions from that node.
1842
1843            - If .base is nonzero then charid is added to it to find an entry in
1844            the trans array.
1845
1846            -If trans[states[state].base+charid].check!=state then the
1847            transition is taken to be a 0/Fail transition. Thus if there are fail
1848            transitions at the front of the node then the .base offset will point
1849            somewhere inside the previous nodes data (or maybe even into a node
1850            even earlier), but the .check field determines if the transition is
1851            valid.
1852
1853            XXX - wrong maybe?
1854            The following process inplace converts the table to the compressed
1855            table: We first do not compress the root node 1,and mark all its
1856            .check pointers as 1 and set its .base pointer as 1 as well. This
1857            allows us to do a DFA construction from the compressed table later,
1858            and ensures that any .base pointers we calculate later are greater
1859            than 0.
1860
1861            - We set 'pos' to indicate the first entry of the second node.
1862
1863            - We then iterate over the columns of the node, finding the first and
1864            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1865            and set the .check pointers accordingly, and advance pos
1866            appropriately and repreat for the next node. Note that when we copy
1867            the next pointers we have to convert them from the original
1868            NODEIDX form to NODENUM form as the former is not valid post
1869            compression.
1870
1871            - If a node has no transitions used we mark its base as 0 and do not
1872            advance the pos pointer.
1873
1874            - If a node only has one transition we use a second pointer into the
1875            structure to fill in allocated fail transitions from other states.
1876            This pointer is independent of the main pointer and scans forward
1877            looking for null transitions that are allocated to a state. When it
1878            finds one it writes the single transition into the "hole".  If the
1879            pointer doesnt find one the single transition is appended as normal.
1880
1881            - Once compressed we can Renew/realloc the structures to release the
1882            excess space.
1883
1884            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1885            specifically Fig 3.47 and the associated pseudocode.
1886
1887            demq
1888         */
1889         const U32 laststate = TRIE_NODENUM( next_alloc );
1890         U32 state, charid;
1891         U32 pos = 0, zp=0;
1892         trie->statecount = laststate;
1893
1894         for ( state = 1 ; state < laststate ; state++ ) {
1895             U8 flag = 0;
1896             const U32 stateidx = TRIE_NODEIDX( state );
1897             const U32 o_used = trie->trans[ stateidx ].check;
1898             U32 used = trie->trans[ stateidx ].check;
1899             trie->trans[ stateidx ].check = 0;
1900
1901             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1902                 if ( flag || trie->trans[ stateidx + charid ].next ) {
1903                     if ( trie->trans[ stateidx + charid ].next ) {
1904                         if (o_used == 1) {
1905                             for ( ; zp < pos ; zp++ ) {
1906                                 if ( ! trie->trans[ zp ].next ) {
1907                                     break;
1908                                 }
1909                             }
1910                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1911                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1912                             trie->trans[ zp ].check = state;
1913                             if ( ++zp > pos ) pos = zp;
1914                             break;
1915                         }
1916                         used--;
1917                     }
1918                     if ( !flag ) {
1919                         flag = 1;
1920                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1921                     }
1922                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1923                     trie->trans[ pos ].check = state;
1924                     pos++;
1925                 }
1926             }
1927         }
1928         trie->lasttrans = pos + 1;
1929         trie->states = (reg_trie_state *)
1930             PerlMemShared_realloc( trie->states, laststate
1931                                    * sizeof(reg_trie_state) );
1932         DEBUG_TRIE_COMPILE_MORE_r(
1933                 PerlIO_printf( Perl_debug_log,
1934                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1935                     (int)depth * 2 + 2,"",
1936                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1937                     (IV)next_alloc,
1938                     (IV)pos,
1939                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1940             );
1941
1942         } /* end table compress */
1943     }
1944     DEBUG_TRIE_COMPILE_MORE_r(
1945             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1946                 (int)depth * 2 + 2, "",
1947                 (UV)trie->statecount,
1948                 (UV)trie->lasttrans)
1949     );
1950     /* resize the trans array to remove unused space */
1951     trie->trans = (reg_trie_trans *)
1952         PerlMemShared_realloc( trie->trans, trie->lasttrans
1953                                * sizeof(reg_trie_trans) );
1954
1955     {   /* Modify the program and insert the new TRIE node */
1956         U8 nodetype =(U8)(flags & 0xFF);
1957         char *str=NULL;
1958
1959 #ifdef DEBUGGING
1960         regnode *optimize = NULL;
1961 #ifdef RE_TRACK_PATTERN_OFFSETS
1962
1963         U32 mjd_offset = 0;
1964         U32 mjd_nodelen = 0;
1965 #endif /* RE_TRACK_PATTERN_OFFSETS */
1966 #endif /* DEBUGGING */
1967         /*
1968            This means we convert either the first branch or the first Exact,
1969            depending on whether the thing following (in 'last') is a branch
1970            or not and whther first is the startbranch (ie is it a sub part of
1971            the alternation or is it the whole thing.)
1972            Assuming its a sub part we convert the EXACT otherwise we convert
1973            the whole branch sequence, including the first.
1974          */
1975         /* Find the node we are going to overwrite */
1976         if ( first != startbranch || OP( last ) == BRANCH ) {
1977             /* branch sub-chain */
1978             NEXT_OFF( first ) = (U16)(last - first);
1979 #ifdef RE_TRACK_PATTERN_OFFSETS
1980             DEBUG_r({
1981                 mjd_offset= Node_Offset((convert));
1982                 mjd_nodelen= Node_Length((convert));
1983             });
1984 #endif
1985             /* whole branch chain */
1986         }
1987 #ifdef RE_TRACK_PATTERN_OFFSETS
1988         else {
1989             DEBUG_r({
1990                 const  regnode *nop = NEXTOPER( convert );
1991                 mjd_offset= Node_Offset((nop));
1992                 mjd_nodelen= Node_Length((nop));
1993             });
1994         }
1995         DEBUG_OPTIMISE_r(
1996             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1997                 (int)depth * 2 + 2, "",
1998                 (UV)mjd_offset, (UV)mjd_nodelen)
1999         );
2000 #endif
2001         /* But first we check to see if there is a common prefix we can
2002            split out as an EXACT and put in front of the TRIE node.  */
2003         trie->startstate= 1;
2004         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2005             U32 state;
2006             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2007                 U32 ofs = 0;
2008                 I32 idx = -1;
2009                 U32 count = 0;
2010                 const U32 base = trie->states[ state ].trans.base;
2011
2012                 if ( trie->states[state].wordnum )
2013                         count = 1;
2014
2015                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2016                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2017                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2018                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2019                     {
2020                         if ( ++count > 1 ) {
2021                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2022                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2023                             if ( state == 1 ) break;
2024                             if ( count == 2 ) {
2025                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2026                                 DEBUG_OPTIMISE_r(
2027                                     PerlIO_printf(Perl_debug_log,
2028                                         "%*sNew Start State=%"UVuf" Class: [",
2029                                         (int)depth * 2 + 2, "",
2030                                         (UV)state));
2031                                 if (idx >= 0) {
2032                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2033                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2034
2035                                     TRIE_BITMAP_SET(trie,*ch);
2036                                     if ( folder )
2037                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2038                                     DEBUG_OPTIMISE_r(
2039                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2040                                     );
2041                                 }
2042                             }
2043                             TRIE_BITMAP_SET(trie,*ch);
2044                             if ( folder )
2045                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2046                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2047                         }
2048                         idx = ofs;
2049                     }
2050                 }
2051                 if ( count == 1 ) {
2052                     SV **tmp = av_fetch( revcharmap, idx, 0);
2053                     STRLEN len;
2054                     char *ch = SvPV( *tmp, len );
2055                     DEBUG_OPTIMISE_r({
2056                         SV *sv=sv_newmortal();
2057                         PerlIO_printf( Perl_debug_log,
2058                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2059                             (int)depth * 2 + 2, "",
2060                             (UV)state, (UV)idx,
2061                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2062                                 PL_colors[0], PL_colors[1],
2063                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2064                                 PERL_PV_ESCAPE_FIRSTCHAR
2065                             )
2066                         );
2067                     });
2068                     if ( state==1 ) {
2069                         OP( convert ) = nodetype;
2070                         str=STRING(convert);
2071                         STR_LEN(convert)=0;
2072                     }
2073                     STR_LEN(convert) += len;
2074                     while (len--)
2075                         *str++ = *ch++;
2076                 } else {
2077 #ifdef DEBUGGING
2078                     if (state>1)
2079                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2080 #endif
2081                     break;
2082                 }
2083             }
2084             trie->prefixlen = (state-1);
2085             if (str) {
2086                 regnode *n = convert+NODE_SZ_STR(convert);
2087                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2088                 trie->startstate = state;
2089                 trie->minlen -= (state - 1);
2090                 trie->maxlen -= (state - 1);
2091 #ifdef DEBUGGING
2092                /* At least the UNICOS C compiler choked on this
2093                 * being argument to DEBUG_r(), so let's just have
2094                 * it right here. */
2095                if (
2096 #ifdef PERL_EXT_RE_BUILD
2097                    1
2098 #else
2099                    DEBUG_r_TEST
2100 #endif
2101                    ) {
2102                    regnode *fix = convert;
2103                    U32 word = trie->wordcount;
2104                    mjd_nodelen++;
2105                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2106                    while( ++fix < n ) {
2107                        Set_Node_Offset_Length(fix, 0, 0);
2108                    }
2109                    while (word--) {
2110                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2111                        if (tmp) {
2112                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2113                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2114                            else
2115                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2116                        }
2117                    }
2118                }
2119 #endif
2120                 if (trie->maxlen) {
2121                     convert = n;
2122                 } else {
2123                     NEXT_OFF(convert) = (U16)(tail - convert);
2124                     DEBUG_r(optimize= n);
2125                 }
2126             }
2127         }
2128         if (!jumper)
2129             jumper = last;
2130         if ( trie->maxlen ) {
2131             NEXT_OFF( convert ) = (U16)(tail - convert);
2132             ARG_SET( convert, data_slot );
2133             /* Store the offset to the first unabsorbed branch in
2134                jump[0], which is otherwise unused by the jump logic.
2135                We use this when dumping a trie and during optimisation. */
2136             if (trie->jump)
2137                 trie->jump[0] = (U16)(nextbranch - convert);
2138
2139             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2140              *   and there is a bitmap
2141              *   and the first "jump target" node we found leaves enough room
2142              * then convert the TRIE node into a TRIEC node, with the bitmap
2143              * embedded inline in the opcode - this is hypothetically faster.
2144              */
2145             if ( !trie->states[trie->startstate].wordnum
2146                  && trie->bitmap
2147                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2148             {
2149                 OP( convert ) = TRIEC;
2150                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2151                 PerlMemShared_free(trie->bitmap);
2152                 trie->bitmap= NULL;
2153             } else
2154                 OP( convert ) = TRIE;
2155
2156             /* store the type in the flags */
2157             convert->flags = nodetype;
2158             DEBUG_r({
2159             optimize = convert
2160                       + NODE_STEP_REGNODE
2161                       + regarglen[ OP( convert ) ];
2162             });
2163             /* XXX We really should free up the resource in trie now,
2164                    as we won't use them - (which resources?) dmq */
2165         }
2166         /* needed for dumping*/
2167         DEBUG_r(if (optimize) {
2168             regnode *opt = convert;
2169
2170             while ( ++opt < optimize) {
2171                 Set_Node_Offset_Length(opt,0,0);
2172             }
2173             /*
2174                 Try to clean up some of the debris left after the
2175                 optimisation.
2176              */
2177             while( optimize < jumper ) {
2178                 mjd_nodelen += Node_Length((optimize));
2179                 OP( optimize ) = OPTIMIZED;
2180                 Set_Node_Offset_Length(optimize,0,0);
2181                 optimize++;
2182             }
2183             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2184         });
2185     } /* end node insert */
2186
2187     /*  Finish populating the prev field of the wordinfo array.  Walk back
2188      *  from each accept state until we find another accept state, and if
2189      *  so, point the first word's .prev field at the second word. If the
2190      *  second already has a .prev field set, stop now. This will be the
2191      *  case either if we've already processed that word's accept state,
2192      *  or that state had multiple words, and the overspill words were
2193      *  already linked up earlier.
2194      */
2195     {
2196         U16 word;
2197         U32 state;
2198         U16 prev;
2199
2200         for (word=1; word <= trie->wordcount; word++) {
2201             prev = 0;
2202             if (trie->wordinfo[word].prev)
2203                 continue;
2204             state = trie->wordinfo[word].accept;
2205             while (state) {
2206                 state = prev_states[state];
2207                 if (!state)
2208                     break;
2209                 prev = trie->states[state].wordnum;
2210                 if (prev)
2211                     break;
2212             }
2213             trie->wordinfo[word].prev = prev;
2214         }
2215         Safefree(prev_states);
2216     }
2217
2218
2219     /* and now dump out the compressed format */
2220     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2221
2222     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2223 #ifdef DEBUGGING
2224     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2225     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2226 #else
2227     SvREFCNT_dec(revcharmap);
2228 #endif
2229     return trie->jump
2230            ? MADE_JUMP_TRIE
2231            : trie->startstate>1
2232              ? MADE_EXACT_TRIE
2233              : MADE_TRIE;
2234 }
2235
2236 STATIC void
2237 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2238 {
2239 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2240
2241    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2242    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2243    ISBN 0-201-10088-6
2244
2245    We find the fail state for each state in the trie, this state is the longest proper
2246    suffix of the current state's 'word' that is also a proper prefix of another word in our
2247    trie. State 1 represents the word '' and is thus the default fail state. This allows
2248    the DFA not to have to restart after its tried and failed a word at a given point, it
2249    simply continues as though it had been matching the other word in the first place.
2250    Consider
2251       'abcdgu'=~/abcdefg|cdgu/
2252    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2253    fail, which would bring us to the state representing 'd' in the second word where we would
2254    try 'g' and succeed, proceeding to match 'cdgu'.
2255  */
2256  /* add a fail transition */
2257     const U32 trie_offset = ARG(source);
2258     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2259     U32 *q;
2260     const U32 ucharcount = trie->uniquecharcount;
2261     const U32 numstates = trie->statecount;
2262     const U32 ubound = trie->lasttrans + ucharcount;
2263     U32 q_read = 0;
2264     U32 q_write = 0;
2265     U32 charid;
2266     U32 base = trie->states[ 1 ].trans.base;
2267     U32 *fail;
2268     reg_ac_data *aho;
2269     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2270     GET_RE_DEBUG_FLAGS_DECL;
2271
2272     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2273 #ifndef DEBUGGING
2274     PERL_UNUSED_ARG(depth);
2275 #endif
2276
2277
2278     ARG_SET( stclass, data_slot );
2279     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2280     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2281     aho->trie=trie_offset;
2282     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2283     Copy( trie->states, aho->states, numstates, reg_trie_state );
2284     Newxz( q, numstates, U32);
2285     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2286     aho->refcount = 1;
2287     fail = aho->fail;
2288     /* initialize fail[0..1] to be 1 so that we always have
2289        a valid final fail state */
2290     fail[ 0 ] = fail[ 1 ] = 1;
2291
2292     for ( charid = 0; charid < ucharcount ; charid++ ) {
2293         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2294         if ( newstate ) {
2295             q[ q_write ] = newstate;
2296             /* set to point at the root */
2297             fail[ q[ q_write++ ] ]=1;
2298         }
2299     }
2300     while ( q_read < q_write) {
2301         const U32 cur = q[ q_read++ % numstates ];
2302         base = trie->states[ cur ].trans.base;
2303
2304         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2305             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2306             if (ch_state) {
2307                 U32 fail_state = cur;
2308                 U32 fail_base;
2309                 do {
2310                     fail_state = fail[ fail_state ];
2311                     fail_base = aho->states[ fail_state ].trans.base;
2312                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2313
2314                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2315                 fail[ ch_state ] = fail_state;
2316                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2317                 {
2318                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2319                 }
2320                 q[ q_write++ % numstates] = ch_state;
2321             }
2322         }
2323     }
2324     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2325        when we fail in state 1, this allows us to use the
2326        charclass scan to find a valid start char. This is based on the principle
2327        that theres a good chance the string being searched contains lots of stuff
2328        that cant be a start char.
2329      */
2330     fail[ 0 ] = fail[ 1 ] = 0;
2331     DEBUG_TRIE_COMPILE_r({
2332         PerlIO_printf(Perl_debug_log,
2333                       "%*sStclass Failtable (%"UVuf" states): 0",
2334                       (int)(depth * 2), "", (UV)numstates
2335         );
2336         for( q_read=1; q_read<numstates; q_read++ ) {
2337             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2338         }
2339         PerlIO_printf(Perl_debug_log, "\n");
2340     });
2341     Safefree(q);
2342     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2343 }
2344
2345
2346 /*
2347  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2348  * These need to be revisited when a newer toolchain becomes available.
2349  */
2350 #if defined(__sparc64__) && defined(__GNUC__)
2351 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2352 #       undef  SPARC64_GCC_WORKAROUND
2353 #       define SPARC64_GCC_WORKAROUND 1
2354 #   endif
2355 #endif
2356
2357 #define DEBUG_PEEP(str,scan,depth) \
2358     DEBUG_OPTIMISE_r({if (scan){ \
2359        SV * const mysv=sv_newmortal(); \
2360        regnode *Next = regnext(scan); \
2361        regprop(RExC_rx, mysv, scan); \
2362        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2363        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2364        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2365    }});
2366
2367
2368
2369
2370
2371 #define JOIN_EXACT(scan,min,flags) \
2372     if (PL_regkind[OP(scan)] == EXACT) \
2373         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2374
2375 STATIC U32
2376 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2377     /* Merge several consecutive EXACTish nodes into one. */
2378     regnode *n = regnext(scan);
2379     U32 stringok = 1;
2380     regnode *next = scan + NODE_SZ_STR(scan);
2381     U32 merged = 0;
2382     U32 stopnow = 0;
2383 #ifdef DEBUGGING
2384     regnode *stop = scan;
2385     GET_RE_DEBUG_FLAGS_DECL;
2386 #else
2387     PERL_UNUSED_ARG(depth);
2388 #endif
2389
2390     PERL_ARGS_ASSERT_JOIN_EXACT;
2391 #ifndef EXPERIMENTAL_INPLACESCAN
2392     PERL_UNUSED_ARG(flags);
2393     PERL_UNUSED_ARG(val);
2394 #endif
2395     DEBUG_PEEP("join",scan,depth);
2396
2397     /* Skip NOTHING, merge EXACT*. */
2398     while (n &&
2399            ( PL_regkind[OP(n)] == NOTHING ||
2400              (stringok && (OP(n) == OP(scan))))
2401            && NEXT_OFF(n)
2402            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2403
2404         if (OP(n) == TAIL || n > next)
2405             stringok = 0;
2406         if (PL_regkind[OP(n)] == NOTHING) {
2407             DEBUG_PEEP("skip:",n,depth);
2408             NEXT_OFF(scan) += NEXT_OFF(n);
2409             next = n + NODE_STEP_REGNODE;
2410 #ifdef DEBUGGING
2411             if (stringok)
2412                 stop = n;
2413 #endif
2414             n = regnext(n);
2415         }
2416         else if (stringok) {
2417             const unsigned int oldl = STR_LEN(scan);
2418             regnode * const nnext = regnext(n);
2419
2420             DEBUG_PEEP("merg",n,depth);
2421
2422             merged++;
2423             if (oldl + STR_LEN(n) > U8_MAX)
2424                 break;
2425             NEXT_OFF(scan) += NEXT_OFF(n);
2426             STR_LEN(scan) += STR_LEN(n);
2427             next = n + NODE_SZ_STR(n);
2428             /* Now we can overwrite *n : */
2429             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2430 #ifdef DEBUGGING
2431             stop = next - 1;
2432 #endif
2433             n = nnext;
2434             if (stopnow) break;
2435         }
2436
2437 #ifdef EXPERIMENTAL_INPLACESCAN
2438         if (flags && !NEXT_OFF(n)) {
2439             DEBUG_PEEP("atch", val, depth);
2440             if (reg_off_by_arg[OP(n)]) {
2441                 ARG_SET(n, val - n);
2442             }
2443             else {
2444                 NEXT_OFF(n) = val - n;
2445             }
2446             stopnow = 1;
2447         }
2448 #endif
2449     }
2450
2451     if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2452     /*
2453     Two problematic code points in Unicode casefolding of EXACT nodes:
2454
2455     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2456     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2457
2458     which casefold to
2459
2460     Unicode                      UTF-8
2461
2462     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2463     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2464
2465     This means that in case-insensitive matching (or "loose matching",
2466     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2467     length of the above casefolded versions) can match a target string
2468     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2469     This would rather mess up the minimum length computation.
2470
2471     What we'll do is to look for the tail four bytes, and then peek
2472     at the preceding two bytes to see whether we need to decrease
2473     the minimum length by four (six minus two).
2474
2475     Thanks to the design of UTF-8, there cannot be false matches:
2476     A sequence of valid UTF-8 bytes cannot be a subsequence of
2477     another valid sequence of UTF-8 bytes.
2478
2479     */
2480          char * const s0 = STRING(scan), *s, *t;
2481          char * const s1 = s0 + STR_LEN(scan) - 1;
2482          char * const s2 = s1 - 4;
2483 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2484          const char t0[] = "\xaf\x49\xaf\x42";
2485 #else
2486          const char t0[] = "\xcc\x88\xcc\x81";
2487 #endif
2488          const char * const t1 = t0 + 3;
2489
2490          for (s = s0 + 2;
2491               s < s2 && (t = ninstr(s, s1, t0, t1));
2492               s = t + 4) {
2493 #ifdef EBCDIC
2494               if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2495                   ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2496 #else
2497               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2498                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2499 #endif
2500                    *min -= 4;
2501          }
2502     }
2503
2504 #ifdef DEBUGGING
2505     /* Allow dumping */
2506     n = scan + NODE_SZ_STR(scan);
2507     while (n <= stop) {
2508         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2509             OP(n) = OPTIMIZED;
2510             NEXT_OFF(n) = 0;
2511         }
2512         n++;
2513     }
2514 #endif
2515     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2516     return stopnow;
2517 }
2518
2519 /* REx optimizer.  Converts nodes into quickier variants "in place".
2520    Finds fixed substrings.  */
2521
2522 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2523    to the position after last scanned or to NULL. */
2524
2525 #define INIT_AND_WITHP \
2526     assert(!and_withp); \
2527     Newx(and_withp,1,struct regnode_charclass_class); \
2528     SAVEFREEPV(and_withp)
2529
2530 /* this is a chain of data about sub patterns we are processing that
2531    need to be handled seperately/specially in study_chunk. Its so
2532    we can simulate recursion without losing state.  */
2533 struct scan_frame;
2534 typedef struct scan_frame {
2535     regnode *last;  /* last node to process in this frame */
2536     regnode *next;  /* next node to process when last is reached */
2537     struct scan_frame *prev; /*previous frame*/
2538     I32 stop; /* what stopparen do we use */
2539 } scan_frame;
2540
2541
2542 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2543
2544 #define CASE_SYNST_FNC(nAmE)                                       \
2545 case nAmE:                                                         \
2546     if (flags & SCF_DO_STCLASS_AND) {                              \
2547             for (value = 0; value < 256; value++)                  \
2548                 if (!is_ ## nAmE ## _cp(value))                       \
2549                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2550     }                                                              \
2551     else {                                                         \
2552             for (value = 0; value < 256; value++)                  \
2553                 if (is_ ## nAmE ## _cp(value))                        \
2554                     ANYOF_BITMAP_SET(data->start_class, value);    \
2555     }                                                              \
2556     break;                                                         \
2557 case N ## nAmE:                                                    \
2558     if (flags & SCF_DO_STCLASS_AND) {                              \
2559             for (value = 0; value < 256; value++)                   \
2560                 if (is_ ## nAmE ## _cp(value))                         \
2561                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2562     }                                                               \
2563     else {                                                          \
2564             for (value = 0; value < 256; value++)                   \
2565                 if (!is_ ## nAmE ## _cp(value))                        \
2566                     ANYOF_BITMAP_SET(data->start_class, value);     \
2567     }                                                               \
2568     break
2569
2570
2571
2572 STATIC I32
2573 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2574                         I32 *minlenp, I32 *deltap,
2575                         regnode *last,
2576                         scan_data_t *data,
2577                         I32 stopparen,
2578                         U8* recursed,
2579                         struct regnode_charclass_class *and_withp,
2580                         U32 flags, U32 depth)
2581                         /* scanp: Start here (read-write). */
2582                         /* deltap: Write maxlen-minlen here. */
2583                         /* last: Stop before this one. */
2584                         /* data: string data about the pattern */
2585                         /* stopparen: treat close N as END */
2586                         /* recursed: which subroutines have we recursed into */
2587                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2588 {
2589     dVAR;
2590     I32 min = 0, pars = 0, code;
2591     regnode *scan = *scanp, *next;
2592     I32 delta = 0;
2593     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2594     int is_inf_internal = 0;            /* The studied chunk is infinite */
2595     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2596     scan_data_t data_fake;
2597     SV *re_trie_maxbuff = NULL;
2598     regnode *first_non_open = scan;
2599     I32 stopmin = I32_MAX;
2600     scan_frame *frame = NULL;
2601     GET_RE_DEBUG_FLAGS_DECL;
2602
2603     PERL_ARGS_ASSERT_STUDY_CHUNK;
2604
2605 #ifdef DEBUGGING
2606     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2607 #endif
2608
2609     if ( depth == 0 ) {
2610         while (first_non_open && OP(first_non_open) == OPEN)
2611             first_non_open=regnext(first_non_open);
2612     }
2613
2614
2615   fake_study_recurse:
2616     while ( scan && OP(scan) != END && scan < last ){
2617         /* Peephole optimizer: */
2618         DEBUG_STUDYDATA("Peep:", data,depth);
2619         DEBUG_PEEP("Peep",scan,depth);
2620         JOIN_EXACT(scan,&min,0);
2621
2622         /* Follow the next-chain of the current node and optimize
2623            away all the NOTHINGs from it.  */
2624         if (OP(scan) != CURLYX) {
2625             const int max = (reg_off_by_arg[OP(scan)]
2626                        ? I32_MAX
2627                        /* I32 may be smaller than U16 on CRAYs! */
2628                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2629             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2630             int noff;
2631             regnode *n = scan;
2632
2633             /* Skip NOTHING and LONGJMP. */
2634             while ((n = regnext(n))
2635                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2636                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2637                    && off + noff < max)
2638                 off += noff;
2639             if (reg_off_by_arg[OP(scan)])
2640                 ARG(scan) = off;
2641             else
2642                 NEXT_OFF(scan) = off;
2643         }
2644
2645
2646
2647         /* The principal pseudo-switch.  Cannot be a switch, since we
2648            look into several different things.  */
2649         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2650                    || OP(scan) == IFTHEN) {
2651             next = regnext(scan);
2652             code = OP(scan);
2653             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2654
2655             if (OP(next) == code || code == IFTHEN) {
2656                 /* NOTE - There is similar code to this block below for handling
2657                    TRIE nodes on a re-study.  If you change stuff here check there
2658                    too. */
2659                 I32 max1 = 0, min1 = I32_MAX, num = 0;
2660                 struct regnode_charclass_class accum;
2661                 regnode * const startbranch=scan;
2662
2663                 if (flags & SCF_DO_SUBSTR)
2664                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2665                 if (flags & SCF_DO_STCLASS)
2666                     cl_init_zero(pRExC_state, &accum);
2667
2668                 while (OP(scan) == code) {
2669                     I32 deltanext, minnext, f = 0, fake;
2670                     struct regnode_charclass_class this_class;
2671
2672                     num++;
2673                     data_fake.flags = 0;
2674                     if (data) {
2675                         data_fake.whilem_c = data->whilem_c;
2676                         data_fake.last_closep = data->last_closep;
2677                     }
2678                     else
2679                         data_fake.last_closep = &fake;
2680
2681                     data_fake.pos_delta = delta;
2682                     next = regnext(scan);
2683                     scan = NEXTOPER(scan);
2684                     if (code != BRANCH)
2685                         scan = NEXTOPER(scan);
2686                     if (flags & SCF_DO_STCLASS) {
2687                         cl_init(pRExC_state, &this_class);
2688                         data_fake.start_class = &this_class;
2689                         f = SCF_DO_STCLASS_AND;
2690                     }
2691                     if (flags & SCF_WHILEM_VISITED_POS)
2692                         f |= SCF_WHILEM_VISITED_POS;
2693
2694                     /* we suppose the run is continuous, last=next...*/
2695                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2696                                           next, &data_fake,
2697                                           stopparen, recursed, NULL, f,depth+1);
2698                     if (min1 > minnext)
2699                         min1 = minnext;
2700                     if (max1 < minnext + deltanext)
2701                         max1 = minnext + deltanext;
2702                     if (deltanext == I32_MAX)
2703                         is_inf = is_inf_internal = 1;
2704                     scan = next;
2705                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2706                         pars++;
2707                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
2708                         if ( stopmin > minnext)
2709                             stopmin = min + min1;
2710                         flags &= ~SCF_DO_SUBSTR;
2711                         if (data)
2712                             data->flags |= SCF_SEEN_ACCEPT;
2713                     }
2714                     if (data) {
2715                         if (data_fake.flags & SF_HAS_EVAL)
2716                             data->flags |= SF_HAS_EVAL;
2717                         data->whilem_c = data_fake.whilem_c;
2718                     }
2719                     if (flags & SCF_DO_STCLASS)
2720                         cl_or(pRExC_state, &accum, &this_class);
2721                 }
2722                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2723                     min1 = 0;
2724                 if (flags & SCF_DO_SUBSTR) {
2725                     data->pos_min += min1;
2726                     data->pos_delta += max1 - min1;
2727                     if (max1 != min1 || is_inf)
2728                         data->longest = &(data->longest_float);
2729                 }
2730                 min += min1;
2731                 delta += max1 - min1;
2732                 if (flags & SCF_DO_STCLASS_OR) {
2733                     cl_or(pRExC_state, data->start_class, &accum);
2734                     if (min1) {
2735                         cl_and(data->start_class, and_withp);
2736                         flags &= ~SCF_DO_STCLASS;
2737                     }
2738                 }
2739                 else if (flags & SCF_DO_STCLASS_AND) {
2740                     if (min1) {
2741                         cl_and(data->start_class, &accum);
2742                         flags &= ~SCF_DO_STCLASS;
2743                     }
2744                     else {
2745                         /* Switch to OR mode: cache the old value of
2746                          * data->start_class */
2747                         INIT_AND_WITHP;
2748                         StructCopy(data->start_class, and_withp,
2749                                    struct regnode_charclass_class);
2750                         flags &= ~SCF_DO_STCLASS_AND;
2751                         StructCopy(&accum, data->start_class,
2752                                    struct regnode_charclass_class);
2753                         flags |= SCF_DO_STCLASS_OR;
2754                         data->start_class->flags |= ANYOF_EOS;
2755                     }
2756                 }
2757
2758                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2759                 /* demq.
2760
2761                    Assuming this was/is a branch we are dealing with: 'scan' now
2762                    points at the item that follows the branch sequence, whatever
2763                    it is. We now start at the beginning of the sequence and look
2764                    for subsequences of
2765
2766                    BRANCH->EXACT=>x1
2767                    BRANCH->EXACT=>x2
2768                    tail
2769
2770                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
2771
2772                    If we can find such a subseqence we need to turn the first
2773                    element into a trie and then add the subsequent branch exact
2774                    strings to the trie.
2775
2776                    We have two cases
2777
2778                      1. patterns where the whole set of branches can be converted.
2779
2780                      2. patterns where only a subset can be converted.
2781
2782                    In case 1 we can replace the whole set with a single regop
2783                    for the trie. In case 2 we need to keep the start and end
2784                    branches so
2785
2786                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2787                      becomes BRANCH TRIE; BRANCH X;
2788
2789                   There is an additional case, that being where there is a
2790                   common prefix, which gets split out into an EXACT like node
2791                   preceding the TRIE node.
2792
2793                   If x(1..n)==tail then we can do a simple trie, if not we make
2794                   a "jump" trie, such that when we match the appropriate word
2795                   we "jump" to the appopriate tail node. Essentailly we turn
2796                   a nested if into a case structure of sorts.
2797
2798                 */
2799
2800                     int made=0;
2801                     if (!re_trie_maxbuff) {
2802                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2803                         if (!SvIOK(re_trie_maxbuff))
2804                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2805                     }
2806                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2807                         regnode *cur;
2808                         regnode *first = (regnode *)NULL;
2809                         regnode *last = (regnode *)NULL;
2810                         regnode *tail = scan;
2811                         U8 optype = 0;
2812                         U32 count=0;
2813
2814 #ifdef DEBUGGING
2815                         SV * const mysv = sv_newmortal();       /* for dumping */
2816 #endif
2817                         /* var tail is used because there may be a TAIL
2818                            regop in the way. Ie, the exacts will point to the
2819                            thing following the TAIL, but the last branch will
2820                            point at the TAIL. So we advance tail. If we
2821                            have nested (?:) we may have to move through several
2822                            tails.
2823                          */
2824
2825                         while ( OP( tail ) == TAIL ) {
2826                             /* this is the TAIL generated by (?:) */
2827                             tail = regnext( tail );
2828                         }
2829
2830
2831                         DEBUG_OPTIMISE_r({
2832                             regprop(RExC_rx, mysv, tail );
2833                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2834                                 (int)depth * 2 + 2, "",
2835                                 "Looking for TRIE'able sequences. Tail node is: ",
2836                                 SvPV_nolen_const( mysv )
2837                             );
2838                         });
2839
2840                         /*
2841
2842                            step through the branches, cur represents each
2843                            branch, noper is the first thing to be matched
2844                            as part of that branch and noper_next is the
2845                            regnext() of that node. if noper is an EXACT
2846                            and noper_next is the same as scan (our current
2847                            position in the regex) then the EXACT branch is
2848                            a possible optimization target. Once we have
2849                            two or more consequetive such branches we can
2850                            create a trie of the EXACT's contents and stich
2851                            it in place. If the sequence represents all of
2852                            the branches we eliminate the whole thing and
2853                            replace it with a single TRIE. If it is a
2854                            subsequence then we need to stitch it in. This
2855                            means the first branch has to remain, and needs
2856                            to be repointed at the item on the branch chain
2857                            following the last branch optimized. This could
2858                            be either a BRANCH, in which case the
2859                            subsequence is internal, or it could be the
2860                            item following the branch sequence in which
2861                            case the subsequence is at the end.
2862
2863                         */
2864
2865                         /* dont use tail as the end marker for this traverse */
2866                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2867                             regnode * const noper = NEXTOPER( cur );
2868 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2869                             regnode * const noper_next = regnext( noper );
2870 #endif
2871
2872                             DEBUG_OPTIMISE_r({
2873                                 regprop(RExC_rx, mysv, cur);
2874                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2875                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2876
2877                                 regprop(RExC_rx, mysv, noper);
2878                                 PerlIO_printf( Perl_debug_log, " -> %s",
2879                                     SvPV_nolen_const(mysv));
2880
2881                                 if ( noper_next ) {
2882                                   regprop(RExC_rx, mysv, noper_next );
2883                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2884                                     SvPV_nolen_const(mysv));
2885                                 }
2886                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2887                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2888                             });
2889                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2890                                          : PL_regkind[ OP( noper ) ] == EXACT )
2891                                   || OP(noper) == NOTHING )
2892 #ifdef NOJUMPTRIE
2893                                   && noper_next == tail
2894 #endif
2895                                   && count < U16_MAX)
2896                             {
2897                                 count++;
2898                                 if ( !first || optype == NOTHING ) {
2899                                     if (!first) first = cur;
2900                                     optype = OP( noper );
2901                                 } else {
2902                                     last = cur;
2903                                 }
2904                             } else {
2905 /*
2906     Currently we do not believe that the trie logic can
2907     handle case insensitive matching properly when the
2908     pattern is not unicode (thus forcing unicode semantics).
2909
2910     If/when this is fixed the following define can be swapped
2911     in below to fully enable trie logic.
2912
2913 #define TRIE_TYPE_IS_SAFE 1
2914
2915 */
2916 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2917
2918                                 if ( last && TRIE_TYPE_IS_SAFE ) {
2919                                     make_trie( pRExC_state,
2920                                             startbranch, first, cur, tail, count,
2921                                             optype, depth+1 );
2922                                 }
2923                                 if ( PL_regkind[ OP( noper ) ] == EXACT
2924 #ifdef NOJUMPTRIE
2925                                      && noper_next == tail
2926 #endif
2927                                 ){
2928                                     count = 1;
2929                                     first = cur;
2930                                     optype = OP( noper );
2931                                 } else {
2932                                     count = 0;
2933                                     first = NULL;
2934                                     optype = 0;
2935                                 }
2936                                 last = NULL;
2937                             }
2938                         }
2939                         DEBUG_OPTIMISE_r({
2940                             regprop(RExC_rx, mysv, cur);
2941                             PerlIO_printf( Perl_debug_log,
2942                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2943                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2944
2945                         });
2946
2947                         if ( last && TRIE_TYPE_IS_SAFE ) {
2948                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2949 #ifdef TRIE_STUDY_OPT
2950                             if ( ((made == MADE_EXACT_TRIE &&
2951                                  startbranch == first)
2952                                  || ( first_non_open == first )) &&
2953                                  depth==0 ) {
2954                                 flags |= SCF_TRIE_RESTUDY;
2955                                 if ( startbranch == first
2956                                      && scan == tail )
2957                                 {
2958                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2959                                 }
2960                             }
2961 #endif
2962                         }
2963                     }
2964
2965                 } /* do trie */
2966
2967             }
2968             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
2969                 scan = NEXTOPER(NEXTOPER(scan));
2970             } else                      /* single branch is optimized. */
2971                 scan = NEXTOPER(scan);
2972             continue;
2973         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2974             scan_frame *newframe = NULL;
2975             I32 paren;
2976             regnode *start;
2977             regnode *end;
2978
2979             if (OP(scan) != SUSPEND) {
2980             /* set the pointer */
2981                 if (OP(scan) == GOSUB) {
2982                     paren = ARG(scan);
2983                     RExC_recurse[ARG2L(scan)] = scan;
2984                     start = RExC_open_parens[paren-1];
2985                     end   = RExC_close_parens[paren-1];
2986                 } else {
2987                     paren = 0;
2988                     start = RExC_rxi->program + 1;
2989                     end   = RExC_opend;
2990                 }
2991                 if (!recursed) {
2992                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2993                     SAVEFREEPV(recursed);
2994                 }
2995                 if (!PAREN_TEST(recursed,paren+1)) {
2996                     PAREN_SET(recursed,paren+1);
2997                     Newx(newframe,1,scan_frame);
2998                 } else {
2999                     if (flags & SCF_DO_SUBSTR) {
3000                         SCAN_COMMIT(pRExC_state,data,minlenp);
3001                         data->longest = &(data->longest_float);
3002                     }
3003                     is_inf = is_inf_internal = 1;
3004                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3005                         cl_anything(pRExC_state, data->start_class);
3006                     flags &= ~SCF_DO_STCLASS;
3007                 }
3008             } else {
3009                 Newx(newframe,1,scan_frame);
3010                 paren = stopparen;
3011                 start = scan+2;
3012                 end = regnext(scan);
3013             }
3014             if (newframe) {
3015                 assert(start);
3016                 assert(end);
3017                 SAVEFREEPV(newframe);
3018                 newframe->next = regnext(scan);
3019                 newframe->last = last;
3020                 newframe->stop = stopparen;
3021                 newframe->prev = frame;
3022
3023                 frame = newframe;
3024                 scan =  start;
3025                 stopparen = paren;
3026                 last = end;
3027
3028                 continue;
3029             }
3030         }
3031         else if (OP(scan) == EXACT) {
3032             I32 l = STR_LEN(scan);
3033             UV uc;
3034             if (UTF) {
3035                 const U8 * const s = (U8*)STRING(scan);
3036                 l = utf8_length(s, s + l);
3037                 uc = utf8_to_uvchr(s, NULL);
3038             } else {
3039                 uc = *((U8*)STRING(scan));
3040             }
3041             min += l;
3042             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3043                 /* The code below prefers earlier match for fixed
3044                    offset, later match for variable offset.  */
3045                 if (data->last_end == -1) { /* Update the start info. */
3046                     data->last_start_min = data->pos_min;
3047                     data->last_start_max = is_inf
3048                         ? I32_MAX : data->pos_min + data->pos_delta;
3049                 }
3050                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3051                 if (UTF)
3052                     SvUTF8_on(data->last_found);
3053                 {
3054                     SV * const sv = data->last_found;
3055                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3056                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3057                     if (mg && mg->mg_len >= 0)
3058                         mg->mg_len += utf8_length((U8*)STRING(scan),
3059                                                   (U8*)STRING(scan)+STR_LEN(scan));
3060                 }
3061                 data->last_end = data->pos_min + l;
3062                 data->pos_min += l; /* As in the first entry. */
3063                 data->flags &= ~SF_BEFORE_EOL;
3064             }
3065             if (flags & SCF_DO_STCLASS_AND) {
3066                 /* Check whether it is compatible with what we know already! */
3067                 int compat = 1;
3068
3069                 if (uc >= 0x100 ||
3070                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3071                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3072                     && (!(data->start_class->flags & ANYOF_FOLD)
3073                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3074                     )
3075                     compat = 0;
3076                 ANYOF_CLASS_ZERO(data->start_class);
3077                 ANYOF_BITMAP_ZERO(data->start_class);
3078                 if (compat)
3079                     ANYOF_BITMAP_SET(data->start_class, uc);
3080                 data->start_class->flags &= ~ANYOF_EOS;
3081                 if (uc < 0x100)
3082                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3083             }
3084             else if (flags & SCF_DO_STCLASS_OR) {
3085                 /* false positive possible if the class is case-folded */
3086                 if (uc < 0x100)
3087                     ANYOF_BITMAP_SET(data->start_class, uc);
3088                 else
3089                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3090                 data->start_class->flags &= ~ANYOF_EOS;
3091                 cl_and(data->start_class, and_withp);
3092             }
3093             flags &= ~SCF_DO_STCLASS;
3094         }
3095         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3096             I32 l = STR_LEN(scan);
3097             UV uc = *((U8*)STRING(scan));
3098
3099             /* Search for fixed substrings supports EXACT only. */
3100             if (flags & SCF_DO_SUBSTR) {
3101                 assert(data);
3102                 SCAN_COMMIT(pRExC_state, data, minlenp);
3103             }
3104             if (UTF) {
3105                 const U8 * const s = (U8 *)STRING(scan);
3106                 l = utf8_length(s, s + l);
3107                 uc = utf8_to_uvchr(s, NULL);
3108             }
3109             min += l;
3110             if (flags & SCF_DO_SUBSTR)
3111                 data->pos_min += l;
3112             if (flags & SCF_DO_STCLASS_AND) {
3113                 /* Check whether it is compatible with what we know already! */
3114                 int compat = 1;
3115
3116                 if (uc >= 0x100 ||
3117                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3118                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3119                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3120                     compat = 0;
3121                 ANYOF_CLASS_ZERO(data->start_class);
3122                 ANYOF_BITMAP_ZERO(data->start_class);
3123                 if (compat) {
3124                     ANYOF_BITMAP_SET(data->start_class, uc);
3125                     data->start_class->flags &= ~ANYOF_EOS;
3126                     data->start_class->flags |= ANYOF_FOLD;
3127                     if (OP(scan) == EXACTFL)
3128                         data->start_class->flags |= ANYOF_LOCALE;
3129                 }
3130             }
3131             else if (flags & SCF_DO_STCLASS_OR) {
3132                 if (data->start_class->flags & ANYOF_FOLD) {
3133                     /* false positive possible if the class is case-folded.
3134                        Assume that the locale settings are the same... */
3135                     if (uc < 0x100)
3136                         ANYOF_BITMAP_SET(data->start_class, uc);
3137                     data->start_class->flags &= ~ANYOF_EOS;
3138                 }
3139                 cl_and(data->start_class, and_withp);
3140             }
3141             flags &= ~SCF_DO_STCLASS;
3142         }
3143         else if (REGNODE_VARIES(OP(scan))) {
3144             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3145             I32 f = flags, pos_before = 0;
3146             regnode * const oscan = scan;
3147             struct regnode_charclass_class this_class;
3148             struct regnode_charclass_class *oclass = NULL;
3149             I32 next_is_eval = 0;
3150
3151             switch (PL_regkind[OP(scan)]) {
3152             case WHILEM:                /* End of (?:...)* . */
3153                 scan = NEXTOPER(scan);
3154                 goto finish;
3155             case PLUS:
3156                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3157                     next = NEXTOPER(scan);
3158                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3159                         mincount = 1;
3160                         maxcount = REG_INFTY;
3161                         next = regnext(scan);
3162                         scan = NEXTOPER(scan);
3163                         goto do_curly;
3164                     }
3165                 }
3166                 if (flags & SCF_DO_SUBSTR)
3167                     data->pos_min++;
3168                 min++;
3169                 /* Fall through. */
3170             case STAR:
3171                 if (flags & SCF_DO_STCLASS) {
3172                     mincount = 0;
3173                     maxcount = REG_INFTY;
3174                     next = regnext(scan);
3175                     scan = NEXTOPER(scan);
3176                     goto do_curly;
3177                 }
3178                 is_inf = is_inf_internal = 1;
3179                 scan = regnext(scan);
3180                 if (flags & SCF_DO_SUBSTR) {
3181                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3182                     data->longest = &(data->longest_float);
3183                 }
3184                 goto optimize_curly_tail;
3185             case CURLY:
3186                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3187                     && (scan->flags == stopparen))
3188                 {
3189                     mincount = 1;
3190                     maxcount = 1;
3191                 } else {
3192                     mincount = ARG1(scan);
3193                     maxcount = ARG2(scan);
3194                 }
3195                 next = regnext(scan);
3196                 if (OP(scan) == CURLYX) {
3197                     I32 lp = (data ? *(data->last_closep) : 0);
3198                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3199                 }
3200                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3201                 next_is_eval = (OP(scan) == EVAL);
3202               do_curly:
3203                 if (flags & SCF_DO_SUBSTR) {
3204                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3205                     pos_before = data->pos_min;
3206                 }
3207                 if (data) {
3208                     fl = data->flags;
3209                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3210                     if (is_inf)
3211                         data->flags |= SF_IS_INF;
3212                 }
3213                 if (flags & SCF_DO_STCLASS) {
3214                     cl_init(pRExC_state, &this_class);
3215                     oclass = data->start_class;
3216                     data->start_class = &this_class;
3217                     f |= SCF_DO_STCLASS_AND;
3218                     f &= ~SCF_DO_STCLASS_OR;
3219                 }
3220                 /* These are the cases when once a subexpression
3221                    fails at a particular position, it cannot succeed
3222                    even after backtracking at the enclosing scope.
3223
3224                    XXXX what if minimal match and we are at the
3225                         initial run of {n,m}? */
3226                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3227                     f &= ~SCF_WHILEM_VISITED_POS;
3228
3229                 /* This will finish on WHILEM, setting scan, or on NULL: */
3230                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3231                                       last, data, stopparen, recursed, NULL,
3232                                       (mincount == 0
3233                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3234
3235                 if (flags & SCF_DO_STCLASS)
3236                     data->start_class = oclass;
3237                 if (mincount == 0 || minnext == 0) {
3238                     if (flags & SCF_DO_STCLASS_OR) {
3239                         cl_or(pRExC_state, data->start_class, &this_class);
3240                     }
3241                     else if (flags & SCF_DO_STCLASS_AND) {
3242                         /* Switch to OR mode: cache the old value of
3243                          * data->start_class */
3244                         INIT_AND_WITHP;
3245                         StructCopy(data->start_class, and_withp,
3246                                    struct regnode_charclass_class);
3247                         flags &= ~SCF_DO_STCLASS_AND;
3248                         StructCopy(&this_class, data->start_class,
3249                                    struct regnode_charclass_class);
3250                         flags |= SCF_DO_STCLASS_OR;
3251                         data->start_class->flags |= ANYOF_EOS;
3252                     }
3253                 } else {                /* Non-zero len */
3254                     if (flags & SCF_DO_STCLASS_OR) {
3255                         cl_or(pRExC_state, data->start_class, &this_class);
3256                         cl_and(data->start_class, and_withp);
3257                     }
3258                     else if (flags & SCF_DO_STCLASS_AND)
3259                         cl_and(data->start_class, &this_class);
3260                     flags &= ~SCF_DO_STCLASS;
3261                 }
3262                 if (!scan)              /* It was not CURLYX, but CURLY. */
3263                     scan = next;
3264                 if ( /* ? quantifier ok, except for (?{ ... }) */
3265                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3266                     && (minnext == 0) && (deltanext == 0)
3267                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3268                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3269                 {
3270                     ckWARNreg(RExC_parse,
3271                               "Quantifier unexpected on zero-length expression");
3272                 }
3273
3274                 min += minnext * mincount;
3275                 is_inf_internal |= ((maxcount == REG_INFTY
3276                                      && (minnext + deltanext) > 0)
3277                                     || deltanext == I32_MAX);
3278                 is_inf |= is_inf_internal;
3279                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3280
3281                 /* Try powerful optimization CURLYX => CURLYN. */
3282                 if (  OP(oscan) == CURLYX && data
3283                       && data->flags & SF_IN_PAR
3284                       && !(data->flags & SF_HAS_EVAL)
3285                       && !deltanext && minnext == 1 ) {
3286                     /* Try to optimize to CURLYN.  */
3287                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3288                     regnode * const nxt1 = nxt;
3289 #ifdef DEBUGGING
3290                     regnode *nxt2;
3291 #endif
3292
3293                     /* Skip open. */
3294                     nxt = regnext(nxt);
3295                     if (!REGNODE_SIMPLE(OP(nxt))
3296                         && !(PL_regkind[OP(nxt)] == EXACT
3297                              && STR_LEN(nxt) == 1))
3298                         goto nogo;
3299 #ifdef DEBUGGING
3300                     nxt2 = nxt;
3301 #endif
3302                     nxt = regnext(nxt);
3303                     if (OP(nxt) != CLOSE)
3304                         goto nogo;
3305                     if (RExC_open_parens) {
3306                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3307                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3308                     }
3309                     /* Now we know that nxt2 is the only contents: */
3310                     oscan->flags = (U8)ARG(nxt);
3311                     OP(oscan) = CURLYN;
3312                     OP(nxt1) = NOTHING; /* was OPEN. */
3313
3314 #ifdef DEBUGGING
3315                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3316                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3317                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3318                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3319                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3320                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3321 #endif
3322                 }
3323               nogo:
3324
3325                 /* Try optimization CURLYX => CURLYM. */
3326                 if (  OP(oscan) == CURLYX && data
3327                       && !(data->flags & SF_HAS_PAR)
3328                       && !(data->flags & SF_HAS_EVAL)
3329                       && !deltanext     /* atom is fixed width */
3330                       && minnext != 0   /* CURLYM can't handle zero width */
3331                 ) {
3332                     /* XXXX How to optimize if data == 0? */
3333                     /* Optimize to a simpler form.  */
3334                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3335                     regnode *nxt2;
3336
3337                     OP(oscan) = CURLYM;
3338                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3339                             && (OP(nxt2) != WHILEM))
3340                         nxt = nxt2;
3341                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3342                     /* Need to optimize away parenths. */
3343                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3344                         /* Set the parenth number.  */
3345                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3346
3347                         oscan->flags = (U8)ARG(nxt);
3348                         if (RExC_open_parens) {
3349                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3350                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3351                         }
3352                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3353                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3354
3355 #ifdef DEBUGGING
3356                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3357                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3358                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3359                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3360 #endif
3361 #if 0
3362                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3363                             regnode *nnxt = regnext(nxt1);
3364                             if (nnxt == nxt) {
3365                                 if (reg_off_by_arg[OP(nxt1)])
3366                                     ARG_SET(nxt1, nxt2 - nxt1);
3367                                 else if (nxt2 - nxt1 < U16_MAX)
3368                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3369                                 else
3370                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3371                             }
3372                             nxt1 = nnxt;
3373                         }
3374 #endif
3375                         /* Optimize again: */
3376                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3377                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3378                     }
3379                     else
3380                         oscan->flags = 0;
3381                 }
3382                 else if ((OP(oscan) == CURLYX)
3383                          && (flags & SCF_WHILEM_VISITED_POS)
3384                          /* See the comment on a similar expression above.
3385                             However, this time it's not a subexpression
3386                             we care about, but the expression itself. */
3387                          && (maxcount == REG_INFTY)
3388                          && data && ++data->whilem_c < 16) {
3389                     /* This stays as CURLYX, we can put the count/of pair. */
3390                     /* Find WHILEM (as in regexec.c) */
3391                     regnode *nxt = oscan + NEXT_OFF(oscan);
3392
3393                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3394                         nxt += ARG(nxt);
3395                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3396                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3397                 }
3398                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3399                     pars++;
3400                 if (flags & SCF_DO_SUBSTR) {
3401                     SV *last_str = NULL;
3402                     int counted = mincount != 0;
3403
3404                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3405 #if defined(SPARC64_GCC_WORKAROUND)
3406                         I32 b = 0;
3407                         STRLEN l = 0;
3408                         const char *s = NULL;
3409                         I32 old = 0;
3410
3411                         if (pos_before >= data->last_start_min)
3412                             b = pos_before;
3413                         else
3414                             b = data->last_start_min;
3415
3416                         l = 0;
3417                         s = SvPV_const(data->last_found, l);
3418                         old = b - data->last_start_min;
3419
3420 #else
3421                         I32 b = pos_before >= data->last_start_min
3422                             ? pos_before : data->last_start_min;
3423                         STRLEN l;
3424                         const char * const s = SvPV_const(data->last_found, l);
3425                         I32 old = b - data->last_start_min;
3426 #endif
3427
3428                         if (UTF)
3429                             old = utf8_hop((U8*)s, old) - (U8*)s;
3430                         l -= old;
3431                         /* Get the added string: */
3432                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3433                         if (deltanext == 0 && pos_before == b) {
3434                             /* What was added is a constant string */
3435                             if (mincount > 1) {
3436                                 SvGROW(last_str, (mincount * l) + 1);
3437                                 repeatcpy(SvPVX(last_str) + l,
3438                                           SvPVX_const(last_str), l, mincount - 1);
3439                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3440                                 /* Add additional parts. */
3441                                 SvCUR_set(data->last_found,
3442                                           SvCUR(data->last_found) - l);
3443                                 sv_catsv(data->last_found, last_str);
3444                                 {
3445                                     SV * sv = data->last_found;
3446                                     MAGIC *mg =
3447                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3448                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3449                                     if (mg && mg->mg_len >= 0)
3450                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3451                                 }
3452                                 data->last_end += l * (mincount - 1);
3453                             }
3454                         } else {
3455                             /* start offset must point into the last copy */
3456                             data->last_start_min += minnext * (mincount - 1);
3457                             data->last_start_max += is_inf ? I32_MAX
3458                                 : (maxcount - 1) * (minnext + data->pos_delta);
3459                         }
3460                     }
3461                     /* It is counted once already... */
3462                     data->pos_min += minnext * (mincount - counted);
3463                     data->pos_delta += - counted * deltanext +
3464                         (minnext + deltanext) * maxcount - minnext * mincount;
3465                     if (mincount != maxcount) {
3466                          /* Cannot extend fixed substrings found inside
3467                             the group.  */
3468                         SCAN_COMMIT(pRExC_state,data,minlenp);
3469                         if (mincount && last_str) {
3470                             SV * const sv = data->last_found;
3471                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3472                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3473
3474                             if (mg)
3475                                 mg->mg_len = -1;
3476                             sv_setsv(sv, last_str);
3477                             data->last_end = data->pos_min;
3478                             data->last_start_min =
3479                                 data->pos_min - CHR_SVLEN(last_str);
3480                             data->last_start_max = is_inf
3481                                 ? I32_MAX
3482                                 : data->pos_min + data->pos_delta
3483                                 - CHR_SVLEN(last_str);
3484                         }
3485                         data->longest = &(data->longest_float);
3486                     }
3487                     SvREFCNT_dec(last_str);
3488                 }
3489                 if (data && (fl & SF_HAS_EVAL))
3490                     data->flags |= SF_HAS_EVAL;
3491               optimize_curly_tail:
3492                 if (OP(oscan) != CURLYX) {
3493                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3494                            && NEXT_OFF(next))
3495                         NEXT_OFF(oscan) += NEXT_OFF(next);
3496                 }
3497                 continue;
3498             default:                    /* REF and CLUMP only? */
3499                 if (flags & SCF_DO_SUBSTR) {
3500                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3501                     data->longest = &(data->longest_float);
3502                 }
3503                 is_inf = is_inf_internal = 1;
3504                 if (flags & SCF_DO_STCLASS_OR)
3505                     cl_anything(pRExC_state, data->start_class);
3506                 flags &= ~SCF_DO_STCLASS;
3507                 break;
3508             }
3509         }
3510         else if (OP(scan) == LNBREAK) {
3511             if (flags & SCF_DO_STCLASS) {
3512                 int value = 0;
3513                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3514                 if (flags & SCF_DO_STCLASS_AND) {
3515                     for (value = 0; value < 256; value++)
3516                         if (!is_VERTWS_cp(value))
3517                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3518                 }
3519                 else {
3520                     for (value = 0; value < 256; value++)
3521                         if (is_VERTWS_cp(value))
3522                             ANYOF_BITMAP_SET(data->start_class, value);
3523                 }
3524                 if (flags & SCF_DO_STCLASS_OR)
3525                     cl_and(data->start_class, and_withp);
3526                 flags &= ~SCF_DO_STCLASS;
3527             }
3528             min += 1;
3529             delta += 1;
3530             if (flags & SCF_DO_SUBSTR) {
3531                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3532                 data->pos_min += 1;
3533                 data->pos_delta += 1;
3534                 data->longest = &(data->longest_float);
3535             }
3536         }
3537         else if (OP(scan) == FOLDCHAR) {
3538             int d = ARG(scan)==0xDF ? 1 : 2;
3539             flags &= ~SCF_DO_STCLASS;
3540             min += 1;
3541             delta += d;
3542             if (flags & SCF_DO_SUBSTR) {
3543                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3544                 data->pos_min += 1;
3545                 data->pos_delta += d;
3546                 data->longest = &(data->longest_float);
3547             }
3548         }
3549         else if (REGNODE_SIMPLE(OP(scan))) {
3550             int value = 0;
3551
3552             if (flags & SCF_DO_SUBSTR) {
3553                 SCAN_COMMIT(pRExC_state,data,minlenp);
3554                 data->pos_min++;
3555             }
3556             min++;
3557             if (flags & SCF_DO_STCLASS) {
3558                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3559
3560                 /* Some of the logic below assumes that switching
3561                    locale on will only add false positives. */
3562                 switch (PL_regkind[OP(scan)]) {
3563                 case SANY:
3564                 default:
3565                   do_default:
3566                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3567                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3568                         cl_anything(pRExC_state, data->start_class);
3569                     break;
3570                 case REG_ANY:
3571                     if (OP(scan) == SANY)
3572                         goto do_default;
3573                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3574                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3575                                  || (data->start_class->flags & ANYOF_CLASS));
3576                         cl_anything(pRExC_state, data->start_class);
3577                     }
3578                     if (flags & SCF_DO_STCLASS_AND || !value)
3579                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3580                     break;
3581                 case ANYOF:
3582                     if (flags & SCF_DO_STCLASS_AND)
3583                         cl_and(data->start_class,
3584                                (struct regnode_charclass_class*)scan);
3585                     else
3586                         cl_or(pRExC_state, data->start_class,
3587                               (struct regnode_charclass_class*)scan);
3588                     break;
3589                 case ALNUM:
3590                     if (flags & SCF_DO_STCLASS_AND) {
3591                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3592                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3593                             if (FLAGS(scan) & USE_UNI) {
3594                                 for (value = 0; value < 256; value++) {
3595                                     if (!isWORDCHAR_L1(value)) {
3596                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3597                                     }
3598                                 }
3599                             } else {
3600                                 for (value = 0; value < 256; value++) {
3601                                     if (!isALNUM(value)) {
3602                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3603                                     }
3604                                 }
3605                             }
3606                         }
3607                     }
3608                     else {
3609                         if (data->start_class->flags & ANYOF_LOCALE)
3610                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3611                         else if (FLAGS(scan) & USE_UNI) {
3612                             for (value = 0; value < 256; value++) {
3613                                 if (isWORDCHAR_L1(value)) {
3614                                     ANYOF_BITMAP_SET(data->start_class, value);
3615                                 }
3616                             }
3617                         } else {
3618                             for (value = 0; value < 256; value++) {
3619                                 if (isALNUM(value)) {
3620                                     ANYOF_BITMAP_SET(data->start_class, value);
3621                                 }
3622                             }
3623                         }
3624                     }
3625                     break;
3626                 case ALNUML:
3627                     if (flags & SCF_DO_STCLASS_AND) {
3628                         if (data->start_class->flags & ANYOF_LOCALE)
3629                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3630                     }
3631                     else {
3632                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3633                         data->start_class->flags |= ANYOF_LOCALE;
3634                     }
3635                     break;
3636                 case NALNUM:
3637                     if (flags & SCF_DO_STCLASS_AND) {
3638                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3639                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3640                             if (FLAGS(scan) & USE_UNI) {
3641                                 for (value = 0; value < 256; value++) {
3642                                     if (isWORDCHAR_L1(value)) {
3643                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3644                                     }
3645                                 }
3646                             } else {
3647                                 for (value = 0; value < 256; value++) {
3648                                     if (isALNUM(value)) {
3649                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3650                                     }
3651                                 }
3652                             }
3653                         }
3654                     }
3655                     else {
3656                         if (data->start_class->flags & ANYOF_LOCALE)
3657                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3658                         else {
3659                             for (value = 0; value < 256; value++)
3660                                 if (!isALNUM(value))
3661                                     ANYOF_BITMAP_SET(data->start_class, value);
3662                         }
3663                     }
3664                     break;
3665                 case NALNUML:
3666                     if (flags & SCF_DO_STCLASS_AND) {
3667                         if (data->start_class->flags & ANYOF_LOCALE)
3668                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3669                     }
3670                     else {
3671                         data->start_class->flags |= ANYOF_LOCALE;
3672                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3673                     }
3674                     break;
3675                 case SPACE:
3676                     if (flags & SCF_DO_STCLASS_AND) {
3677                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3678                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3679                             if (FLAGS(scan) & USE_UNI) {
3680                                 for (value = 0; value < 256; value++) {
3681                                     if (!isSPACE_L1(value)) {
3682                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3683                                     }
3684                                 }
3685                             } else {
3686                                 for (value = 0; value < 256; value++) {
3687                                     if (!isSPACE(value)) {
3688                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3689                                     }
3690                                 }
3691                             }
3692                         }
3693                     }
3694                     else {
3695                         if (data->start_class->flags & ANYOF_LOCALE) {
3696                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3697                         }
3698                         else if (FLAGS(scan) & USE_UNI) {
3699                             for (value = 0; value < 256; value++) {
3700                                 if (isSPACE_L1(value)) {
3701                                     ANYOF_BITMAP_SET(data->start_class, value);
3702                                 }
3703                             }
3704                         } else {
3705                             for (value = 0; value < 256; value++) {
3706                                 if (isSPACE(value)) {
3707                                     ANYOF_BITMAP_SET(data->start_class, value);
3708                                 }
3709                             }
3710                         }
3711                     }
3712                     break;
3713                 case SPACEL:
3714                     if (flags & SCF_DO_STCLASS_AND) {
3715                         if (data->start_class->flags & ANYOF_LOCALE)
3716                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3717                     }
3718                     else {
3719                         data->start_class->flags |= ANYOF_LOCALE;
3720                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3721                     }
3722                     break;
3723                 case NSPACE:
3724                     if (flags & SCF_DO_STCLASS_AND) {
3725                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3726                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3727                             if (FLAGS(scan) & USE_UNI) {
3728                                 for (value = 0; value < 256; value++) {
3729                                     if (isSPACE_L1(value)) {
3730                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3731                                     }
3732                                 }
3733                             } else {
3734                                 for (value = 0; value < 256; value++) {
3735                                     if (isSPACE(value)) {
3736                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3737                                     }
3738                                 }
3739                             }
3740                         }
3741                     }
3742                     else {
3743                         if (data->start_class->flags & ANYOF_LOCALE)
3744                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3745                         else if (FLAGS(scan) & USE_UNI) {
3746                             for (value = 0; value < 256; value++) {
3747                                 if (!isSPACE_L1(value)) {
3748                                     ANYOF_BITMAP_SET(data->start_class, value);
3749                                 }
3750                             }
3751                         }
3752                         else {
3753                             for (value = 0; value < 256; value++) {
3754                                 if (!isSPACE(value)) {
3755                                     ANYOF_BITMAP_SET(data->start_class, value);
3756                                 }
3757                             }
3758                         }
3759                     }
3760                     break;
3761                 case NSPACEL:
3762                     if (flags & SCF_DO_STCLASS_AND) {
3763                         if (data->start_class->flags & ANYOF_LOCALE) {
3764                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3765                             for (value = 0; value < 256; value++)
3766                                 if (!isSPACE(value))
3767                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3768                         }
3769                     }
3770                     else {
3771                         data->start_class->flags |= ANYOF_LOCALE;
3772                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3773                     }
3774                     break;
3775                 case DIGIT:
3776                     if (flags & SCF_DO_STCLASS_AND) {
3777                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3778                         for (value = 0; value < 256; value++)
3779                             if (!isDIGIT(value))
3780                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3781                     }
3782                     else {
3783                         if (data->start_class->flags & ANYOF_LOCALE)
3784                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3785                         else {
3786                             for (value = 0; value < 256; value++)
3787                                 if (isDIGIT(value))
3788                                     ANYOF_BITMAP_SET(data->start_class, value);
3789                         }
3790                     }
3791                     break;
3792                 case NDIGIT:
3793                     if (flags & SCF_DO_STCLASS_AND) {
3794                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3795                         for (value = 0; value < 256; value++)
3796                             if (isDIGIT(value))
3797                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3798                     }
3799                     else {
3800                         if (data->start_class->flags & ANYOF_LOCALE)
3801                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3802                         else {
3803                             for (value = 0; value < 256; value++)
3804                                 if (!isDIGIT(value))
3805                                     ANYOF_BITMAP_SET(data->start_class, value);
3806                         }
3807                     }
3808                     break;
3809                 CASE_SYNST_FNC(VERTWS);
3810                 CASE_SYNST_FNC(HORIZWS);
3811
3812                 }
3813                 if (flags & SCF_DO_STCLASS_OR)
3814                     cl_and(data->start_class, and_withp);
3815                 flags &= ~SCF_DO_STCLASS;
3816             }
3817         }
3818         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3819             data->flags |= (OP(scan) == MEOL
3820                             ? SF_BEFORE_MEOL
3821                             : SF_BEFORE_SEOL);
3822         }
3823         else if (  PL_regkind[OP(scan)] == BRANCHJ
3824                  /* Lookbehind, or need to calculate parens/evals/stclass: */
3825                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
3826                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3827             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3828                 || OP(scan) == UNLESSM )
3829             {
3830                 /* Negative Lookahead/lookbehind
3831                    In this case we can't do fixed string optimisation.
3832                 */
3833
3834                 I32 deltanext, minnext, fake = 0;
3835                 regnode *nscan;
3836                 struct regnode_charclass_class intrnl;
3837                 int f = 0;
3838
3839                 data_fake.flags = 0;
3840                 if (data) {
3841                     data_fake.whilem_c = data->whilem_c;
3842                     data_fake.last_closep = data->last_closep;
3843                 }
3844                 else
3845                     data_fake.last_closep = &fake;
3846                 data_fake.pos_delta = delta;
3847                 if ( flags & SCF_DO_STCLASS && !scan->flags
3848                      && OP(scan) == IFMATCH ) { /* Lookahead */
3849                     cl_init(pRExC_state, &intrnl);
3850                     data_fake.start_class = &intrnl;
3851                     f |= SCF_DO_STCLASS_AND;
3852                 }
3853                 if (flags & SCF_WHILEM_VISITED_POS)
3854                     f |= SCF_WHILEM_VISITED_POS;
3855                 next = regnext(scan);
3856                 nscan = NEXTOPER(NEXTOPER(scan));
3857                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3858                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3859                 if (scan->flags) {
3860                     if (deltanext) {
3861                         FAIL("Variable length lookbehind not implemented");
3862                     }
3863                     else if (minnext > (I32)U8_MAX) {
3864                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3865                     }
3866                     scan->flags = (U8)minnext;
3867                 }
3868                 if (data) {
3869                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3870                         pars++;
3871                     if (data_fake.flags & SF_HAS_EVAL)
3872                         data->flags |= SF_HAS_EVAL;
3873                     data->whilem_c = data_fake.whilem_c;
3874                 }
3875                 if (f & SCF_DO_STCLASS_AND) {
3876                     if (flags & SCF_DO_STCLASS_OR) {
3877                         /* OR before, AND after: ideally we would recurse with
3878                          * data_fake to get the AND applied by study of the
3879                          * remainder of the pattern, and then derecurse;
3880                          * *** HACK *** for now just treat as "no information".
3881                          * See [perl #56690].
3882                          */
3883                         cl_init(pRExC_state, data->start_class);
3884                     }  else {
3885                         /* AND before and after: combine and continue */
3886                         const int was = (data->start_class->flags & ANYOF_EOS);
3887
3888                         cl_and(data->start_class, &intrnl);
3889                         if (was)
3890                             data->start_class->flags |= ANYOF_EOS;
3891                     }
3892                 }
3893             }
3894 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3895             else {
3896                 /* Positive Lookahead/lookbehind
3897                    In this case we can do fixed string optimisation,
3898                    but we must be careful about it. Note in the case of
3899                    lookbehind the positions will be offset by the minimum
3900                    length of the pattern, something we won't know about
3901                    until after the recurse.
3902                 */
3903                 I32 deltanext, fake = 0;
3904                 regnode *nscan;
3905                 struct regnode_charclass_class intrnl;
3906                 int f = 0;
3907                 /* We use SAVEFREEPV so that when the full compile
3908                     is finished perl will clean up the allocated
3909                     minlens when it's all done. This way we don't
3910                     have to worry about freeing them when we know
3911                     they wont be used, which would be a pain.
3912                  */
3913                 I32 *minnextp;
3914                 Newx( minnextp, 1, I32 );
3915                 SAVEFREEPV(minnextp);
3916
3917                 if (data) {
3918                     StructCopy(data, &data_fake, scan_data_t);
3919                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3920                         f |= SCF_DO_SUBSTR;
3921                         if (scan->flags)
3922                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3923                         data_fake.last_found=newSVsv(data->last_found);
3924                     }
3925                 }
3926                 else
3927                     data_fake.last_closep = &fake;
3928                 data_fake.flags = 0;
3929                 data_fake.pos_delta = delta;
3930                 if (is_inf)
3931                     data_fake.flags |= SF_IS_INF;
3932                 if ( flags & SCF_DO_STCLASS && !scan->flags
3933                      && OP(scan) == IFMATCH ) { /* Lookahead */
3934                     cl_init(pRExC_state, &intrnl);
3935                     data_fake.start_class = &intrnl;
3936                     f |= SCF_DO_STCLASS_AND;
3937                 }
3938                 if (flags & SCF_WHILEM_VISITED_POS)
3939                     f |= SCF_WHILEM_VISITED_POS;
3940                 next = regnext(scan);
3941                 nscan = NEXTOPER(NEXTOPER(scan));
3942
3943                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3944                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3945                 if (scan->flags) {
3946                     if (deltanext) {
3947                         FAIL("Variable length lookbehind not implemented");
3948                     }
3949                     else if (*minnextp > (I32)U8_MAX) {
3950                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3951                     }
3952                     scan->flags = (U8)*minnextp;
3953                 }
3954
3955                 *minnextp += min;
3956
3957                 if (f & SCF_DO_STCLASS_AND) {
3958                     const int was = (data->start_class->flags & ANYOF_EOS);
3959
3960                     cl_and(data->start_class, &intrnl);
3961                     if (was)
3962                         data->start_class->flags |= ANYOF_EOS;
3963                 }
3964                 if (data) {
3965                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3966                         pars++;
3967                     if (data_fake.flags & SF_HAS_EVAL)
3968                         data->flags |= SF_HAS_EVAL;
3969                     data->whilem_c = data_fake.whilem_c;
3970                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3971                         if (RExC_rx->minlen<*minnextp)
3972                             RExC_rx->minlen=*minnextp;
3973                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3974                         SvREFCNT_dec(data_fake.last_found);
3975
3976                         if ( data_fake.minlen_fixed != minlenp )
3977                         {
3978                             data->offset_fixed= data_fake.offset_fixed;
3979                             data->minlen_fixed= data_fake.minlen_fixed;
3980                             data->lookbehind_fixed+= scan->flags;
3981                         }
3982                         if ( data_fake.minlen_float != minlenp )
3983                         {
3984                             data->minlen_float= data_fake.minlen_float;
3985                             data->offset_float_min=data_fake.offset_float_min;
3986                             data->offset_float_max=data_fake.offset_float_max;
3987                             data->lookbehind_float+= scan->flags;
3988                         }
3989                     }
3990                 }
3991
3992
3993             }
3994 #endif
3995         }
3996         else if (OP(scan) == OPEN) {
3997             if (stopparen != (I32)ARG(scan))
3998                 pars++;
3999         }
4000         else if (OP(scan) == CLOSE) {
4001             if (stopparen == (I32)ARG(scan)) {
4002                 break;
4003             }
4004             if ((I32)ARG(scan) == is_par) {
4005                 next = regnext(scan);
4006
4007                 if ( next && (OP(next) != WHILEM) && next < last)
4008                     is_par = 0;         /* Disable optimization */
4009             }
4010             if (data)
4011                 *(data->last_closep) = ARG(scan);
4012         }
4013         else if (OP(scan) == EVAL) {
4014                 if (data)
4015                     data->flags |= SF_HAS_EVAL;
4016         }
4017         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4018             if (flags & SCF_DO_SUBSTR) {
4019                 SCAN_COMMIT(pRExC_state,data,minlenp);
4020                 flags &= ~SCF_DO_SUBSTR;
4021             }
4022             if (data && OP(scan)==ACCEPT) {
4023                 data->flags |= SCF_SEEN_ACCEPT;
4024                 if (stopmin > min)
4025                     stopmin = min;
4026             }
4027         }
4028         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4029         {
4030                 if (flags & SCF_DO_SUBSTR) {
4031                     SCAN_COMMIT(pRExC_state,data,minlenp);
4032                     data->longest = &(data->longest_float);
4033                 }
4034                 is_inf = is_inf_internal = 1;
4035                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4036                     cl_anything(pRExC_state, data->start_class);
4037                 flags &= ~SCF_DO_STCLASS;
4038         }
4039         else if (OP(scan) == GPOS) {
4040             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4041                 !(delta || is_inf || (data && data->pos_delta)))
4042             {
4043                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4044                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4045                 if (RExC_rx->gofs < (U32)min)
4046                     RExC_rx->gofs = min;
4047             } else {
4048                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4049                 RExC_rx->gofs = 0;
4050             }
4051         }
4052 #ifdef TRIE_STUDY_OPT
4053 #ifdef FULL_TRIE_STUDY
4054         else if (PL_regkind[OP(scan)] == TRIE) {
4055             /* NOTE - There is similar code to this block above for handling
4056                BRANCH nodes on the initial study.  If you change stuff here
4057                check there too. */
4058             regnode *trie_node= scan;
4059             regnode *tail= regnext(scan);
4060             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4061             I32 max1 = 0, min1 = I32_MAX;
4062             struct regnode_charclass_class accum;
4063
4064             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4065                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4066             if (flags & SCF_DO_STCLASS)
4067                 cl_init_zero(pRExC_state, &accum);
4068
4069             if (!trie->jump) {
4070                 min1= trie->minlen;
4071                 max1= trie->maxlen;
4072             } else {
4073                 const regnode *nextbranch= NULL;
4074                 U32 word;
4075
4076                 for ( word=1 ; word <= trie->wordcount ; word++)
4077                 {
4078                     I32 deltanext=0, minnext=0, f = 0, fake;
4079                     struct regnode_charclass_class this_class;
4080
4081                     data_fake.flags = 0;
4082                     if (data) {
4083                         data_fake.whilem_c = data->whilem_c;
4084                         data_fake.last_closep = data->last_closep;
4085                     }
4086                     else
4087                         data_fake.last_closep = &fake;
4088                     data_fake.pos_delta = delta;
4089                     if (flags & SCF_DO_STCLASS) {
4090                         cl_init(pRExC_state, &this_class);
4091                         data_fake.start_class = &this_class;
4092                         f = SCF_DO_STCLASS_AND;
4093                     }
4094                     if (flags & SCF_WHILEM_VISITED_POS)
4095                         f |= SCF_WHILEM_VISITED_POS;
4096
4097                     if (trie->jump[word]) {
4098                         if (!nextbranch)
4099                             nextbranch = trie_node + trie->jump[0];
4100                         scan= trie_node + trie->jump[word];
4101                         /* We go from the jump point to the branch that follows
4102                            it. Note this means we need the vestigal unused branches
4103                            even though they arent otherwise used.
4104                          */
4105                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4106                             &deltanext, (regnode *)nextbranch, &data_fake,
4107                             stopparen, recursed, NULL, f,depth+1);
4108                     }
4109                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4110                         nextbranch= regnext((regnode*)nextbranch);
4111
4112                     if (min1 > (I32)(minnext + trie->minlen))
4113                         min1 = minnext + trie->minlen;
4114                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4115                         max1 = minnext + deltanext + trie->maxlen;
4116                     if (deltanext == I32_MAX)
4117                         is_inf = is_inf_internal = 1;
4118
4119                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4120                         pars++;
4121                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4122                         if ( stopmin > min + min1)
4123                             stopmin = min + min1;
4124                         flags &= ~SCF_DO_SUBSTR;
4125                         if (data)
4126                             data->flags |= SCF_SEEN_ACCEPT;
4127                     }
4128                     if (data) {
4129                         if (data_fake.flags & SF_HAS_EVAL)
4130                             data->flags |= SF_HAS_EVAL;
4131                         data->whilem_c = data_fake.whilem_c;
4132                     }
4133                     if (flags & SCF_DO_STCLASS)
4134                         cl_or(pRExC_state, &accum, &this_class);
4135                 }
4136             }
4137             if (flags & SCF_DO_SUBSTR) {
4138                 data->pos_min += min1;
4139                 data->pos_delta += max1 - min1;
4140                 if (max1 != min1 || is_inf)
4141                     data->longest = &(data->longest_float);
4142             }
4143             min += min1;
4144             delta += max1 - min1;
4145             if (flags & SCF_DO_STCLASS_OR) {
4146                 cl_or(pRExC_state, data->start_class, &accum);
4147                 if (min1) {
4148                     cl_and(data->start_class, and_withp);
4149                     flags &= ~SCF_DO_STCLASS;
4150                 }
4151             }
4152             else if (flags & SCF_DO_STCLASS_AND) {
4153                 if (min1) {
4154                     cl_and(data->start_class, &accum);
4155                     flags &= ~SCF_DO_STCLASS;
4156                 }
4157                 else {
4158                     /* Switch to OR mode: cache the old value of
4159                      * data->start_class */
4160                     INIT_AND_WITHP;
4161                     StructCopy(data->start_class, and_withp,
4162                                struct regnode_charclass_class);
4163                     flags &= ~SCF_DO_STCLASS_AND;
4164                     StructCopy(&accum, data->start_class,
4165                                struct regnode_charclass_class);
4166                     flags |= SCF_DO_STCLASS_OR;
4167                     data->start_class->flags |= ANYOF_EOS;
4168                 }
4169             }
4170             scan= tail;
4171             continue;
4172         }
4173 #else
4174         else if (PL_regkind[OP(scan)] == TRIE) {
4175             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4176             U8*bang=NULL;
4177
4178             min += trie->minlen;
4179             delta += (trie->maxlen - trie->minlen);
4180             flags &= ~SCF_DO_STCLASS; /* xxx */
4181             if (flags & SCF_DO_SUBSTR) {
4182                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4183                 data->pos_min += trie->minlen;
4184                 data->pos_delta += (trie->maxlen - trie->minlen);
4185                 if (trie->maxlen != trie->minlen)
4186                     data->longest = &(data->longest_float);
4187             }
4188             if (trie->jump) /* no more substrings -- for now /grr*/
4189                 flags &= ~SCF_DO_SUBSTR;
4190         }
4191 #endif /* old or new */
4192 #endif /* TRIE_STUDY_OPT */
4193
4194         /* Else: zero-length, ignore. */
4195         scan = regnext(scan);
4196     }
4197     if (frame) {
4198         last = frame->last;
4199         scan = frame->next;
4200         stopparen = frame->stop;
4201         frame = frame->prev;
4202         goto fake_study_recurse;
4203     }
4204
4205   finish:
4206     assert(!frame);
4207     DEBUG_STUDYDATA("pre-fin:",data,depth);
4208
4209     *scanp = scan;
4210     *deltap = is_inf_internal ? I32_MAX : delta;
4211     if (flags & SCF_DO_SUBSTR && is_inf)
4212         data->pos_delta = I32_MAX - data->pos_min;
4213     if (is_par > (I32)U8_MAX)
4214         is_par = 0;
4215     if (is_par && pars==1 && data) {
4216         data->flags |= SF_IN_PAR;
4217         data->flags &= ~SF_HAS_PAR;
4218     }
4219     else if (pars && data) {
4220         data->flags |= SF_HAS_PAR;
4221         data->flags &= ~SF_IN_PAR;
4222     }
4223     if (flags & SCF_DO_STCLASS_OR)
4224         cl_and(data->start_class, and_withp);
4225     if (flags & SCF_TRIE_RESTUDY)
4226         data->flags |=  SCF_TRIE_RESTUDY;
4227
4228     DEBUG_STUDYDATA("post-fin:",data,depth);
4229
4230     return min < stopmin ? min : stopmin;
4231 }
4232
4233 STATIC U32
4234 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4235 {
4236     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4237
4238     PERL_ARGS_ASSERT_ADD_DATA;
4239
4240     Renewc(RExC_rxi->data,
4241            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4242            char, struct reg_data);
4243     if(count)
4244         Renew(RExC_rxi->data->what, count + n, U8);
4245     else
4246         Newx(RExC_rxi->data->what, n, U8);
4247     RExC_rxi->data->count = count + n;
4248     Copy(s, RExC_rxi->data->what + count, n, U8);
4249     return count;
4250 }
4251
4252 /*XXX: todo make this not included in a non debugging perl */
4253 #ifndef PERL_IN_XSUB_RE
4254 void
4255 Perl_reginitcolors(pTHX)
4256 {
4257     dVAR;
4258     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4259     if (s) {
4260         char *t = savepv(s);
4261         int i = 0;
4262         PL_colors[0] = t;
4263         while (++i < 6) {
4264             t = strchr(t, '\t');
4265             if (t) {
4266                 *t = '\0';
4267                 PL_colors[i] = ++t;
4268             }
4269             else
4270                 PL_colors[i] = t = (char *)"";
4271         }
4272     } else {
4273         int i = 0;
4274         while (i < 6)
4275             PL_colors[i++] = (char *)"";
4276     }
4277     PL_colorset = 1;
4278 }
4279 #endif
4280
4281
4282 #ifdef TRIE_STUDY_OPT
4283 #define CHECK_RESTUDY_GOTO                                  \
4284         if (                                                \
4285               (data.flags & SCF_TRIE_RESTUDY)               \
4286               && ! restudied++                              \
4287         )     goto reStudy
4288 #else
4289 #define CHECK_RESTUDY_GOTO
4290 #endif
4291
4292 /*
4293  - pregcomp - compile a regular expression into internal code
4294  *
4295  * We can't allocate space until we know how big the compiled form will be,
4296  * but we can't compile it (and thus know how big it is) until we've got a
4297  * place to put the code.  So we cheat:  we compile it twice, once with code
4298  * generation turned off and size counting turned on, and once "for real".
4299  * This also means that we don't allocate space until we are sure that the
4300  * thing really will compile successfully, and we never have to move the
4301  * code and thus invalidate pointers into it.  (Note that it has to be in
4302  * one piece because free() must be able to free it all.) [NB: not true in perl]
4303  *
4304  * Beware that the optimization-preparation code in here knows about some
4305  * of the structure of the compiled regexp.  [I'll say.]
4306  */
4307
4308
4309
4310 #ifndef PERL_IN_XSUB_RE
4311 #define RE_ENGINE_PTR &PL_core_reg_engine
4312 #else
4313 extern const struct regexp_engine my_reg_engine;
4314 #define RE_ENGINE_PTR &my_reg_engine
4315 #endif
4316
4317 #ifndef PERL_IN_XSUB_RE
4318 REGEXP *
4319 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4320 {
4321     dVAR;
4322     HV * const table = GvHV(PL_hintgv);
4323
4324     PERL_ARGS_ASSERT_PREGCOMP;
4325
4326     /* Dispatch a request to compile a regexp to correct
4327        regexp engine. */
4328     if (table) {
4329         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4330         GET_RE_DEBUG_FLAGS_DECL;
4331         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4332             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4333             DEBUG_COMPILE_r({
4334                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4335                     SvIV(*ptr));
4336             });
4337             return CALLREGCOMP_ENG(eng, pattern, flags);
4338         }
4339     }
4340     return Perl_re_compile(aTHX_ pattern, flags);
4341 }
4342 #endif
4343
4344 REGEXP *
4345 Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
4346 {
4347     dVAR;
4348     REGEXP *rx;
4349     struct regexp *r;
4350     register regexp_internal *ri;
4351     STRLEN plen;
4352     char  *exp;
4353     char* xend;
4354     regnode *scan;
4355     I32 flags;
4356     I32 minlen = 0;
4357
4358     /* these are all flags - maybe they should be turned
4359      * into a single int with different bit masks */
4360     I32 sawlookahead = 0;
4361     I32 sawplus = 0;
4362     I32 sawopen = 0;
4363
4364     U8 jump_ret = 0;
4365     dJMPENV;
4366     scan_data_t data;
4367     RExC_state_t RExC_state;
4368     RExC_state_t * const pRExC_state = &RExC_state;
4369 #ifdef TRIE_STUDY_OPT
4370     int restudied;
4371     RExC_state_t copyRExC_state;
4372 #endif
4373     GET_RE_DEBUG_FLAGS_DECL;
4374
4375     PERL_ARGS_ASSERT_RE_COMPILE;
4376
4377     DEBUG_r(if (!PL_colorset) reginitcolors());
4378
4379     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4380
4381
4382     /* Longjmp back to here if have to switch in midstream to utf8 */
4383     if (! RExC_orig_utf8) {
4384         JMPENV_PUSH(jump_ret);
4385     }
4386
4387     if (jump_ret == 0) {    /* First time through */
4388         exp = SvPV(pattern, plen);
4389         xend = exp + plen;
4390
4391         DEBUG_COMPILE_r({
4392             SV *dsv= sv_newmortal();
4393             RE_PV_QUOTED_DECL(s, RExC_utf8,
4394                 dsv, exp, plen, 60);
4395             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4396                            PL_colors[4],PL_colors[5],s);
4397         });
4398     }
4399     else {  /* longjumped back */
4400         STRLEN len = plen;
4401
4402         /* If the cause for the longjmp was other than changing to utf8, pop
4403          * our own setjmp, and longjmp to the correct handler */
4404         if (jump_ret != UTF8_LONGJMP) {
4405             JMPENV_POP;
4406             JMPENV_JUMP(jump_ret);
4407         }
4408
4409         GET_RE_DEBUG_FLAGS;
4410
4411         /* It's possible to write a regexp in ascii that represents Unicode
4412         codepoints outside of the byte range, such as via \x{100}. If we
4413         detect such a sequence we have to convert the entire pattern to utf8
4414         and then recompile, as our sizing calculation will have been based
4415         on 1 byte == 1 character, but we will need to use utf8 to encode
4416         at least some part of the pattern, and therefore must convert the whole
4417         thing.
4418         -- dmq */
4419         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4420             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4421         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4422         xend = exp + len;
4423         RExC_orig_utf8 = RExC_utf8 = 1;
4424         SAVEFREEPV(exp);
4425     }
4426
4427 #ifdef TRIE_STUDY_OPT
4428     restudied = 0;
4429 #endif
4430
4431     RExC_precomp = exp;
4432     RExC_flags = pm_flags;
4433     RExC_sawback = 0;
4434
4435     RExC_seen = 0;
4436     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4437     RExC_seen_evals = 0;
4438     RExC_extralen = 0;
4439
4440     /* First pass: determine size, legality. */
4441     RExC_parse = exp;
4442     RExC_start = exp;
4443     RExC_end = xend;
4444     RExC_naughty = 0;
4445     RExC_npar = 1;
4446     RExC_nestroot = 0;
4447     RExC_size = 0L;
4448     RExC_emit = &PL_regdummy;
4449     RExC_whilem_seen = 0;
4450     RExC_open_parens = NULL;
4451     RExC_close_parens = NULL;
4452     RExC_opend = NULL;
4453     RExC_paren_names = NULL;
4454 #ifdef DEBUGGING
4455     RExC_paren_name_list = NULL;
4456 #endif
4457     RExC_recurse = NULL;
4458     RExC_recurse_count = 0;
4459
4460 #if 0 /* REGC() is (currently) a NOP at the first pass.
4461        * Clever compilers notice this and complain. --jhi */
4462     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4463 #endif
4464     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4465     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4466         RExC_precomp = NULL;
4467         return(NULL);
4468     }
4469
4470     /* Here, finished first pass.  Get rid of our setjmp, which we added for
4471      * efficiency only if the passed-in string wasn't in utf8, as shown by
4472      * RExC_orig_utf8.  But if the first pass was redone, that variable will be
4473      * 1 here even though the original string wasn't utf8, but in this case
4474      * there will have been a long jump */
4475     if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
4476         JMPENV_POP;
4477     }
4478     DEBUG_PARSE_r({
4479         PerlIO_printf(Perl_debug_log,
4480             "Required size %"IVdf" nodes\n"
4481             "Starting second pass (creation)\n",
4482             (IV)RExC_size);
4483         RExC_lastnum=0;
4484         RExC_lastparse=NULL;
4485     });
4486     /* Small enough for pointer-storage convention?
4487        If extralen==0, this means that we will not need long jumps. */
4488     if (RExC_size >= 0x10000L && RExC_extralen)
4489         RExC_size += RExC_extralen;
4490     else
4491         RExC_extralen = 0;
4492     if (RExC_whilem_seen > 15)
4493         RExC_whilem_seen = 15;
4494
4495     /* Allocate space and zero-initialize. Note, the two step process
4496        of zeroing when in debug mode, thus anything assigned has to
4497        happen after that */
4498     rx = (REGEXP*) newSV_type(SVt_REGEXP);
4499     r = (struct regexp*)SvANY(rx);
4500     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4501          char, regexp_internal);
4502     if ( r == NULL || ri == NULL )
4503         FAIL("Regexp out of space");
4504 #ifdef DEBUGGING
4505     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4506     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4507 #else
4508     /* bulk initialize base fields with 0. */
4509     Zero(ri, sizeof(regexp_internal), char);
4510 #endif
4511
4512     /* non-zero initialization begins here */
4513     RXi_SET( r, ri );
4514     r->engine= RE_ENGINE_PTR;
4515     r->extflags = pm_flags;
4516     {
4517         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4518         bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
4519
4520         /* The caret is output if there are any defaults: if not all the STD
4521          * flags are set, or if no character set specifier is needed */
4522         bool has_default =
4523                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4524                     || ! has_charset);
4525         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4526         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4527                             >> RXf_PMf_STD_PMMOD_SHIFT);
4528         const char *fptr = STD_PAT_MODS;        /*"msix"*/
4529         char *p;
4530         /* Allocate for the worst case, which is all the std flags are turned
4531          * on.  If more precision is desired, we could do a population count of
4532          * the flags set.  This could be done with a small lookup table, or by
4533          * shifting, masking and adding, or even, when available, assembly
4534          * language for a machine-language population count.
4535          * We never output a minus, as all those are defaults, so are
4536          * covered by the caret */
4537         const STRLEN wraplen = plen + has_p + has_runon
4538             + has_default       /* If needs a caret */
4539             + has_charset       /* If needs a character set specifier */
4540             + (sizeof(STD_PAT_MODS) - 1)
4541             + (sizeof("(?:)") - 1);
4542
4543         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4544         SvPOK_on(rx);
4545         SvFLAGS(rx) |= SvUTF8(pattern);
4546         *p++='('; *p++='?';
4547
4548         /* If a default, cover it using the caret */
4549         if (has_default) {
4550             *p++= DEFAULT_PAT_MOD;
4551         }
4552         if (has_charset) {
4553             if (r->extflags & RXf_PMf_LOCALE) {
4554                 *p++ = LOCALE_PAT_MOD;
4555             } else {
4556                 *p++ = UNICODE_PAT_MOD;
4557             }
4558         }
4559         if (has_p)
4560             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4561         {
4562             char ch;
4563             while((ch = *fptr++)) {
4564                 if(reganch & 1)
4565                     *p++ = ch;
4566                 reganch >>= 1;
4567             }
4568         }
4569
4570         *p++ = ':';
4571         Copy(RExC_precomp, p, plen, char);
4572         assert ((RX_WRAPPED(rx) - p) < 16);
4573         r->pre_prefix = p - RX_WRAPPED(rx);
4574         p += plen;
4575         if (has_runon)
4576             *p++ = '\n';
4577         *p++ = ')';
4578         *p = 0;
4579         SvCUR_set(rx, p - SvPVX_const(rx));
4580     }
4581
4582     r->intflags = 0;
4583     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4584
4585     if (RExC_seen & REG_SEEN_RECURSE) {
4586         Newxz(RExC_open_parens, RExC_npar,regnode *);
4587         SAVEFREEPV(RExC_open_parens);
4588         Newxz(RExC_close_parens,RExC_npar,regnode *);
4589         SAVEFREEPV(RExC_close_parens);
4590     }
4591
4592     /* Useful during FAIL. */
4593 #ifdef RE_TRACK_PATTERN_OFFSETS
4594     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4595     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4596                           "%s %"UVuf" bytes for offset annotations.\n",
4597                           ri->u.offsets ? "Got" : "Couldn't get",
4598                           (UV)((2*RExC_size+1) * sizeof(U32))));
4599 #endif
4600     SetProgLen(ri,RExC_size);
4601     RExC_rx_sv = rx;
4602     RExC_rx = r;
4603     RExC_rxi = ri;
4604
4605     /* Second pass: emit code. */
4606     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
4607     RExC_parse = exp;
4608     RExC_end = xend;
4609     RExC_naughty = 0;
4610     RExC_npar = 1;
4611     RExC_emit_start = ri->program;
4612     RExC_emit = ri->program;
4613     RExC_emit_bound = ri->program + RExC_size + 1;
4614
4615     /* Store the count of eval-groups for security checks: */
4616     RExC_rx->seen_evals = RExC_seen_evals;
4617     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4618     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4619         ReREFCNT_dec(rx);
4620         return(NULL);
4621     }
4622     /* XXXX To minimize changes to RE engine we always allocate
4623        3-units-long substrs field. */
4624     Newx(r->substrs, 1, struct reg_substr_data);
4625     if (RExC_recurse_count) {
4626         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4627         SAVEFREEPV(RExC_recurse);
4628     }
4629
4630 reStudy:
4631     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4632     Zero(r->substrs, 1, struct reg_substr_data);
4633
4634 #ifdef TRIE_STUDY_OPT
4635     if (!restudied) {
4636         StructCopy(&zero_scan_data, &data, scan_data_t);
4637         copyRExC_state = RExC_state;
4638     } else {
4639         U32 seen=RExC_seen;
4640         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4641
4642         RExC_state = copyRExC_state;
4643         if (seen & REG_TOP_LEVEL_BRANCHES)
4644             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4645         else
4646             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4647         if (data.last_found) {
4648             SvREFCNT_dec(data.longest_fixed);
4649             SvREFCNT_dec(data.longest_float);
4650             SvREFCNT_dec(data.last_found);
4651         }
4652         StructCopy(&zero_scan_data, &data, scan_data_t);
4653     }
4654 #else
4655     StructCopy(&zero_scan_data, &data, scan_data_t);
4656 #endif
4657
4658     /* Dig out information for optimizations. */
4659     r->extflags = RExC_flags; /* was pm_op */
4660     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4661
4662     if (UTF)
4663         SvUTF8_on(rx);  /* Unicode in it? */
4664     ri->regstclass = NULL;
4665     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
4666         r->intflags |= PREGf_NAUGHTY;
4667     scan = ri->program + 1;             /* First BRANCH. */
4668
4669     /* testing for BRANCH here tells us whether there is "must appear"
4670        data in the pattern. If there is then we can use it for optimisations */
4671     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4672         I32 fake;
4673         STRLEN longest_float_length, longest_fixed_length;
4674         struct regnode_charclass_class ch_class; /* pointed to by data */
4675         int stclass_flag;
4676         I32 last_close = 0; /* pointed to by data */
4677         regnode *first= scan;
4678         regnode *first_next= regnext(first);
4679         /*
4680          * Skip introductions and multiplicators >= 1
4681          * so that we can extract the 'meat' of the pattern that must
4682          * match in the large if() sequence following.
4683          * NOTE that EXACT is NOT covered here, as it is normally
4684          * picked up by the optimiser separately.
4685          *
4686          * This is unfortunate as the optimiser isnt handling lookahead
4687          * properly currently.
4688          *
4689          */
4690         while ((OP(first) == OPEN && (sawopen = 1)) ||
4691                /* An OR of *one* alternative - should not happen now. */
4692             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4693             /* for now we can't handle lookbehind IFMATCH*/
4694             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4695             (OP(first) == PLUS) ||
4696             (OP(first) == MINMOD) ||
4697                /* An {n,m} with n>0 */
4698             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4699             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4700         {
4701                 /*
4702                  * the only op that could be a regnode is PLUS, all the rest
4703                  * will be regnode_1 or regnode_2.
4704                  *
4705                  */
4706                 if (OP(first) == PLUS)
4707                     sawplus = 1;
4708                 else
4709                     first += regarglen[OP(first)];
4710
4711                 first = NEXTOPER(first);
4712                 first_next= regnext(first);
4713         }
4714
4715         /* Starting-point info. */
4716       again:
4717         DEBUG_PEEP("first:",first,0);
4718         /* Ignore EXACT as we deal with it later. */
4719         if (PL_regkind[OP(first)] == EXACT) {
4720             if (OP(first) == EXACT)
4721                 NOOP;   /* Empty, get anchored substr later. */
4722             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4723                 ri->regstclass = first;
4724         }
4725 #ifdef TRIE_STCLASS
4726         else if (PL_regkind[OP(first)] == TRIE &&
4727                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4728         {
4729             regnode *trie_op;
4730             /* this can happen only on restudy */
4731             if ( OP(first) == TRIE ) {
4732                 struct regnode_1 *trieop = (struct regnode_1 *)
4733                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
4734                 StructCopy(first,trieop,struct regnode_1);
4735                 trie_op=(regnode *)trieop;
4736             } else {
4737                 struct regnode_charclass *trieop = (struct regnode_charclass *)
4738                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4739                 StructCopy(first,trieop,struct regnode_charclass);
4740                 trie_op=(regnode *)trieop;
4741             }
4742             OP(trie_op)+=2;
4743             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4744             ri->regstclass = trie_op;
4745         }
4746 #endif
4747         else if (REGNODE_SIMPLE(OP(first)))
4748             ri->regstclass = first;
4749         else if (PL_regkind[OP(first)] == BOUND ||
4750                  PL_regkind[OP(first)] == NBOUND)
4751             ri->regstclass = first;
4752         else if (PL_regkind[OP(first)] == BOL) {
4753             r->extflags |= (OP(first) == MBOL
4754                            ? RXf_ANCH_MBOL
4755                            : (OP(first) == SBOL
4756                               ? RXf_ANCH_SBOL
4757                               : RXf_ANCH_BOL));
4758             first = NEXTOPER(first);
4759             goto again;
4760         }
4761         else if (OP(first) == GPOS) {
4762             r->extflags |= RXf_ANCH_GPOS;
4763             first = NEXTOPER(first);
4764             goto again;
4765         }
4766         else if ((!sawopen || !RExC_sawback) &&
4767             (OP(first) == STAR &&
4768             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4769             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4770         {
4771             /* turn .* into ^.* with an implied $*=1 */
4772             const int type =
4773                 (OP(NEXTOPER(first)) == REG_ANY)
4774                     ? RXf_ANCH_MBOL
4775                     : RXf_ANCH_SBOL;
4776             r->extflags |= type;
4777             r->intflags |= PREGf_IMPLICIT;
4778             first = NEXTOPER(first);
4779             goto again;
4780         }
4781         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4782             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4783             /* x+ must match at the 1st pos of run of x's */
4784             r->intflags |= PREGf_SKIP;
4785
4786         /* Scan is after the zeroth branch, first is atomic matcher. */
4787 #ifdef TRIE_STUDY_OPT
4788         DEBUG_PARSE_r(
4789             if (!restudied)
4790                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4791                               (IV)(first - scan + 1))
4792         );
4793 #else
4794         DEBUG_PARSE_r(
4795             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4796                 (IV)(first - scan + 1))
4797         );
4798 #endif
4799
4800
4801         /*
4802         * If there's something expensive in the r.e., find the
4803         * longest literal string that must appear and make it the
4804         * regmust.  Resolve ties in favor of later strings, since
4805         * the regstart check works with the beginning of the r.e.
4806         * and avoiding duplication strengthens checking.  Not a
4807         * strong reason, but sufficient in the absence of others.
4808         * [Now we resolve ties in favor of the earlier string if
4809         * it happens that c_offset_min has been invalidated, since the
4810         * earlier string may buy us something the later one won't.]
4811         */
4812
4813         data.longest_fixed = newSVpvs("");
4814         data.longest_float = newSVpvs("");
4815         data.last_found = newSVpvs("");
4816         data.longest = &(data.longest_fixed);
4817         first = scan;
4818         if (!ri->regstclass) {
4819             cl_init(pRExC_state, &ch_class);
4820             data.start_class = &ch_class;
4821             stclass_flag = SCF_DO_STCLASS_AND;
4822         } else                          /* XXXX Check for BOUND? */
4823             stclass_flag = 0;
4824         data.last_closep = &last_close;
4825
4826         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4827             &data, -1, NULL, NULL,
4828             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4829
4830
4831         CHECK_RESTUDY_GOTO;
4832
4833
4834         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4835              && data.last_start_min == 0 && data.last_end > 0
4836              && !RExC_seen_zerolen
4837              && !(RExC_seen & REG_SEEN_VERBARG)
4838              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4839             r->extflags |= RXf_CHECK_ALL;
4840         scan_commit(pRExC_state, &data,&minlen,0);
4841         SvREFCNT_dec(data.last_found);
4842
4843         /* Note that code very similar to this but for anchored string
4844            follows immediately below, changes may need to be made to both.
4845            Be careful.
4846          */
4847         longest_float_length = CHR_SVLEN(data.longest_float);
4848         if (longest_float_length
4849             || (data.flags & SF_FL_BEFORE_EOL
4850                 && (!(data.flags & SF_FL_BEFORE_MEOL)
4851                     || (RExC_flags & RXf_PMf_MULTILINE))))
4852         {
4853             I32 t,ml;
4854
4855             if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
4856                 && data.offset_fixed == data.offset_float_min
4857                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4858                     goto remove_float;          /* As in (a)+. */
4859
4860             /* copy the information about the longest float from the reg_scan_data
4861                over to the program. */
4862             if (SvUTF8(data.longest_float)) {
4863                 r->float_utf8 = data.longest_float;
4864                 r->float_substr = NULL;
4865             } else {
4866                 r->float_substr = data.longest_float;
4867                 r->float_utf8 = NULL;
4868             }
4869             /* float_end_shift is how many chars that must be matched that
4870                follow this item. We calculate it ahead of time as once the
4871                lookbehind offset is added in we lose the ability to correctly
4872                calculate it.*/
4873             ml = data.minlen_float ? *(data.minlen_float)
4874                                    : (I32)longest_float_length;
4875             r->float_end_shift = ml - data.offset_float_min
4876                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4877                 + data.lookbehind_float;
4878             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4879             r->float_max_offset = data.offset_float_max;
4880             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4881                 r->float_max_offset -= data.lookbehind_float;
4882
4883             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4884                        && (!(data.flags & SF_FL_BEFORE_MEOL)
4885                            || (RExC_flags & RXf_PMf_MULTILINE)));
4886             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4887         }
4888         else {
4889           remove_float:
4890             r->float_substr = r->float_utf8 = NULL;
4891             SvREFCNT_dec(data.longest_float);
4892             longest_float_length = 0;
4893         }
4894
4895         /* Note that code very similar to this but for floating string
4896            is immediately above, changes may need to be made to both.
4897            Be careful.
4898          */
4899         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4900         if (longest_fixed_length
4901             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4902                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4903                     || (RExC_flags & RXf_PMf_MULTILINE))))
4904         {
4905             I32 t,ml;
4906
4907             /* copy the information about the longest fixed
4908                from the reg_scan_data over to the program. */
4909             if (SvUTF8(data.longest_fixed)) {
4910                 r->anchored_utf8 = data.longest_fixed;
4911                 r->anchored_substr = NULL;
4912             } else {
4913                 r->anchored_substr = data.longest_fixed;
4914                 r->anchored_utf8 = NULL;
4915             }
4916             /* fixed_end_shift is how many chars that must be matched that
4917                follow this item. We calculate it ahead of time as once the
4918                lookbehind offset is added in we lose the ability to correctly
4919                calculate it.*/
4920             ml = data.minlen_fixed ? *(data.minlen_fixed)
4921                                    : (I32)longest_fixed_length;
4922             r->anchored_end_shift = ml - data.offset_fixed
4923                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4924                 + data.lookbehind_fixed;
4925             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4926
4927             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4928                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
4929                      || (RExC_flags & RXf_PMf_MULTILINE)));
4930             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4931         }
4932         else {
4933             r->anchored_substr = r->anchored_utf8 = NULL;
4934             SvREFCNT_dec(data.longest_fixed);
4935             longest_fixed_length = 0;
4936         }
4937         if (ri->regstclass
4938             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4939             ri->regstclass = NULL;
4940         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4941             && stclass_flag
4942             && !(data.start_class->flags & ANYOF_EOS)
4943             && !cl_is_anything(data.start_class))
4944         {
4945             const U32 n = add_data(pRExC_state, 1, "f");
4946
4947             Newx(RExC_rxi->data->data[n], 1,
4948                 struct regnode_charclass_class);
4949             StructCopy(data.start_class,
4950                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4951                        struct regnode_charclass_class);
4952             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4953             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4954             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4955                       regprop(r, sv, (regnode*)data.start_class);
4956                       PerlIO_printf(Perl_debug_log,
4957                                     "synthetic stclass \"%s\".\n",
4958                                     SvPVX_const(sv));});
4959         }
4960
4961         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4962         if (longest_fixed_length > longest_float_length) {
4963             r->check_end_shift = r->anchored_end_shift;
4964             r->check_substr = r->anchored_substr;
4965             r->check_utf8 = r->anchored_utf8;
4966             r->check_offset_min = r->check_offset_max = r->anchored_offset;
4967             if (r->extflags & RXf_ANCH_SINGLE)
4968                 r->extflags |= RXf_NOSCAN;
4969         }
4970         else {
4971             r->check_end_shift = r->float_end_shift;
4972             r->check_substr = r->float_substr;
4973             r->check_utf8 = r->float_utf8;
4974             r->check_offset_min = r->float_min_offset;
4975             r->check_offset_max = r->float_max_offset;
4976         }
4977         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4978            This should be changed ASAP!  */
4979         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4980             r->extflags |= RXf_USE_INTUIT;
4981             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4982                 r->extflags |= RXf_INTUIT_TAIL;
4983         }
4984         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4985         if ( (STRLEN)minlen < longest_float_length )
4986             minlen= longest_float_length;
4987         if ( (STRLEN)minlen < longest_fixed_length )
4988             minlen= longest_fixed_length;
4989         */
4990     }
4991     else {
4992         /* Several toplevels. Best we can is to set minlen. */
4993         I32 fake;
4994         struct regnode_charclass_class ch_class;
4995         I32 last_close = 0;
4996
4997         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4998
4999         scan = ri->program + 1;
5000         cl_init(pRExC_state, &ch_class);
5001         data.start_class = &ch_class;
5002         data.last_closep = &last_close;
5003
5004
5005         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5006             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5007
5008         CHECK_RESTUDY_GOTO;
5009
5010         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5011                 = r->float_substr = r->float_utf8 = NULL;
5012         if (!(data.start_class->flags & ANYOF_EOS)
5013             && !cl_is_anything(data.start_class))
5014         {
5015             const U32 n = add_data(pRExC_state, 1, "f");
5016
5017             Newx(RExC_rxi->data->data[n], 1,
5018                 struct regnode_charclass_class);
5019             StructCopy(data.start_class,
5020                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5021                        struct regnode_charclass_class);
5022             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5023             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5024             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5025                       regprop(r, sv, (regnode*)data.start_class);
5026                       PerlIO_printf(Perl_debug_log,
5027                                     "synthetic stclass \"%s\".\n",
5028                                     SvPVX_const(sv));});
5029         }
5030     }
5031
5032     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5033        the "real" pattern. */
5034     DEBUG_OPTIMISE_r({
5035         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5036                       (IV)minlen, (IV)r->minlen);
5037     });
5038     r->minlenret = minlen;
5039     if (r->minlen < minlen)
5040         r->minlen = minlen;
5041
5042     if (RExC_seen & REG_SEEN_GPOS)
5043         r->extflags |= RXf_GPOS_SEEN;
5044     if (RExC_seen & REG_SEEN_LOOKBEHIND)
5045         r->extflags |= RXf_LOOKBEHIND_SEEN;
5046     if (RExC_seen & REG_SEEN_EVAL)
5047         r->extflags |= RXf_EVAL_SEEN;
5048     if (RExC_seen & REG_SEEN_CANY)
5049         r->extflags |= RXf_CANY_SEEN;
5050     if (RExC_seen & REG_SEEN_VERBARG)
5051         r->intflags |= PREGf_VERBARG_SEEN;
5052     if (RExC_seen & REG_SEEN_CUTGROUP)
5053         r->intflags |= PREGf_CUTGROUP_SEEN;
5054     if (RExC_paren_names)
5055         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5056     else
5057         RXp_PAREN_NAMES(r) = NULL;
5058
5059 #ifdef STUPID_PATTERN_CHECKS
5060     if (RX_PRELEN(rx) == 0)
5061         r->extflags |= RXf_NULL;
5062     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5063         /* XXX: this should happen BEFORE we compile */
5064         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5065     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5066         r->extflags |= RXf_WHITE;
5067     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5068         r->extflags |= RXf_START_ONLY;
5069 #else
5070     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5071             /* XXX: this should happen BEFORE we compile */
5072             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5073     else {
5074         regnode *first = ri->program + 1;
5075         U8 fop = OP(first);
5076         U8 nop = OP(NEXTOPER(first));
5077
5078         if (PL_regkind[fop] == NOTHING && nop == END)
5079             r->extflags |= RXf_NULL;
5080         else if (PL_regkind[fop] == BOL && nop == END)
5081             r->extflags |= RXf_START_ONLY;
5082         else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
5083             r->extflags |= RXf_WHITE;
5084     }
5085 #endif
5086 #ifdef DEBUGGING
5087     if (RExC_paren_names) {
5088         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5089         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5090     } else
5091 #endif
5092         ri->name_list_idx = 0;
5093
5094     if (RExC_recurse_count) {
5095         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5096             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5097             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5098         }
5099     }
5100     Newxz(r->offs, RExC_npar, regexp_paren_pair);
5101     /* assume we don't need to swap parens around before we match */
5102
5103     DEBUG_DUMP_r({
5104         PerlIO_printf(Perl_debug_log,"Final program:\n");
5105         regdump(r);
5106     });
5107 #ifdef RE_TRACK_PATTERN_OFFSETS
5108     DEBUG_OFFSETS_r(if (ri->u.offsets) {
5109         const U32 len = ri->u.offsets[0];
5110         U32 i;
5111         GET_RE_DEBUG_FLAGS_DECL;
5112         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5113         for (i = 1; i <= len; i++) {
5114             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5115                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5116                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5117             }
5118         PerlIO_printf(Perl_debug_log, "\n");
5119     });
5120 #endif
5121     return rx;
5122 }
5123
5124 #undef RE_ENGINE_PTR
5125
5126
5127 SV*
5128 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5129                     const U32 flags)
5130 {
5131     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5132
5133     PERL_UNUSED_ARG(value);
5134
5135     if (flags & RXapif_FETCH) {
5136         return reg_named_buff_fetch(rx, key, flags);
5137     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5138         Perl_croak_no_modify(aTHX);
5139         return NULL;
5140     } else if (flags & RXapif_EXISTS) {
5141         return reg_named_buff_exists(rx, key, flags)
5142             ? &PL_sv_yes
5143             : &PL_sv_no;
5144     } else if (flags & RXapif_REGNAMES) {
5145         return reg_named_buff_all(rx, flags);
5146     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5147         return reg_named_buff_scalar(rx, flags);
5148     } else {
5149         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5150         return NULL;
5151     }
5152 }
5153
5154 SV*
5155 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5156                          const U32 flags)
5157 {
5158     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5159     PERL_UNUSED_ARG(lastkey);
5160
5161     if (flags & RXapif_FIRSTKEY)
5162         return reg_named_buff_firstkey(rx, flags);
5163     else if (flags & RXapif_NEXTKEY)
5164         return reg_named_buff_nextkey(rx, flags);
5165     else {
5166         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5167         return NULL;
5168     }
5169 }
5170
5171 SV*
5172 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5173                           const U32 flags)
5174 {
5175     AV *retarray = NULL;
5176     SV *ret;
5177     struct regexp *const rx = (struct regexp *)SvANY(r);
5178
5179     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5180
5181     if (flags & RXapif_ALL)
5182         retarray=newAV();
5183
5184     if (rx && RXp_PAREN_NAMES(rx)) {
5185         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5186         if (he_str) {
5187             IV i;
5188             SV* sv_dat=HeVAL(he_str);
5189             I32 *nums=(I32*)SvPVX(sv_dat);
5190             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5191                 if ((I32)(rx->nparens) >= nums[i]
5192                     && rx->offs[nums[i]].start != -1
5193                     && rx->offs[nums[i]].end != -1)
5194                 {
5195                     ret = newSVpvs("");
5196                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5197                     if (!retarray)
5198                         return ret;
5199                 } else {
5200                     ret = newSVsv(&PL_sv_undef);
5201                 }
5202                 if (retarray)
5203                     av_push(retarray, ret);
5204             }
5205             if (retarray)
5206                 return newRV_noinc(MUTABLE_SV(retarray));
5207         }
5208     }
5209     return NULL;
5210 }
5211
5212 bool
5213 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5214                            const U32 flags)
5215 {
5216     struct regexp *const rx = (struct regexp *)SvANY(r);
5217
5218     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5219
5220     if (rx && RXp_PAREN_NAMES(rx)) {
5221         if (flags & RXapif_ALL) {
5222             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5223         } else {
5224             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5225             if (sv) {
5226                 SvREFCNT_dec(sv);
5227                 return TRUE;
5228             } else {
5229                 return FALSE;
5230             }
5231         }
5232     } else {
5233         return FALSE;
5234     }
5235 }
5236
5237 SV*
5238 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5239 {
5240     struct regexp *const rx = (struct regexp *)SvANY(r);
5241
5242     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5243
5244     if ( rx && RXp_PAREN_NAMES(rx) ) {
5245         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5246
5247         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5248     } else {
5249         return FALSE;
5250     }
5251 }
5252
5253 SV*
5254 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5255 {
5256     struct regexp *const rx = (struct regexp *)SvANY(r);
5257     GET_RE_DEBUG_FLAGS_DECL;
5258
5259     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5260
5261     if (rx && RXp_PAREN_NAMES(rx)) {
5262         HV *hv = RXp_PAREN_NAMES(rx);
5263         HE *temphe;
5264         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5265             IV i;
5266             IV parno = 0;
5267             SV* sv_dat = HeVAL(temphe);
5268             I32 *nums = (I32*)SvPVX(sv_dat);
5269             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5270                 if ((I32)(rx->lastparen) >= nums[i] &&
5271                     rx->offs[nums[i]].start != -1 &&
5272                     rx->offs[nums[i]].end != -1)
5273                 {
5274                     parno = nums[i];
5275                     break;
5276                 }
5277             }
5278             if (parno || flags & RXapif_ALL) {
5279                 return newSVhek(HeKEY_hek(temphe));
5280             }
5281         }
5282     }
5283     return NULL;
5284 }
5285
5286 SV*
5287 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5288 {
5289     SV *ret;
5290     AV *av;
5291     I32 length;
5292     struct regexp *const rx = (struct regexp *)SvANY(r);
5293
5294     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5295
5296     if (rx && RXp_PAREN_NAMES(rx)) {
5297         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5298             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5299         } else if (flags & RXapif_ONE) {
5300             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5301             av = MUTABLE_AV(SvRV(ret));
5302             length = av_len(av);
5303             SvREFCNT_dec(ret);
5304             return newSViv(length + 1);
5305         } else {
5306             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5307             return NULL;
5308         }
5309     }
5310     return &PL_sv_undef;
5311 }
5312
5313 SV*
5314 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5315 {
5316     struct regexp *const rx = (struct regexp *)SvANY(r);
5317     AV *av = newAV();
5318
5319     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5320
5321     if (rx && RXp_PAREN_NAMES(rx)) {
5322         HV *hv= RXp_PAREN_NAMES(rx);
5323         HE *temphe;
5324         (void)hv_iterinit(hv);
5325         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5326             IV i;
5327             IV parno = 0;
5328             SV* sv_dat = HeVAL(temphe);
5329             I32 *nums = (I32*)SvPVX(sv_dat);
5330             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5331                 if ((I32)(rx->lastparen) >= nums[i] &&
5332                     rx->offs[nums[i]].start != -1 &&
5333                     rx->offs[nums[i]].end != -1)
5334                 {
5335                     parno = nums[i];
5336                     break;
5337                 }
5338             }
5339             if (parno || flags & RXapif_ALL) {
5340                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5341             }
5342         }
5343     }
5344
5345     return newRV_noinc(MUTABLE_SV(av));
5346 }
5347
5348 void
5349 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5350                              SV * const sv)
5351 {
5352     struct regexp *const rx = (struct regexp *)SvANY(r);
5353     char *s = NULL;
5354     I32 i = 0;
5355     I32 s1, t1;
5356
5357     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5358
5359     if (!rx->subbeg) {
5360         sv_setsv(sv,&PL_sv_undef);
5361         return;
5362     }
5363     else
5364     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5365         /* $` */
5366         i = rx->offs[0].start;
5367         s = rx->subbeg;
5368     }
5369     else
5370     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5371         /* $' */
5372         s = rx->subbeg + rx->offs[0].end;
5373         i = rx->sublen - rx->offs[0].end;
5374     }
5375     else
5376     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5377         (s1 = rx->offs[paren].start) != -1 &&
5378         (t1 = rx->offs[paren].end) != -1)
5379     {
5380         /* $& $1 ... */
5381         i = t1 - s1;
5382         s = rx->subbeg + s1;
5383     } else {
5384         sv_setsv(sv,&PL_sv_undef);
5385         return;
5386     }
5387     assert(rx->sublen >= (s - rx->subbeg) + i );
5388     if (i >= 0) {
5389         const int oldtainted = PL_tainted;
5390         TAINT_NOT;
5391         sv_setpvn(sv, s, i);
5392         PL_tainted = oldtainted;
5393         if ( (rx->extflags & RXf_CANY_SEEN)
5394             ? (RXp_MATCH_UTF8(rx)
5395                         && (!i || is_utf8_string((U8*)s, i)))
5396             : (RXp_MATCH_UTF8(rx)) )
5397         {
5398             SvUTF8_on(sv);
5399         }
5400         else
5401             SvUTF8_off(sv);
5402         if (PL_tainting) {
5403             if (RXp_MATCH_TAINTED(rx)) {
5404                 if (SvTYPE(sv) >= SVt_PVMG) {
5405                     MAGIC* const mg = SvMAGIC(sv);
5406                     MAGIC* mgt;
5407                     PL_tainted = 1;
5408                     SvMAGIC_set(sv, mg->mg_moremagic);
5409                     SvTAINT(sv);
5410                     if ((mgt = SvMAGIC(sv))) {
5411                         mg->mg_moremagic = mgt;
5412                         SvMAGIC_set(sv, mg);
5413                     }
5414                 } else {
5415                     PL_tainted = 1;
5416                     SvTAINT(sv);
5417                 }
5418             } else
5419                 SvTAINTED_off(sv);
5420         }
5421     } else {
5422         sv_setsv(sv,&PL_sv_undef);
5423         return;
5424     }
5425 }
5426
5427 void
5428 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5429                                                          SV const * const value)
5430 {
5431     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5432
5433     PERL_UNUSED_ARG(rx);
5434     PERL_UNUSED_ARG(paren);
5435     PERL_UNUSED_ARG(value);
5436
5437     if (!PL_localizing)
5438         Perl_croak_no_modify(aTHX);
5439 }
5440
5441 I32
5442 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5443                               const I32 paren)
5444 {
5445     struct regexp *const rx = (struct regexp *)SvANY(r);
5446     I32 i;
5447     I32 s1, t1;
5448
5449     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5450
5451     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5452         switch (paren) {
5453       /* $` / ${^PREMATCH} */
5454       case RX_BUFF_IDX_PREMATCH:
5455         if (rx->offs[0].start != -1) {
5456                         i = rx->offs[0].start;
5457                         if (i > 0) {
5458                                 s1 = 0;
5459                                 t1 = i;
5460                                 goto getlen;
5461                         }
5462             }
5463         return 0;
5464       /* $' / ${^POSTMATCH} */
5465       case RX_BUFF_IDX_POSTMATCH:
5466             if (rx->offs[0].end != -1) {
5467                         i = rx->sublen - rx->offs[0].end;
5468                         if (i > 0) {
5469                                 s1 = rx->offs[0].end;
5470                                 t1 = rx->sublen;
5471                                 goto getlen;
5472                         }
5473             }
5474         return 0;
5475       /* $& / ${^MATCH}, $1, $2, ... */
5476       default:
5477             if (paren <= (I32)rx->nparens &&
5478             (s1 = rx->offs[paren].start) != -1 &&
5479             (t1 = rx->offs[paren].end) != -1)
5480             {
5481             i = t1 - s1;
5482             goto getlen;
5483         } else {
5484             if (ckWARN(WARN_UNINITIALIZED))
5485                 report_uninit((const SV *)sv);
5486             return 0;
5487         }
5488     }
5489   getlen:
5490     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5491         const char * const s = rx->subbeg + s1;
5492         const U8 *ep;
5493         STRLEN el;
5494
5495         i = t1 - s1;
5496         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5497                         i = el;
5498     }
5499     return i;
5500 }
5501
5502 SV*
5503 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5504 {
5505     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5506         PERL_UNUSED_ARG(rx);
5507         if (0)
5508             return NULL;
5509         else
5510             return newSVpvs("Regexp");
5511 }
5512
5513 /* Scans the name of a named buffer from the pattern.
5514  * If flags is REG_RSN_RETURN_NULL returns null.
5515  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5516  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5517  * to the parsed name as looked up in the RExC_paren_names hash.
5518  * If there is an error throws a vFAIL().. type exception.
5519  */
5520
5521 #define REG_RSN_RETURN_NULL    0
5522 #define REG_RSN_RETURN_NAME    1
5523 #define REG_RSN_RETURN_DATA    2
5524
5525 STATIC SV*
5526 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5527 {
5528     char *name_start = RExC_parse;
5529
5530     PERL_ARGS_ASSERT_REG_SCAN_NAME;
5531
5532     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5533          /* skip IDFIRST by using do...while */
5534         if (UTF)
5535             do {
5536                 RExC_parse += UTF8SKIP(RExC_parse);
5537             } while (isALNUM_utf8((U8*)RExC_parse));
5538         else
5539             do {
5540                 RExC_parse++;
5541             } while (isALNUM(*RExC_parse));
5542     }
5543
5544     if ( flags ) {
5545         SV* sv_name
5546             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5547                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5548         if ( flags == REG_RSN_RETURN_NAME)
5549             return sv_name;
5550         else if (flags==REG_RSN_RETURN_DATA) {
5551             HE *he_str = NULL;
5552             SV *sv_dat = NULL;
5553             if ( ! sv_name )      /* should not happen*/
5554                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5555             if (RExC_paren_names)
5556                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5557             if ( he_str )
5558                 sv_dat = HeVAL(he_str);
5559             if ( ! sv_dat )
5560                 vFAIL("Reference to nonexistent named group");
5561             return sv_dat;
5562         }
5563         else {
5564             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5565         }
5566         /* NOT REACHED */
5567     }
5568     return NULL;
5569 }
5570
5571 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5572     int rem=(int)(RExC_end - RExC_parse);                       \
5573     int cut;                                                    \
5574     int num;                                                    \
5575     int iscut=0;                                                \
5576     if (rem>10) {                                               \
5577         rem=10;                                                 \
5578         iscut=1;                                                \
5579     }                                                           \
5580     cut=10-rem;                                                 \
5581     if (RExC_lastparse!=RExC_parse)                             \
5582         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5583             rem, RExC_parse,                                    \
5584             cut + 4,                                            \
5585             iscut ? "..." : "<"                                 \
5586         );                                                      \
5587     else                                                        \
5588         PerlIO_printf(Perl_debug_log,"%16s","");                \
5589                                                                 \
5590     if (SIZE_ONLY)                                              \
5591        num = RExC_size + 1;                                     \
5592     else                                                        \
5593        num=REG_NODE_NUM(RExC_emit);                             \
5594     if (RExC_lastnum!=num)                                      \
5595        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5596     else                                                        \
5597        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5598     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5599         (int)((depth*2)), "",                                   \
5600         (funcname)                                              \
5601     );                                                          \
5602     RExC_lastnum=num;                                           \
5603     RExC_lastparse=RExC_parse;                                  \
5604 })
5605
5606
5607
5608 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5609     DEBUG_PARSE_MSG((funcname));                            \
5610     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5611 })
5612 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5613     DEBUG_PARSE_MSG((funcname));                            \
5614     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5615 })
5616 /*
5617  - reg - regular expression, i.e. main body or parenthesized thing
5618  *
5619  * Caller must absorb opening parenthesis.
5620  *
5621  * Combining parenthesis handling with the base level of regular expression
5622  * is a trifle forced, but the need to tie the tails of the branches to what
5623  * follows makes it hard to avoid.
5624  */
5625 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5626 #ifdef DEBUGGING
5627 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5628 #else
5629 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5630 #endif
5631
5632 STATIC regnode *
5633 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5634     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5635 {
5636     dVAR;
5637     register regnode *ret;              /* Will be the head of the group. */
5638     register regnode *br;
5639     register regnode *lastbr;
5640     register regnode *ender = NULL;
5641     register I32 parno = 0;
5642     I32 flags;
5643     U32 oregflags = RExC_flags;
5644     bool have_branch = 0;
5645     bool is_open = 0;
5646     I32 freeze_paren = 0;
5647     I32 after_freeze = 0;
5648
5649     /* for (?g), (?gc), and (?o) warnings; warning
5650        about (?c) will warn about (?g) -- japhy    */
5651
5652 #define WASTED_O  0x01
5653 #define WASTED_G  0x02
5654 #define WASTED_C  0x04
5655 #define WASTED_GC (0x02|0x04)
5656     I32 wastedflags = 0x00;
5657
5658     char * parse_start = RExC_parse; /* MJD */
5659     char * const oregcomp_parse = RExC_parse;
5660
5661     GET_RE_DEBUG_FLAGS_DECL;
5662
5663     PERL_ARGS_ASSERT_REG;
5664     DEBUG_PARSE("reg ");
5665
5666     *flagp = 0;                         /* Tentatively. */
5667
5668
5669     /* Make an OPEN node, if parenthesized. */
5670     if (paren) {
5671         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5672             char *start_verb = RExC_parse;
5673             STRLEN verb_len = 0;
5674             char *start_arg = NULL;
5675             unsigned char op = 0;
5676             int argok = 1;
5677             int internal_argval = 0; /* internal_argval is only useful if !argok */
5678             while ( *RExC_parse && *RExC_parse != ')' ) {
5679                 if ( *RExC_parse == ':' ) {
5680                     start_arg = RExC_parse + 1;
5681                     break;
5682                 }
5683                 RExC_parse++;
5684             }
5685             ++start_verb;
5686             verb_len = RExC_parse - start_verb;
5687             if ( start_arg ) {
5688                 RExC_parse++;
5689                 while ( *RExC_parse && *RExC_parse != ')' )
5690                     RExC_parse++;
5691                 if ( *RExC_parse != ')' )
5692                     vFAIL("Unterminated verb pattern argument");
5693                 if ( RExC_parse == start_arg )
5694                     start_arg = NULL;
5695             } else {
5696                 if ( *RExC_parse != ')' )
5697                     vFAIL("Unterminated verb pattern");
5698             }
5699
5700             switch ( *start_verb ) {
5701             case 'A':  /* (*ACCEPT) */
5702                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5703                     op = ACCEPT;
5704                     internal_argval = RExC_nestroot;
5705                 }
5706                 break;
5707             case 'C':  /* (*COMMIT) */
5708                 if ( memEQs(start_verb,verb_len,"COMMIT") )
5709                     op = COMMIT;
5710                 break;
5711             case 'F':  /* (*FAIL) */
5712                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5713                     op = OPFAIL;
5714                     argok = 0;
5715                 }
5716                 break;
5717             case ':':  /* (*:NAME) */
5718             case 'M':  /* (*MARK:NAME) */
5719                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5720                     op = MARKPOINT;
5721                     argok = -1;
5722                 }
5723                 break;
5724             case 'P':  /* (*PRUNE) */
5725                 if ( memEQs(start_verb,verb_len,"PRUNE") )
5726                     op = PRUNE;
5727                 break;
5728             case 'S':   /* (*SKIP) */
5729                 if ( memEQs(start_verb,verb_len,"SKIP") )
5730                     op = SKIP;
5731                 break;
5732             case 'T':  /* (*THEN) */
5733                 /* [19:06] <TimToady> :: is then */
5734                 if ( memEQs(start_verb,verb_len,"THEN") ) {
5735                     op = CUTGROUP;
5736                     RExC_seen |= REG_SEEN_CUTGROUP;
5737                 }
5738                 break;
5739             }
5740             if ( ! op ) {
5741                 RExC_parse++;
5742                 vFAIL3("Unknown verb pattern '%.*s'",
5743                     verb_len, start_verb);
5744             }
5745             if ( argok ) {
5746                 if ( start_arg && internal_argval ) {
5747                     vFAIL3("Verb pattern '%.*s' may not have an argument",
5748                         verb_len, start_verb);
5749                 } else if ( argok < 0 && !start_arg ) {
5750                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5751                         verb_len, start_verb);
5752                 } else {
5753                     ret = reganode(pRExC_state, op, internal_argval);
5754                     if ( ! internal_argval && ! SIZE_ONLY ) {
5755                         if (start_arg) {
5756                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5757                             ARG(ret) = add_data( pRExC_state, 1, "S" );
5758                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5759                             ret->flags = 0;
5760                         } else {
5761                             ret->flags = 1;
5762                         }
5763                     }
5764                 }
5765                 if (!internal_argval)
5766                     RExC_seen |= REG_SEEN_VERBARG;
5767             } else if ( start_arg ) {
5768                 vFAIL3("Verb pattern '%.*s' may not have an argument",
5769                         verb_len, start_verb);
5770             } else {
5771                 ret = reg_node(pRExC_state, op);
5772             }
5773             nextchar(pRExC_state);
5774             return ret;
5775         } else
5776         if (*RExC_parse == '?') { /* (?...) */
5777             bool is_logical = 0;
5778             const char * const seqstart = RExC_parse;
5779             bool has_use_defaults = FALSE;
5780
5781             RExC_parse++;
5782             paren = *RExC_parse++;
5783             ret = NULL;                 /* For look-ahead/behind. */
5784             switch (paren) {
5785
5786             case 'P':   /* (?P...) variants for those used to PCRE/Python */
5787                 paren = *RExC_parse++;
5788                 if ( paren == '<')         /* (?P<...>) named capture */
5789                     goto named_capture;
5790                 else if (paren == '>') {   /* (?P>name) named recursion */
5791                     goto named_recursion;
5792                 }
5793                 else if (paren == '=') {   /* (?P=...)  named backref */
5794                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
5795                        you change this make sure you change that */
5796                     char* name_start = RExC_parse;
5797                     U32 num = 0;
5798                     SV *sv_dat = reg_scan_name(pRExC_state,
5799                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5800                     if (RExC_parse == name_start || *RExC_parse != ')')
5801                         vFAIL2("Sequence %.3s... not terminated",parse_start);
5802
5803                     if (!SIZE_ONLY) {
5804                         num = add_data( pRExC_state, 1, "S" );
5805                         RExC_rxi->data->data[num]=(void*)sv_dat;
5806                         SvREFCNT_inc_simple_void(sv_dat);
5807                     }
5808                     RExC_sawback = 1;
5809                     ret = reganode(pRExC_state,
5810                            (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5811                            num);
5812                     *flagp |= HASWIDTH;
5813
5814                     Set_Node_Offset(ret, parse_start+1);
5815                     Set_Node_Cur_Length(ret); /* MJD */
5816
5817                     nextchar(pRExC_state);
5818                     return ret;
5819                 }
5820                 RExC_parse++;
5821                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5822                 /*NOTREACHED*/
5823             case '<':           /* (?<...) */
5824                 if (*RExC_parse == '!')
5825                     paren = ',';
5826                 else if (*RExC_parse != '=')
5827               named_capture:
5828                 {               /* (?<...>) */
5829                     char *name_start;
5830                     SV *svname;
5831                     paren= '>';
5832             case '\'':          /* (?'...') */
5833                     name_start= RExC_parse;
5834                     svname = reg_scan_name(pRExC_state,
5835                         SIZE_ONLY ?  /* reverse test from the others */
5836                         REG_RSN_RETURN_NAME :
5837                         REG_RSN_RETURN_NULL);
5838                     if (RExC_parse == name_start) {
5839                         RExC_parse++;
5840                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5841                         /*NOTREACHED*/
5842                     }
5843                     if (*RExC_parse != paren)
5844                         vFAIL2("Sequence (?%c... not terminated",
5845                             paren=='>' ? '<' : paren);
5846                     if (SIZE_ONLY) {
5847                         HE *he_str;
5848                         SV *sv_dat = NULL;
5849                         if (!svname) /* shouldnt happen */
5850                             Perl_croak(aTHX_
5851                                 "panic: reg_scan_name returned NULL");
5852                         if (!RExC_paren_names) {
5853                             RExC_paren_names= newHV();
5854                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
5855 #ifdef DEBUGGING
5856                             RExC_paren_name_list= newAV();
5857                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5858 #endif
5859                         }
5860                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5861                         if ( he_str )
5862                             sv_dat = HeVAL(he_str);
5863                         if ( ! sv_dat ) {
5864                             /* croak baby croak */
5865                             Perl_croak(aTHX_
5866                                 "panic: paren_name hash element allocation failed");
5867                         } else if ( SvPOK(sv_dat) ) {
5868                             /* (?|...) can mean we have dupes so scan to check
5869                                its already been stored. Maybe a flag indicating
5870                                we are inside such a construct would be useful,
5871                                but the arrays are likely to be quite small, so
5872                                for now we punt -- dmq */
5873                             IV count = SvIV(sv_dat);
5874                             I32 *pv = (I32*)SvPVX(sv_dat);
5875                             IV i;
5876                             for ( i = 0 ; i < count ; i++ ) {
5877                                 if ( pv[i] == RExC_npar ) {
5878                                     count = 0;
5879                                     break;
5880                                 }
5881                             }
5882                             if ( count ) {
5883                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5884                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5885                                 pv[count] = RExC_npar;
5886                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5887                             }
5888                         } else {
5889                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
5890                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5891                             SvIOK_on(sv_dat);
5892                             SvIV_set(sv_dat, 1);
5893                         }
5894 #ifdef DEBUGGING
5895                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5896                             SvREFCNT_dec(svname);
5897 #endif
5898
5899                         /*sv_dump(sv_dat);*/
5900                     }
5901                     nextchar(pRExC_state);
5902                     paren = 1;
5903                     goto capturing_parens;
5904                 }
5905                 RExC_seen |= REG_SEEN_LOOKBEHIND;
5906                 RExC_parse++;
5907             case '=':           /* (?=...) */
5908                 RExC_seen_zerolen++;
5909                 break;
5910             case '!':           /* (?!...) */
5911                 RExC_seen_zerolen++;
5912                 if (*RExC_parse == ')') {
5913                     ret=reg_node(pRExC_state, OPFAIL);
5914                     nextchar(pRExC_state);
5915                     return ret;
5916                 }
5917                 break;
5918             case '|':           /* (?|...) */
5919                 /* branch reset, behave like a (?:...) except that
5920                    buffers in alternations share the same numbers */
5921                 paren = ':';
5922                 after_freeze = freeze_paren = RExC_npar;
5923                 break;
5924             case ':':           /* (?:...) */
5925             case '>':           /* (?>...) */
5926                 break;
5927             case '$':           /* (?$...) */
5928             case '@':           /* (?@...) */
5929                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5930                 break;
5931             case '#':           /* (?#...) */
5932                 while (*RExC_parse && *RExC_parse != ')')
5933                     RExC_parse++;
5934                 if (*RExC_parse != ')')
5935                     FAIL("Sequence (?#... not terminated");
5936                 nextchar(pRExC_state);
5937                 *flagp = TRYAGAIN;
5938                 return NULL;
5939             case '0' :           /* (?0) */
5940             case 'R' :           /* (?R) */
5941                 if (*RExC_parse != ')')
5942                     FAIL("Sequence (?R) not terminated");
5943                 ret = reg_node(pRExC_state, GOSTART);
5944                 *flagp |= POSTPONED;
5945                 nextchar(pRExC_state);
5946                 return ret;
5947                 /*notreached*/
5948             { /* named and numeric backreferences */
5949                 I32 num;
5950             case '&':            /* (?&NAME) */
5951                 parse_start = RExC_parse - 1;
5952               named_recursion:
5953                 {
5954                     SV *sv_dat = reg_scan_name(pRExC_state,
5955                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5956                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5957                 }
5958                 goto gen_recurse_regop;
5959                 /* NOT REACHED */
5960             case '+':
5961                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5962                     RExC_parse++;
5963                     vFAIL("Illegal pattern");
5964                 }
5965                 goto parse_recursion;
5966                 /* NOT REACHED*/
5967             case '-': /* (?-1) */
5968                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5969                     RExC_parse--; /* rewind to let it be handled later */
5970                     goto parse_flags;
5971                 }
5972                 /*FALLTHROUGH */
5973             case '1': case '2': case '3': case '4': /* (?1) */
5974             case '5': case '6': case '7': case '8': case '9':
5975                 RExC_parse--;
5976               parse_recursion:
5977                 num = atoi(RExC_parse);
5978                 parse_start = RExC_parse - 1; /* MJD */
5979                 if (*RExC_parse == '-')
5980                     RExC_parse++;
5981                 while (isDIGIT(*RExC_parse))
5982                         RExC_parse++;
5983                 if (*RExC_parse!=')')
5984                     vFAIL("Expecting close bracket");
5985
5986               gen_recurse_regop:
5987                 if ( paren == '-' ) {
5988                     /*
5989                     Diagram of capture buffer numbering.
5990                     Top line is the normal capture buffer numbers
5991                     Bottom line is the negative indexing as from
5992                     the X (the (?-2))
5993
5994                     +   1 2    3 4 5 X          6 7
5995                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5996                     -   5 4    3 2 1 X          x x
5997
5998                     */
5999                     num = RExC_npar + num;
6000                     if (num < 1)  {
6001                         RExC_parse++;
6002                         vFAIL("Reference to nonexistent group");
6003                     }
6004                 } else if ( paren == '+' ) {
6005                     num = RExC_npar + num - 1;
6006                 }
6007
6008                 ret = reganode(pRExC_state, GOSUB, num);
6009                 if (!SIZE_ONLY) {
6010                     if (num > (I32)RExC_rx->nparens) {
6011                         RExC_parse++;
6012                         vFAIL("Reference to nonexistent group");
6013                     }
6014                     ARG2L_SET( ret, RExC_recurse_count++);
6015                     RExC_emit++;
6016                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6017                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6018                 } else {
6019                     RExC_size++;
6020                 }
6021                 RExC_seen |= REG_SEEN_RECURSE;
6022                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6023                 Set_Node_Offset(ret, parse_start); /* MJD */
6024
6025                 *flagp |= POSTPONED;
6026                 nextchar(pRExC_state);
6027                 return ret;
6028             } /* named and numeric backreferences */
6029             /* NOT REACHED */
6030
6031             case '?':           /* (??...) */
6032                 is_logical = 1;
6033                 if (*RExC_parse != '{') {
6034                     RExC_parse++;
6035                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6036                     /*NOTREACHED*/
6037                 }
6038                 *flagp |= POSTPONED;
6039                 paren = *RExC_parse++;
6040                 /* FALL THROUGH */
6041             case '{':           /* (?{...}) */
6042             {
6043                 I32 count = 1;
6044                 U32 n = 0;
6045                 char c;
6046                 char *s = RExC_parse;
6047
6048                 RExC_seen_zerolen++;
6049                 RExC_seen |= REG_SEEN_EVAL;
6050                 while (count && (c = *RExC_parse)) {
6051                     if (c == '\\') {
6052                         if (RExC_parse[1])
6053                             RExC_parse++;
6054                     }
6055                     else if (c == '{')
6056                         count++;
6057                     else if (c == '}')
6058                         count--;
6059                     RExC_parse++;
6060                 }
6061                 if (*RExC_parse != ')') {
6062                     RExC_parse = s;
6063                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6064                 }
6065                 if (!SIZE_ONLY) {
6066                     PAD *pad;
6067                     OP_4tree *sop, *rop;
6068                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6069
6070                     ENTER;
6071                     Perl_save_re_context(aTHX);
6072                     rop = sv_compile_2op(sv, &sop, "re", &pad);
6073                     sop->op_private |= OPpREFCOUNTED;
6074                     /* re_dup will OpREFCNT_inc */
6075                     OpREFCNT_set(sop, 1);
6076                     LEAVE;
6077
6078                     n = add_data(pRExC_state, 3, "nop");
6079                     RExC_rxi->data->data[n] = (void*)rop;
6080                     RExC_rxi->data->data[n+1] = (void*)sop;
6081                     RExC_rxi->data->data[n+2] = (void*)pad;
6082                     SvREFCNT_dec(sv);
6083                 }
6084                 else {                                          /* First pass */
6085                     if (PL_reginterp_cnt < ++RExC_seen_evals
6086                         && IN_PERL_RUNTIME)
6087                         /* No compiled RE interpolated, has runtime
6088                            components ===> unsafe.  */
6089                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
6090                     if (PL_tainting && PL_tainted)
6091                         FAIL("Eval-group in insecure regular expression");
6092 #if PERL_VERSION > 8
6093                     if (IN_PERL_COMPILETIME)
6094                         PL_cv_has_eval = 1;
6095 #endif
6096                 }
6097
6098                 nextchar(pRExC_state);
6099                 if (is_logical) {
6100                     ret = reg_node(pRExC_state, LOGICAL);
6101                     if (!SIZE_ONLY)
6102                         ret->flags = 2;
6103                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6104                     /* deal with the length of this later - MJD */
6105                     return ret;
6106                 }
6107                 ret = reganode(pRExC_state, EVAL, n);
6108                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6109                 Set_Node_Offset(ret, parse_start);
6110                 return ret;
6111             }
6112             case '(':           /* (?(?{...})...) and (?(?=...)...) */
6113             {
6114                 int is_define= 0;
6115                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
6116                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6117                         || RExC_parse[1] == '<'
6118                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
6119                         I32 flag;
6120
6121                         ret = reg_node(pRExC_state, LOGICAL);
6122                         if (!SIZE_ONLY)
6123                             ret->flags = 1;
6124                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6125                         goto insert_if;
6126                     }
6127                 }
6128                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
6129                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6130                 {
6131                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
6132                     char *name_start= RExC_parse++;
6133                     U32 num = 0;
6134                     SV *sv_dat=reg_scan_name(pRExC_state,
6135                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6136                     if (RExC_parse == name_start || *RExC_parse != ch)
6137                         vFAIL2("Sequence (?(%c... not terminated",
6138                             (ch == '>' ? '<' : ch));
6139                     RExC_parse++;
6140                     if (!SIZE_ONLY) {
6141                         num = add_data( pRExC_state, 1, "S" );
6142                         RExC_rxi->data->data[num]=(void*)sv_dat;
6143                         SvREFCNT_inc_simple_void(sv_dat);
6144                     }
6145                     ret = reganode(pRExC_state,NGROUPP,num);
6146                     goto insert_if_check_paren;
6147                 }
6148                 else if (RExC_parse[0] == 'D' &&
6149                          RExC_parse[1] == 'E' &&
6150                          RExC_parse[2] == 'F' &&
6151                          RExC_parse[3] == 'I' &&
6152                          RExC_parse[4] == 'N' &&
6153                          RExC_parse[5] == 'E')
6154                 {
6155                     ret = reganode(pRExC_state,DEFINEP,0);
6156                     RExC_parse +=6 ;
6157                     is_define = 1;
6158                     goto insert_if_check_paren;
6159                 }
6160                 else if (RExC_parse[0] == 'R') {
6161                     RExC_parse++;
6162                     parno = 0;
6163                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6164                         parno = atoi(RExC_parse++);
6165                         while (isDIGIT(*RExC_parse))
6166                             RExC_parse++;
6167                     } else if (RExC_parse[0] == '&') {
6168                         SV *sv_dat;
6169                         RExC_parse++;
6170                         sv_dat = reg_scan_name(pRExC_state,
6171                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6172                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6173                     }
6174                     ret = reganode(pRExC_state,INSUBP,parno);
6175                     goto insert_if_check_paren;
6176                 }
6177                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6178                     /* (?(1)...) */
6179                     char c;
6180                     parno = atoi(RExC_parse++);
6181
6182                     while (isDIGIT(*RExC_parse))
6183                         RExC_parse++;
6184                     ret = reganode(pRExC_state, GROUPP, parno);
6185
6186                  insert_if_check_paren:
6187                     if ((c = *nextchar(pRExC_state)) != ')')
6188                         vFAIL("Switch condition not recognized");
6189                   insert_if:
6190                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6191                     br = regbranch(pRExC_state, &flags, 1,depth+1);
6192                     if (br == NULL)
6193                         br = reganode(pRExC_state, LONGJMP, 0);
6194                     else
6195                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6196                     c = *nextchar(pRExC_state);
6197                     if (flags&HASWIDTH)
6198                         *flagp |= HASWIDTH;
6199                     if (c == '|') {
6200                         if (is_define)
6201                             vFAIL("(?(DEFINE)....) does not allow branches");
6202                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6203                         regbranch(pRExC_state, &flags, 1,depth+1);
6204                         REGTAIL(pRExC_state, ret, lastbr);
6205                         if (flags&HASWIDTH)
6206                             *flagp |= HASWIDTH;
6207                         c = *nextchar(pRExC_state);
6208                     }
6209                     else
6210                         lastbr = NULL;
6211                     if (c != ')')
6212                         vFAIL("Switch (?(condition)... contains too many branches");
6213                     ender = reg_node(pRExC_state, TAIL);
6214                     REGTAIL(pRExC_state, br, ender);
6215                     if (lastbr) {
6216                         REGTAIL(pRExC_state, lastbr, ender);
6217                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6218                     }
6219                     else
6220                         REGTAIL(pRExC_state, ret, ender);
6221                     RExC_size++; /* XXX WHY do we need this?!!
6222                                     For large programs it seems to be required
6223                                     but I can't figure out why. -- dmq*/
6224                     return ret;
6225                 }
6226                 else {
6227                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6228                 }
6229             }
6230             case 0:
6231                 RExC_parse--; /* for vFAIL to print correctly */
6232                 vFAIL("Sequence (? incomplete");
6233                 break;
6234             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
6235                                        that follow */
6236                 has_use_defaults = TRUE;
6237                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
6238                 RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
6239                 goto parse_flags;
6240             default:
6241                 --RExC_parse;
6242                 parse_flags:      /* (?i) */
6243             {
6244                 U32 posflags = 0, negflags = 0;
6245                 U32 *flagsp = &posflags;
6246                 bool has_charset_modifier = 0;
6247
6248                 while (*RExC_parse) {
6249                     /* && strchr("iogcmsx", *RExC_parse) */
6250                     /* (?g), (?gc) and (?o) are useless here
6251                        and must be globally applied -- japhy */
6252                     switch (*RExC_parse) {
6253                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6254                     case LOCALE_PAT_MOD:
6255                         if (has_charset_modifier || flagsp == &negflags) {
6256                             goto fail_modifiers;
6257                         }
6258                         posflags |= RXf_PMf_LOCALE;
6259                         negflags |= RXf_PMf_UNICODE;
6260                         has_charset_modifier = 1;
6261                         break;
6262                     case UNICODE_PAT_MOD:
6263                         if (has_charset_modifier || flagsp == &negflags) {
6264                             goto fail_modifiers;
6265                         }
6266                         posflags |= RXf_PMf_UNICODE;
6267                         negflags |= RXf_PMf_LOCALE;
6268                         has_charset_modifier = 1;
6269                         break;
6270                     case DUAL_PAT_MOD:
6271                         if (has_use_defaults
6272                             || has_charset_modifier
6273                             || flagsp == &negflags)
6274                         {
6275                             goto fail_modifiers;
6276                         }
6277                         negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
6278                         has_charset_modifier = 1;
6279                         break;
6280                     case ONCE_PAT_MOD: /* 'o' */
6281                     case GLOBAL_PAT_MOD: /* 'g' */
6282                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6283                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6284                             if (! (wastedflags & wflagbit) ) {
6285                                 wastedflags |= wflagbit;
6286                                 vWARN5(
6287                                     RExC_parse + 1,
6288                                     "Useless (%s%c) - %suse /%c modifier",
6289                                     flagsp == &negflags ? "?-" : "?",
6290                                     *RExC_parse,
6291                                     flagsp == &negflags ? "don't " : "",
6292                                     *RExC_parse
6293                                 );
6294                             }
6295                         }
6296                         break;
6297
6298                     case CONTINUE_PAT_MOD: /* 'c' */
6299                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6300                             if (! (wastedflags & WASTED_C) ) {
6301                                 wastedflags |= WASTED_GC;
6302                                 vWARN3(
6303                                     RExC_parse + 1,
6304                                     "Useless (%sc) - %suse /gc modifier",
6305                                     flagsp == &negflags ? "?-" : "?",
6306                                     flagsp == &negflags ? "don't " : ""
6307                                 );
6308                             }
6309                         }
6310                         break;
6311                     case KEEPCOPY_PAT_MOD: /* 'p' */
6312                         if (flagsp == &negflags) {
6313                             if (SIZE_ONLY)
6314                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6315                         } else {
6316                             *flagsp |= RXf_PMf_KEEPCOPY;
6317                         }
6318                         break;
6319                     case '-':
6320                         /* A flag is a default iff it is following a minus, so
6321                          * if there is a minus, it means will be trying to
6322                          * re-specify a default which is an error */
6323                         if (has_use_defaults || flagsp == &negflags) {
6324             fail_modifiers:
6325                             RExC_parse++;
6326                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6327                             /*NOTREACHED*/
6328                         }
6329                         flagsp = &negflags;
6330                         wastedflags = 0;  /* reset so (?g-c) warns twice */
6331                         break;
6332                     case ':':
6333                         paren = ':';
6334                         /*FALLTHROUGH*/
6335                     case ')':
6336                         RExC_flags |= posflags;
6337                         RExC_flags &= ~negflags;
6338                         if (paren != ':') {
6339                             oregflags |= posflags;
6340                             oregflags &= ~negflags;
6341                         }
6342                         nextchar(pRExC_state);
6343                         if (paren != ':') {
6344                             *flagp = TRYAGAIN;
6345                             return NULL;
6346                         } else {
6347                             ret = NULL;
6348                             goto parse_rest;
6349                         }
6350                         /*NOTREACHED*/
6351                     default:
6352                         RExC_parse++;
6353                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6354                         /*NOTREACHED*/
6355                     }
6356                     ++RExC_parse;
6357                 }
6358             }} /* one for the default block, one for the switch */
6359         }
6360         else {                  /* (...) */
6361           capturing_parens:
6362             parno = RExC_npar;
6363             RExC_npar++;
6364
6365             ret = reganode(pRExC_state, OPEN, parno);
6366             if (!SIZE_ONLY ){
6367                 if (!RExC_nestroot)
6368                     RExC_nestroot = parno;
6369                 if (RExC_seen & REG_SEEN_RECURSE
6370                     && !RExC_open_parens[parno-1])
6371                 {
6372                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6373                         "Setting open paren #%"IVdf" to %d\n",
6374                         (IV)parno, REG_NODE_NUM(ret)));
6375                     RExC_open_parens[parno-1]= ret;
6376                 }
6377             }
6378             Set_Node_Length(ret, 1); /* MJD */
6379             Set_Node_Offset(ret, RExC_parse); /* MJD */
6380             is_open = 1;
6381         }
6382     }
6383     else                        /* ! paren */
6384         ret = NULL;
6385
6386    parse_rest:
6387     /* Pick up the branches, linking them together. */
6388     parse_start = RExC_parse;   /* MJD */
6389     br = regbranch(pRExC_state, &flags, 1,depth+1);
6390
6391     if (freeze_paren) {
6392         if (RExC_npar > after_freeze)
6393             after_freeze = RExC_npar;
6394         RExC_npar = freeze_paren;
6395     }
6396
6397     /*     branch_len = (paren != 0); */
6398
6399     if (br == NULL)
6400         return(NULL);
6401     if (*RExC_parse == '|') {
6402         if (!SIZE_ONLY && RExC_extralen) {
6403             reginsert(pRExC_state, BRANCHJ, br, depth+1);
6404         }
6405         else {                  /* MJD */
6406             reginsert(pRExC_state, BRANCH, br, depth+1);
6407             Set_Node_Length(br, paren != 0);
6408             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6409         }
6410         have_branch = 1;
6411         if (SIZE_ONLY)
6412             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
6413     }
6414     else if (paren == ':') {
6415         *flagp |= flags&SIMPLE;
6416     }
6417     if (is_open) {                              /* Starts with OPEN. */
6418         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
6419     }
6420     else if (paren != '?')              /* Not Conditional */
6421         ret = br;
6422     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6423     lastbr = br;
6424     while (*RExC_parse == '|') {
6425         if (!SIZE_ONLY && RExC_extralen) {
6426             ender = reganode(pRExC_state, LONGJMP,0);
6427             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6428         }
6429         if (SIZE_ONLY)
6430             RExC_extralen += 2;         /* Account for LONGJMP. */
6431         nextchar(pRExC_state);
6432         if (freeze_paren) {
6433             if (RExC_npar > after_freeze)
6434                 after_freeze = RExC_npar;
6435             RExC_npar = freeze_paren;
6436         }
6437         br = regbranch(pRExC_state, &flags, 0, depth+1);
6438
6439         if (br == NULL)
6440             return(NULL);
6441         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
6442         lastbr = br;
6443         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6444     }
6445
6446     if (have_branch || paren != ':') {
6447         /* Make a closing node, and hook it on the end. */
6448         switch (paren) {
6449         case ':':
6450             ender = reg_node(pRExC_state, TAIL);
6451             break;
6452         case 1:
6453             ender = reganode(pRExC_state, CLOSE, parno);
6454             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6455                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6456                         "Setting close paren #%"IVdf" to %d\n",
6457                         (IV)parno, REG_NODE_NUM(ender)));
6458                 RExC_close_parens[parno-1]= ender;
6459                 if (RExC_nestroot == parno)
6460                     RExC_nestroot = 0;
6461             }
6462             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6463             Set_Node_Length(ender,1); /* MJD */
6464             break;
6465         case '<':
6466         case ',':
6467         case '=':
6468         case '!':
6469             *flagp &= ~HASWIDTH;
6470             /* FALL THROUGH */
6471         case '>':
6472             ender = reg_node(pRExC_state, SUCCEED);
6473             break;
6474         case 0:
6475             ender = reg_node(pRExC_state, END);
6476             if (!SIZE_ONLY) {
6477                 assert(!RExC_opend); /* there can only be one! */
6478                 RExC_opend = ender;
6479             }
6480             break;
6481         }
6482         REGTAIL(pRExC_state, lastbr, ender);
6483
6484         if (have_branch && !SIZE_ONLY) {
6485             if (depth==1)
6486                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6487
6488             /* Hook the tails of the branches to the closing node. */
6489             for (br = ret; br; br = regnext(br)) {
6490                 const U8 op = PL_regkind[OP(br)];
6491                 if (op == BRANCH) {
6492                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6493                 }
6494                 else if (op == BRANCHJ) {
6495                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6496                 }
6497             }
6498         }
6499     }
6500
6501     {
6502         const char *p;
6503         static const char parens[] = "=!<,>";
6504
6505         if (paren && (p = strchr(parens, paren))) {
6506             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6507             int flag = (p - parens) > 1;
6508
6509             if (paren == '>')
6510                 node = SUSPEND, flag = 0;
6511             reginsert(pRExC_state, node,ret, depth+1);
6512             Set_Node_Cur_Length(ret);
6513             Set_Node_Offset(ret, parse_start + 1);
6514             ret->flags = flag;
6515             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6516         }
6517     }
6518
6519     /* Check for proper termination. */
6520     if (paren) {
6521         RExC_flags = oregflags;
6522         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6523             RExC_parse = oregcomp_parse;
6524             vFAIL("Unmatched (");
6525         }
6526     }
6527     else if (!paren && RExC_parse < RExC_end) {
6528         if (*RExC_parse == ')') {
6529             RExC_parse++;
6530             vFAIL("Unmatched )");
6531         }
6532         else
6533             FAIL("Junk on end of regexp");      /* "Can't happen". */
6534         /* NOTREACHED */
6535     }
6536     if (after_freeze)
6537         RExC_npar = after_freeze;
6538     return(ret);
6539 }
6540
6541 /*
6542  - regbranch - one alternative of an | operator
6543  *
6544  * Implements the concatenation operator.
6545  */
6546 STATIC regnode *
6547 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6548 {
6549     dVAR;
6550     register regnode *ret;
6551     register regnode *chain = NULL;
6552     register regnode *latest;
6553     I32 flags = 0, c = 0;
6554     GET_RE_DEBUG_FLAGS_DECL;
6555
6556     PERL_ARGS_ASSERT_REGBRANCH;
6557
6558     DEBUG_PARSE("brnc");
6559
6560     if (first)
6561         ret = NULL;
6562     else {
6563         if (!SIZE_ONLY && RExC_extralen)
6564             ret = reganode(pRExC_state, BRANCHJ,0);
6565         else {
6566             ret = reg_node(pRExC_state, BRANCH);
6567             Set_Node_Length(ret, 1);
6568         }
6569     }
6570
6571     if (!first && SIZE_ONLY)
6572         RExC_extralen += 1;                     /* BRANCHJ */
6573
6574     *flagp = WORST;                     /* Tentatively. */
6575
6576     RExC_parse--;
6577     nextchar(pRExC_state);
6578     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6579         flags &= ~TRYAGAIN;
6580         latest = regpiece(pRExC_state, &flags,depth+1);
6581         if (latest == NULL) {
6582             if (flags & TRYAGAIN)
6583                 continue;
6584             return(NULL);
6585         }
6586         else if (ret == NULL)
6587             ret = latest;
6588         *flagp |= flags&(HASWIDTH|POSTPONED);
6589         if (chain == NULL)      /* First piece. */
6590             *flagp |= flags&SPSTART;
6591         else {
6592             RExC_naughty++;
6593             REGTAIL(pRExC_state, chain, latest);
6594         }
6595         chain = latest;
6596         c++;
6597     }
6598     if (chain == NULL) {        /* Loop ran zero times. */
6599         chain = reg_node(pRExC_state, NOTHING);
6600         if (ret == NULL)
6601             ret = chain;
6602     }
6603     if (c == 1) {
6604         *flagp |= flags&SIMPLE;
6605     }
6606
6607     return ret;
6608 }
6609
6610 /*
6611  - regpiece - something followed by possible [*+?]
6612  *
6613  * Note that the branching code sequences used for ? and the general cases
6614  * of * and + are somewhat optimized:  they use the same NOTHING node as
6615  * both the endmarker for their branch list and the body of the last branch.
6616  * It might seem that this node could be dispensed with entirely, but the
6617  * endmarker role is not redundant.
6618  */
6619 STATIC regnode *
6620 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6621 {
6622     dVAR;
6623     register regnode *ret;
6624     register char op;
6625     register char *next;
6626     I32 flags;
6627     const char * const origparse = RExC_parse;
6628     I32 min;
6629     I32 max = REG_INFTY;
6630     char *parse_start;
6631     const char *maxpos = NULL;
6632     GET_RE_DEBUG_FLAGS_DECL;
6633
6634     PERL_ARGS_ASSERT_REGPIECE;
6635
6636     DEBUG_PARSE("piec");
6637
6638     ret = regatom(pRExC_state, &flags,depth+1);
6639     if (ret == NULL) {
6640         if (flags & TRYAGAIN)
6641             *flagp |= TRYAGAIN;
6642         return(NULL);
6643     }
6644
6645     op = *RExC_parse;
6646
6647     if (op == '{' && regcurly(RExC_parse)) {
6648         maxpos = NULL;
6649         parse_start = RExC_parse; /* MJD */
6650         next = RExC_parse + 1;
6651         while (isDIGIT(*next) || *next == ',') {
6652             if (*next == ',') {
6653                 if (maxpos)
6654                     break;
6655                 else
6656                     maxpos = next;
6657             }
6658             next++;
6659         }
6660         if (*next == '}') {             /* got one */
6661             if (!maxpos)
6662                 maxpos = next;
6663             RExC_parse++;
6664             min = atoi(RExC_parse);
6665             if (*maxpos == ',')
6666                 maxpos++;
6667             else
6668                 maxpos = RExC_parse;
6669             max = atoi(maxpos);
6670             if (!max && *maxpos != '0')
6671                 max = REG_INFTY;                /* meaning "infinity" */
6672             else if (max >= REG_INFTY)
6673                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6674             RExC_parse = next;
6675             nextchar(pRExC_state);
6676
6677         do_curly:
6678             if ((flags&SIMPLE)) {
6679                 RExC_naughty += 2 + RExC_naughty / 2;
6680                 reginsert(pRExC_state, CURLY, ret, depth+1);
6681                 Set_Node_Offset(ret, parse_start+1); /* MJD */
6682                 Set_Node_Cur_Length(ret);
6683             }
6684             else {
6685                 regnode * const w = reg_node(pRExC_state, WHILEM);
6686
6687                 w->flags = 0;
6688                 REGTAIL(pRExC_state, ret, w);
6689                 if (!SIZE_ONLY && RExC_extralen) {
6690                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
6691                     reginsert(pRExC_state, NOTHING,ret, depth+1);
6692                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
6693                 }
6694                 reginsert(pRExC_state, CURLYX,ret, depth+1);
6695                                 /* MJD hk */
6696                 Set_Node_Offset(ret, parse_start+1);
6697                 Set_Node_Length(ret,
6698                                 op == '{' ? (RExC_parse - parse_start) : 1);
6699
6700                 if (!SIZE_ONLY && RExC_extralen)
6701                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
6702                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6703                 if (SIZE_ONLY)
6704                     RExC_whilem_seen++, RExC_extralen += 3;
6705                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
6706             }
6707             ret->flags = 0;
6708
6709             if (min > 0)
6710                 *flagp = WORST;
6711             if (max > 0)
6712                 *flagp |= HASWIDTH;
6713             if (max < min)
6714                 vFAIL("Can't do {n,m} with n > m");
6715             if (!SIZE_ONLY) {
6716                 ARG1_SET(ret, (U16)min);
6717                 ARG2_SET(ret, (U16)max);
6718             }
6719
6720             goto nest_check;
6721         }
6722     }
6723
6724     if (!ISMULT1(op)) {
6725         *flagp = flags;
6726         return(ret);
6727     }
6728
6729 #if 0                           /* Now runtime fix should be reliable. */
6730
6731     /* if this is reinstated, don't forget to put this back into perldiag:
6732
6733             =item Regexp *+ operand could be empty at {#} in regex m/%s/
6734
6735            (F) The part of the regexp subject to either the * or + quantifier
6736            could match an empty string. The {#} shows in the regular
6737            expression about where the problem was discovered.
6738
6739     */
6740
6741     if (!(flags&HASWIDTH) && op != '?')
6742       vFAIL("Regexp *+ operand could be empty");
6743 #endif
6744
6745     parse_start = RExC_parse;
6746     nextchar(pRExC_state);
6747
6748     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6749
6750     if (op == '*' && (flags&SIMPLE)) {
6751         reginsert(pRExC_state, STAR, ret, depth+1);
6752         ret->flags = 0;
6753         RExC_naughty += 4;
6754     }
6755     else if (op == '*') {
6756         min = 0;
6757         goto do_curly;
6758     }
6759     else if (op == '+' && (flags&SIMPLE)) {
6760         reginsert(pRExC_state, PLUS, ret, depth+1);
6761         ret->flags = 0;
6762         RExC_naughty += 3;
6763     }
6764     else if (op == '+') {
6765         min = 1;
6766         goto do_curly;
6767     }
6768     else if (op == '?') {
6769         min = 0; max = 1;
6770         goto do_curly;
6771     }
6772   nest_check:
6773     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6774         ckWARN3reg(RExC_parse,
6775                    "%.*s matches null string many times",
6776                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6777                    origparse);
6778     }
6779
6780     if (RExC_parse < RExC_end && *RExC_parse == '?') {
6781         nextchar(pRExC_state);
6782         reginsert(pRExC_state, MINMOD, ret, depth+1);
6783         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6784     }
6785 #ifndef REG_ALLOW_MINMOD_SUSPEND
6786     else
6787 #endif
6788     if (RExC_parse < RExC_end && *RExC_parse == '+') {
6789         regnode *ender;
6790         nextchar(pRExC_state);
6791         ender = reg_node(pRExC_state, SUCCEED);
6792         REGTAIL(pRExC_state, ret, ender);
6793         reginsert(pRExC_state, SUSPEND, ret, depth+1);
6794         ret->flags = 0;
6795         ender = reg_node(pRExC_state, TAIL);
6796         REGTAIL(pRExC_state, ret, ender);
6797         /*ret= ender;*/
6798     }
6799
6800     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6801         RExC_parse++;
6802         vFAIL("Nested quantifiers");
6803     }
6804
6805     return(ret);
6806 }
6807
6808
6809 /* reg_namedseq(pRExC_state,UVp)
6810
6811    This is expected to be called by a parser routine that has
6812    recognized '\N' and needs to handle the rest. RExC_parse is
6813    expected to point at the first char following the N at the time
6814    of the call.
6815
6816    The \N may be inside (indicated by valuep not being NULL) or outside a
6817    character class.
6818
6819    \N may begin either a named sequence, or if outside a character class, mean
6820    to match a non-newline.  For non single-quoted regexes, the tokenizer has
6821    attempted to decide which, and in the case of a named sequence converted it
6822    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6823    where c1... are the characters in the sequence.  For single-quoted regexes,
6824    the tokenizer passes the \N sequence through unchanged; this code will not
6825    attempt to determine this nor expand those.  The net effect is that if the
6826    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6827    signals that this \N occurrence means to match a non-newline.
6828
6829    Only the \N{U+...} form should occur in a character class, for the same
6830    reason that '.' inside a character class means to just match a period: it
6831    just doesn't make sense.
6832
6833    If valuep is non-null then it is assumed that we are parsing inside
6834    of a charclass definition and the first codepoint in the resolved
6835    string is returned via *valuep and the routine will return NULL.
6836    In this mode if a multichar string is returned from the charnames
6837    handler, a warning will be issued, and only the first char in the
6838    sequence will be examined. If the string returned is zero length
6839    then the value of *valuep is undefined and NON-NULL will
6840    be returned to indicate failure. (This will NOT be a valid pointer
6841    to a regnode.)
6842
6843    If valuep is null then it is assumed that we are parsing normal text and a
6844    new EXACT node is inserted into the program containing the resolved string,
6845    and a pointer to the new node is returned.  But if the string is zero length
6846    a NOTHING node is emitted instead.
6847
6848    On success RExC_parse is set to the char following the endbrace.
6849    Parsing failures will generate a fatal error via vFAIL(...)
6850  */
6851 STATIC regnode *
6852 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6853 {
6854     char * endbrace;    /* '}' following the name */
6855     regnode *ret = NULL;
6856 #ifdef DEBUGGING
6857     char* parse_start = RExC_parse - 2;     /* points to the '\N' */
6858 #endif
6859     char* p;
6860
6861     GET_RE_DEBUG_FLAGS_DECL;
6862
6863     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6864
6865     GET_RE_DEBUG_FLAGS;
6866
6867     /* The [^\n] meaning of \N ignores spaces and comments under the /x
6868      * modifier.  The other meaning does not */
6869     p = (RExC_flags & RXf_PMf_EXTENDED)
6870         ? regwhite( pRExC_state, RExC_parse )
6871         : RExC_parse;
6872
6873     /* Disambiguate between \N meaning a named character versus \N meaning
6874      * [^\n].  The former is assumed when it can't be the latter. */
6875     if (*p != '{' || regcurly(p)) {
6876         RExC_parse = p;
6877         if (valuep) {
6878             /* no bare \N in a charclass */
6879             vFAIL("\\N in a character class must be a named character: \\N{...}");
6880         }
6881         nextchar(pRExC_state);
6882         ret = reg_node(pRExC_state, REG_ANY);
6883         *flagp |= HASWIDTH|SIMPLE;
6884         RExC_naughty++;
6885         RExC_parse--;
6886         Set_Node_Length(ret, 1); /* MJD */
6887         return ret;
6888     }
6889
6890     /* Here, we have decided it should be a named sequence */
6891
6892     /* The test above made sure that the next real character is a '{', but
6893      * under the /x modifier, it could be separated by space (or a comment and
6894      * \n) and this is not allowed (for consistency with \x{...} and the
6895      * tokenizer handling of \N{NAME}). */
6896     if (*RExC_parse != '{') {
6897         vFAIL("Missing braces on \\N{}");
6898     }
6899
6900     RExC_parse++;       /* Skip past the '{' */
6901
6902     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6903         || ! (endbrace == RExC_parse            /* nothing between the {} */
6904               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
6905                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6906     {
6907         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
6908         vFAIL("\\N{NAME} must be resolved by the lexer");
6909     }
6910
6911     if (endbrace == RExC_parse) {   /* empty: \N{} */
6912         if (! valuep) {
6913             RExC_parse = endbrace + 1;
6914             return reg_node(pRExC_state,NOTHING);
6915         }
6916
6917         if (SIZE_ONLY) {
6918             ckWARNreg(RExC_parse,
6919                     "Ignoring zero length \\N{} in character class"
6920             );
6921             RExC_parse = endbrace + 1;
6922         }
6923         *valuep = 0;
6924         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6925     }
6926
6927     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
6928     RExC_parse += 2;    /* Skip past the 'U+' */
6929
6930     if (valuep) {   /* In a bracketed char class */
6931         /* We only pay attention to the first char of
6932         multichar strings being returned. I kinda wonder
6933         if this makes sense as it does change the behaviour
6934         from earlier versions, OTOH that behaviour was broken
6935         as well. XXX Solution is to recharacterize as
6936         [rest-of-class]|multi1|multi2... */
6937
6938         STRLEN length_of_hex;
6939         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6940             | PERL_SCAN_DISALLOW_PREFIX
6941             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6942
6943         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
6944         if (endchar < endbrace) {
6945             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6946         }
6947
6948         length_of_hex = (STRLEN)(endchar - RExC_parse);
6949         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
6950
6951         /* The tokenizer should have guaranteed validity, but it's possible to
6952          * bypass it by using single quoting, so check */
6953         if (length_of_hex == 0
6954             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6955         {
6956             RExC_parse += length_of_hex;        /* Includes all the valid */
6957             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
6958                             ? UTF8SKIP(RExC_parse)
6959                             : 1;
6960             /* Guard against malformed utf8 */
6961             if (RExC_parse >= endchar) RExC_parse = endchar;
6962             vFAIL("Invalid hexadecimal number in \\N{U+...}");
6963         }
6964
6965         RExC_parse = endbrace + 1;
6966         if (endchar == endbrace) return NULL;
6967
6968         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
6969     }
6970     else {      /* Not a char class */
6971         char *s;            /* String to put in generated EXACT node */
6972         STRLEN len = 0;     /* Its current byte length */
6973         char *endchar;      /* Points to '.' or '}' ending cur char in the input
6974                                stream */
6975
6976         ret = reg_node(pRExC_state,
6977                         (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6978         s= STRING(ret);
6979
6980         /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
6981          * the input which is of the form now 'c1.c2.c3...}' until find the
6982          * ending brace or exceed length 255.  The characters that exceed this
6983          * limit are dropped.  The limit could be relaxed should it become
6984          * desirable by reparsing this as (?:\N{NAME}), so could generate
6985          * multiple EXACT nodes, as is done for just regular input.  But this
6986          * is primarily a named character, and not intended to be a huge long
6987          * string, so 255 bytes should be good enough */
6988         while (1) {
6989             STRLEN length_of_hex;
6990             I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
6991                             | PERL_SCAN_DISALLOW_PREFIX
6992                             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6993             UV cp;  /* Ord of current character */
6994
6995             /* Code points are separated by dots.  If none, there is only one
6996              * code point, and is terminated by the brace */
6997             endchar = RExC_parse + strcspn(RExC_parse, ".}");
6998
6999             /* The values are Unicode even on EBCDIC machines */
7000             length_of_hex = (STRLEN)(endchar - RExC_parse);
7001             cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7002             if ( length_of_hex == 0
7003                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7004             {
7005                 RExC_parse += length_of_hex;        /* Includes all the valid */
7006                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
7007                                 ? UTF8SKIP(RExC_parse)
7008                                 : 1;
7009                 /* Guard against malformed utf8 */
7010                 if (RExC_parse >= endchar) RExC_parse = endchar;
7011                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7012             }
7013
7014             if (! FOLD) {       /* Not folding, just append to the string */
7015                 STRLEN unilen;
7016
7017                 /* Quit before adding this character if would exceed limit */
7018                 if (len + UNISKIP(cp) > U8_MAX) break;
7019
7020                 unilen = reguni(pRExC_state, cp, s);
7021                 if (unilen > 0) {
7022                     s   += unilen;
7023                     len += unilen;
7024                 }
7025             } else {    /* Folding, output the folded equivalent */
7026                 STRLEN foldlen,numlen;
7027                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7028                 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7029
7030                 /* Quit before exceeding size limit */
7031                 if (len + foldlen > U8_MAX) break;
7032
7033                 for (foldbuf = tmpbuf;
7034                     foldlen;
7035                     foldlen -= numlen)
7036                 {
7037                     cp = utf8_to_uvchr(foldbuf, &numlen);
7038                     if (numlen > 0) {
7039                         const STRLEN unilen = reguni(pRExC_state, cp, s);
7040                         s       += unilen;
7041                         len     += unilen;
7042                         /* In EBCDIC the numlen and unilen can differ. */
7043                         foldbuf += numlen;
7044                         if (numlen >= foldlen)
7045                             break;
7046                     }
7047                     else
7048                         break; /* "Can't happen." */
7049                 }
7050             }
7051
7052             /* Point to the beginning of the next character in the sequence. */
7053             RExC_parse = endchar + 1;
7054
7055             /* Quit if no more characters */
7056             if (RExC_parse >= endbrace) break;
7057         }
7058
7059
7060         if (SIZE_ONLY) {
7061             if (RExC_parse < endbrace) {
7062                 ckWARNreg(RExC_parse - 1,
7063                           "Using just the first characters returned by \\N{}");
7064             }
7065
7066             RExC_size += STR_SZ(len);
7067         } else {
7068             STR_LEN(ret) = len;
7069             RExC_emit += STR_SZ(len);
7070         }
7071
7072         RExC_parse = endbrace + 1;
7073
7074         *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7075                                with malformed in t/re/pat_advanced.t */
7076         RExC_parse --;
7077         Set_Node_Cur_Length(ret); /* MJD */
7078         nextchar(pRExC_state);
7079     }
7080
7081     return ret;
7082 }
7083
7084
7085 /*
7086  * reg_recode
7087  *
7088  * It returns the code point in utf8 for the value in *encp.
7089  *    value: a code value in the source encoding
7090  *    encp:  a pointer to an Encode object
7091  *
7092  * If the result from Encode is not a single character,
7093  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7094  */
7095 STATIC UV
7096 S_reg_recode(pTHX_ const char value, SV **encp)
7097 {
7098     STRLEN numlen = 1;
7099     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7100     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7101     const STRLEN newlen = SvCUR(sv);
7102     UV uv = UNICODE_REPLACEMENT;
7103
7104     PERL_ARGS_ASSERT_REG_RECODE;
7105
7106     if (newlen)
7107         uv = SvUTF8(sv)
7108              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7109              : *(U8*)s;
7110
7111     if (!newlen || numlen != newlen) {
7112         uv = UNICODE_REPLACEMENT;
7113         *encp = NULL;
7114     }
7115     return uv;
7116 }
7117
7118
7119 /*
7120  - regatom - the lowest level
7121
7122    Try to identify anything special at the start of the pattern. If there
7123    is, then handle it as required. This may involve generating a single regop,
7124    such as for an assertion; or it may involve recursing, such as to
7125    handle a () structure.
7126
7127    If the string doesn't start with something special then we gobble up
7128    as much literal text as we can.
7129
7130    Once we have been able to handle whatever type of thing started the
7131    sequence, we return.
7132
7133    Note: we have to be careful with escapes, as they can be both literal
7134    and special, and in the case of \10 and friends can either, depending
7135    on context. Specifically there are two seperate switches for handling
7136    escape sequences, with the one for handling literal escapes requiring
7137    a dummy entry for all of the special escapes that are actually handled
7138    by the other.
7139 */
7140
7141 STATIC regnode *
7142 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7143 {
7144     dVAR;
7145     register regnode *ret = NULL;
7146     I32 flags;
7147     char *parse_start = RExC_parse;
7148     GET_RE_DEBUG_FLAGS_DECL;
7149     DEBUG_PARSE("atom");
7150     *flagp = WORST;             /* Tentatively. */
7151
7152     PERL_ARGS_ASSERT_REGATOM;
7153
7154 tryagain:
7155     switch ((U8)*RExC_parse) {
7156     case '^':
7157         RExC_seen_zerolen++;
7158         nextchar(pRExC_state);
7159         if (RExC_flags & RXf_PMf_MULTILINE)
7160             ret = reg_node(pRExC_state, MBOL);
7161         else if (RExC_flags & RXf_PMf_SINGLELINE)
7162             ret = reg_node(pRExC_state, SBOL);
7163         else
7164             ret = reg_node(pRExC_state, BOL);
7165         Set_Node_Length(ret, 1); /* MJD */
7166         break;
7167     case '$':
7168         nextchar(pRExC_state);
7169         if (*RExC_parse)
7170             RExC_seen_zerolen++;
7171         if (RExC_flags & RXf_PMf_MULTILINE)
7172             ret = reg_node(pRExC_state, MEOL);
7173         else if (RExC_flags & RXf_PMf_SINGLELINE)
7174             ret = reg_node(pRExC_state, SEOL);
7175         else
7176             ret = reg_node(pRExC_state, EOL);
7177         Set_Node_Length(ret, 1); /* MJD */
7178         break;
7179     case '.':
7180         nextchar(pRExC_state);
7181         if (RExC_flags & RXf_PMf_SINGLELINE)
7182             ret = reg_node(pRExC_state, SANY);
7183         else
7184             ret = reg_node(pRExC_state, REG_ANY);
7185         *flagp |= HASWIDTH|SIMPLE;
7186         RExC_naughty++;
7187         Set_Node_Length(ret, 1); /* MJD */
7188         break;
7189     case '[':
7190     {
7191         char * const oregcomp_parse = ++RExC_parse;
7192         ret = regclass(pRExC_state,depth+1);
7193         if (*RExC_parse != ']') {
7194             RExC_parse = oregcomp_parse;
7195             vFAIL("Unmatched [");
7196         }
7197         nextchar(pRExC_state);
7198         *flagp |= HASWIDTH|SIMPLE;
7199         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7200         break;
7201     }
7202     case '(':
7203         nextchar(pRExC_state);
7204         ret = reg(pRExC_state, 1, &flags,depth+1);
7205         if (ret == NULL) {
7206                 if (flags & TRYAGAIN) {
7207                     if (RExC_parse == RExC_end) {
7208                          /* Make parent create an empty node if needed. */
7209                         *flagp |= TRYAGAIN;
7210                         return(NULL);
7211                     }
7212                     goto tryagain;
7213                 }
7214                 return(NULL);
7215         }
7216         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7217         break;
7218     case '|':
7219     case ')':
7220         if (flags & TRYAGAIN) {
7221             *flagp |= TRYAGAIN;
7222             return NULL;
7223         }
7224         vFAIL("Internal urp");
7225                                 /* Supposed to be caught earlier. */
7226         break;
7227     case '{':
7228         if (!regcurly(RExC_parse)) {
7229             RExC_parse++;
7230             goto defchar;
7231         }
7232         /* FALL THROUGH */
7233     case '?':
7234     case '+':
7235     case '*':
7236         RExC_parse++;
7237         vFAIL("Quantifier follows nothing");
7238         break;
7239     case 0xDF:
7240     case 0xC3:
7241     case 0xCE:
7242         do_foldchar:
7243         if (!LOC && FOLD) {
7244             U32 len,cp;
7245             len=0; /* silence a spurious compiler warning */
7246             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7247                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7248                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7249                 ret = reganode(pRExC_state, FOLDCHAR, cp);
7250                 Set_Node_Length(ret, 1); /* MJD */
7251                 nextchar(pRExC_state); /* kill whitespace under /x */
7252                 return ret;
7253             }
7254         }
7255         goto outer_default;
7256     case '\\':
7257         /* Special Escapes
7258
7259            This switch handles escape sequences that resolve to some kind
7260            of special regop and not to literal text. Escape sequnces that
7261            resolve to literal text are handled below in the switch marked
7262            "Literal Escapes".
7263
7264            Every entry in this switch *must* have a corresponding entry
7265            in the literal escape switch. However, the opposite is not
7266            required, as the default for this switch is to jump to the
7267            literal text handling code.
7268         */
7269         switch ((U8)*++RExC_parse) {
7270         case 0xDF:
7271         case 0xC3:
7272         case 0xCE:
7273                    goto do_foldchar;
7274         /* Special Escapes */
7275         case 'A':
7276             RExC_seen_zerolen++;
7277             ret = reg_node(pRExC_state, SBOL);
7278             *flagp |= SIMPLE;
7279             goto finish_meta_pat;
7280         case 'G':
7281             ret = reg_node(pRExC_state, GPOS);
7282             RExC_seen |= REG_SEEN_GPOS;
7283             *flagp |= SIMPLE;
7284             goto finish_meta_pat;
7285         case 'K':
7286             RExC_seen_zerolen++;
7287             ret = reg_node(pRExC_state, KEEPS);
7288             *flagp |= SIMPLE;
7289             /* XXX:dmq : disabling in-place substitution seems to
7290              * be necessary here to avoid cases of memory corruption, as
7291              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7292              */
7293             RExC_seen |= REG_SEEN_LOOKBEHIND;
7294             goto finish_meta_pat;
7295         case 'Z':
7296             ret = reg_node(pRExC_state, SEOL);
7297             *flagp |= SIMPLE;
7298             RExC_seen_zerolen++;                /* Do not optimize RE away */
7299             goto finish_meta_pat;
7300         case 'z':
7301             ret = reg_node(pRExC_state, EOS);
7302             *flagp |= SIMPLE;
7303             RExC_seen_zerolen++;                /* Do not optimize RE away */
7304             goto finish_meta_pat;
7305         case 'C':
7306             ret = reg_node(pRExC_state, CANY);
7307             RExC_seen |= REG_SEEN_CANY;
7308             *flagp |= HASWIDTH|SIMPLE;
7309             goto finish_meta_pat;
7310         case 'X':
7311             ret = reg_node(pRExC_state, CLUMP);
7312             *flagp |= HASWIDTH;
7313             goto finish_meta_pat;
7314         case 'w':
7315             if (LOC) {
7316                 ret = reg_node(pRExC_state, (U8)(ALNUML));
7317             } else {
7318                 ret = reg_node(pRExC_state, (U8)(ALNUM));
7319                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7320             }
7321             *flagp |= HASWIDTH|SIMPLE;
7322             goto finish_meta_pat;
7323         case 'W':
7324             if (LOC) {
7325                 ret = reg_node(pRExC_state, (U8)(NALNUML));
7326             } else {
7327                 ret = reg_node(pRExC_state, (U8)(NALNUM));
7328                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7329             }
7330             *flagp |= HASWIDTH|SIMPLE;
7331             goto finish_meta_pat;
7332         case 'b':
7333             RExC_seen_zerolen++;
7334             RExC_seen |= REG_SEEN_LOOKBEHIND;
7335             if (LOC) {
7336                 ret = reg_node(pRExC_state, (U8)(BOUNDL));
7337             } else {
7338                 ret = reg_node(pRExC_state, (U8)(BOUND));
7339                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7340             }
7341             *flagp |= SIMPLE;
7342             goto finish_meta_pat;
7343         case 'B':
7344             RExC_seen_zerolen++;
7345             RExC_seen |= REG_SEEN_LOOKBEHIND;
7346             if (LOC) {
7347                 ret = reg_node(pRExC_state, (U8)(NBOUNDL));
7348             } else {
7349                 ret = reg_node(pRExC_state, (U8)(NBOUND));
7350                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7351             }
7352             *flagp |= SIMPLE;
7353             goto finish_meta_pat;
7354         case 's':
7355             if (LOC) {
7356                 ret = reg_node(pRExC_state, (U8)(SPACEL));
7357             } else {
7358                 ret = reg_node(pRExC_state, (U8)(SPACE));
7359                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7360             }
7361             *flagp |= HASWIDTH|SIMPLE;
7362             goto finish_meta_pat;
7363         case 'S':
7364             if (LOC) {
7365                 ret = reg_node(pRExC_state, (U8)(NSPACEL));
7366             } else {
7367                 ret = reg_node(pRExC_state, (U8)(NSPACE));
7368                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7369             }
7370             *flagp |= HASWIDTH|SIMPLE;
7371             goto finish_meta_pat;
7372         case 'd':
7373             ret = reg_node(pRExC_state, DIGIT);
7374             *flagp |= HASWIDTH|SIMPLE;
7375             goto finish_meta_pat;
7376         case 'D':
7377             ret = reg_node(pRExC_state, NDIGIT);
7378             *flagp |= HASWIDTH|SIMPLE;
7379             goto finish_meta_pat;
7380         case 'R':
7381             ret = reg_node(pRExC_state, LNBREAK);
7382             *flagp |= HASWIDTH|SIMPLE;
7383             goto finish_meta_pat;
7384         case 'h':
7385             ret = reg_node(pRExC_state, HORIZWS);
7386             *flagp |= HASWIDTH|SIMPLE;
7387             goto finish_meta_pat;
7388         case 'H':
7389             ret = reg_node(pRExC_state, NHORIZWS);
7390             *flagp |= HASWIDTH|SIMPLE;
7391             goto finish_meta_pat;
7392         case 'v':
7393             ret = reg_node(pRExC_state, VERTWS);
7394             *flagp |= HASWIDTH|SIMPLE;
7395             goto finish_meta_pat;
7396         case 'V':
7397             ret = reg_node(pRExC_state, NVERTWS);
7398             *flagp |= HASWIDTH|SIMPLE;
7399          finish_meta_pat:
7400             nextchar(pRExC_state);
7401             Set_Node_Length(ret, 2); /* MJD */
7402             break;
7403         case 'p':
7404         case 'P':
7405             {
7406                 char* const oldregxend = RExC_end;
7407 #ifdef DEBUGGING
7408                 char* parse_start = RExC_parse - 2;
7409 #endif
7410
7411                 if (RExC_parse[1] == '{') {
7412                   /* a lovely hack--pretend we saw [\pX] instead */
7413                     RExC_end = strchr(RExC_parse, '}');
7414                     if (!RExC_end) {
7415                         const U8 c = (U8)*RExC_parse;
7416                         RExC_parse += 2;
7417                         RExC_end = oldregxend;
7418                         vFAIL2("Missing right brace on \\%c{}", c);
7419                     }
7420                     RExC_end++;
7421                 }
7422                 else {
7423                     RExC_end = RExC_parse + 2;
7424                     if (RExC_end > oldregxend)
7425                         RExC_end = oldregxend;
7426                 }
7427                 RExC_parse--;
7428
7429                 ret = regclass(pRExC_state,depth+1);
7430
7431                 RExC_end = oldregxend;
7432                 RExC_parse--;
7433
7434                 Set_Node_Offset(ret, parse_start + 2);
7435                 Set_Node_Cur_Length(ret);
7436                 nextchar(pRExC_state);
7437                 *flagp |= HASWIDTH|SIMPLE;
7438             }
7439             break;
7440         case 'N':
7441             /* Handle \N and \N{NAME} here and not below because it can be
7442             multicharacter. join_exact() will join them up later on.
7443             Also this makes sure that things like /\N{BLAH}+/ and
7444             \N{BLAH} being multi char Just Happen. dmq*/
7445             ++RExC_parse;
7446             ret= reg_namedseq(pRExC_state, NULL, flagp);
7447             break;
7448         case 'k':    /* Handle \k<NAME> and \k'NAME' */
7449         parse_named_seq:
7450         {
7451             char ch= RExC_parse[1];
7452             if (ch != '<' && ch != '\'' && ch != '{') {
7453                 RExC_parse++;
7454                 vFAIL2("Sequence %.2s... not terminated",parse_start);
7455             } else {
7456                 /* this pretty much dupes the code for (?P=...) in reg(), if
7457                    you change this make sure you change that */
7458                 char* name_start = (RExC_parse += 2);
7459                 U32 num = 0;
7460                 SV *sv_dat = reg_scan_name(pRExC_state,
7461                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7462                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7463                 if (RExC_parse == name_start || *RExC_parse != ch)
7464                     vFAIL2("Sequence %.3s... not terminated",parse_start);
7465
7466                 if (!SIZE_ONLY) {
7467                     num = add_data( pRExC_state, 1, "S" );
7468                     RExC_rxi->data->data[num]=(void*)sv_dat;
7469                     SvREFCNT_inc_simple_void(sv_dat);
7470                 }
7471
7472                 RExC_sawback = 1;
7473                 ret = reganode(pRExC_state,
7474                            (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7475                            num);
7476                 *flagp |= HASWIDTH;
7477
7478                 /* override incorrect value set in reganode MJD */
7479                 Set_Node_Offset(ret, parse_start+1);
7480                 Set_Node_Cur_Length(ret); /* MJD */
7481                 nextchar(pRExC_state);
7482
7483             }
7484             break;
7485         }
7486         case 'g':
7487         case '1': case '2': case '3': case '4':
7488         case '5': case '6': case '7': case '8': case '9':
7489             {
7490                 I32 num;
7491                 bool isg = *RExC_parse == 'g';
7492                 bool isrel = 0;
7493                 bool hasbrace = 0;
7494                 if (isg) {
7495                     RExC_parse++;
7496                     if (*RExC_parse == '{') {
7497                         RExC_parse++;
7498                         hasbrace = 1;
7499                     }
7500                     if (*RExC_parse == '-') {
7501                         RExC_parse++;
7502                         isrel = 1;
7503                     }
7504                     if (hasbrace && !isDIGIT(*RExC_parse)) {
7505                         if (isrel) RExC_parse--;
7506                         RExC_parse -= 2;
7507                         goto parse_named_seq;
7508                 }   }
7509                 num = atoi(RExC_parse);
7510                 if (isg && num == 0)
7511                     vFAIL("Reference to invalid group 0");
7512                 if (isrel) {
7513                     num = RExC_npar - num;
7514                     if (num < 1)
7515                         vFAIL("Reference to nonexistent or unclosed group");
7516                 }
7517                 if (!isg && num > 9 && num >= RExC_npar)
7518                     goto defchar;
7519                 else {
7520                     char * const parse_start = RExC_parse - 1; /* MJD */
7521                     while (isDIGIT(*RExC_parse))
7522                         RExC_parse++;
7523                     if (parse_start == RExC_parse - 1)
7524                         vFAIL("Unterminated \\g... pattern");
7525                     if (hasbrace) {
7526                         if (*RExC_parse != '}')
7527                             vFAIL("Unterminated \\g{...} pattern");
7528                         RExC_parse++;
7529                     }
7530                     if (!SIZE_ONLY) {
7531                         if (num > (I32)RExC_rx->nparens)
7532                             vFAIL("Reference to nonexistent group");
7533                     }
7534                     RExC_sawback = 1;
7535                     ret = reganode(pRExC_state,
7536                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7537                                    num);
7538                     *flagp |= HASWIDTH;
7539
7540                     /* override incorrect value set in reganode MJD */
7541                     Set_Node_Offset(ret, parse_start+1);
7542                     Set_Node_Cur_Length(ret); /* MJD */
7543                     RExC_parse--;
7544                     nextchar(pRExC_state);
7545                 }
7546             }
7547             break;
7548         case '\0':
7549             if (RExC_parse >= RExC_end)
7550                 FAIL("Trailing \\");
7551             /* FALL THROUGH */
7552         default:
7553             /* Do not generate "unrecognized" warnings here, we fall
7554                back into the quick-grab loop below */
7555             parse_start--;
7556             goto defchar;
7557         }
7558         break;
7559
7560     case '#':
7561         if (RExC_flags & RXf_PMf_EXTENDED) {
7562             if ( reg_skipcomment( pRExC_state ) )
7563                 goto tryagain;
7564         }
7565         /* FALL THROUGH */
7566
7567     default:
7568         outer_default:{
7569             register STRLEN len;
7570             register UV ender;
7571             register char *p;
7572             char *s;
7573             STRLEN foldlen;
7574             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7575
7576             parse_start = RExC_parse - 1;
7577
7578             RExC_parse++;
7579
7580         defchar:
7581             ender = 0;
7582             ret = reg_node(pRExC_state,
7583                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7584             s = STRING(ret);
7585             for (len = 0, p = RExC_parse - 1;
7586               len < 127 && p < RExC_end;
7587               len++)
7588             {
7589                 char * const oldp = p;
7590
7591                 if (RExC_flags & RXf_PMf_EXTENDED)
7592                     p = regwhite( pRExC_state, p );
7593                 switch ((U8)*p) {
7594                 case 0xDF:
7595                 case 0xC3:
7596                 case 0xCE:
7597                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7598                                 goto normal_default;
7599                 case '^':
7600                 case '$':
7601                 case '.':
7602                 case '[':
7603                 case '(':
7604                 case ')':
7605                 case '|':
7606                     goto loopdone;
7607                 case '\\':
7608                     /* Literal Escapes Switch
7609
7610                        This switch is meant to handle escape sequences that
7611                        resolve to a literal character.
7612
7613                        Every escape sequence that represents something
7614                        else, like an assertion or a char class, is handled
7615                        in the switch marked 'Special Escapes' above in this
7616                        routine, but also has an entry here as anything that
7617                        isn't explicitly mentioned here will be treated as
7618                        an unescaped equivalent literal.
7619                     */
7620
7621                     switch ((U8)*++p) {
7622                     /* These are all the special escapes. */
7623                     case 0xDF:
7624                     case 0xC3:
7625                     case 0xCE:
7626                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7627                                 goto normal_default;
7628                     case 'A':             /* Start assertion */
7629                     case 'b': case 'B':   /* Word-boundary assertion*/
7630                     case 'C':             /* Single char !DANGEROUS! */
7631                     case 'd': case 'D':   /* digit class */
7632                     case 'g': case 'G':   /* generic-backref, pos assertion */
7633                     case 'h': case 'H':   /* HORIZWS */
7634                     case 'k': case 'K':   /* named backref, keep marker */
7635                     case 'N':             /* named char sequence */
7636                     case 'p': case 'P':   /* Unicode property */
7637                               case 'R':   /* LNBREAK */
7638                     case 's': case 'S':   /* space class */
7639                     case 'v': case 'V':   /* VERTWS */
7640                     case 'w': case 'W':   /* word class */
7641                     case 'X':             /* eXtended Unicode "combining character sequence" */
7642                     case 'z': case 'Z':   /* End of line/string assertion */
7643                         --p;
7644                         goto loopdone;
7645
7646                     /* Anything after here is an escape that resolves to a
7647                        literal. (Except digits, which may or may not)
7648                      */
7649                     case 'n':
7650                         ender = '\n';
7651                         p++;
7652                         break;
7653                     case 'r':
7654                         ender = '\r';
7655                         p++;
7656                         break;
7657                     case 't':
7658                         ender = '\t';
7659                         p++;
7660                         break;
7661                     case 'f':
7662                         ender = '\f';
7663                         p++;
7664                         break;
7665                     case 'e':
7666                           ender = ASCII_TO_NATIVE('\033');
7667                         p++;
7668                         break;
7669                     case 'a':
7670                           ender = ASCII_TO_NATIVE('\007');
7671                         p++;
7672                         break;
7673                     case 'o':
7674                         {
7675                             STRLEN brace_len = len;
7676                             UV result;
7677                             const char* error_msg;
7678
7679                             bool valid = grok_bslash_o(p,
7680                                                        &result,
7681                                                        &brace_len,
7682                                                        &error_msg,
7683                                                        1);
7684                             p += brace_len;
7685                             if (! valid) {
7686                                 RExC_parse = p; /* going to die anyway; point
7687                                                    to exact spot of failure */
7688                                 vFAIL(error_msg);
7689                             }
7690                             else
7691                             {
7692                                 ender = result;
7693                             }
7694                             if (PL_encoding && ender < 0x100) {
7695                                 goto recode_encoding;
7696                             }
7697                             if (ender > 0xff) {
7698                                 REQUIRE_UTF8;
7699                             }
7700                             break;
7701                         }
7702                     case 'x':
7703                         if (*++p == '{') {
7704                             char* const e = strchr(p, '}');
7705
7706                             if (!e) {
7707                                 RExC_parse = p + 1;
7708                                 vFAIL("Missing right brace on \\x{}");
7709                             }
7710                             else {
7711                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7712                                     | PERL_SCAN_DISALLOW_PREFIX;
7713                                 STRLEN numlen = e - p - 1;
7714                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7715                                 if (ender > 0xff)
7716                                     REQUIRE_UTF8;
7717                                 p = e + 1;
7718                             }
7719                         }
7720                         else {
7721                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7722                             STRLEN numlen = 2;
7723                             ender = grok_hex(p, &numlen, &flags, NULL);
7724                             p += numlen;
7725                         }
7726                         if (PL_encoding && ender < 0x100)
7727                             goto recode_encoding;
7728                         break;
7729                     case 'c':
7730                         p++;
7731                         ender = grok_bslash_c(*p++, SIZE_ONLY);
7732                         break;
7733                     case '0': case '1': case '2': case '3':case '4':
7734                     case '5': case '6': case '7': case '8':case '9':
7735                         if (*p == '0' ||
7736                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
7737                         {
7738                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
7739                             STRLEN numlen = 3;
7740                             ender = grok_oct(p, &numlen, &flags, NULL);
7741                             if (ender > 0xff) {
7742                                 REQUIRE_UTF8;
7743                             }
7744                             p += numlen;
7745                         }
7746                         else {
7747                             --p;
7748                             goto loopdone;
7749                         }
7750                         if (PL_encoding && ender < 0x100)
7751                             goto recode_encoding;
7752                         break;
7753                     recode_encoding:
7754                         {
7755                             SV* enc = PL_encoding;
7756                             ender = reg_recode((const char)(U8)ender, &enc);
7757                             if (!enc && SIZE_ONLY)
7758                                 ckWARNreg(p, "Invalid escape in the specified encoding");
7759                             REQUIRE_UTF8;
7760                         }
7761                         break;
7762                     case '\0':
7763                         if (p >= RExC_end)
7764                             FAIL("Trailing \\");
7765                         /* FALL THROUGH */
7766                     default:
7767                         if (!SIZE_ONLY&& isALPHA(*p))
7768                             ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7769                         goto normal_default;
7770                     }
7771                     break;
7772                 default:
7773                   normal_default:
7774                     if (UTF8_IS_START(*p) && UTF) {
7775                         STRLEN numlen;
7776                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7777                                                &numlen, UTF8_ALLOW_DEFAULT);
7778                         p += numlen;
7779                     }
7780                     else
7781                         ender = *p++;
7782                     break;
7783                 }
7784                 if ( RExC_flags & RXf_PMf_EXTENDED)
7785                     p = regwhite( pRExC_state, p );
7786                 if (UTF && FOLD) {
7787                     /* Prime the casefolded buffer. */
7788                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7789                 }
7790                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7791                     if (len)
7792                         p = oldp;
7793                     else if (UTF) {
7794                          if (FOLD) {
7795                               /* Emit all the Unicode characters. */
7796                               STRLEN numlen;
7797                               for (foldbuf = tmpbuf;
7798                                    foldlen;
7799                                    foldlen -= numlen) {
7800                                    ender = utf8_to_uvchr(foldbuf, &numlen);
7801                                    if (numlen > 0) {
7802                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
7803                                         s       += unilen;
7804                                         len     += unilen;
7805                                         /* In EBCDIC the numlen
7806                                          * and unilen can differ. */
7807                                         foldbuf += numlen;
7808                                         if (numlen >= foldlen)
7809                                              break;
7810                                    }
7811                                    else
7812                                         break; /* "Can't happen." */
7813                               }
7814                          }
7815                          else {
7816                               const STRLEN unilen = reguni(pRExC_state, ender, s);
7817                               if (unilen > 0) {
7818                                    s   += unilen;
7819                                    len += unilen;
7820                               }
7821                          }
7822                     }
7823                     else {
7824                         len++;
7825                         REGC((char)ender, s++);
7826                     }
7827                     break;
7828                 }
7829                 if (UTF) {
7830                      if (FOLD) {
7831                           /* Emit all the Unicode characters. */
7832                           STRLEN numlen;
7833                           for (foldbuf = tmpbuf;
7834                                foldlen;
7835                                foldlen -= numlen) {
7836                                ender = utf8_to_uvchr(foldbuf, &numlen);
7837                                if (numlen > 0) {
7838                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
7839                                     len     += unilen;
7840                                     s       += unilen;
7841                                     /* In EBCDIC the numlen
7842                                      * and unilen can differ. */
7843                                     foldbuf += numlen;
7844                                     if (numlen >= foldlen)
7845                                          break;
7846                                }
7847                                else
7848                                     break;
7849                           }
7850                      }
7851                      else {
7852                           const STRLEN unilen = reguni(pRExC_state, ender, s);
7853                           if (unilen > 0) {
7854                                s   += unilen;
7855                                len += unilen;
7856                           }
7857                      }
7858                      len--;
7859                 }
7860                 else
7861                     REGC((char)ender, s++);
7862             }
7863         loopdone:
7864             RExC_parse = p - 1;
7865             Set_Node_Cur_Length(ret); /* MJD */
7866             nextchar(pRExC_state);
7867             {
7868                 /* len is STRLEN which is unsigned, need to copy to signed */
7869                 IV iv = len;
7870                 if (iv < 0)
7871                     vFAIL("Internal disaster");
7872             }
7873             if (len > 0)
7874                 *flagp |= HASWIDTH;
7875             if (len == 1 && UNI_IS_INVARIANT(ender))
7876                 *flagp |= SIMPLE;
7877
7878             if (SIZE_ONLY)
7879                 RExC_size += STR_SZ(len);
7880             else {
7881                 STR_LEN(ret) = len;
7882                 RExC_emit += STR_SZ(len);
7883             }
7884         }
7885         break;
7886     }
7887
7888     return(ret);
7889 }
7890
7891 STATIC char *
7892 S_regwhite( RExC_state_t *pRExC_state, char *p )
7893 {
7894     const char *e = RExC_end;
7895
7896     PERL_ARGS_ASSERT_REGWHITE;
7897
7898     while (p < e) {
7899         if (isSPACE(*p))
7900             ++p;
7901         else if (*p == '#') {
7902             bool ended = 0;
7903             do {
7904                 if (*p++ == '\n') {
7905                     ended = 1;
7906                     break;
7907                 }
7908             } while (p < e);
7909             if (!ended)
7910                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7911         }
7912         else
7913             break;
7914     }
7915     return p;
7916 }
7917
7918 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7919    Character classes ([:foo:]) can also be negated ([:^foo:]).
7920    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7921    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7922    but trigger failures because they are currently unimplemented. */
7923
7924 #define POSIXCC_DONE(c)   ((c) == ':')
7925 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7926 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7927
7928 STATIC I32
7929 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7930 {
7931     dVAR;
7932     I32 namedclass = OOB_NAMEDCLASS;
7933
7934     PERL_ARGS_ASSERT_REGPPOSIXCC;
7935
7936     if (value == '[' && RExC_parse + 1 < RExC_end &&
7937         /* I smell either [: or [= or [. -- POSIX has been here, right? */
7938         POSIXCC(UCHARAT(RExC_parse))) {
7939         const char c = UCHARAT(RExC_parse);
7940         char* const s = RExC_parse++;
7941
7942         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7943             RExC_parse++;
7944         if (RExC_parse == RExC_end)
7945             /* Grandfather lone [:, [=, [. */
7946             RExC_parse = s;
7947         else {
7948             const char* const t = RExC_parse++; /* skip over the c */
7949             assert(*t == c);
7950
7951             if (UCHARAT(RExC_parse) == ']') {
7952                 const char *posixcc = s + 1;
7953                 RExC_parse++; /* skip over the ending ] */
7954
7955                 if (*s == ':') {
7956                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7957                     const I32 skip = t - posixcc;
7958
7959                     /* Initially switch on the length of the name.  */
7960                     switch (skip) {
7961                     case 4:
7962                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7963                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7964                         break;
7965                     case 5:
7966                         /* Names all of length 5.  */
7967                         /* alnum alpha ascii blank cntrl digit graph lower
7968                            print punct space upper  */
7969                         /* Offset 4 gives the best switch position.  */
7970                         switch (posixcc[4]) {
7971                         case 'a':
7972                             if (memEQ(posixcc, "alph", 4)) /* alpha */
7973                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7974                             break;
7975                         case 'e':
7976                             if (memEQ(posixcc, "spac", 4)) /* space */
7977                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7978                             break;
7979                         case 'h':
7980                             if (memEQ(posixcc, "grap", 4)) /* graph */
7981                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7982                             break;
7983                         case 'i':
7984                             if (memEQ(posixcc, "asci", 4)) /* ascii */
7985                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7986                             break;
7987                         case 'k':
7988                             if (memEQ(posixcc, "blan", 4)) /* blank */
7989                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7990                             break;
7991                         case 'l':
7992                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7993                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7994                             break;
7995                         case 'm':
7996                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
7997                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7998                             break;
7999                         case 'r':
8000                             if (memEQ(posixcc, "lowe", 4)) /* lower */
8001                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
8002                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
8003                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
8004                             break;
8005                         case 't':
8006                             if (memEQ(posixcc, "digi", 4)) /* digit */
8007                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
8008                             else if (memEQ(posixcc, "prin", 4)) /* print */
8009                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
8010                             else if (memEQ(posixcc, "punc", 4)) /* punct */
8011                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
8012                             break;
8013                         }
8014                         break;
8015                     case 6:
8016                         if (memEQ(posixcc, "xdigit", 6))
8017                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
8018                         break;
8019                     }
8020
8021                     if (namedclass == OOB_NAMEDCLASS)
8022                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8023                                       t - s - 1, s + 1);
8024                     assert (posixcc[skip] == ':');
8025                     assert (posixcc[skip+1] == ']');
8026                 } else if (!SIZE_ONLY) {
8027                     /* [[=foo=]] and [[.foo.]] are still future. */
8028
8029                     /* adjust RExC_parse so the warning shows after
8030                        the class closes */
8031                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
8032                         RExC_parse++;
8033                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8034                 }
8035             } else {
8036                 /* Maternal grandfather:
8037                  * "[:" ending in ":" but not in ":]" */
8038                 RExC_parse = s;
8039             }
8040         }
8041     }
8042
8043     return namedclass;
8044 }
8045
8046 STATIC void
8047 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
8048 {
8049     dVAR;
8050
8051     PERL_ARGS_ASSERT_CHECKPOSIXCC;
8052
8053     if (POSIXCC(UCHARAT(RExC_parse))) {
8054         const char *s = RExC_parse;
8055         const char  c = *s++;
8056
8057         while (isALNUM(*s))
8058             s++;
8059         if (*s && c == *s && s[1] == ']') {
8060             ckWARN3reg(s+2,
8061                        "POSIX syntax [%c %c] belongs inside character classes",
8062                        c, c);
8063
8064             /* [[=foo=]] and [[.foo.]] are still future. */
8065             if (POSIXCC_NOTYET(c)) {
8066                 /* adjust RExC_parse so the error shows after
8067                    the class closes */
8068                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
8069                     NOOP;
8070                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8071             }
8072         }
8073     }
8074 }
8075
8076 /* No locale test */
8077 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
8078 ANYOF_##NAME:                                           \
8079         for (value = 0; value < 256; value++)           \
8080             if (TEST)                                   \
8081                 ANYOF_BITMAP_SET(ret, value);           \
8082     yesno = '+';                                        \
8083     what = WORD;                                        \
8084     break;                                              \
8085 case ANYOF_N##NAME:                                     \
8086         for (value = 0; value < 256; value++)           \
8087             if (!TEST)                                  \
8088                 ANYOF_BITMAP_SET(ret, value);           \
8089     yesno = '!';                                        \
8090     what = WORD;                                        \
8091     break
8092
8093 /* Like the above, but there are differences if we are in uni-8-bit or not, so
8094  * there are two tests passed in, to use depending on that. There aren't any
8095  * cases where the label is different from the name, so no need for that
8096  * parameter */
8097 #define _C_C_T_(NAME,TEST_8,TEST_7,WORD)       \
8098 ANYOF_##NAME:                                           \
8099     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);        \
8100     else if (UNI_SEMANTICS) {                           \
8101         for (value = 0; value < 256; value++) {         \
8102             if (TEST_8) ANYOF_BITMAP_SET(ret, value);   \
8103         }                                               \
8104     }                                                   \
8105     else {                                              \
8106         for (value = 0; value < 256; value++) {         \
8107             if (TEST_7) ANYOF_BITMAP_SET(ret, value);   \
8108         }                                               \
8109     }                                                   \
8110     yesno = '+';                                        \
8111     what = WORD;                                        \
8112     break;                                              \
8113 case ANYOF_N##NAME:                                     \
8114     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);       \
8115     else if (UNI_SEMANTICS) {                           \
8116         for (value = 0; value < 256; value++) {         \
8117             if (! TEST_8) ANYOF_BITMAP_SET(ret, value); \
8118         }                                               \
8119     }                                                   \
8120     else {                                              \
8121         for (value = 0; value < 256; value++) {         \
8122             if (! TEST_7) ANYOF_BITMAP_SET(ret, value); \
8123         }                                               \
8124     }                                                   \
8125     yesno = '!';                                        \
8126     what = WORD;                                        \
8127     break
8128
8129 /*
8130    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8131    so that it is possible to override the option here without having to
8132    rebuild the entire core. as we are required to do if we change regcomp.h
8133    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8134 */
8135 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8136 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8137 #endif
8138
8139 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8140 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8141 #else
8142 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8143 #endif
8144
8145 /*
8146    parse a class specification and produce either an ANYOF node that
8147    matches the pattern or if the pattern matches a single char only and
8148    that char is < 256 and we are case insensitive then we produce an
8149    EXACT node instead.
8150 */
8151
8152 STATIC regnode *
8153 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
8154 {
8155     dVAR;
8156     register UV nextvalue;
8157     register IV prevvalue = OOB_UNICODE;
8158     register IV range = 0;
8159     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
8160     register regnode *ret;
8161     STRLEN numlen;
8162     IV namedclass;
8163     char *rangebegin = NULL;
8164     bool need_class = 0;
8165     SV *listsv = NULL;
8166     UV n;
8167     bool optimize_invert   = TRUE;
8168     AV* unicode_alternate  = NULL;
8169 #ifdef EBCDIC
8170     UV literal_endpoint = 0;
8171 #endif
8172     UV stored = 0;  /* 0, 1, or more than 1 chars stored in the class */
8173
8174     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
8175         case we need to change the emitted regop to an EXACT. */
8176     const char * orig_parse = RExC_parse;
8177     GET_RE_DEBUG_FLAGS_DECL;
8178
8179     PERL_ARGS_ASSERT_REGCLASS;
8180 #ifndef DEBUGGING
8181     PERL_UNUSED_ARG(depth);
8182 #endif
8183
8184     DEBUG_PARSE("clas");
8185
8186     /* Assume we are going to generate an ANYOF node. */
8187     ret = reganode(pRExC_state, ANYOF, 0);
8188
8189     if (!SIZE_ONLY)
8190         ANYOF_FLAGS(ret) = 0;
8191
8192     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
8193         RExC_naughty++;
8194         RExC_parse++;
8195         if (!SIZE_ONLY)
8196             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8197     }
8198
8199     if (SIZE_ONLY) {
8200         RExC_size += ANYOF_SKIP;
8201         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8202     }
8203     else {
8204         RExC_emit += ANYOF_SKIP;
8205         if (FOLD)
8206             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
8207         if (LOC)
8208             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
8209         ANYOF_BITMAP_ZERO(ret);
8210         listsv = newSVpvs("# comment\n");
8211     }
8212
8213     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8214
8215     if (!SIZE_ONLY && POSIXCC(nextvalue))
8216         checkposixcc(pRExC_state);
8217
8218     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8219     if (UCHARAT(RExC_parse) == ']')
8220         goto charclassloop;
8221
8222 parseit:
8223     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
8224
8225     charclassloop:
8226
8227         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8228
8229         if (!range)
8230             rangebegin = RExC_parse;
8231         if (UTF) {
8232             value = utf8n_to_uvchr((U8*)RExC_parse,
8233                                    RExC_end - RExC_parse,
8234                                    &numlen, UTF8_ALLOW_DEFAULT);
8235             RExC_parse += numlen;
8236         }
8237         else
8238             value = UCHARAT(RExC_parse++);
8239
8240         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8241         if (value == '[' && POSIXCC(nextvalue))
8242             namedclass = regpposixcc(pRExC_state, value);
8243         else if (value == '\\') {
8244             if (UTF) {
8245                 value = utf8n_to_uvchr((U8*)RExC_parse,
8246                                    RExC_end - RExC_parse,
8247                                    &numlen, UTF8_ALLOW_DEFAULT);
8248                 RExC_parse += numlen;
8249             }
8250             else
8251                 value = UCHARAT(RExC_parse++);
8252             /* Some compilers cannot handle switching on 64-bit integer
8253              * values, therefore value cannot be an UV.  Yes, this will
8254              * be a problem later if we want switch on Unicode.
8255              * A similar issue a little bit later when switching on
8256              * namedclass. --jhi */
8257             switch ((I32)value) {
8258             case 'w':   namedclass = ANYOF_ALNUM;       break;
8259             case 'W':   namedclass = ANYOF_NALNUM;      break;
8260             case 's':   namedclass = ANYOF_SPACE;       break;
8261             case 'S':   namedclass = ANYOF_NSPACE;      break;
8262             case 'd':   namedclass = ANYOF_DIGIT;       break;
8263             case 'D':   namedclass = ANYOF_NDIGIT;      break;
8264             case 'v':   namedclass = ANYOF_VERTWS;      break;
8265             case 'V':   namedclass = ANYOF_NVERTWS;     break;
8266             case 'h':   namedclass = ANYOF_HORIZWS;     break;
8267             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
8268             case 'N':  /* Handle \N{NAME} in class */
8269                 {
8270                     /* We only pay attention to the first char of
8271                     multichar strings being returned. I kinda wonder
8272                     if this makes sense as it does change the behaviour
8273                     from earlier versions, OTOH that behaviour was broken
8274                     as well. */
8275                     UV v; /* value is register so we cant & it /grrr */
8276                     if (reg_namedseq(pRExC_state, &v, NULL)) {
8277                         goto parseit;
8278                     }
8279                     value= v;
8280                 }
8281                 break;
8282             case 'p':
8283             case 'P':
8284                 {
8285                 char *e;
8286                 if (RExC_parse >= RExC_end)
8287                     vFAIL2("Empty \\%c{}", (U8)value);
8288                 if (*RExC_parse == '{') {
8289                     const U8 c = (U8)value;
8290                     e = strchr(RExC_parse++, '}');
8291                     if (!e)
8292                         vFAIL2("Missing right brace on \\%c{}", c);
8293                     while (isSPACE(UCHARAT(RExC_parse)))
8294                         RExC_parse++;
8295                     if (e == RExC_parse)
8296                         vFAIL2("Empty \\%c{}", c);
8297                     n = e - RExC_parse;
8298                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8299                         n--;
8300                 }
8301                 else {
8302                     e = RExC_parse;
8303                     n = 1;
8304                 }
8305                 if (!SIZE_ONLY) {
8306                     if (UCHARAT(RExC_parse) == '^') {
8307                          RExC_parse++;
8308                          n--;
8309                          value = value == 'p' ? 'P' : 'p'; /* toggle */
8310                          while (isSPACE(UCHARAT(RExC_parse))) {
8311                               RExC_parse++;
8312                               n--;
8313                          }
8314                     }
8315                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8316                         (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8317                 }
8318                 RExC_parse = e + 1;
8319                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8320                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
8321                 }
8322                 break;
8323             case 'n':   value = '\n';                   break;
8324             case 'r':   value = '\r';                   break;
8325             case 't':   value = '\t';                   break;
8326             case 'f':   value = '\f';                   break;
8327             case 'b':   value = '\b';                   break;
8328             case 'e':   value = ASCII_TO_NATIVE('\033');break;
8329             case 'a':   value = ASCII_TO_NATIVE('\007');break;
8330             case 'o':
8331                 RExC_parse--;   /* function expects to be pointed at the 'o' */
8332                 {
8333                     const char* error_msg;
8334                     bool valid = grok_bslash_o(RExC_parse,
8335                                                &value,
8336                                                &numlen,
8337                                                &error_msg,
8338                                                SIZE_ONLY);
8339                     RExC_parse += numlen;
8340                     if (! valid) {
8341                         vFAIL(error_msg);
8342                     }
8343                 }
8344                 if (PL_encoding && value < 0x100) {
8345                     goto recode_encoding;
8346                 }
8347                 break;
8348             case 'x':
8349                 if (*RExC_parse == '{') {
8350                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8351                         | PERL_SCAN_DISALLOW_PREFIX;
8352                     char * const e = strchr(RExC_parse++, '}');
8353                     if (!e)
8354                         vFAIL("Missing right brace on \\x{}");
8355
8356                     numlen = e - RExC_parse;
8357                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8358                     RExC_parse = e + 1;
8359                 }
8360                 else {
8361                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8362                     numlen = 2;
8363                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8364                     RExC_parse += numlen;
8365                 }
8366                 if (PL_encoding && value < 0x100)
8367                     goto recode_encoding;
8368                 break;
8369             case 'c':
8370                 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
8371                 break;
8372             case '0': case '1': case '2': case '3': case '4':
8373             case '5': case '6': case '7':
8374                 {
8375                     /* Take 1-3 octal digits */
8376                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8377                     numlen = 3;
8378                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8379                     RExC_parse += numlen;
8380                     if (PL_encoding && value < 0x100)
8381                         goto recode_encoding;
8382                     break;
8383                 }
8384             recode_encoding:
8385                 {
8386                     SV* enc = PL_encoding;
8387                     value = reg_recode((const char)(U8)value, &enc);
8388                     if (!enc && SIZE_ONLY)
8389                         ckWARNreg(RExC_parse,
8390                                   "Invalid escape in the specified encoding");
8391                     break;
8392                 }
8393             default:
8394                 /* Allow \_ to not give an error */
8395                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
8396                     ckWARN2reg(RExC_parse,
8397                                "Unrecognized escape \\%c in character class passed through",
8398                                (int)value);
8399                 }
8400                 break;
8401             }
8402         } /* end of \blah */
8403 #ifdef EBCDIC
8404         else
8405             literal_endpoint++;
8406 #endif
8407
8408         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8409
8410             /* What matches in a locale is not known until runtime, so need to
8411              * (one time per class) allocate extra space to pass to regexec.
8412              * The space will contain a bit for each named class that is to be
8413              * matched against.  This isn't needed for \p{} and pseudo-classes,
8414              * as they are not affected by locale, and hence are dealt with
8415              * separately */
8416             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
8417                 need_class = 1;
8418                 if (SIZE_ONLY) {
8419                     RExC_size += ANYOF_CLASS_ADD_SKIP;
8420                 }
8421                 else {
8422                     RExC_emit += ANYOF_CLASS_ADD_SKIP;
8423                     ANYOF_CLASS_ZERO(ret);
8424                 }
8425                     ANYOF_FLAGS(ret) |= ANYOF_CLASS|ANYOF_LARGE;
8426             }
8427
8428             /* a bad range like a-\d, a-[:digit:] ? */
8429             if (range) {
8430                 if (!SIZE_ONLY) {
8431                     const int w =
8432                         RExC_parse >= rangebegin ?
8433                         RExC_parse - rangebegin : 0;
8434                     ckWARN4reg(RExC_parse,
8435                                "False [] range \"%*.*s\"",
8436                                w, w, rangebegin);
8437
8438                     if (prevvalue < 256) {
8439                         ANYOF_BITMAP_SET(ret, prevvalue);
8440                         ANYOF_BITMAP_SET(ret, '-');
8441                     }
8442                     else {
8443                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8444                         Perl_sv_catpvf(aTHX_ listsv,
8445                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8446                     }
8447                 }
8448
8449                 range = 0; /* this was not a true range */
8450             }
8451
8452
8453
8454             if (!SIZE_ONLY) {
8455                 const char *what = NULL;
8456                 char yesno = 0;
8457
8458                 if (namedclass > OOB_NAMEDCLASS)
8459                     optimize_invert = FALSE;
8460                 /* Possible truncation here but in some 64-bit environments
8461                  * the compiler gets heartburn about switch on 64-bit values.
8462                  * A similar issue a little earlier when switching on value.
8463                  * --jhi */
8464                 switch ((I32)namedclass) {
8465
8466                 case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
8467                 case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
8468                 case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
8469                 case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
8470                 case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
8471                 case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
8472                 case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
8473                 case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
8474                 case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
8475                 case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
8476 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8477                 /* \s, \w match all unicode if utf8. */
8478                 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
8479                 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
8480 #else
8481                 /* \s, \w match ascii and locale only */
8482                 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
8483                 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
8484 #endif
8485                 case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
8486                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8487                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8488                 case ANYOF_ASCII:
8489                     if (LOC)
8490                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8491                     else {
8492 #ifndef EBCDIC
8493                         for (value = 0; value < 128; value++)
8494                             ANYOF_BITMAP_SET(ret, value);
8495 #else  /* EBCDIC */
8496                         for (value = 0; value < 256; value++) {
8497                             if (isASCII(value))
8498                                 ANYOF_BITMAP_SET(ret, value);
8499                         }
8500 #endif /* EBCDIC */
8501                     }
8502                     yesno = '+';
8503                     what = "ASCII";
8504                     break;
8505                 case ANYOF_NASCII:
8506                     if (LOC)
8507                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8508                     else {
8509 #ifndef EBCDIC
8510                         for (value = 128; value < 256; value++)
8511                             ANYOF_BITMAP_SET(ret, value);
8512 #else  /* EBCDIC */
8513                         for (value = 0; value < 256; value++) {
8514                             if (!isASCII(value))
8515                                 ANYOF_BITMAP_SET(ret, value);
8516                         }
8517 #endif /* EBCDIC */
8518                     }
8519                     yesno = '!';
8520                     what = "ASCII";
8521                     break;
8522                 case ANYOF_DIGIT:
8523                     if (LOC)
8524                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8525                     else {
8526                         /* consecutive digits assumed */
8527                         for (value = '0'; value <= '9'; value++)
8528                             ANYOF_BITMAP_SET(ret, value);
8529                     }
8530                     yesno = '+';
8531                     what = POSIX_CC_UNI_NAME("Digit");
8532                     break;
8533                 case ANYOF_NDIGIT:
8534                     if (LOC)
8535                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8536                     else {
8537                         /* consecutive digits assumed */
8538                         for (value = 0; value < '0'; value++)
8539                             ANYOF_BITMAP_SET(ret, value);
8540                         for (value = '9' + 1; value < 256; value++)
8541                             ANYOF_BITMAP_SET(ret, value);
8542                     }
8543                     yesno = '!';
8544                     what = POSIX_CC_UNI_NAME("Digit");
8545                     break;
8546                 case ANYOF_MAX:
8547                     /* this is to handle \p and \P */
8548                     break;
8549                 default:
8550                     vFAIL("Invalid [::] class");
8551                     break;
8552                 }
8553                 if (what) {
8554                     /* Strings such as "+utf8::isWord\n" */
8555                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8556                 }
8557                 stored+=2; /* can't optimize this class */
8558                 continue;
8559             }
8560         } /* end of namedclass \blah */
8561
8562         if (range) {
8563             if (prevvalue > (IV)value) /* b-a */ {
8564                 const int w = RExC_parse - rangebegin;
8565                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8566                 range = 0; /* not a valid range */
8567             }
8568         }
8569         else {
8570             prevvalue = value; /* save the beginning of the range */
8571             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8572                 RExC_parse[1] != ']') {
8573                 RExC_parse++;
8574
8575                 /* a bad range like \w-, [:word:]- ? */
8576                 if (namedclass > OOB_NAMEDCLASS) {
8577                     if (ckWARN(WARN_REGEXP)) {
8578                         const int w =
8579                             RExC_parse >= rangebegin ?
8580                             RExC_parse - rangebegin : 0;
8581                         vWARN4(RExC_parse,
8582                                "False [] range \"%*.*s\"",
8583                                w, w, rangebegin);
8584                     }
8585                     if (!SIZE_ONLY)
8586                         ANYOF_BITMAP_SET(ret, '-');
8587                 } else
8588                     range = 1;  /* yeah, it's a range! */
8589                 continue;       /* but do it the next time */
8590             }
8591         }
8592
8593         /* now is the next time */
8594         /*stored += (value - prevvalue + 1);*/
8595         if (!SIZE_ONLY) {
8596             if (prevvalue < 256) {
8597                 const IV ceilvalue = value < 256 ? value : 255;
8598                 IV i;
8599 #ifdef EBCDIC
8600                 /* In EBCDIC [\x89-\x91] should include
8601                  * the \x8e but [i-j] should not. */
8602                 if (literal_endpoint == 2 &&
8603                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8604                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8605                 {
8606                     if (isLOWER(prevvalue)) {
8607                         for (i = prevvalue; i <= ceilvalue; i++)
8608                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8609                                 stored++;
8610                                 ANYOF_BITMAP_SET(ret, i);
8611                             }
8612                     } else {
8613                         for (i = prevvalue; i <= ceilvalue; i++)
8614                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8615                                 stored++;
8616                                 ANYOF_BITMAP_SET(ret, i);
8617                             }
8618                     }
8619                 }
8620                 else
8621 #endif
8622                       for (i = prevvalue; i <= ceilvalue; i++) {
8623                         if (!ANYOF_BITMAP_TEST(ret,i)) {
8624                             stored++;
8625                             ANYOF_BITMAP_SET(ret, i);
8626                         }
8627                       }
8628           }
8629           if (value > 255 || UTF) {
8630                 const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
8631                 const UV natvalue      = NATIVE_TO_UNI(value);
8632                 stored+=2; /* can't optimize this class */
8633                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8634                 if (prevnatvalue < natvalue) { /* what about > ? */
8635                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8636                                    prevnatvalue, natvalue);
8637                 }
8638                 else if (prevnatvalue == natvalue) {
8639                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8640                     if (FOLD) {
8641                          U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8642                          STRLEN foldlen;
8643                          const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8644
8645 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8646                          if (RExC_precomp[0] == ':' &&
8647                              RExC_precomp[1] == '[' &&
8648                              (f == 0xDF || f == 0x92)) {
8649                              f = NATIVE_TO_UNI(f);
8650                         }
8651 #endif
8652                          /* If folding and foldable and a single
8653                           * character, insert also the folded version
8654                           * to the charclass. */
8655                          if (f != value) {
8656 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8657                              if ((RExC_precomp[0] == ':' &&
8658                                   RExC_precomp[1] == '[' &&
8659                                   (f == 0xA2 &&
8660                                    (value == 0xFB05 || value == 0xFB06))) ?
8661                                  foldlen == ((STRLEN)UNISKIP(f) - 1) :
8662                                  foldlen == (STRLEN)UNISKIP(f) )
8663 #else
8664                               if (foldlen == (STRLEN)UNISKIP(f))
8665 #endif
8666                                   Perl_sv_catpvf(aTHX_ listsv,
8667                                                  "%04"UVxf"\n", f);
8668                               else {
8669                                   /* Any multicharacter foldings
8670                                    * require the following transform:
8671                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8672                                    * where E folds into "pq" and F folds
8673                                    * into "rst", all other characters
8674                                    * fold to single characters.  We save
8675                                    * away these multicharacter foldings,
8676                                    * to be later saved as part of the
8677                                    * additional "s" data. */
8678                                   SV *sv;
8679
8680                                   if (!unicode_alternate)
8681                                       unicode_alternate = newAV();
8682                                   sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8683                                                      TRUE);
8684                                   av_push(unicode_alternate, sv);
8685                               }
8686                          }
8687
8688                          /* If folding and the value is one of the Greek
8689                           * sigmas insert a few more sigmas to make the
8690                           * folding rules of the sigmas to work right.
8691                           * Note that not all the possible combinations
8692                           * are handled here: some of them are handled
8693                           * by the standard folding rules, and some of
8694                           * them (literal or EXACTF cases) are handled
8695                           * during runtime in regexec.c:S_find_byclass(). */
8696                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8697                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8698                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8699                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8700                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8701                          }
8702                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8703                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8704                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8705                     }
8706                 }
8707             }
8708 #ifdef EBCDIC
8709             literal_endpoint = 0;
8710 #endif
8711         }
8712
8713         range = 0; /* this range (if it was one) is done now */
8714     }
8715
8716
8717
8718     if (SIZE_ONLY)
8719         return ret;
8720     /****** !SIZE_ONLY AFTER HERE *********/
8721
8722     if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8723         && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8724     ) {
8725         /* optimize single char class to an EXACT node
8726            but *only* when its not a UTF/high char  */
8727         const char * cur_parse= RExC_parse;
8728         RExC_emit = (regnode *)orig_emit;
8729         RExC_parse = (char *)orig_parse;
8730         ret = reg_node(pRExC_state,
8731                        (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8732         RExC_parse = (char *)cur_parse;
8733         *STRING(ret)= (char)value;
8734         STR_LEN(ret)= 1;
8735         RExC_emit += STR_SZ(1);
8736         SvREFCNT_dec(listsv);
8737         return ret;
8738     }
8739     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8740     if ( /* If the only flag is folding (plus possibly inversion). */
8741         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8742        ) {
8743         for (value = 0; value < 256; ++value) {
8744             if (ANYOF_BITMAP_TEST(ret, value)) {
8745                 UV fold = PL_fold[value];
8746
8747                 if (fold != value)
8748                     ANYOF_BITMAP_SET(ret, fold);
8749             }
8750         }
8751         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8752     }
8753
8754     /* optimize inverted simple patterns (e.g. [^a-z]) */
8755     if (optimize_invert &&
8756         /* If the only flag is inversion. */
8757         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8758         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8759             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8760         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8761     }
8762     {
8763         AV * const av = newAV();
8764         SV *rv;
8765         /* The 0th element stores the character class description
8766          * in its textual form: used later (regexec.c:Perl_regclass_swash())
8767          * to initialize the appropriate swash (which gets stored in
8768          * the 1st element), and also useful for dumping the regnode.
8769          * The 2nd element stores the multicharacter foldings,
8770          * used later (regexec.c:S_reginclass()). */
8771         av_store(av, 0, listsv);
8772         av_store(av, 1, NULL);
8773         av_store(av, 2, MUTABLE_SV(unicode_alternate));
8774         rv = newRV_noinc(MUTABLE_SV(av));
8775         n = add_data(pRExC_state, 1, "s");
8776         RExC_rxi->data->data[n] = (void*)rv;
8777         ARG_SET(ret, n);
8778     }
8779     return ret;
8780 }
8781 #undef _C_C_T_
8782
8783
8784 /* reg_skipcomment()
8785
8786    Absorbs an /x style # comments from the input stream.
8787    Returns true if there is more text remaining in the stream.
8788    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8789    terminates the pattern without including a newline.
8790
8791    Note its the callers responsibility to ensure that we are
8792    actually in /x mode
8793
8794 */
8795
8796 STATIC bool
8797 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8798 {
8799     bool ended = 0;
8800
8801     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8802
8803     while (RExC_parse < RExC_end)
8804         if (*RExC_parse++ == '\n') {
8805             ended = 1;
8806             break;
8807         }
8808     if (!ended) {
8809         /* we ran off the end of the pattern without ending
8810            the comment, so we have to add an \n when wrapping */
8811         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8812         return 0;
8813     } else
8814         return 1;
8815 }
8816
8817 /* nextchar()
8818
8819    Advances the parse position, and optionally absorbs
8820    "whitespace" from the inputstream.
8821
8822    Without /x "whitespace" means (?#...) style comments only,
8823    with /x this means (?#...) and # comments and whitespace proper.
8824
8825    Returns the RExC_parse point from BEFORE the scan occurs.
8826
8827    This is the /x friendly way of saying RExC_parse++.
8828 */
8829
8830 STATIC char*
8831 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8832 {
8833     char* const retval = RExC_parse++;
8834
8835     PERL_ARGS_ASSERT_NEXTCHAR;
8836
8837     for (;;) {
8838         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8839                 RExC_parse[2] == '#') {
8840             while (*RExC_parse != ')') {
8841                 if (RExC_parse == RExC_end)
8842                     FAIL("Sequence (?#... not terminated");
8843                 RExC_parse++;
8844             }
8845             RExC_parse++;
8846             continue;
8847         }
8848         if (RExC_flags & RXf_PMf_EXTENDED) {
8849             if (isSPACE(*RExC_parse)) {
8850                 RExC_parse++;
8851                 continue;
8852             }
8853             else if (*RExC_parse == '#') {
8854                 if ( reg_skipcomment( pRExC_state ) )
8855                     continue;
8856             }
8857         }
8858         return retval;
8859     }
8860 }
8861
8862 /*
8863 - reg_node - emit a node
8864 */
8865 STATIC regnode *                        /* Location. */
8866 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8867 {
8868     dVAR;
8869     register regnode *ptr;
8870     regnode * const ret = RExC_emit;
8871     GET_RE_DEBUG_FLAGS_DECL;
8872
8873     PERL_ARGS_ASSERT_REG_NODE;
8874
8875     if (SIZE_ONLY) {
8876         SIZE_ALIGN(RExC_size);
8877         RExC_size += 1;
8878         return(ret);
8879     }
8880     if (RExC_emit >= RExC_emit_bound)
8881         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8882
8883     NODE_ALIGN_FILL(ret);
8884     ptr = ret;
8885     FILL_ADVANCE_NODE(ptr, op);
8886 #ifdef RE_TRACK_PATTERN_OFFSETS
8887     if (RExC_offsets) {         /* MJD */
8888         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8889               "reg_node", __LINE__,
8890               PL_reg_name[op],
8891               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8892                 ? "Overwriting end of array!\n" : "OK",
8893               (UV)(RExC_emit - RExC_emit_start),
8894               (UV)(RExC_parse - RExC_start),
8895               (UV)RExC_offsets[0]));
8896         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8897     }
8898 #endif
8899     RExC_emit = ptr;
8900     return(ret);
8901 }
8902
8903 /*
8904 - reganode - emit a node with an argument
8905 */
8906 STATIC regnode *                        /* Location. */
8907 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8908 {
8909     dVAR;
8910     register regnode *ptr;
8911     regnode * const ret = RExC_emit;
8912     GET_RE_DEBUG_FLAGS_DECL;
8913
8914     PERL_ARGS_ASSERT_REGANODE;
8915
8916     if (SIZE_ONLY) {
8917         SIZE_ALIGN(RExC_size);
8918         RExC_size += 2;
8919         /*
8920            We can't do this:
8921
8922            assert(2==regarglen[op]+1);
8923
8924            Anything larger than this has to allocate the extra amount.
8925            If we changed this to be:
8926
8927            RExC_size += (1 + regarglen[op]);
8928
8929            then it wouldn't matter. Its not clear what side effect
8930            might come from that so its not done so far.
8931            -- dmq
8932         */
8933         return(ret);
8934     }
8935     if (RExC_emit >= RExC_emit_bound)
8936         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8937
8938     NODE_ALIGN_FILL(ret);
8939     ptr = ret;
8940     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8941 #ifdef RE_TRACK_PATTERN_OFFSETS
8942     if (RExC_offsets) {         /* MJD */
8943         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8944               "reganode",
8945               __LINE__,
8946               PL_reg_name[op],
8947               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8948               "Overwriting end of array!\n" : "OK",
8949               (UV)(RExC_emit - RExC_emit_start),
8950               (UV)(RExC_parse - RExC_start),
8951               (UV)RExC_offsets[0]));
8952         Set_Cur_Node_Offset;
8953     }
8954 #endif
8955     RExC_emit = ptr;
8956     return(ret);
8957 }
8958
8959 /*
8960 - reguni - emit (if appropriate) a Unicode character
8961 */
8962 STATIC STRLEN
8963 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8964 {
8965     dVAR;
8966
8967     PERL_ARGS_ASSERT_REGUNI;
8968
8969     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8970 }
8971
8972 /*
8973 - reginsert - insert an operator in front of already-emitted operand
8974 *
8975 * Means relocating the operand.
8976 */
8977 STATIC void
8978 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8979 {
8980     dVAR;
8981     register regnode *src;
8982     register regnode *dst;
8983     register regnode *place;
8984     const int offset = regarglen[(U8)op];
8985     const int size = NODE_STEP_REGNODE + offset;
8986     GET_RE_DEBUG_FLAGS_DECL;
8987
8988     PERL_ARGS_ASSERT_REGINSERT;
8989     PERL_UNUSED_ARG(depth);
8990 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8991     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8992     if (SIZE_ONLY) {
8993         RExC_size += size;
8994         return;
8995     }
8996
8997     src = RExC_emit;
8998     RExC_emit += size;
8999     dst = RExC_emit;
9000     if (RExC_open_parens) {
9001         int paren;
9002         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
9003         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
9004             if ( RExC_open_parens[paren] >= opnd ) {
9005                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
9006                 RExC_open_parens[paren] += size;
9007             } else {
9008                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
9009             }
9010             if ( RExC_close_parens[paren] >= opnd ) {
9011                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
9012                 RExC_close_parens[paren] += size;
9013             } else {
9014                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
9015             }
9016         }
9017     }
9018
9019     while (src > opnd) {
9020         StructCopy(--src, --dst, regnode);
9021 #ifdef RE_TRACK_PATTERN_OFFSETS
9022         if (RExC_offsets) {     /* MJD 20010112 */
9023             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
9024                   "reg_insert",
9025                   __LINE__,
9026                   PL_reg_name[op],
9027                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
9028                     ? "Overwriting end of array!\n" : "OK",
9029                   (UV)(src - RExC_emit_start),
9030                   (UV)(dst - RExC_emit_start),
9031                   (UV)RExC_offsets[0]));
9032             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
9033             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
9034         }
9035 #endif
9036     }
9037
9038
9039     place = opnd;               /* Op node, where operand used to be. */
9040 #ifdef RE_TRACK_PATTERN_OFFSETS
9041     if (RExC_offsets) {         /* MJD */
9042         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9043               "reginsert",
9044               __LINE__,
9045               PL_reg_name[op],
9046               (UV)(place - RExC_emit_start) > RExC_offsets[0]
9047               ? "Overwriting end of array!\n" : "OK",
9048               (UV)(place - RExC_emit_start),
9049               (UV)(RExC_parse - RExC_start),
9050               (UV)RExC_offsets[0]));
9051         Set_Node_Offset(place, RExC_parse);
9052         Set_Node_Length(place, 1);
9053     }
9054 #endif
9055     src = NEXTOPER(place);
9056     FILL_ADVANCE_NODE(place, op);
9057     Zero(src, offset, regnode);
9058 }
9059
9060 /*
9061 - regtail - set the next-pointer at the end of a node chain of p to val.
9062 - SEE ALSO: regtail_study
9063 */
9064 /* TODO: All three parms should be const */
9065 STATIC void
9066 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9067 {
9068     dVAR;
9069     register regnode *scan;
9070     GET_RE_DEBUG_FLAGS_DECL;
9071
9072     PERL_ARGS_ASSERT_REGTAIL;
9073 #ifndef DEBUGGING
9074     PERL_UNUSED_ARG(depth);
9075 #endif
9076
9077     if (SIZE_ONLY)
9078         return;
9079
9080     /* Find last node. */
9081     scan = p;
9082     for (;;) {
9083         regnode * const temp = regnext(scan);
9084         DEBUG_PARSE_r({
9085             SV * const mysv=sv_newmortal();
9086             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
9087             regprop(RExC_rx, mysv, scan);
9088             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
9089                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
9090                     (temp == NULL ? "->" : ""),
9091                     (temp == NULL ? PL_reg_name[OP(val)] : "")
9092             );
9093         });
9094         if (temp == NULL)
9095             break;
9096         scan = temp;
9097     }
9098
9099     if (reg_off_by_arg[OP(scan)]) {
9100         ARG_SET(scan, val - scan);
9101     }
9102     else {
9103         NEXT_OFF(scan) = val - scan;
9104     }
9105 }
9106
9107 #ifdef DEBUGGING
9108 /*
9109 - regtail_study - set the next-pointer at the end of a node chain of p to val.
9110 - Look for optimizable sequences at the same time.
9111 - currently only looks for EXACT chains.
9112
9113 This is expermental code. The idea is to use this routine to perform
9114 in place optimizations on branches and groups as they are constructed,
9115 with the long term intention of removing optimization from study_chunk so
9116 that it is purely analytical.
9117
9118 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
9119 to control which is which.
9120
9121 */
9122 /* TODO: All four parms should be const */
9123
9124 STATIC U8
9125 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9126 {
9127     dVAR;
9128     register regnode *scan;
9129     U8 exact = PSEUDO;
9130 #ifdef EXPERIMENTAL_INPLACESCAN
9131     I32 min = 0;
9132 #endif
9133     GET_RE_DEBUG_FLAGS_DECL;
9134
9135     PERL_ARGS_ASSERT_REGTAIL_STUDY;
9136
9137
9138     if (SIZE_ONLY)
9139         return exact;
9140
9141     /* Find last node. */
9142
9143     scan = p;
9144     for (;;) {
9145         regnode * const temp = regnext(scan);
9146 #ifdef EXPERIMENTAL_INPLACESCAN
9147         if (PL_regkind[OP(scan)] == EXACT)
9148             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
9149                 return EXACT;
9150 #endif
9151         if ( exact ) {
9152             switch (OP(scan)) {
9153                 case EXACT:
9154                 case EXACTF:
9155                 case EXACTFL:
9156                         if( exact == PSEUDO )
9157                             exact= OP(scan);
9158                         else if ( exact != OP(scan) )
9159                             exact= 0;
9160                 case NOTHING:
9161                     break;
9162                 default:
9163                     exact= 0;
9164             }
9165         }
9166         DEBUG_PARSE_r({
9167             SV * const mysv=sv_newmortal();
9168             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
9169             regprop(RExC_rx, mysv, scan);
9170             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
9171                 SvPV_nolen_const(mysv),
9172                 REG_NODE_NUM(scan),
9173                 PL_reg_name[exact]);
9174         });
9175         if (temp == NULL)
9176             break;
9177         scan = temp;
9178     }
9179     DEBUG_PARSE_r({
9180         SV * const mysv_val=sv_newmortal();
9181         DEBUG_PARSE_MSG("");
9182         regprop(RExC_rx, mysv_val, val);
9183         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
9184                       SvPV_nolen_const(mysv_val),
9185                       (IV)REG_NODE_NUM(val),
9186                       (IV)(val - scan)
9187         );
9188     });
9189     if (reg_off_by_arg[OP(scan)]) {
9190         ARG_SET(scan, val - scan);
9191     }
9192     else {
9193         NEXT_OFF(scan) = val - scan;
9194     }
9195
9196     return exact;
9197 }
9198 #endif
9199
9200 /*
9201  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
9202  */
9203 #ifdef DEBUGGING
9204 static void
9205 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9206 {
9207     int bit;
9208     int set=0;
9209
9210     for (bit=0; bit<32; bit++) {
9211         if (flags & (1<<bit)) {
9212             if (!set++ && lead)
9213                 PerlIO_printf(Perl_debug_log, "%s",lead);
9214             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9215         }
9216     }
9217     if (lead)  {
9218         if (set)
9219             PerlIO_printf(Perl_debug_log, "\n");
9220         else
9221             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9222     }
9223 }
9224 #endif
9225
9226 void
9227 Perl_regdump(pTHX_ const regexp *r)
9228 {
9229 #ifdef DEBUGGING
9230     dVAR;
9231     SV * const sv = sv_newmortal();
9232     SV *dsv= sv_newmortal();
9233     RXi_GET_DECL(r,ri);
9234     GET_RE_DEBUG_FLAGS_DECL;
9235
9236     PERL_ARGS_ASSERT_REGDUMP;
9237
9238     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
9239
9240     /* Header fields of interest. */
9241     if (r->anchored_substr) {
9242         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9243             RE_SV_DUMPLEN(r->anchored_substr), 30);
9244         PerlIO_printf(Perl_debug_log,
9245                       "anchored %s%s at %"IVdf" ",
9246                       s, RE_SV_TAIL(r->anchored_substr),
9247                       (IV)r->anchored_offset);
9248     } else if (r->anchored_utf8) {
9249         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9250             RE_SV_DUMPLEN(r->anchored_utf8), 30);
9251         PerlIO_printf(Perl_debug_log,
9252                       "anchored utf8 %s%s at %"IVdf" ",
9253                       s, RE_SV_TAIL(r->anchored_utf8),
9254                       (IV)r->anchored_offset);
9255     }
9256     if (r->float_substr) {
9257         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9258             RE_SV_DUMPLEN(r->float_substr), 30);
9259         PerlIO_printf(Perl_debug_log,
9260                       "floating %s%s at %"IVdf"..%"UVuf" ",
9261                       s, RE_SV_TAIL(r->float_substr),
9262                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9263     } else if (r->float_utf8) {
9264         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9265             RE_SV_DUMPLEN(r->float_utf8), 30);
9266         PerlIO_printf(Perl_debug_log,
9267                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9268                       s, RE_SV_TAIL(r->float_utf8),
9269                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9270     }
9271     if (r->check_substr || r->check_utf8)
9272         PerlIO_printf(Perl_debug_log,
9273                       (const char *)
9274                       (r->check_substr == r->float_substr
9275                        && r->check_utf8 == r->float_utf8
9276                        ? "(checking floating" : "(checking anchored"));
9277     if (r->extflags & RXf_NOSCAN)
9278         PerlIO_printf(Perl_debug_log, " noscan");
9279     if (r->extflags & RXf_CHECK_ALL)
9280         PerlIO_printf(Perl_debug_log, " isall");
9281     if (r->check_substr || r->check_utf8)
9282         PerlIO_printf(Perl_debug_log, ") ");
9283
9284     if (ri->regstclass) {
9285         regprop(r, sv, ri->regstclass);
9286         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9287     }
9288     if (r->extflags & RXf_ANCH) {
9289         PerlIO_printf(Perl_debug_log, "anchored");
9290         if (r->extflags & RXf_ANCH_BOL)
9291             PerlIO_printf(Perl_debug_log, "(BOL)");
9292         if (r->extflags & RXf_ANCH_MBOL)
9293             PerlIO_printf(Perl_debug_log, "(MBOL)");
9294         if (r->extflags & RXf_ANCH_SBOL)
9295             PerlIO_printf(Perl_debug_log, "(SBOL)");
9296         if (r->extflags & RXf_ANCH_GPOS)
9297             PerlIO_printf(Perl_debug_log, "(GPOS)");
9298         PerlIO_putc(Perl_debug_log, ' ');
9299     }
9300     if (r->extflags & RXf_GPOS_SEEN)
9301         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9302     if (r->intflags & PREGf_SKIP)
9303         PerlIO_printf(Perl_debug_log, "plus ");
9304     if (r->intflags & PREGf_IMPLICIT)
9305         PerlIO_printf(Perl_debug_log, "implicit ");
9306     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9307     if (r->extflags & RXf_EVAL_SEEN)
9308         PerlIO_printf(Perl_debug_log, "with eval ");
9309     PerlIO_printf(Perl_debug_log, "\n");
9310     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9311 #else
9312     PERL_ARGS_ASSERT_REGDUMP;
9313     PERL_UNUSED_CONTEXT;
9314     PERL_UNUSED_ARG(r);
9315 #endif  /* DEBUGGING */
9316 }
9317
9318 /*
9319 - regprop - printable representation of opcode
9320 */
9321 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9322 STMT_START { \
9323         if (do_sep) {                           \
9324             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9325             if (flags & ANYOF_INVERT)           \
9326                 /*make sure the invert info is in each */ \
9327                 sv_catpvs(sv, "^");             \
9328             do_sep = 0;                         \
9329         }                                       \
9330 } STMT_END
9331
9332 void
9333 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9334 {
9335 #ifdef DEBUGGING
9336     dVAR;
9337     register int k;
9338     RXi_GET_DECL(prog,progi);
9339     GET_RE_DEBUG_FLAGS_DECL;
9340
9341     PERL_ARGS_ASSERT_REGPROP;
9342
9343     sv_setpvs(sv, "");
9344
9345     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
9346         /* It would be nice to FAIL() here, but this may be called from
9347            regexec.c, and it would be hard to supply pRExC_state. */
9348         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9349     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9350
9351     k = PL_regkind[OP(o)];
9352
9353     if (k == EXACT) {
9354         sv_catpvs(sv, " ");
9355         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9356          * is a crude hack but it may be the best for now since
9357          * we have no flag "this EXACTish node was UTF-8"
9358          * --jhi */
9359         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9360                   PERL_PV_ESCAPE_UNI_DETECT |
9361                   PERL_PV_PRETTY_ELLIPSES   |
9362                   PERL_PV_PRETTY_LTGT       |
9363                   PERL_PV_PRETTY_NOCLEAR
9364                   );
9365     } else if (k == TRIE) {
9366         /* print the details of the trie in dumpuntil instead, as
9367          * progi->data isn't available here */
9368         const char op = OP(o);
9369         const U32 n = ARG(o);
9370         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9371                (reg_ac_data *)progi->data->data[n] :
9372                NULL;
9373         const reg_trie_data * const trie
9374             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9375
9376         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9377         DEBUG_TRIE_COMPILE_r(
9378             Perl_sv_catpvf(aTHX_ sv,
9379                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9380                 (UV)trie->startstate,
9381                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9382                 (UV)trie->wordcount,
9383                 (UV)trie->minlen,
9384                 (UV)trie->maxlen,
9385                 (UV)TRIE_CHARCOUNT(trie),
9386                 (UV)trie->uniquecharcount
9387             )
9388         );
9389         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9390             int i;
9391             int rangestart = -1;
9392             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9393             sv_catpvs(sv, "[");
9394             for (i = 0; i <= 256; i++) {
9395                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9396                     if (rangestart == -1)
9397                         rangestart = i;
9398                 } else if (rangestart != -1) {
9399                     if (i <= rangestart + 3)
9400                         for (; rangestart < i; rangestart++)
9401                             put_byte(sv, rangestart);
9402                     else {
9403                         put_byte(sv, rangestart);
9404                         sv_catpvs(sv, "-");
9405                         put_byte(sv, i - 1);
9406                     }
9407                     rangestart = -1;
9408                 }
9409             }
9410             sv_catpvs(sv, "]");
9411         }
9412
9413     } else if (k == CURLY) {
9414         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9415             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9416         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9417     }
9418     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
9419         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9420     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9421         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
9422         if ( RXp_PAREN_NAMES(prog) ) {
9423             if ( k != REF || OP(o) < NREF) {
9424                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9425                 SV **name= av_fetch(list, ARG(o), 0 );
9426                 if (name)
9427                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9428             }
9429             else {
9430                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9431                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9432                 I32 *nums=(I32*)SvPVX(sv_dat);
9433                 SV **name= av_fetch(list, nums[0], 0 );
9434                 I32 n;
9435                 if (name) {
9436                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
9437                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9438                                     (n ? "," : ""), (IV)nums[n]);
9439                     }
9440                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9441                 }
9442             }
9443         }
9444     } else if (k == GOSUB)
9445         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9446     else if (k == VERB) {
9447         if (!o->flags)
9448             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9449                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9450     } else if (k == LOGICAL)
9451         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
9452     else if (k == FOLDCHAR)
9453         Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9454     else if (k == ANYOF) {
9455         int i, rangestart = -1;
9456         const U8 flags = ANYOF_FLAGS(o);
9457         int do_sep = 0;
9458
9459         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9460         static const char * const anyofs[] = {
9461             "\\w",
9462             "\\W",
9463             "\\s",
9464             "\\S",
9465             "\\d",
9466             "\\D",
9467             "[:alnum:]",
9468             "[:^alnum:]",
9469             "[:alpha:]",
9470             "[:^alpha:]",
9471             "[:ascii:]",
9472             "[:^ascii:]",
9473             "[:cntrl:]",
9474             "[:^cntrl:]",
9475             "[:graph:]",
9476             "[:^graph:]",
9477             "[:lower:]",
9478             "[:^lower:]",
9479             "[:print:]",
9480             "[:^print:]",
9481             "[:punct:]",
9482             "[:^punct:]",
9483             "[:upper:]",
9484             "[:^upper:]",
9485             "[:xdigit:]",
9486             "[:^xdigit:]",
9487             "[:space:]",
9488             "[:^space:]",
9489             "[:blank:]",
9490             "[:^blank:]"
9491         };
9492
9493         if (flags & ANYOF_LOCALE)
9494             sv_catpvs(sv, "{loc}");
9495         if (flags & ANYOF_FOLD)
9496             sv_catpvs(sv, "{i}");
9497         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9498         if (flags & ANYOF_INVERT)
9499             sv_catpvs(sv, "^");
9500
9501         /* output what the standard cp 0-255 bitmap matches */
9502         for (i = 0; i <= 256; i++) {
9503             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9504                 if (rangestart == -1)
9505                     rangestart = i;
9506             } else if (rangestart != -1) {
9507                 if (i <= rangestart + 3)
9508                     for (; rangestart < i; rangestart++)
9509                         put_byte(sv, rangestart);
9510                 else {
9511                     put_byte(sv, rangestart);
9512                     sv_catpvs(sv, "-");
9513                     put_byte(sv, i - 1);
9514                 }
9515                 do_sep = 1;
9516                 rangestart = -1;
9517             }
9518         }
9519
9520         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9521         /* output any special charclass tests (used mostly under use locale) */
9522         if (o->flags & ANYOF_CLASS)
9523             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9524                 if (ANYOF_CLASS_TEST(o,i)) {
9525                     sv_catpv(sv, anyofs[i]);
9526                     do_sep = 1;
9527                 }
9528
9529         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9530
9531         /* output information about the unicode matching */
9532         if (flags & ANYOF_UNICODE)
9533             sv_catpvs(sv, "{unicode}");
9534         else if (flags & ANYOF_UNICODE_ALL)
9535             sv_catpvs(sv, "{unicode_all}");
9536
9537         {
9538             SV *lv;
9539             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9540
9541             if (lv) {
9542                 if (sw) {
9543                     U8 s[UTF8_MAXBYTES_CASE+1];
9544
9545                     for (i = 0; i <= 256; i++) { /* just the first 256 */
9546                         uvchr_to_utf8(s, i);
9547
9548                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
9549                             if (rangestart == -1)
9550                                 rangestart = i;
9551                         } else if (rangestart != -1) {
9552                             if (i <= rangestart + 3)
9553                                 for (; rangestart < i; rangestart++) {
9554                                     const U8 * const e = uvchr_to_utf8(s,rangestart);
9555                                     U8 *p;
9556                                     for(p = s; p < e; p++)
9557                                         put_byte(sv, *p);
9558                                 }
9559                             else {
9560                                 const U8 *e = uvchr_to_utf8(s,rangestart);
9561                                 U8 *p;
9562                                 for (p = s; p < e; p++)
9563                                     put_byte(sv, *p);
9564                                 sv_catpvs(sv, "-");
9565                                 e = uvchr_to_utf8(s, i-1);
9566                                 for (p = s; p < e; p++)
9567                                     put_byte(sv, *p);
9568                                 }
9569                                 rangestart = -1;
9570                             }
9571                         }
9572
9573                     sv_catpvs(sv, "..."); /* et cetera */
9574                 }
9575
9576                 {
9577                     char *s = savesvpv(lv);
9578                     char * const origs = s;
9579
9580                     while (*s && *s != '\n')
9581                         s++;
9582
9583                     if (*s == '\n') {
9584                         const char * const t = ++s;
9585
9586                         while (*s) {
9587                             if (*s == '\n')
9588                                 *s = ' ';
9589                             s++;
9590                         }
9591                         if (s[-1] == ' ')
9592                             s[-1] = 0;
9593
9594                         sv_catpv(sv, t);
9595                     }
9596
9597                     Safefree(origs);
9598                 }
9599             }
9600         }
9601
9602         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9603     }
9604     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9605         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9606 #else
9607     PERL_UNUSED_CONTEXT;
9608     PERL_UNUSED_ARG(sv);
9609     PERL_UNUSED_ARG(o);
9610     PERL_UNUSED_ARG(prog);
9611 #endif  /* DEBUGGING */
9612 }
9613
9614 SV *
9615 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9616 {                               /* Assume that RE_INTUIT is set */
9617     dVAR;
9618     struct regexp *const prog = (struct regexp *)SvANY(r);
9619     GET_RE_DEBUG_FLAGS_DECL;
9620
9621     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9622     PERL_UNUSED_CONTEXT;
9623
9624     DEBUG_COMPILE_r(
9625         {
9626             const char * const s = SvPV_nolen_const(prog->check_substr
9627                       ? prog->check_substr : prog->check_utf8);
9628
9629             if (!PL_colorset) reginitcolors();
9630             PerlIO_printf(Perl_debug_log,
9631                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9632                       PL_colors[4],
9633                       prog->check_substr ? "" : "utf8 ",
9634                       PL_colors[5],PL_colors[0],
9635                       s,
9636                       PL_colors[1],
9637                       (strlen(s) > 60 ? "..." : ""));
9638         } );
9639
9640     return prog->check_substr ? prog->check_substr : prog->check_utf8;
9641 }
9642
9643 /*
9644    pregfree()
9645
9646    handles refcounting and freeing the perl core regexp structure. When
9647    it is necessary to actually free the structure the first thing it
9648    does is call the 'free' method of the regexp_engine associated to
9649    the regexp, allowing the handling of the void *pprivate; member
9650    first. (This routine is not overridable by extensions, which is why
9651    the extensions free is called first.)
9652
9653    See regdupe and regdupe_internal if you change anything here.
9654 */
9655 #ifndef PERL_IN_XSUB_RE
9656 void
9657 Perl_pregfree(pTHX_ REGEXP *r)
9658 {
9659     SvREFCNT_dec(r);
9660 }
9661
9662 void
9663 Perl_pregfree2(pTHX_ REGEXP *rx)
9664 {
9665     dVAR;
9666     struct regexp *const r = (struct regexp *)SvANY(rx);
9667     GET_RE_DEBUG_FLAGS_DECL;
9668
9669     PERL_ARGS_ASSERT_PREGFREE2;
9670
9671     if (r->mother_re) {
9672         ReREFCNT_dec(r->mother_re);
9673     } else {
9674         CALLREGFREE_PVT(rx); /* free the private data */
9675         SvREFCNT_dec(RXp_PAREN_NAMES(r));
9676     }
9677     if (r->substrs) {
9678         SvREFCNT_dec(r->anchored_substr);
9679         SvREFCNT_dec(r->anchored_utf8);
9680         SvREFCNT_dec(r->float_substr);
9681         SvREFCNT_dec(r->float_utf8);
9682         Safefree(r->substrs);
9683     }
9684     RX_MATCH_COPY_FREE(rx);
9685 #ifdef PERL_OLD_COPY_ON_WRITE
9686     SvREFCNT_dec(r->saved_copy);
9687 #endif
9688     Safefree(r->offs);
9689 }
9690
9691 /*  reg_temp_copy()
9692
9693     This is a hacky workaround to the structural issue of match results
9694     being stored in the regexp structure which is in turn stored in
9695     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9696     could be PL_curpm in multiple contexts, and could require multiple
9697     result sets being associated with the pattern simultaneously, such
9698     as when doing a recursive match with (??{$qr})
9699
9700     The solution is to make a lightweight copy of the regexp structure
9701     when a qr// is returned from the code executed by (??{$qr}) this
9702     lightweight copy doesnt actually own any of its data except for
9703     the starp/end and the actual regexp structure itself.
9704
9705 */
9706
9707
9708 REGEXP *
9709 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9710 {
9711     struct regexp *ret;
9712     struct regexp *const r = (struct regexp *)SvANY(rx);
9713     register const I32 npar = r->nparens+1;
9714
9715     PERL_ARGS_ASSERT_REG_TEMP_COPY;
9716
9717     if (!ret_x)
9718         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9719     ret = (struct regexp *)SvANY(ret_x);
9720
9721     (void)ReREFCNT_inc(rx);
9722     /* We can take advantage of the existing "copied buffer" mechanism in SVs
9723        by pointing directly at the buffer, but flagging that the allocated
9724        space in the copy is zero. As we've just done a struct copy, it's now
9725        a case of zero-ing that, rather than copying the current length.  */
9726     SvPV_set(ret_x, RX_WRAPPED(rx));
9727     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9728     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9729            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9730     SvLEN_set(ret_x, 0);
9731     SvSTASH_set(ret_x, NULL);
9732     SvMAGIC_set(ret_x, NULL);
9733     Newx(ret->offs, npar, regexp_paren_pair);
9734     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9735     if (r->substrs) {
9736         Newx(ret->substrs, 1, struct reg_substr_data);
9737         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9738
9739         SvREFCNT_inc_void(ret->anchored_substr);
9740         SvREFCNT_inc_void(ret->anchored_utf8);
9741         SvREFCNT_inc_void(ret->float_substr);
9742         SvREFCNT_inc_void(ret->float_utf8);
9743
9744         /* check_substr and check_utf8, if non-NULL, point to either their
9745            anchored or float namesakes, and don't hold a second reference.  */
9746     }
9747     RX_MATCH_COPIED_off(ret_x);
9748 #ifdef PERL_OLD_COPY_ON_WRITE
9749     ret->saved_copy = NULL;
9750 #endif
9751     ret->mother_re = rx;
9752
9753     return ret_x;
9754 }
9755 #endif
9756
9757 /* regfree_internal()
9758
9759    Free the private data in a regexp. This is overloadable by
9760    extensions. Perl takes care of the regexp structure in pregfree(),
9761    this covers the *pprivate pointer which technically perl doesn't
9762    know about, however of course we have to handle the
9763    regexp_internal structure when no extension is in use.
9764
9765    Note this is called before freeing anything in the regexp
9766    structure.
9767  */
9768
9769 void
9770 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9771 {
9772     dVAR;
9773     struct regexp *const r = (struct regexp *)SvANY(rx);
9774     RXi_GET_DECL(r,ri);
9775     GET_RE_DEBUG_FLAGS_DECL;
9776
9777     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9778
9779     DEBUG_COMPILE_r({
9780         if (!PL_colorset)
9781             reginitcolors();
9782         {
9783             SV *dsv= sv_newmortal();
9784             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
9785                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
9786             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9787                 PL_colors[4],PL_colors[5],s);
9788         }
9789     });
9790 #ifdef RE_TRACK_PATTERN_OFFSETS
9791     if (ri->u.offsets)
9792         Safefree(ri->u.offsets);             /* 20010421 MJD */
9793 #endif
9794     if (ri->data) {
9795         int n = ri->data->count;
9796         PAD* new_comppad = NULL;
9797         PAD* old_comppad;
9798         PADOFFSET refcnt;
9799
9800         while (--n >= 0) {
9801           /* If you add a ->what type here, update the comment in regcomp.h */
9802             switch (ri->data->what[n]) {
9803             case 'a':
9804             case 's':
9805             case 'S':
9806             case 'u':
9807                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9808                 break;
9809             case 'f':
9810                 Safefree(ri->data->data[n]);
9811                 break;
9812             case 'p':
9813                 new_comppad = MUTABLE_AV(ri->data->data[n]);
9814                 break;
9815             case 'o':
9816                 if (new_comppad == NULL)
9817                     Perl_croak(aTHX_ "panic: pregfree comppad");
9818                 PAD_SAVE_LOCAL(old_comppad,
9819                     /* Watch out for global destruction's random ordering. */
9820                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9821                 );
9822                 OP_REFCNT_LOCK;
9823                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9824                 OP_REFCNT_UNLOCK;
9825                 if (!refcnt)
9826                     op_free((OP_4tree*)ri->data->data[n]);
9827
9828                 PAD_RESTORE_LOCAL(old_comppad);
9829                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
9830                 new_comppad = NULL;
9831                 break;
9832             case 'n':
9833                 break;
9834             case 'T':
9835                 { /* Aho Corasick add-on structure for a trie node.
9836                      Used in stclass optimization only */
9837                     U32 refcount;
9838                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9839                     OP_REFCNT_LOCK;
9840                     refcount = --aho->refcount;
9841                     OP_REFCNT_UNLOCK;
9842                     if ( !refcount ) {
9843                         PerlMemShared_free(aho->states);
9844                         PerlMemShared_free(aho->fail);
9845                          /* do this last!!!! */
9846                         PerlMemShared_free(ri->data->data[n]);
9847                         PerlMemShared_free(ri->regstclass);
9848                     }
9849                 }
9850                 break;
9851             case 't':
9852                 {
9853                     /* trie structure. */
9854                     U32 refcount;
9855                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9856                     OP_REFCNT_LOCK;
9857                     refcount = --trie->refcount;
9858                     OP_REFCNT_UNLOCK;
9859                     if ( !refcount ) {
9860                         PerlMemShared_free(trie->charmap);
9861                         PerlMemShared_free(trie->states);
9862                         PerlMemShared_free(trie->trans);
9863                         if (trie->bitmap)
9864                             PerlMemShared_free(trie->bitmap);
9865                         if (trie->jump)
9866                             PerlMemShared_free(trie->jump);
9867                         PerlMemShared_free(trie->wordinfo);
9868                         /* do this last!!!! */
9869                         PerlMemShared_free(ri->data->data[n]);
9870                     }
9871                 }
9872                 break;
9873             default:
9874                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9875             }
9876         }
9877         Safefree(ri->data->what);
9878         Safefree(ri->data);
9879     }
9880
9881     Safefree(ri);
9882 }
9883
9884 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
9885 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
9886 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
9887
9888 /*
9889    re_dup - duplicate a regexp.
9890
9891    This routine is expected to clone a given regexp structure. It is only
9892    compiled under USE_ITHREADS.
9893
9894    After all of the core data stored in struct regexp is duplicated
9895    the regexp_engine.dupe method is used to copy any private data
9896    stored in the *pprivate pointer. This allows extensions to handle
9897    any duplication it needs to do.
9898
9899    See pregfree() and regfree_internal() if you change anything here.
9900 */
9901 #if defined(USE_ITHREADS)
9902 #ifndef PERL_IN_XSUB_RE
9903 void
9904 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
9905 {
9906     dVAR;
9907     I32 npar;
9908     const struct regexp *r = (const struct regexp *)SvANY(sstr);
9909     struct regexp *ret = (struct regexp *)SvANY(dstr);
9910
9911     PERL_ARGS_ASSERT_RE_DUP_GUTS;
9912
9913     npar = r->nparens+1;
9914     Newx(ret->offs, npar, regexp_paren_pair);
9915     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9916     if(ret->swap) {
9917         /* no need to copy these */
9918         Newx(ret->swap, npar, regexp_paren_pair);
9919     }
9920
9921     if (ret->substrs) {
9922         /* Do it this way to avoid reading from *r after the StructCopy().
9923            That way, if any of the sv_dup_inc()s dislodge *r from the L1
9924            cache, it doesn't matter.  */
9925         const bool anchored = r->check_substr
9926             ? r->check_substr == r->anchored_substr
9927             : r->check_utf8 == r->anchored_utf8;
9928         Newx(ret->substrs, 1, struct reg_substr_data);
9929         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9930
9931         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9932         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9933         ret->float_substr = sv_dup_inc(ret->float_substr, param);
9934         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9935
9936         /* check_substr and check_utf8, if non-NULL, point to either their
9937            anchored or float namesakes, and don't hold a second reference.  */
9938
9939         if (ret->check_substr) {
9940             if (anchored) {
9941                 assert(r->check_utf8 == r->anchored_utf8);
9942                 ret->check_substr = ret->anchored_substr;
9943                 ret->check_utf8 = ret->anchored_utf8;
9944             } else {
9945                 assert(r->check_substr == r->float_substr);
9946                 assert(r->check_utf8 == r->float_utf8);
9947                 ret->check_substr = ret->float_substr;
9948                 ret->check_utf8 = ret->float_utf8;
9949             }
9950         } else if (ret->check_utf8) {
9951             if (anchored) {
9952                 ret->check_utf8 = ret->anchored_utf8;
9953             } else {
9954                 ret->check_utf8 = ret->float_utf8;
9955             }
9956         }
9957     }
9958
9959     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9960
9961     if (ret->pprivate)
9962         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
9963
9964     if (RX_MATCH_COPIED(dstr))
9965         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
9966     else
9967         ret->subbeg = NULL;
9968 #ifdef PERL_OLD_COPY_ON_WRITE
9969     ret->saved_copy = NULL;
9970 #endif
9971
9972     if (ret->mother_re) {
9973         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9974             /* Our storage points directly to our mother regexp, but that's
9975                1: a buffer in a different thread
9976                2: something we no longer hold a reference on
9977                so we need to copy it locally.  */
9978             /* Note we need to sue SvCUR() on our mother_re, because it, in
9979                turn, may well be pointing to its own mother_re.  */
9980             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9981                                    SvCUR(ret->mother_re)+1));
9982             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9983         }
9984         ret->mother_re      = NULL;
9985     }
9986     ret->gofs = 0;
9987 }
9988 #endif /* PERL_IN_XSUB_RE */
9989
9990 /*
9991    regdupe_internal()
9992
9993    This is the internal complement to regdupe() which is used to copy
9994    the structure pointed to by the *pprivate pointer in the regexp.
9995    This is the core version of the extension overridable cloning hook.
9996    The regexp structure being duplicated will be copied by perl prior
9997    to this and will be provided as the regexp *r argument, however
9998    with the /old/ structures pprivate pointer value. Thus this routine
9999    may override any copying normally done by perl.
10000
10001    It returns a pointer to the new regexp_internal structure.
10002 */
10003
10004 void *
10005 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
10006 {
10007     dVAR;
10008     struct regexp *const r = (struct regexp *)SvANY(rx);
10009     regexp_internal *reti;
10010     int len, npar;
10011     RXi_GET_DECL(r,ri);
10012
10013     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
10014
10015     npar = r->nparens+1;
10016     len = ProgLen(ri);
10017
10018     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
10019     Copy(ri->program, reti->program, len+1, regnode);
10020
10021
10022     reti->regstclass = NULL;
10023
10024     if (ri->data) {
10025         struct reg_data *d;
10026         const int count = ri->data->count;
10027         int i;
10028
10029         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
10030                 char, struct reg_data);
10031         Newx(d->what, count, U8);
10032
10033         d->count = count;
10034         for (i = 0; i < count; i++) {
10035             d->what[i] = ri->data->what[i];
10036             switch (d->what[i]) {
10037                 /* legal options are one of: sSfpontTua
10038                    see also regcomp.h and pregfree() */
10039             case 'a': /* actually an AV, but the dup function is identical.  */
10040             case 's':
10041             case 'S':
10042             case 'p': /* actually an AV, but the dup function is identical.  */
10043             case 'u': /* actually an HV, but the dup function is identical.  */
10044                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
10045                 break;
10046             case 'f':
10047                 /* This is cheating. */
10048                 Newx(d->data[i], 1, struct regnode_charclass_class);
10049                 StructCopy(ri->data->data[i], d->data[i],
10050                             struct regnode_charclass_class);
10051                 reti->regstclass = (regnode*)d->data[i];
10052                 break;
10053             case 'o':
10054                 /* Compiled op trees are readonly and in shared memory,
10055                    and can thus be shared without duplication. */
10056                 OP_REFCNT_LOCK;
10057                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
10058                 OP_REFCNT_UNLOCK;
10059                 break;
10060             case 'T':
10061                 /* Trie stclasses are readonly and can thus be shared
10062                  * without duplication. We free the stclass in pregfree
10063                  * when the corresponding reg_ac_data struct is freed.
10064                  */
10065                 reti->regstclass= ri->regstclass;
10066                 /* Fall through */
10067             case 't':
10068                 OP_REFCNT_LOCK;
10069                 ((reg_trie_data*)ri->data->data[i])->refcount++;
10070                 OP_REFCNT_UNLOCK;
10071                 /* Fall through */
10072             case 'n':
10073                 d->data[i] = ri->data->data[i];
10074                 break;
10075             default:
10076                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
10077             }
10078         }
10079
10080         reti->data = d;
10081     }
10082     else
10083         reti->data = NULL;
10084
10085     reti->name_list_idx = ri->name_list_idx;
10086
10087 #ifdef RE_TRACK_PATTERN_OFFSETS
10088     if (ri->u.offsets) {
10089         Newx(reti->u.offsets, 2*len+1, U32);
10090         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
10091     }
10092 #else
10093     SetProgLen(reti,len);
10094 #endif
10095
10096     return (void*)reti;
10097 }
10098
10099 #endif    /* USE_ITHREADS */
10100
10101 #ifndef PERL_IN_XSUB_RE
10102
10103 /*
10104  - regnext - dig the "next" pointer out of a node
10105  */
10106 regnode *
10107 Perl_regnext(pTHX_ register regnode *p)
10108 {
10109     dVAR;
10110     register I32 offset;
10111
10112     if (!p)
10113         return(NULL);
10114
10115     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
10116         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
10117     }
10118
10119     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
10120     if (offset == 0)
10121         return(NULL);
10122
10123     return(p+offset);
10124 }
10125 #endif
10126
10127 STATIC void
10128 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
10129 {
10130     va_list args;
10131     STRLEN l1 = strlen(pat1);
10132     STRLEN l2 = strlen(pat2);
10133     char buf[512];
10134     SV *msv;
10135     const char *message;
10136
10137     PERL_ARGS_ASSERT_RE_CROAK2;
10138
10139     if (l1 > 510)
10140         l1 = 510;
10141     if (l1 + l2 > 510)
10142         l2 = 510 - l1;
10143     Copy(pat1, buf, l1 , char);
10144     Copy(pat2, buf + l1, l2 , char);
10145     buf[l1 + l2] = '\n';
10146     buf[l1 + l2 + 1] = '\0';
10147 #ifdef I_STDARG
10148     /* ANSI variant takes additional second argument */
10149     va_start(args, pat2);
10150 #else
10151     va_start(args);
10152 #endif
10153     msv = vmess(buf, &args);
10154     va_end(args);
10155     message = SvPV_const(msv,l1);
10156     if (l1 > 512)
10157         l1 = 512;
10158     Copy(message, buf, l1 , char);
10159     buf[l1-1] = '\0';                   /* Overwrite \n */
10160     Perl_croak(aTHX_ "%s", buf);
10161 }
10162
10163 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
10164
10165 #ifndef PERL_IN_XSUB_RE
10166 void
10167 Perl_save_re_context(pTHX)
10168 {
10169     dVAR;
10170
10171     struct re_save_state *state;
10172
10173     SAVEVPTR(PL_curcop);
10174     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10175
10176     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10177     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
10178     SSPUSHUV(SAVEt_RE_STATE);
10179
10180     Copy(&PL_reg_state, state, 1, struct re_save_state);
10181
10182     PL_reg_start_tmp = 0;
10183     PL_reg_start_tmpl = 0;
10184     PL_reg_oldsaved = NULL;
10185     PL_reg_oldsavedlen = 0;
10186     PL_reg_maxiter = 0;
10187     PL_reg_leftiter = 0;
10188     PL_reg_poscache = NULL;
10189     PL_reg_poscache_size = 0;
10190 #ifdef PERL_OLD_COPY_ON_WRITE
10191     PL_nrs = NULL;
10192 #endif
10193
10194     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10195     if (PL_curpm) {
10196         const REGEXP * const rx = PM_GETRE(PL_curpm);
10197         if (rx) {
10198             U32 i;
10199             for (i = 1; i <= RX_NPARENS(rx); i++) {
10200                 char digits[TYPE_CHARS(long)];
10201                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
10202                 GV *const *const gvp
10203                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10204
10205                 if (gvp) {
10206                     GV * const gv = *gvp;
10207                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10208                         save_scalar(gv);
10209                 }
10210             }
10211         }
10212     }
10213 }
10214 #endif
10215
10216 static void
10217 clear_re(pTHX_ void *r)
10218 {
10219     dVAR;
10220     ReREFCNT_dec((REGEXP *)r);
10221 }
10222
10223 #ifdef DEBUGGING
10224
10225 STATIC void
10226 S_put_byte(pTHX_ SV *sv, int c)
10227 {
10228     PERL_ARGS_ASSERT_PUT_BYTE;
10229
10230     /* Our definition of isPRINT() ignores locales, so only bytes that are
10231        not part of UTF-8 are considered printable. I assume that the same
10232        holds for UTF-EBCDIC.
10233        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10234        which Wikipedia says:
10235
10236        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10237        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10238        identical, to the ASCII delete (DEL) or rubout control character.
10239        ) So the old condition can be simplified to !isPRINT(c)  */
10240     if (!isPRINT(c))
10241         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
10242     else {
10243         const char string = c;
10244         if (c == '-' || c == ']' || c == '\\' || c == '^')
10245             sv_catpvs(sv, "\\");
10246         sv_catpvn(sv, &string, 1);
10247     }
10248 }
10249
10250
10251 #define CLEAR_OPTSTART \
10252     if (optstart) STMT_START { \
10253             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
10254             optstart=NULL; \
10255     } STMT_END
10256
10257 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
10258
10259 STATIC const regnode *
10260 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
10261             const regnode *last, const regnode *plast,
10262             SV* sv, I32 indent, U32 depth)
10263 {
10264     dVAR;
10265     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
10266     register const regnode *next;
10267     const regnode *optstart= NULL;
10268
10269     RXi_GET_DECL(r,ri);
10270     GET_RE_DEBUG_FLAGS_DECL;
10271
10272     PERL_ARGS_ASSERT_DUMPUNTIL;
10273
10274 #ifdef DEBUG_DUMPUNTIL
10275     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10276         last ? last-start : 0,plast ? plast-start : 0);
10277 #endif
10278
10279     if (plast && plast < last)
10280         last= plast;
10281
10282     while (PL_regkind[op] != END && (!last || node < last)) {
10283         /* While that wasn't END last time... */
10284         NODE_ALIGN(node);
10285         op = OP(node);
10286         if (op == CLOSE || op == WHILEM)
10287             indent--;
10288         next = regnext((regnode *)node);
10289
10290         /* Where, what. */
10291         if (OP(node) == OPTIMIZED) {
10292             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10293                 optstart = node;
10294             else
10295                 goto after_print;
10296         } else
10297             CLEAR_OPTSTART;
10298
10299         regprop(r, sv, node);
10300         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10301                       (int)(2*indent + 1), "", SvPVX_const(sv));
10302
10303         if (OP(node) != OPTIMIZED) {
10304             if (next == NULL)           /* Next ptr. */
10305                 PerlIO_printf(Perl_debug_log, " (0)");
10306             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10307                 PerlIO_printf(Perl_debug_log, " (FAIL)");
10308             else
10309                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10310             (void)PerlIO_putc(Perl_debug_log, '\n');
10311         }
10312
10313       after_print:
10314         if (PL_regkind[(U8)op] == BRANCHJ) {
10315             assert(next);
10316             {
10317                 register const regnode *nnode = (OP(next) == LONGJMP
10318                                              ? regnext((regnode *)next)
10319                                              : next);
10320                 if (last && nnode > last)
10321                     nnode = last;
10322                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10323             }
10324         }
10325         else if (PL_regkind[(U8)op] == BRANCH) {
10326             assert(next);
10327             DUMPUNTIL(NEXTOPER(node), next);
10328         }
10329         else if ( PL_regkind[(U8)op]  == TRIE ) {
10330             const regnode *this_trie = node;
10331             const char op = OP(node);
10332             const U32 n = ARG(node);
10333             const reg_ac_data * const ac = op>=AHOCORASICK ?
10334                (reg_ac_data *)ri->data->data[n] :
10335                NULL;
10336             const reg_trie_data * const trie =
10337                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10338 #ifdef DEBUGGING
10339             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10340 #endif
10341             const regnode *nextbranch= NULL;
10342             I32 word_idx;
10343             sv_setpvs(sv, "");
10344             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10345                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10346
10347                 PerlIO_printf(Perl_debug_log, "%*s%s ",
10348                    (int)(2*(indent+3)), "",
10349                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10350                             PL_colors[0], PL_colors[1],
10351                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10352                             PERL_PV_PRETTY_ELLIPSES    |
10353                             PERL_PV_PRETTY_LTGT
10354                             )
10355                             : "???"
10356                 );
10357                 if (trie->jump) {
10358                     U16 dist= trie->jump[word_idx+1];
10359                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10360                                   (UV)((dist ? this_trie + dist : next) - start));
10361                     if (dist) {
10362                         if (!nextbranch)
10363                             nextbranch= this_trie + trie->jump[0];
10364                         DUMPUNTIL(this_trie + dist, nextbranch);
10365                     }
10366                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10367                         nextbranch= regnext((regnode *)nextbranch);
10368                 } else {
10369                     PerlIO_printf(Perl_debug_log, "\n");
10370                 }
10371             }
10372             if (last && next > last)
10373                 node= last;
10374             else
10375                 node= next;
10376         }
10377         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
10378             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10379                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10380         }
10381         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10382             assert(next);
10383             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10384         }
10385         else if ( op == PLUS || op == STAR) {
10386             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10387         }
10388         else if (op == ANYOF) {
10389             /* arglen 1 + class block */
10390             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10391                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10392             node = NEXTOPER(node);
10393         }
10394         else if (PL_regkind[(U8)op] == EXACT) {
10395             /* Literal string, where present. */
10396             node += NODE_SZ_STR(node) - 1;
10397             node = NEXTOPER(node);
10398         }
10399         else {
10400             node = NEXTOPER(node);
10401             node += regarglen[(U8)op];
10402         }
10403         if (op == CURLYX || op == OPEN)
10404             indent++;
10405     }
10406     CLEAR_OPTSTART;
10407 #ifdef DEBUG_DUMPUNTIL
10408     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10409 #endif
10410     return node;
10411 }
10412
10413 #endif  /* DEBUGGING */
10414
10415 /*
10416  * Local variables:
10417  * c-indentation-style: bsd
10418  * c-basic-offset: 4
10419  * indent-tabs-mode: t
10420  * End:
10421  *
10422  * ex: set ts=8 sts=4 sw=4 noet:
10423  */