src/5013008/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89
  90 #ifdef op
  91 #undef op
  92 #endif /* op */
  93
  94 #ifdef MSDOS
  95 #  if defined(BUGGY_MSC6)
  96  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  97 #    pragma optimize("a",off)
  98  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  99 #    pragma optimize("w",on )
 100 #  endif /* BUGGY_MSC6 */
 101 #endif /* MSDOS */
 102
 103 #ifndef STATIC
 104 #define STATIC  static
 105 #endif
 106
 107 typedef struct RExC_state_t {
 108     U32         flags;                  /* are we folding, multilining? */
 109     char        *precomp;               /* uncompiled string. */
 110     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 111     regexp      *rx;                    /* perl core regexp structure */
 112     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 113     char        *start;                 /* Start of input for compile */
 114     char        *end;                   /* End of input for compile */
 115     char        *parse;                 /* Input-scan pointer. */
 116     I32         whilem_seen;            /* number of WHILEM in this expr */
 117     regnode     *emit_start;            /* Start of emitted-code area */
 118     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 120     I32         naughty;                /* How bad is this pattern? */
 121     I32         sawback;                /* Did we see \1, ...? */
 122     U32         seen;
 123     I32         size;                   /* Code size. */
 124     I32         npar;                   /* Capture buffer count, (OPEN). */
 125     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 126     I32         nestroot;               /* root parens we are in - used by accept */
 127     I32         extralen;
 128     I32         seen_zerolen;
 129     I32         seen_evals;
 130     regnode     **open_parens;          /* pointers to open parens */
 131     regnode     **close_parens;         /* pointers to close parens */
 132     regnode     *opend;                 /* END node in program */
 133     I32         utf8;           /* whether the pattern is utf8 or not */
 134     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 135                                 /* XXX use this for future optimisation of case
 136                                  * where pattern must be upgraded to utf8. */
 137     HV          *paren_names;           /* Paren names */
 138
 139     regnode     **recurse;              /* Recurse regops */
 140     I32         recurse_count;          /* Number of recurse regops */
 141 #if ADD_TO_REGEXEC
 142     char        *starttry;              /* -Dr: where regtry was called. */
 143 #define RExC_starttry   (pRExC_state->starttry)
 144 #endif
 145 #ifdef DEBUGGING
 146     const char  *lastparse;
 147     I32         lastnum;
 148     AV          *paren_name_list;       /* idx -> name */
 149 #define RExC_lastparse  (pRExC_state->lastparse)
 150 #define RExC_lastnum    (pRExC_state->lastnum)
 151 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 152 #endif
 153 } RExC_state_t;
 154
 155 #define RExC_flags      (pRExC_state->flags)
 156 #define RExC_precomp    (pRExC_state->precomp)
 157 #define RExC_rx_sv      (pRExC_state->rx_sv)
 158 #define RExC_rx         (pRExC_state->rx)
 159 #define RExC_rxi        (pRExC_state->rxi)
 160 #define RExC_start      (pRExC_state->start)
 161 #define RExC_end        (pRExC_state->end)
 162 #define RExC_parse      (pRExC_state->parse)
 163 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 164 #ifdef RE_TRACK_PATTERN_OFFSETS
 165 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 166 #endif
 167 #define RExC_emit       (pRExC_state->emit)
 168 #define RExC_emit_start (pRExC_state->emit_start)
 169 #define RExC_emit_bound (pRExC_state->emit_bound)
 170 #define RExC_naughty    (pRExC_state->naughty)
 171 #define RExC_sawback    (pRExC_state->sawback)
 172 #define RExC_seen       (pRExC_state->seen)
 173 #define RExC_size       (pRExC_state->size)
 174 #define RExC_npar       (pRExC_state->npar)
 175 #define RExC_nestroot   (pRExC_state->nestroot)
 176 #define RExC_extralen   (pRExC_state->extralen)
 177 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 178 #define RExC_seen_evals (pRExC_state->seen_evals)
 179 #define RExC_utf8       (pRExC_state->utf8)
 180 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 181 #define RExC_open_parens        (pRExC_state->open_parens)
 182 #define RExC_close_parens       (pRExC_state->close_parens)
 183 #define RExC_opend      (pRExC_state->opend)
 184 #define RExC_paren_names        (pRExC_state->paren_names)
 185 #define RExC_recurse    (pRExC_state->recurse)
 186 #define RExC_recurse_count      (pRExC_state->recurse_count)
 187
 188
 189 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 190 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 191         ((*s) == '{' && regcurly(s)))
 192
 193 #ifdef SPSTART
 194 #undef SPSTART          /* dratted cpp namespace... */
 195 #endif
 196 /*
 197  * Flags to be passed up and down.
 198  */
 199 #define WORST           0       /* Worst case. */
 200 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 201
 202 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 203  * character, and if utf8, must be invariant. */
 204 #define SIMPLE          0x02
 205 #define SPSTART         0x04    /* Starts with * or +. */
 206 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 207 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 208
 209 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 210
 211 /* whether trie related optimizations are enabled */
 212 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 213 #define TRIE_STUDY_OPT
 214 #define FULL_TRIE_STUDY
 215 #define TRIE_STCLASS
 216 #endif
 217
 218
 219
 220 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 221 #define PBITVAL(paren) (1 << ((paren) & 7))
 222 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 223 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 224 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 225
 226 /* If not already in utf8, do a longjmp back to the beginning */
 227 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 228 #define REQUIRE_UTF8    STMT_START {                                       \
 229                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 230                         } STMT_END
 231
 232 /* About scan_data_t.
 233
 234   During optimisation we recurse through the regexp program performing
 235   various inplace (keyhole style) optimisations. In addition study_chunk
 236   and scan_commit populate this data structure with information about
 237   what strings MUST appear in the pattern. We look for the longest
 238   string that must appear at a fixed location, and we look for the
 239   longest string that may appear at a floating location. So for instance
 240   in the pattern:
 241
 242     /FOO[xX]A.*B[xX]BAR/
 243
 244   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 245   strings (because they follow a .* construct). study_chunk will identify
 246   both FOO and BAR as being the longest fixed and floating strings respectively.
 247
 248   The strings can be composites, for instance
 249
 250      /(f)(o)(o)/
 251
 252   will result in a composite fixed substring 'foo'.
 253
 254   For each string some basic information is maintained:
 255
 256   - offset or min_offset
 257     This is the position the string must appear at, or not before.
 258     It also implicitly (when combined with minlenp) tells us how many
 259     characters must match before the string we are searching for.
 260     Likewise when combined with minlenp and the length of the string it
 261     tells us how many characters must appear after the string we have
 262     found.
 263
 264   - max_offset
 265     Only used for floating strings. This is the rightmost point that
 266     the string can appear at. If set to I32 max it indicates that the
 267     string can occur infinitely far to the right.
 268
 269   - minlenp
 270     A pointer to the minimum length of the pattern that the string
 271     was found inside. This is important as in the case of positive
 272     lookahead or positive lookbehind we can have multiple patterns
 273     involved. Consider
 274
 275     /(?=FOO).*F/
 276
 277     The minimum length of the pattern overall is 3, the minimum length
 278     of the lookahead part is 3, but the minimum length of the part that
 279     will actually match is 1. So 'FOO's minimum length is 3, but the
 280     minimum length for the F is 1. This is important as the minimum length
 281     is used to determine offsets in front of and behind the string being
 282     looked for.  Since strings can be composites this is the length of the
 283     pattern at the time it was commited with a scan_commit. Note that
 284     the length is calculated by study_chunk, so that the minimum lengths
 285     are not known until the full pattern has been compiled, thus the
 286     pointer to the value.
 287
 288   - lookbehind
 289
 290     In the case of lookbehind the string being searched for can be
 291     offset past the start point of the final matching string.
 292     If this value was just blithely removed from the min_offset it would
 293     invalidate some of the calculations for how many chars must match
 294     before or after (as they are derived from min_offset and minlen and
 295     the length of the string being searched for).
 296     When the final pattern is compiled and the data is moved from the
 297     scan_data_t structure into the regexp structure the information
 298     about lookbehind is factored in, with the information that would
 299     have been lost precalculated in the end_shift field for the
 300     associated string.
 301
 302   The fields pos_min and pos_delta are used to store the minimum offset
 303   and the delta to the maximum offset at the current point in the pattern.
 304
 305 */
 306
 307 typedef struct scan_data_t {
 308     /*I32 len_min;      unused */
 309     /*I32 len_delta;    unused */
 310     I32 pos_min;
 311     I32 pos_delta;
 312     SV *last_found;
 313     I32 last_end;           /* min value, <0 unless valid. */
 314     I32 last_start_min;
 315     I32 last_start_max;
 316     SV **longest;           /* Either &l_fixed, or &l_float. */
 317     SV *longest_fixed;      /* longest fixed string found in pattern */
 318     I32 offset_fixed;       /* offset where it starts */
 319     I32 *minlen_fixed;      /* pointer to the minlen relevent to the string */
 320     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 321     SV *longest_float;      /* longest floating string found in pattern */
 322     I32 offset_float_min;   /* earliest point in string it can appear */
 323     I32 offset_float_max;   /* latest point in string it can appear */
 324     I32 *minlen_float;      /* pointer to the minlen relevent to the string */
 325     I32 lookbehind_float;   /* is the position of the string modified by LB */
 326     I32 flags;
 327     I32 whilem_c;
 328     I32 *last_closep;
 329     struct regnode_charclass_class *start_class;
 330 } scan_data_t;
 331
 332 /*
 333  * Forward declarations for pregcomp()'s friends.
 334  */
 335
 336 static const scan_data_t zero_scan_data =
 337   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 338
 339 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 340 #define SF_BEFORE_SEOL          0x0001
 341 #define SF_BEFORE_MEOL          0x0002
 342 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 343 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 344
 345 #ifdef NO_UNARY_PLUS
 346 #  define SF_FIX_SHIFT_EOL      (0+2)
 347 #  define SF_FL_SHIFT_EOL               (0+4)
 348 #else
 349 #  define SF_FIX_SHIFT_EOL      (+2)
 350 #  define SF_FL_SHIFT_EOL               (+4)
 351 #endif
 352
 353 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 354 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 355
 356 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 357 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 358 #define SF_IS_INF               0x0040
 359 #define SF_HAS_PAR              0x0080
 360 #define SF_IN_PAR               0x0100
 361 #define SF_HAS_EVAL             0x0200
 362 #define SCF_DO_SUBSTR           0x0400
 363 #define SCF_DO_STCLASS_AND      0x0800
 364 #define SCF_DO_STCLASS_OR       0x1000
 365 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 366 #define SCF_WHILEM_VISITED_POS  0x2000
 367
 368 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 369 #define SCF_SEEN_ACCEPT         0x8000
 370
 371 #define UTF cBOOL(RExC_utf8)
 372 #define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
 373 #define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
 374 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 375
 376 #define OOB_UNICODE             12345678
 377 #define OOB_NAMEDCLASS          -1
 378
 379 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 380 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 381
 382
 383 /* length of regex to show in messages that don't mark a position within */
 384 #define RegexLengthToShowInErrorMessages 127
 385
 386 /*
 387  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 388  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 389  * op/pragma/warn/regcomp.
 390  */
 391 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 392 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 393
 394 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 395
 396 /*
 397  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 398  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 399  * "...".
 400  */
 401 #define _FAIL(code) STMT_START {                                        \
 402     const char *ellipses = "";                                          \
 403     IV len = RExC_end - RExC_precomp;                                   \
 404                                                                         \
 405     if (!SIZE_ONLY)                                                     \
 406         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 407     if (len > RegexLengthToShowInErrorMessages) {                       \
 408         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 409         len = RegexLengthToShowInErrorMessages - 10;                    \
 410         ellipses = "...";                                               \
 411     }                                                                   \
 412     code;                                                               \
 413 } STMT_END
 414
 415 #define FAIL(msg) _FAIL(                            \
 416     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 417             msg, (int)len, RExC_precomp, ellipses))
 418
 419 #define FAIL2(msg,arg) _FAIL(                       \
 420     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 421             arg, (int)len, RExC_precomp, ellipses))
 422
 423 /*
 424  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 425  */
 426 #define Simple_vFAIL(m) STMT_START {                                    \
 427     const IV offset = RExC_parse - RExC_precomp;                        \
 428     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 429             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 430 } STMT_END
 431
 432 /*
 433  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 434  */
 435 #define vFAIL(m) STMT_START {                           \
 436     if (!SIZE_ONLY)                                     \
 437         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 438     Simple_vFAIL(m);                                    \
 439 } STMT_END
 440
 441 /*
 442  * Like Simple_vFAIL(), but accepts two arguments.
 443  */
 444 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 445     const IV offset = RExC_parse - RExC_precomp;                        \
 446     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 447             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 448 } STMT_END
 449
 450 /*
 451  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 452  */
 453 #define vFAIL2(m,a1) STMT_START {                       \
 454     if (!SIZE_ONLY)                                     \
 455         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 456     Simple_vFAIL2(m, a1);                               \
 457 } STMT_END
 458
 459
 460 /*
 461  * Like Simple_vFAIL(), but accepts three arguments.
 462  */
 463 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 464     const IV offset = RExC_parse - RExC_precomp;                \
 465     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 466             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 467 } STMT_END
 468
 469 /*
 470  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 471  */
 472 #define vFAIL3(m,a1,a2) STMT_START {                    \
 473     if (!SIZE_ONLY)                                     \
 474         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 475     Simple_vFAIL3(m, a1, a2);                           \
 476 } STMT_END
 477
 478 /*
 479  * Like Simple_vFAIL(), but accepts four arguments.
 480  */
 481 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 482     const IV offset = RExC_parse - RExC_precomp;                \
 483     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 484             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 485 } STMT_END
 486
 487 #define ckWARNreg(loc,m) STMT_START {                                   \
 488     const IV offset = loc - RExC_precomp;                               \
 489     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 490             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 491 } STMT_END
 492
 493 #define ckWARNregdep(loc,m) STMT_START {                                \
 494     const IV offset = loc - RExC_precomp;                               \
 495     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 496             m REPORT_LOCATION,                                          \
 497             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 498 } STMT_END
 499
 500 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 501     const IV offset = loc - RExC_precomp;                               \
 502     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 503             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 504 } STMT_END
 505
 506 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 507     const IV offset = loc - RExC_precomp;                               \
 508     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 509             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 510 } STMT_END
 511
 512 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 513     const IV offset = loc - RExC_precomp;                               \
 514     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 515             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 516 } STMT_END
 517
 518 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 519     const IV offset = loc - RExC_precomp;                               \
 520     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 521             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 522 } STMT_END
 523
 524 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 525     const IV offset = loc - RExC_precomp;                               \
 526     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 527             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 528 } STMT_END
 529
 530 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 531     const IV offset = loc - RExC_precomp;                               \
 532     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 533             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 534 } STMT_END
 535
 536
 537 /* Allow for side effects in s */
 538 #define REGC(c,s) STMT_START {                  \
 539     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 540 } STMT_END
 541
 542 /* Macros for recording node offsets.   20001227 mjd@plover.com
 543  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 544  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 545  * Element 0 holds the number n.
 546  * Position is 1 indexed.
 547  */
 548 #ifndef RE_TRACK_PATTERN_OFFSETS
 549 #define Set_Node_Offset_To_R(node,byte)
 550 #define Set_Node_Offset(node,byte)
 551 #define Set_Cur_Node_Offset
 552 #define Set_Node_Length_To_R(node,len)
 553 #define Set_Node_Length(node,len)
 554 #define Set_Node_Cur_Length(node)
 555 #define Node_Offset(n)
 556 #define Node_Length(n)
 557 #define Set_Node_Offset_Length(node,offset,len)
 558 #define ProgLen(ri) ri->u.proglen
 559 #define SetProgLen(ri,x) ri->u.proglen = x
 560 #else
 561 #define ProgLen(ri) ri->u.offsets[0]
 562 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 563 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 564     if (! SIZE_ONLY) {                                                  \
 565         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 566                     __LINE__, (int)(node), (int)(byte)));               \
 567         if((node) < 0) {                                                \
 568             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 569         } else {                                                        \
 570             RExC_offsets[2*(node)-1] = (byte);                          \
 571         }                                                               \
 572     }                                                                   \
 573 } STMT_END
 574
 575 #define Set_Node_Offset(node,byte) \
 576     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 577 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 578
 579 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 580     if (! SIZE_ONLY) {                                                  \
 581         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 582                 __LINE__, (int)(node), (int)(len)));                    \
 583         if((node) < 0) {                                                \
 584             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 585         } else {                                                        \
 586             RExC_offsets[2*(node)] = (len);                             \
 587         }                                                               \
 588     }                                                                   \
 589 } STMT_END
 590
 591 #define Set_Node_Length(node,len) \
 592     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 593 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 594 #define Set_Node_Cur_Length(node) \
 595     Set_Node_Length(node, RExC_parse - parse_start)
 596
 597 /* Get offsets and lengths */
 598 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 599 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 600
 601 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 602     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 603     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 604 } STMT_END
 605 #endif
 606
 607 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 608 #define EXPERIMENTAL_INPLACESCAN
 609 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 610
 611 #define DEBUG_STUDYDATA(str,data,depth)                              \
 612 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 613     PerlIO_printf(Perl_debug_log,                                    \
 614         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 615         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 616         (int)(depth)*2, "",                                          \
 617         (IV)((data)->pos_min),                                       \
 618         (IV)((data)->pos_delta),                                     \
 619         (UV)((data)->flags),                                         \
 620         (IV)((data)->whilem_c),                                      \
 621         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 622         is_inf ? "INF " : ""                                         \
 623     );                                                               \
 624     if ((data)->last_found)                                          \
 625         PerlIO_printf(Perl_debug_log,                                \
 626             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 627             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 628             SvPVX_const((data)->last_found),                         \
 629             (IV)((data)->last_end),                                  \
 630             (IV)((data)->last_start_min),                            \
 631             (IV)((data)->last_start_max),                            \
 632             ((data)->longest &&                                      \
 633              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 634             SvPVX_const((data)->longest_fixed),                      \
 635             (IV)((data)->offset_fixed),                              \
 636             ((data)->longest &&                                      \
 637              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 638             SvPVX_const((data)->longest_float),                      \
 639             (IV)((data)->offset_float_min),                          \
 640             (IV)((data)->offset_float_max)                           \
 641         );                                                           \
 642     PerlIO_printf(Perl_debug_log,"\n");                              \
 643 });
 644
 645 static void clear_re(pTHX_ void *r);
 646
 647 /* Mark that we cannot extend a found fixed substring at this point.
 648    Update the longest found anchored substring and the longest found
 649    floating substrings if needed. */
 650
 651 STATIC void
 652 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 653 {
 654     const STRLEN l = CHR_SVLEN(data->last_found);
 655     const STRLEN old_l = CHR_SVLEN(*data->longest);
 656     GET_RE_DEBUG_FLAGS_DECL;
 657
 658     PERL_ARGS_ASSERT_SCAN_COMMIT;
 659
 660     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 661         SvSetMagicSV(*data->longest, data->last_found);
 662         if (*data->longest == data->longest_fixed) {
 663             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 664             if (data->flags & SF_BEFORE_EOL)
 665                 data->flags
 666                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 667             else
 668                 data->flags &= ~SF_FIX_BEFORE_EOL;
 669             data->minlen_fixed=minlenp;
 670             data->lookbehind_fixed=0;
 671         }
 672         else { /* *data->longest == data->longest_float */
 673             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 674             data->offset_float_max = (l
 675                                       ? data->last_start_max
 676                                       : data->pos_min + data->pos_delta);
 677             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 678                 data->offset_float_max = I32_MAX;
 679             if (data->flags & SF_BEFORE_EOL)
 680                 data->flags
 681                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 682             else
 683                 data->flags &= ~SF_FL_BEFORE_EOL;
 684             data->minlen_float=minlenp;
 685             data->lookbehind_float=0;
 686         }
 687     }
 688     SvCUR_set(data->last_found, 0);
 689     {
 690         SV * const sv = data->last_found;
 691         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 692             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 693             if (mg)
 694                 mg->mg_len = 0;
 695         }
 696     }
 697     data->last_end = -1;
 698     data->flags &= ~SF_BEFORE_EOL;
 699     DEBUG_STUDYDATA("commit: ",data,0);
 700 }
 701
 702 /* Can match anything (initialization) */
 703 STATIC void
 704 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 705 {
 706     PERL_ARGS_ASSERT_CL_ANYTHING;
 707
 708     ANYOF_CLASS_ZERO(cl);
 709     ANYOF_BITMAP_SETALL(cl);
 710     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
 711     if (LOC)
 712         cl->flags |= ANYOF_LOCALE;
 713     cl->flags |= ANYOF_FOLD;
 714 }
 715
 716 /* Can match anything (initialization) */
 717 STATIC int
 718 S_cl_is_anything(const struct regnode_charclass_class *cl)
 719 {
 720     int value;
 721
 722     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 723
 724     for (value = 0; value <= ANYOF_MAX; value += 2)
 725         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 726             return 1;
 727     if (!(cl->flags & ANYOF_UNICODE_ALL))
 728         return 0;
 729     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 730         return 0;
 731     return 1;
 732 }
 733
 734 /* Can match anything (initialization) */
 735 STATIC void
 736 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 737 {
 738     PERL_ARGS_ASSERT_CL_INIT;
 739
 740     Zero(cl, 1, struct regnode_charclass_class);
 741     cl->type = ANYOF;
 742     cl_anything(pRExC_state, cl);
 743 }
 744
 745 STATIC void
 746 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 747 {
 748     PERL_ARGS_ASSERT_CL_INIT_ZERO;
 749
 750     Zero(cl, 1, struct regnode_charclass_class);
 751     cl->type = ANYOF;
 752     cl_anything(pRExC_state, cl);
 753     if (LOC)
 754         cl->flags |= ANYOF_LOCALE;
 755 }
 756
 757 /* 'And' a given class with another one.  Can create false positives */
 758 /* We assume that cl is not inverted */
 759 STATIC void
 760 S_cl_and(struct regnode_charclass_class *cl,
 761         const struct regnode_charclass_class *and_with)
 762 {
 763     PERL_ARGS_ASSERT_CL_AND;
 764
 765     assert(and_with->type == ANYOF);
 766
 767     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 768         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 769         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 770         && !(and_with->flags & ANYOF_FOLD)
 771         && !(cl->flags & ANYOF_FOLD)) {
 772         int i;
 773
 774         if (and_with->flags & ANYOF_INVERT)
 775             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 776                 cl->bitmap[i] &= ~and_with->bitmap[i];
 777         else
 778             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 779                 cl->bitmap[i] &= and_with->bitmap[i];
 780     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 781     if (!(and_with->flags & ANYOF_EOS))
 782         cl->flags &= ~ANYOF_EOS;
 783
 784     if (!(and_with->flags & ANYOF_FOLD))
 785         cl->flags &= ~ANYOF_FOLD;
 786
 787     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP &&
 788         !(and_with->flags & ANYOF_INVERT)) {
 789         cl->flags &= ~ANYOF_UNICODE_ALL;
 790         cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
 791                                                            only the one(s)
 792                                                            actually set */
 793         ARG_SET(cl, ARG(and_with));
 794     }
 795     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
 796         !(and_with->flags & ANYOF_INVERT))
 797         cl->flags &= ~ANYOF_UNICODE_ALL;
 798     if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
 799         !(and_with->flags & ANYOF_INVERT))
 800         cl->flags &= ~ANYOF_NONBITMAP;
 801 }
 802
 803 /* 'OR' a given class with another one.  Can create false positives */
 804 /* We assume that cl is not inverted */
 805 STATIC void
 806 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 807 {
 808     PERL_ARGS_ASSERT_CL_OR;
 809
 810     if (or_with->flags & ANYOF_INVERT) {
 811         /* We do not use
 812          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 813          *   <= (B1 | !B2) | (CL1 | !CL2)
 814          * which is wasteful if CL2 is small, but we ignore CL2:
 815          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 816          * XXXX Can we handle case-fold?  Unclear:
 817          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 818          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 819          */
 820         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 821              && !(or_with->flags & ANYOF_FOLD)
 822              && !(cl->flags & ANYOF_FOLD) ) {
 823             int i;
 824
 825             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 826                 cl->bitmap[i] |= ~or_with->bitmap[i];
 827         } /* XXXX: logic is complicated otherwise */
 828         else {
 829             cl_anything(pRExC_state, cl);
 830         }
 831     } else {
 832         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 833         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 834              && (!(or_with->flags & ANYOF_FOLD)
 835                  || (cl->flags & ANYOF_FOLD)) ) {
 836             int i;
 837
 838             /* OR char bitmap and class bitmap separately */
 839             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 840                 cl->bitmap[i] |= or_with->bitmap[i];
 841             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 842                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 843                     cl->classflags[i] |= or_with->classflags[i];
 844                 cl->flags |= ANYOF_CLASS;
 845             }
 846         }
 847         else { /* XXXX: logic is complicated, leave it along for a moment. */
 848             cl_anything(pRExC_state, cl);
 849         }
 850     }
 851     if (or_with->flags & ANYOF_EOS)
 852         cl->flags |= ANYOF_EOS;
 853
 854     if (or_with->flags & ANYOF_FOLD)
 855         cl->flags |= ANYOF_FOLD;
 856
 857     /* If both nodes match something outside the bitmap, but what they match
 858      * outside is not the same pointer, and hence not easily compared, give up
 859      * and allow the start class to match everything outside the bitmap */
 860     if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
 861         ARG(cl) != ARG(or_with)) {
 862         cl->flags |= ANYOF_UNICODE_ALL;
 863     }
 864
 865     if (or_with->flags & ANYOF_UNICODE_ALL) {
 866         cl->flags |= ANYOF_UNICODE_ALL;
 867     }
 868 }
 869
 870 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 871 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 872 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 873 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
 874
 875
 876 #ifdef DEBUGGING
 877 /*
 878    dump_trie(trie,widecharmap,revcharmap)
 879    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
 880    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 881
 882    These routines dump out a trie in a somewhat readable format.
 883    The _interim_ variants are used for debugging the interim
 884    tables that are used to generate the final compressed
 885    representation which is what dump_trie expects.
 886
 887    Part of the reason for their existance is to provide a form
 888    of documentation as to how the different representations function.
 889
 890 */
 891
 892 /*
 893   Dumps the final compressed table form of the trie to Perl_debug_log.
 894   Used for debugging make_trie().
 895 */
 896
 897 STATIC void
 898 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
 899             AV *revcharmap, U32 depth)
 900 {
 901     U32 state;
 902     SV *sv=sv_newmortal();
 903     int colwidth= widecharmap ? 6 : 4;
 904     U16 word;
 905     GET_RE_DEBUG_FLAGS_DECL;
 906
 907     PERL_ARGS_ASSERT_DUMP_TRIE;
 908
 909     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
 910         (int)depth * 2 + 2,"",
 911         "Match","Base","Ofs" );
 912
 913     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
 914         SV ** const tmp = av_fetch( revcharmap, state, 0);
 915         if ( tmp ) {
 916             PerlIO_printf( Perl_debug_log, "%*s",
 917                 colwidth,
 918                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 919                             PL_colors[0], PL_colors[1],
 920                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 921                             PERL_PV_ESCAPE_FIRSTCHAR
 922                 )
 923             );
 924         }
 925     }
 926     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
 927         (int)depth * 2 + 2,"");
 928
 929     for( state = 0 ; state < trie->uniquecharcount ; state++ )
 930         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
 931     PerlIO_printf( Perl_debug_log, "\n");
 932
 933     for( state = 1 ; state < trie->statecount ; state++ ) {
 934         const U32 base = trie->states[ state ].trans.base;
 935
 936         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
 937
 938         if ( trie->states[ state ].wordnum ) {
 939             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
 940         } else {
 941             PerlIO_printf( Perl_debug_log, "%6s", "" );
 942         }
 943
 944         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
 945
 946         if ( base ) {
 947             U32 ofs = 0;
 948
 949             while( ( base + ofs  < trie->uniquecharcount ) ||
 950                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
 951                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
 952                     ofs++;
 953
 954             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
 955
 956             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
 957                 if ( ( base + ofs >= trie->uniquecharcount ) &&
 958                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
 959                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
 960                 {
 961                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
 962                     colwidth,
 963                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
 964                 } else {
 965                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
 966                 }
 967             }
 968
 969             PerlIO_printf( Perl_debug_log, "]");
 970
 971         }
 972         PerlIO_printf( Perl_debug_log, "\n" );
 973     }
 974     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
 975     for (word=1; word <= trie->wordcount; word++) {
 976         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
 977             (int)word, (int)(trie->wordinfo[word].prev),
 978             (int)(trie->wordinfo[word].len));
 979     }
 980     PerlIO_printf(Perl_debug_log, "\n" );
 981 }
 982 /*
 983   Dumps a fully constructed but uncompressed trie in list form.
 984   List tries normally only are used for construction when the number of
 985   possible chars (trie->uniquecharcount) is very high.
 986   Used for debugging make_trie().
 987 */
 988 STATIC void
 989 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
 990                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
 991                          U32 depth)
 992 {
 993     U32 state;
 994     SV *sv=sv_newmortal();
 995     int colwidth= widecharmap ? 6 : 4;
 996     GET_RE_DEBUG_FLAGS_DECL;
 997
 998     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
 999
1000     /* print out the table precompression.  */
1001     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1002         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1003         "------:-----+-----------------\n" );
1004
1005     for( state=1 ; state < next_alloc ; state ++ ) {
1006         U16 charid;
1007
1008         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1009             (int)depth * 2 + 2,"", (UV)state  );
1010         if ( ! trie->states[ state ].wordnum ) {
1011             PerlIO_printf( Perl_debug_log, "%5s| ","");
1012         } else {
1013             PerlIO_printf( Perl_debug_log, "W%4x| ",
1014                 trie->states[ state ].wordnum
1015             );
1016         }
1017         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1018             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1019             if ( tmp ) {
1020                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1021                     colwidth,
1022                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1023                             PL_colors[0], PL_colors[1],
1024                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1025                             PERL_PV_ESCAPE_FIRSTCHAR
1026                     ) ,
1027                     TRIE_LIST_ITEM(state,charid).forid,
1028                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1029                 );
1030                 if (!(charid % 10))
1031                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1032                         (int)((depth * 2) + 14), "");
1033             }
1034         }
1035         PerlIO_printf( Perl_debug_log, "\n");
1036     }
1037 }
1038
1039 /*
1040   Dumps a fully constructed but uncompressed trie in table form.
1041   This is the normal DFA style state transition table, with a few
1042   twists to facilitate compression later.
1043   Used for debugging make_trie().
1044 */
1045 STATIC void
1046 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1047                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1048                           U32 depth)
1049 {
1050     U32 state;
1051     U16 charid;
1052     SV *sv=sv_newmortal();
1053     int colwidth= widecharmap ? 6 : 4;
1054     GET_RE_DEBUG_FLAGS_DECL;
1055
1056     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1057
1058     /*
1059        print out the table precompression so that we can do a visual check
1060        that they are identical.
1061      */
1062
1063     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1064
1065     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1066         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1067         if ( tmp ) {
1068             PerlIO_printf( Perl_debug_log, "%*s",
1069                 colwidth,
1070                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1071                             PL_colors[0], PL_colors[1],
1072                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1073                             PERL_PV_ESCAPE_FIRSTCHAR
1074                 )
1075             );
1076         }
1077     }
1078
1079     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1080
1081     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1082         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1083     }
1084
1085     PerlIO_printf( Perl_debug_log, "\n" );
1086
1087     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1088
1089         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1090             (int)depth * 2 + 2,"",
1091             (UV)TRIE_NODENUM( state ) );
1092
1093         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1094             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1095             if (v)
1096                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1097             else
1098                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1099         }
1100         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1101             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1102         } else {
1103             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1104             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1105         }
1106     }
1107 }
1108
1109 #endif
1110
1111
1112 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1113   startbranch: the first branch in the whole branch sequence
1114   first      : start branch of sequence of branch-exact nodes.
1115                May be the same as startbranch
1116   last       : Thing following the last branch.
1117                May be the same as tail.
1118   tail       : item following the branch sequence
1119   count      : words in the sequence
1120   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1121   depth      : indent depth
1122
1123 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1124
1125 A trie is an N'ary tree where the branches are determined by digital
1126 decomposition of the key. IE, at the root node you look up the 1st character and
1127 follow that branch repeat until you find the end of the branches. Nodes can be
1128 marked as "accepting" meaning they represent a complete word. Eg:
1129
1130   /he|she|his|hers/
1131
1132 would convert into the following structure. Numbers represent states, letters
1133 following numbers represent valid transitions on the letter from that state, if
1134 the number is in square brackets it represents an accepting state, otherwise it
1135 will be in parenthesis.
1136
1137       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1138       |    |
1139       |   (2)
1140       |    |
1141      (1)   +-i->(6)-+-s->[7]
1142       |
1143       +-s->(3)-+-h->(4)-+-e->[5]
1144
1145       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1146
1147 This shows that when matching against the string 'hers' we will begin at state 1
1148 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1149 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1150 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1151 single traverse. We store a mapping from accepting to state to which word was
1152 matched, and then when we have multiple possibilities we try to complete the
1153 rest of the regex in the order in which they occured in the alternation.
1154
1155 The only prior NFA like behaviour that would be changed by the TRIE support is
1156 the silent ignoring of duplicate alternations which are of the form:
1157
1158  / (DUPE|DUPE) X? (?{ ... }) Y /x
1159
1160 Thus EVAL blocks following a trie may be called a different number of times with
1161 and without the optimisation. With the optimisations dupes will be silently
1162 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1163 the following demonstrates:
1164
1165  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1166
1167 which prints out 'word' three times, but
1168
1169  'words'=~/(word|word|word)(?{ print $1 })S/
1170
1171 which doesnt print it out at all. This is due to other optimisations kicking in.
1172
1173 Example of what happens on a structural level:
1174
1175 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1176
1177    1: CURLYM[1] {1,32767}(18)
1178    5:   BRANCH(8)
1179    6:     EXACT <ac>(16)
1180    8:   BRANCH(11)
1181    9:     EXACT <ad>(16)
1182   11:   BRANCH(14)
1183   12:     EXACT <ab>(16)
1184   16:   SUCCEED(0)
1185   17:   NOTHING(18)
1186   18: END(0)
1187
1188 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1189 and should turn into:
1190
1191    1: CURLYM[1] {1,32767}(18)
1192    5:   TRIE(16)
1193         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1194           <ac>
1195           <ad>
1196           <ab>
1197   16:   SUCCEED(0)
1198   17:   NOTHING(18)
1199   18: END(0)
1200
1201 Cases where tail != last would be like /(?foo|bar)baz/:
1202
1203    1: BRANCH(4)
1204    2:   EXACT <foo>(8)
1205    4: BRANCH(7)
1206    5:   EXACT <bar>(8)
1207    7: TAIL(8)
1208    8: EXACT <baz>(10)
1209   10: END(0)
1210
1211 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1212 and would end up looking like:
1213
1214     1: TRIE(8)
1215       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1216         <foo>
1217         <bar>
1218    7: TAIL(8)
1219    8: EXACT <baz>(10)
1220   10: END(0)
1221
1222     d = uvuni_to_utf8_flags(d, uv, 0);
1223
1224 is the recommended Unicode-aware way of saying
1225
1226     *(d++) = uv;
1227 */
1228
1229 #define TRIE_STORE_REVCHAR                                                 \
1230     STMT_START {                                                           \
1231         if (UTF) {                                                         \
1232             SV *zlopp = newSV(2);                                          \
1233             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1234             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1235             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1236             SvPOK_on(zlopp);                                               \
1237             SvUTF8_on(zlopp);                                              \
1238             av_push(revcharmap, zlopp);                                    \
1239         } else {                                                           \
1240             char ooooff = (char)uvc;                                               \
1241             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1242         }                                                                  \
1243         } STMT_END
1244
1245 #define TRIE_READ_CHAR STMT_START {                                           \
1246     wordlen++;                                                                \
1247     if ( UTF ) {                                                              \
1248         if ( folder ) {                                                       \
1249             if ( foldlen > 0 ) {                                              \
1250                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1251                foldlen -= len;                                                \
1252                scan += len;                                                   \
1253                len = 0;                                                       \
1254             } else {                                                          \
1255                 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1256                 uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1257                 foldlen -= UNISKIP( uvc );                                    \
1258                 scan = foldbuf + UNISKIP( uvc );                              \
1259             }                                                                 \
1260         } else {                                                              \
1261             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1262         }                                                                     \
1263     } else {                                                                  \
1264         uvc = (U32)*uc;                                                       \
1265         len = 1;                                                              \
1266     }                                                                         \
1267 } STMT_END
1268
1269
1270
1271 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1272     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1273         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1274         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1275     }                                                           \
1276     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1277     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1278     TRIE_LIST_CUR( state )++;                                   \
1279 } STMT_END
1280
1281 #define TRIE_LIST_NEW(state) STMT_START {                       \
1282     Newxz( trie->states[ state ].trans.list,               \
1283         4, reg_trie_trans_le );                                 \
1284      TRIE_LIST_CUR( state ) = 1;                                \
1285      TRIE_LIST_LEN( state ) = 4;                                \
1286 } STMT_END
1287
1288 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1289     U16 dupe= trie->states[ state ].wordnum;                    \
1290     regnode * const noper_next = regnext( noper );              \
1291                                                                 \
1292     DEBUG_r({                                                   \
1293         /* store the word for dumping */                        \
1294         SV* tmp;                                                \
1295         if (OP(noper) != NOTHING)                               \
1296             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1297         else                                                    \
1298             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1299         av_push( trie_words, tmp );                             \
1300     });                                                         \
1301                                                                 \
1302     curword++;                                                  \
1303     trie->wordinfo[curword].prev   = 0;                         \
1304     trie->wordinfo[curword].len    = wordlen;                   \
1305     trie->wordinfo[curword].accept = state;                     \
1306                                                                 \
1307     if ( noper_next < tail ) {                                  \
1308         if (!trie->jump)                                        \
1309             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1310         trie->jump[curword] = (U16)(noper_next - convert);      \
1311         if (!jumper)                                            \
1312             jumper = noper_next;                                \
1313         if (!nextbranch)                                        \
1314             nextbranch= regnext(cur);                           \
1315     }                                                           \
1316                                                                 \
1317     if ( dupe ) {                                               \
1318         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1319         /* chain, so that when the bits of chain are later    */\
1320         /* linked together, the dups appear in the chain      */\
1321         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1322         trie->wordinfo[dupe].prev = curword;                    \
1323     } else {                                                    \
1324         /* we haven't inserted this word yet.                */ \
1325         trie->states[ state ].wordnum = curword;                \
1326     }                                                           \
1327 } STMT_END
1328
1329
1330 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1331      ( ( base + charid >=  ucharcount                                   \
1332          && base + charid < ubound                                      \
1333          && state == trie->trans[ base - ucharcount + charid ].check    \
1334          && trie->trans[ base - ucharcount + charid ].next )            \
1335            ? trie->trans[ base - ucharcount + charid ].next             \
1336            : ( state==1 ? special : 0 )                                 \
1337       )
1338
1339 #define MADE_TRIE       1
1340 #define MADE_JUMP_TRIE  2
1341 #define MADE_EXACT_TRIE 4
1342
1343 STATIC I32
1344 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1345 {
1346     dVAR;
1347     /* first pass, loop through and scan words */
1348     reg_trie_data *trie;
1349     HV *widecharmap = NULL;
1350     AV *revcharmap = newAV();
1351     regnode *cur;
1352     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1353     STRLEN len = 0;
1354     UV uvc = 0;
1355     U16 curword = 0;
1356     U32 next_alloc = 0;
1357     regnode *jumper = NULL;
1358     regnode *nextbranch = NULL;
1359     regnode *convert = NULL;
1360     U32 *prev_states; /* temp array mapping each state to previous one */
1361     /* we just use folder as a flag in utf8 */
1362     const U8 * folder = NULL;
1363
1364 #ifdef DEBUGGING
1365     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1366     AV *trie_words = NULL;
1367     /* along with revcharmap, this only used during construction but both are
1368      * useful during debugging so we store them in the struct when debugging.
1369      */
1370 #else
1371     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1372     STRLEN trie_charcount=0;
1373 #endif
1374     SV *re_trie_maxbuff;
1375     GET_RE_DEBUG_FLAGS_DECL;
1376
1377     PERL_ARGS_ASSERT_MAKE_TRIE;
1378 #ifndef DEBUGGING
1379     PERL_UNUSED_ARG(depth);
1380 #endif
1381
1382     switch (flags) {
1383         case EXACTFU: folder = PL_fold_latin1; break;
1384         case EXACTF:  folder = PL_fold; break;
1385         case EXACTFL: folder = PL_fold_locale; break;
1386     }
1387
1388     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1389     trie->refcount = 1;
1390     trie->startstate = 1;
1391     trie->wordcount = word_count;
1392     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1393     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1394     if (!(UTF && folder))
1395         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1396     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1397                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1398
1399     DEBUG_r({
1400         trie_words = newAV();
1401     });
1402
1403     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1404     if (!SvIOK(re_trie_maxbuff)) {
1405         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1406     }
1407     DEBUG_OPTIMISE_r({
1408                 PerlIO_printf( Perl_debug_log,
1409                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1410                   (int)depth * 2 + 2, "",
1411                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1412                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1413                   (int)depth);
1414     });
1415
1416    /* Find the node we are going to overwrite */
1417     if ( first == startbranch && OP( last ) != BRANCH ) {
1418         /* whole branch chain */
1419         convert = first;
1420     } else {
1421         /* branch sub-chain */
1422         convert = NEXTOPER( first );
1423     }
1424
1425     /*  -- First loop and Setup --
1426
1427        We first traverse the branches and scan each word to determine if it
1428        contains widechars, and how many unique chars there are, this is
1429        important as we have to build a table with at least as many columns as we
1430        have unique chars.
1431
1432        We use an array of integers to represent the character codes 0..255
1433        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1434        native representation of the character value as the key and IV's for the
1435        coded index.
1436
1437        *TODO* If we keep track of how many times each character is used we can
1438        remap the columns so that the table compression later on is more
1439        efficient in terms of memory by ensuring the most common value is in the
1440        middle and the least common are on the outside.  IMO this would be better
1441        than a most to least common mapping as theres a decent chance the most
1442        common letter will share a node with the least common, meaning the node
1443        will not be compressable. With a middle is most common approach the worst
1444        case is when we have the least common nodes twice.
1445
1446      */
1447
1448     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1449         regnode * const noper = NEXTOPER( cur );
1450         const U8 *uc = (U8*)STRING( noper );
1451         const U8 * const e  = uc + STR_LEN( noper );
1452         STRLEN foldlen = 0;
1453         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1454         const U8 *scan = (U8*)NULL;
1455         U32 wordlen      = 0;         /* required init */
1456         STRLEN chars = 0;
1457         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1458
1459         if (OP(noper) == NOTHING) {
1460             trie->minlen= 0;
1461             continue;
1462         }
1463         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1464             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1465                                           regardless of encoding */
1466
1467         for ( ; uc < e ; uc += len ) {
1468             TRIE_CHARCOUNT(trie)++;
1469             TRIE_READ_CHAR;
1470             chars++;
1471             if ( uvc < 256 ) {
1472                 if ( !trie->charmap[ uvc ] ) {
1473                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1474                     if ( folder )
1475                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1476                     TRIE_STORE_REVCHAR;
1477                 }
1478                 if ( set_bit ) {
1479                     /* store the codepoint in the bitmap, and its folded
1480                      * equivalent. */
1481                     TRIE_BITMAP_SET(trie,uvc);
1482
1483                     /* store the folded codepoint */
1484                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1485
1486                     if ( !UTF ) {
1487                         /* store first byte of utf8 representation of
1488                            variant codepoints */
1489                         if (! UNI_IS_INVARIANT(uvc)) {
1490                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1491                         }
1492                     }
1493                     set_bit = 0; /* We've done our bit :-) */
1494                 }
1495             } else {
1496                 SV** svpp;
1497                 if ( !widecharmap )
1498                     widecharmap = newHV();
1499
1500                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1501
1502                 if ( !svpp )
1503                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1504
1505                 if ( !SvTRUE( *svpp ) ) {
1506                     sv_setiv( *svpp, ++trie->uniquecharcount );
1507                     TRIE_STORE_REVCHAR;
1508                 }
1509             }
1510         }
1511         if( cur == first ) {
1512             trie->minlen=chars;
1513             trie->maxlen=chars;
1514         } else if (chars < trie->minlen) {
1515             trie->minlen=chars;
1516         } else if (chars > trie->maxlen) {
1517             trie->maxlen=chars;
1518         }
1519
1520     } /* end first pass */
1521     DEBUG_TRIE_COMPILE_r(
1522         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1523                 (int)depth * 2 + 2,"",
1524                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1525                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1526                 (int)trie->minlen, (int)trie->maxlen )
1527     );
1528
1529     /*
1530         We now know what we are dealing with in terms of unique chars and
1531         string sizes so we can calculate how much memory a naive
1532         representation using a flat table  will take. If it's over a reasonable
1533         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1534         conservative but potentially much slower representation using an array
1535         of lists.
1536
1537         At the end we convert both representations into the same compressed
1538         form that will be used in regexec.c for matching with. The latter
1539         is a form that cannot be used to construct with but has memory
1540         properties similar to the list form and access properties similar
1541         to the table form making it both suitable for fast searches and
1542         small enough that its feasable to store for the duration of a program.
1543
1544         See the comment in the code where the compressed table is produced
1545         inplace from the flat tabe representation for an explanation of how
1546         the compression works.
1547
1548     */
1549
1550
1551     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1552     prev_states[1] = 0;
1553
1554     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1555         /*
1556             Second Pass -- Array Of Lists Representation
1557
1558             Each state will be represented by a list of charid:state records
1559             (reg_trie_trans_le) the first such element holds the CUR and LEN
1560             points of the allocated array. (See defines above).
1561
1562             We build the initial structure using the lists, and then convert
1563             it into the compressed table form which allows faster lookups
1564             (but cant be modified once converted).
1565         */
1566
1567         STRLEN transcount = 1;
1568
1569         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1570             "%*sCompiling trie using list compiler\n",
1571             (int)depth * 2 + 2, ""));
1572
1573         trie->states = (reg_trie_state *)
1574             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1575                                   sizeof(reg_trie_state) );
1576         TRIE_LIST_NEW(1);
1577         next_alloc = 2;
1578
1579         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1580
1581             regnode * const noper = NEXTOPER( cur );
1582             U8 *uc           = (U8*)STRING( noper );
1583             const U8 * const e = uc + STR_LEN( noper );
1584             U32 state        = 1;         /* required init */
1585             U16 charid       = 0;         /* sanity init */
1586             U8 *scan         = (U8*)NULL; /* sanity init */
1587             STRLEN foldlen   = 0;         /* required init */
1588             U32 wordlen      = 0;         /* required init */
1589             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1590
1591             if (OP(noper) != NOTHING) {
1592                 for ( ; uc < e ; uc += len ) {
1593
1594                     TRIE_READ_CHAR;
1595
1596                     if ( uvc < 256 ) {
1597                         charid = trie->charmap[ uvc ];
1598                     } else {
1599                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1600                         if ( !svpp ) {
1601                             charid = 0;
1602                         } else {
1603                             charid=(U16)SvIV( *svpp );
1604                         }
1605                     }
1606                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1607                     if ( charid ) {
1608
1609                         U16 check;
1610                         U32 newstate = 0;
1611
1612                         charid--;
1613                         if ( !trie->states[ state ].trans.list ) {
1614                             TRIE_LIST_NEW( state );
1615                         }
1616                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1617                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1618                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1619                                 break;
1620                             }
1621                         }
1622                         if ( ! newstate ) {
1623                             newstate = next_alloc++;
1624                             prev_states[newstate] = state;
1625                             TRIE_LIST_PUSH( state, charid, newstate );
1626                             transcount++;
1627                         }
1628                         state = newstate;
1629                     } else {
1630                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1631                     }
1632                 }
1633             }
1634             TRIE_HANDLE_WORD(state);
1635
1636         } /* end second pass */
1637
1638         /* next alloc is the NEXT state to be allocated */
1639         trie->statecount = next_alloc;
1640         trie->states = (reg_trie_state *)
1641             PerlMemShared_realloc( trie->states,
1642                                    next_alloc
1643                                    * sizeof(reg_trie_state) );
1644
1645         /* and now dump it out before we compress it */
1646         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1647                                                          revcharmap, next_alloc,
1648                                                          depth+1)
1649         );
1650
1651         trie->trans = (reg_trie_trans *)
1652             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1653         {
1654             U32 state;
1655             U32 tp = 0;
1656             U32 zp = 0;
1657
1658
1659             for( state=1 ; state < next_alloc ; state ++ ) {
1660                 U32 base=0;
1661
1662                 /*
1663                 DEBUG_TRIE_COMPILE_MORE_r(
1664                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1665                 );
1666                 */
1667
1668                 if (trie->states[state].trans.list) {
1669                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1670                     U16 maxid=minid;
1671                     U16 idx;
1672
1673                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1674                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1675                         if ( forid < minid ) {
1676                             minid=forid;
1677                         } else if ( forid > maxid ) {
1678                             maxid=forid;
1679                         }
1680                     }
1681                     if ( transcount < tp + maxid - minid + 1) {
1682                         transcount *= 2;
1683                         trie->trans = (reg_trie_trans *)
1684                             PerlMemShared_realloc( trie->trans,
1685                                                      transcount
1686                                                      * sizeof(reg_trie_trans) );
1687                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1688                     }
1689                     base = trie->uniquecharcount + tp - minid;
1690                     if ( maxid == minid ) {
1691                         U32 set = 0;
1692                         for ( ; zp < tp ; zp++ ) {
1693                             if ( ! trie->trans[ zp ].next ) {
1694                                 base = trie->uniquecharcount + zp - minid;
1695                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1696                                 trie->trans[ zp ].check = state;
1697                                 set = 1;
1698                                 break;
1699                             }
1700                         }
1701                         if ( !set ) {
1702                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1703                             trie->trans[ tp ].check = state;
1704                             tp++;
1705                             zp = tp;
1706                         }
1707                     } else {
1708                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1709                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1710                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1711                             trie->trans[ tid ].check = state;
1712                         }
1713                         tp += ( maxid - minid + 1 );
1714                     }
1715                     Safefree(trie->states[ state ].trans.list);
1716                 }
1717                 /*
1718                 DEBUG_TRIE_COMPILE_MORE_r(
1719                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1720                 );
1721                 */
1722                 trie->states[ state ].trans.base=base;
1723             }
1724             trie->lasttrans = tp + 1;
1725         }
1726     } else {
1727         /*
1728            Second Pass -- Flat Table Representation.
1729
1730            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1731            We know that we will need Charcount+1 trans at most to store the data
1732            (one row per char at worst case) So we preallocate both structures
1733            assuming worst case.
1734
1735            We then construct the trie using only the .next slots of the entry
1736            structs.
1737
1738            We use the .check field of the first entry of the node temporarily to
1739            make compression both faster and easier by keeping track of how many non
1740            zero fields are in the node.
1741
1742            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1743            transition.
1744
1745            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1746            number representing the first entry of the node, and state as a
1747            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1748            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1749            are 2 entrys per node. eg:
1750
1751              A B       A B
1752           1. 2 4    1. 3 7
1753           2. 0 3    3. 0 5
1754           3. 0 0    5. 0 0
1755           4. 0 0    7. 0 0
1756
1757            The table is internally in the right hand, idx form. However as we also
1758            have to deal with the states array which is indexed by nodenum we have to
1759            use TRIE_NODENUM() to convert.
1760
1761         */
1762         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1763             "%*sCompiling trie using table compiler\n",
1764             (int)depth * 2 + 2, ""));
1765
1766         trie->trans = (reg_trie_trans *)
1767             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1768                                   * trie->uniquecharcount + 1,
1769                                   sizeof(reg_trie_trans) );
1770         trie->states = (reg_trie_state *)
1771             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1772                                   sizeof(reg_trie_state) );
1773         next_alloc = trie->uniquecharcount + 1;
1774
1775
1776         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1777
1778             regnode * const noper   = NEXTOPER( cur );
1779             const U8 *uc     = (U8*)STRING( noper );
1780             const U8 * const e = uc + STR_LEN( noper );
1781
1782             U32 state        = 1;         /* required init */
1783
1784             U16 charid       = 0;         /* sanity init */
1785             U32 accept_state = 0;         /* sanity init */
1786             U8 *scan         = (U8*)NULL; /* sanity init */
1787
1788             STRLEN foldlen   = 0;         /* required init */
1789             U32 wordlen      = 0;         /* required init */
1790             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1791
1792             if ( OP(noper) != NOTHING ) {
1793                 for ( ; uc < e ; uc += len ) {
1794
1795                     TRIE_READ_CHAR;
1796
1797                     if ( uvc < 256 ) {
1798                         charid = trie->charmap[ uvc ];
1799                     } else {
1800                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1801                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1802                     }
1803                     if ( charid ) {
1804                         charid--;
1805                         if ( !trie->trans[ state + charid ].next ) {
1806                             trie->trans[ state + charid ].next = next_alloc;
1807                             trie->trans[ state ].check++;
1808                             prev_states[TRIE_NODENUM(next_alloc)]
1809                                     = TRIE_NODENUM(state);
1810                             next_alloc += trie->uniquecharcount;
1811                         }
1812                         state = trie->trans[ state + charid ].next;
1813                     } else {
1814                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1815                     }
1816                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1817                 }
1818             }
1819             accept_state = TRIE_NODENUM( state );
1820             TRIE_HANDLE_WORD(accept_state);
1821
1822         } /* end second pass */
1823
1824         /* and now dump it out before we compress it */
1825         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1826                                                           revcharmap,
1827                                                           next_alloc, depth+1));
1828
1829         {
1830         /*
1831            * Inplace compress the table.*
1832
1833            For sparse data sets the table constructed by the trie algorithm will
1834            be mostly 0/FAIL transitions or to put it another way mostly empty.
1835            (Note that leaf nodes will not contain any transitions.)
1836
1837            This algorithm compresses the tables by eliminating most such
1838            transitions, at the cost of a modest bit of extra work during lookup:
1839
1840            - Each states[] entry contains a .base field which indicates the
1841            index in the state[] array wheres its transition data is stored.
1842
1843            - If .base is 0 there are no valid transitions from that node.
1844
1845            - If .base is nonzero then charid is added to it to find an entry in
1846            the trans array.
1847
1848            -If trans[states[state].base+charid].check!=state then the
1849            transition is taken to be a 0/Fail transition. Thus if there are fail
1850            transitions at the front of the node then the .base offset will point
1851            somewhere inside the previous nodes data (or maybe even into a node
1852            even earlier), but the .check field determines if the transition is
1853            valid.
1854
1855            XXX - wrong maybe?
1856            The following process inplace converts the table to the compressed
1857            table: We first do not compress the root node 1,and mark all its
1858            .check pointers as 1 and set its .base pointer as 1 as well. This
1859            allows us to do a DFA construction from the compressed table later,
1860            and ensures that any .base pointers we calculate later are greater
1861            than 0.
1862
1863            - We set 'pos' to indicate the first entry of the second node.
1864
1865            - We then iterate over the columns of the node, finding the first and
1866            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1867            and set the .check pointers accordingly, and advance pos
1868            appropriately and repreat for the next node. Note that when we copy
1869            the next pointers we have to convert them from the original
1870            NODEIDX form to NODENUM form as the former is not valid post
1871            compression.
1872
1873            - If a node has no transitions used we mark its base as 0 and do not
1874            advance the pos pointer.
1875
1876            - If a node only has one transition we use a second pointer into the
1877            structure to fill in allocated fail transitions from other states.
1878            This pointer is independent of the main pointer and scans forward
1879            looking for null transitions that are allocated to a state. When it
1880            finds one it writes the single transition into the "hole".  If the
1881            pointer doesnt find one the single transition is appended as normal.
1882
1883            - Once compressed we can Renew/realloc the structures to release the
1884            excess space.
1885
1886            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1887            specifically Fig 3.47 and the associated pseudocode.
1888
1889            demq
1890         */
1891         const U32 laststate = TRIE_NODENUM( next_alloc );
1892         U32 state, charid;
1893         U32 pos = 0, zp=0;
1894         trie->statecount = laststate;
1895
1896         for ( state = 1 ; state < laststate ; state++ ) {
1897             U8 flag = 0;
1898             const U32 stateidx = TRIE_NODEIDX( state );
1899             const U32 o_used = trie->trans[ stateidx ].check;
1900             U32 used = trie->trans[ stateidx ].check;
1901             trie->trans[ stateidx ].check = 0;
1902
1903             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1904                 if ( flag || trie->trans[ stateidx + charid ].next ) {
1905                     if ( trie->trans[ stateidx + charid ].next ) {
1906                         if (o_used == 1) {
1907                             for ( ; zp < pos ; zp++ ) {
1908                                 if ( ! trie->trans[ zp ].next ) {
1909                                     break;
1910                                 }
1911                             }
1912                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1913                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1914                             trie->trans[ zp ].check = state;
1915                             if ( ++zp > pos ) pos = zp;
1916                             break;
1917                         }
1918                         used--;
1919                     }
1920                     if ( !flag ) {
1921                         flag = 1;
1922                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1923                     }
1924                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1925                     trie->trans[ pos ].check = state;
1926                     pos++;
1927                 }
1928             }
1929         }
1930         trie->lasttrans = pos + 1;
1931         trie->states = (reg_trie_state *)
1932             PerlMemShared_realloc( trie->states, laststate
1933                                    * sizeof(reg_trie_state) );
1934         DEBUG_TRIE_COMPILE_MORE_r(
1935                 PerlIO_printf( Perl_debug_log,
1936                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1937                     (int)depth * 2 + 2,"",
1938                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1939                     (IV)next_alloc,
1940                     (IV)pos,
1941                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1942             );
1943
1944         } /* end table compress */
1945     }
1946     DEBUG_TRIE_COMPILE_MORE_r(
1947             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1948                 (int)depth * 2 + 2, "",
1949                 (UV)trie->statecount,
1950                 (UV)trie->lasttrans)
1951     );
1952     /* resize the trans array to remove unused space */
1953     trie->trans = (reg_trie_trans *)
1954         PerlMemShared_realloc( trie->trans, trie->lasttrans
1955                                * sizeof(reg_trie_trans) );
1956
1957     {   /* Modify the program and insert the new TRIE node */
1958         U8 nodetype =(U8)(flags & 0xFF);
1959         char *str=NULL;
1960
1961 #ifdef DEBUGGING
1962         regnode *optimize = NULL;
1963 #ifdef RE_TRACK_PATTERN_OFFSETS
1964
1965         U32 mjd_offset = 0;
1966         U32 mjd_nodelen = 0;
1967 #endif /* RE_TRACK_PATTERN_OFFSETS */
1968 #endif /* DEBUGGING */
1969         /*
1970            This means we convert either the first branch or the first Exact,
1971            depending on whether the thing following (in 'last') is a branch
1972            or not and whther first is the startbranch (ie is it a sub part of
1973            the alternation or is it the whole thing.)
1974            Assuming its a sub part we convert the EXACT otherwise we convert
1975            the whole branch sequence, including the first.
1976          */
1977         /* Find the node we are going to overwrite */
1978         if ( first != startbranch || OP( last ) == BRANCH ) {
1979             /* branch sub-chain */
1980             NEXT_OFF( first ) = (U16)(last - first);
1981 #ifdef RE_TRACK_PATTERN_OFFSETS
1982             DEBUG_r({
1983                 mjd_offset= Node_Offset((convert));
1984                 mjd_nodelen= Node_Length((convert));
1985             });
1986 #endif
1987             /* whole branch chain */
1988         }
1989 #ifdef RE_TRACK_PATTERN_OFFSETS
1990         else {
1991             DEBUG_r({
1992                 const  regnode *nop = NEXTOPER( convert );
1993                 mjd_offset= Node_Offset((nop));
1994                 mjd_nodelen= Node_Length((nop));
1995             });
1996         }
1997         DEBUG_OPTIMISE_r(
1998             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1999                 (int)depth * 2 + 2, "",
2000                 (UV)mjd_offset, (UV)mjd_nodelen)
2001         );
2002 #endif
2003         /* But first we check to see if there is a common prefix we can
2004            split out as an EXACT and put in front of the TRIE node.  */
2005         trie->startstate= 1;
2006         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2007             U32 state;
2008             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2009                 U32 ofs = 0;
2010                 I32 idx = -1;
2011                 U32 count = 0;
2012                 const U32 base = trie->states[ state ].trans.base;
2013
2014                 if ( trie->states[state].wordnum )
2015                         count = 1;
2016
2017                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2018                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2019                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2020                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2021                     {
2022                         if ( ++count > 1 ) {
2023                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2024                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2025                             if ( state == 1 ) break;
2026                             if ( count == 2 ) {
2027                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2028                                 DEBUG_OPTIMISE_r(
2029                                     PerlIO_printf(Perl_debug_log,
2030                                         "%*sNew Start State=%"UVuf" Class: [",
2031                                         (int)depth * 2 + 2, "",
2032                                         (UV)state));
2033                                 if (idx >= 0) {
2034                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2035                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2036
2037                                     TRIE_BITMAP_SET(trie,*ch);
2038                                     if ( folder )
2039                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2040                                     DEBUG_OPTIMISE_r(
2041                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2042                                     );
2043                                 }
2044                             }
2045                             TRIE_BITMAP_SET(trie,*ch);
2046                             if ( folder )
2047                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2048                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2049                         }
2050                         idx = ofs;
2051                     }
2052                 }
2053                 if ( count == 1 ) {
2054                     SV **tmp = av_fetch( revcharmap, idx, 0);
2055                     STRLEN len;
2056                     char *ch = SvPV( *tmp, len );
2057                     DEBUG_OPTIMISE_r({
2058                         SV *sv=sv_newmortal();
2059                         PerlIO_printf( Perl_debug_log,
2060                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2061                             (int)depth * 2 + 2, "",
2062                             (UV)state, (UV)idx,
2063                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2064                                 PL_colors[0], PL_colors[1],
2065                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2066                                 PERL_PV_ESCAPE_FIRSTCHAR
2067                             )
2068                         );
2069                     });
2070                     if ( state==1 ) {
2071                         OP( convert ) = nodetype;
2072                         str=STRING(convert);
2073                         STR_LEN(convert)=0;
2074                     }
2075                     STR_LEN(convert) += len;
2076                     while (len--)
2077                         *str++ = *ch++;
2078                 } else {
2079 #ifdef DEBUGGING
2080                     if (state>1)
2081                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2082 #endif
2083                     break;
2084                 }
2085             }
2086             trie->prefixlen = (state-1);
2087             if (str) {
2088                 regnode *n = convert+NODE_SZ_STR(convert);
2089                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2090                 trie->startstate = state;
2091                 trie->minlen -= (state - 1);
2092                 trie->maxlen -= (state - 1);
2093 #ifdef DEBUGGING
2094                /* At least the UNICOS C compiler choked on this
2095                 * being argument to DEBUG_r(), so let's just have
2096                 * it right here. */
2097                if (
2098 #ifdef PERL_EXT_RE_BUILD
2099                    1
2100 #else
2101                    DEBUG_r_TEST
2102 #endif
2103                    ) {
2104                    regnode *fix = convert;
2105                    U32 word = trie->wordcount;
2106                    mjd_nodelen++;
2107                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2108                    while( ++fix < n ) {
2109                        Set_Node_Offset_Length(fix, 0, 0);
2110                    }
2111                    while (word--) {
2112                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2113                        if (tmp) {
2114                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2115                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2116                            else
2117                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2118                        }
2119                    }
2120                }
2121 #endif
2122                 if (trie->maxlen) {
2123                     convert = n;
2124                 } else {
2125                     NEXT_OFF(convert) = (U16)(tail - convert);
2126                     DEBUG_r(optimize= n);
2127                 }
2128             }
2129         }
2130         if (!jumper)
2131             jumper = last;
2132         if ( trie->maxlen ) {
2133             NEXT_OFF( convert ) = (U16)(tail - convert);
2134             ARG_SET( convert, data_slot );
2135             /* Store the offset to the first unabsorbed branch in
2136                jump[0], which is otherwise unused by the jump logic.
2137                We use this when dumping a trie and during optimisation. */
2138             if (trie->jump)
2139                 trie->jump[0] = (U16)(nextbranch - convert);
2140
2141             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2142              *   and there is a bitmap
2143              *   and the first "jump target" node we found leaves enough room
2144              * then convert the TRIE node into a TRIEC node, with the bitmap
2145              * embedded inline in the opcode - this is hypothetically faster.
2146              */
2147             if ( !trie->states[trie->startstate].wordnum
2148                  && trie->bitmap
2149                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2150             {
2151                 OP( convert ) = TRIEC;
2152                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2153                 PerlMemShared_free(trie->bitmap);
2154                 trie->bitmap= NULL;
2155             } else
2156                 OP( convert ) = TRIE;
2157
2158             /* store the type in the flags */
2159             convert->flags = nodetype;
2160             DEBUG_r({
2161             optimize = convert
2162                       + NODE_STEP_REGNODE
2163                       + regarglen[ OP( convert ) ];
2164             });
2165             /* XXX We really should free up the resource in trie now,
2166                    as we won't use them - (which resources?) dmq */
2167         }
2168         /* needed for dumping*/
2169         DEBUG_r(if (optimize) {
2170             regnode *opt = convert;
2171
2172             while ( ++opt < optimize) {
2173                 Set_Node_Offset_Length(opt,0,0);
2174             }
2175             /*
2176                 Try to clean up some of the debris left after the
2177                 optimisation.
2178              */
2179             while( optimize < jumper ) {
2180                 mjd_nodelen += Node_Length((optimize));
2181                 OP( optimize ) = OPTIMIZED;
2182                 Set_Node_Offset_Length(optimize,0,0);
2183                 optimize++;
2184             }
2185             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2186         });
2187     } /* end node insert */
2188     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2189
2190     /*  Finish populating the prev field of the wordinfo array.  Walk back
2191      *  from each accept state until we find another accept state, and if
2192      *  so, point the first word's .prev field at the second word. If the
2193      *  second already has a .prev field set, stop now. This will be the
2194      *  case either if we've already processed that word's accept state,
2195      *  or that state had multiple words, and the overspill words were
2196      *  already linked up earlier.
2197      */
2198     {
2199         U16 word;
2200         U32 state;
2201         U16 prev;
2202
2203         for (word=1; word <= trie->wordcount; word++) {
2204             prev = 0;
2205             if (trie->wordinfo[word].prev)
2206                 continue;
2207             state = trie->wordinfo[word].accept;
2208             while (state) {
2209                 state = prev_states[state];
2210                 if (!state)
2211                     break;
2212                 prev = trie->states[state].wordnum;
2213                 if (prev)
2214                     break;
2215             }
2216             trie->wordinfo[word].prev = prev;
2217         }
2218         Safefree(prev_states);
2219     }
2220
2221
2222     /* and now dump out the compressed format */
2223     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2224
2225     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2226 #ifdef DEBUGGING
2227     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2228     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2229 #else
2230     SvREFCNT_dec(revcharmap);
2231 #endif
2232     return trie->jump
2233            ? MADE_JUMP_TRIE
2234            : trie->startstate>1
2235              ? MADE_EXACT_TRIE
2236              : MADE_TRIE;
2237 }
2238
2239 STATIC void
2240 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2241 {
2242 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2243
2244    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2245    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2246    ISBN 0-201-10088-6
2247
2248    We find the fail state for each state in the trie, this state is the longest proper
2249    suffix of the current state's 'word' that is also a proper prefix of another word in our
2250    trie. State 1 represents the word '' and is thus the default fail state. This allows
2251    the DFA not to have to restart after its tried and failed a word at a given point, it
2252    simply continues as though it had been matching the other word in the first place.
2253    Consider
2254       'abcdgu'=~/abcdefg|cdgu/
2255    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2256    fail, which would bring us to the state representing 'd' in the second word where we would
2257    try 'g' and succeed, proceeding to match 'cdgu'.
2258  */
2259  /* add a fail transition */
2260     const U32 trie_offset = ARG(source);
2261     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2262     U32 *q;
2263     const U32 ucharcount = trie->uniquecharcount;
2264     const U32 numstates = trie->statecount;
2265     const U32 ubound = trie->lasttrans + ucharcount;
2266     U32 q_read = 0;
2267     U32 q_write = 0;
2268     U32 charid;
2269     U32 base = trie->states[ 1 ].trans.base;
2270     U32 *fail;
2271     reg_ac_data *aho;
2272     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2273     GET_RE_DEBUG_FLAGS_DECL;
2274
2275     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2276 #ifndef DEBUGGING
2277     PERL_UNUSED_ARG(depth);
2278 #endif
2279
2280
2281     ARG_SET( stclass, data_slot );
2282     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2283     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2284     aho->trie=trie_offset;
2285     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2286     Copy( trie->states, aho->states, numstates, reg_trie_state );
2287     Newxz( q, numstates, U32);
2288     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2289     aho->refcount = 1;
2290     fail = aho->fail;
2291     /* initialize fail[0..1] to be 1 so that we always have
2292        a valid final fail state */
2293     fail[ 0 ] = fail[ 1 ] = 1;
2294
2295     for ( charid = 0; charid < ucharcount ; charid++ ) {
2296         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2297         if ( newstate ) {
2298             q[ q_write ] = newstate;
2299             /* set to point at the root */
2300             fail[ q[ q_write++ ] ]=1;
2301         }
2302     }
2303     while ( q_read < q_write) {
2304         const U32 cur = q[ q_read++ % numstates ];
2305         base = trie->states[ cur ].trans.base;
2306
2307         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2308             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2309             if (ch_state) {
2310                 U32 fail_state = cur;
2311                 U32 fail_base;
2312                 do {
2313                     fail_state = fail[ fail_state ];
2314                     fail_base = aho->states[ fail_state ].trans.base;
2315                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2316
2317                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2318                 fail[ ch_state ] = fail_state;
2319                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2320                 {
2321                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2322                 }
2323                 q[ q_write++ % numstates] = ch_state;
2324             }
2325         }
2326     }
2327     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2328        when we fail in state 1, this allows us to use the
2329        charclass scan to find a valid start char. This is based on the principle
2330        that theres a good chance the string being searched contains lots of stuff
2331        that cant be a start char.
2332      */
2333     fail[ 0 ] = fail[ 1 ] = 0;
2334     DEBUG_TRIE_COMPILE_r({
2335         PerlIO_printf(Perl_debug_log,
2336                       "%*sStclass Failtable (%"UVuf" states): 0",
2337                       (int)(depth * 2), "", (UV)numstates
2338         );
2339         for( q_read=1; q_read<numstates; q_read++ ) {
2340             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2341         }
2342         PerlIO_printf(Perl_debug_log, "\n");
2343     });
2344     Safefree(q);
2345     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2346 }
2347
2348
2349 /*
2350  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2351  * These need to be revisited when a newer toolchain becomes available.
2352  */
2353 #if defined(__sparc64__) && defined(__GNUC__)
2354 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2355 #       undef  SPARC64_GCC_WORKAROUND
2356 #       define SPARC64_GCC_WORKAROUND 1
2357 #   endif
2358 #endif
2359
2360 #define DEBUG_PEEP(str,scan,depth) \
2361     DEBUG_OPTIMISE_r({if (scan){ \
2362        SV * const mysv=sv_newmortal(); \
2363        regnode *Next = regnext(scan); \
2364        regprop(RExC_rx, mysv, scan); \
2365        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2366        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2367        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2368    }});
2369
2370
2371
2372
2373
2374 #define JOIN_EXACT(scan,min,flags) \
2375     if (PL_regkind[OP(scan)] == EXACT) \
2376         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2377
2378 STATIC U32
2379 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2380     /* Merge several consecutive EXACTish nodes into one. */
2381     regnode *n = regnext(scan);
2382     U32 stringok = 1;
2383     regnode *next = scan + NODE_SZ_STR(scan);
2384     U32 merged = 0;
2385     U32 stopnow = 0;
2386 #ifdef DEBUGGING
2387     regnode *stop = scan;
2388     GET_RE_DEBUG_FLAGS_DECL;
2389 #else
2390     PERL_UNUSED_ARG(depth);
2391 #endif
2392
2393     PERL_ARGS_ASSERT_JOIN_EXACT;
2394 #ifndef EXPERIMENTAL_INPLACESCAN
2395     PERL_UNUSED_ARG(flags);
2396     PERL_UNUSED_ARG(val);
2397 #endif
2398     DEBUG_PEEP("join",scan,depth);
2399
2400     /* Skip NOTHING, merge EXACT*. */
2401     while (n &&
2402            ( PL_regkind[OP(n)] == NOTHING ||
2403              (stringok && (OP(n) == OP(scan))))
2404            && NEXT_OFF(n)
2405            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2406
2407         if (OP(n) == TAIL || n > next)
2408             stringok = 0;
2409         if (PL_regkind[OP(n)] == NOTHING) {
2410             DEBUG_PEEP("skip:",n,depth);
2411             NEXT_OFF(scan) += NEXT_OFF(n);
2412             next = n + NODE_STEP_REGNODE;
2413 #ifdef DEBUGGING
2414             if (stringok)
2415                 stop = n;
2416 #endif
2417             n = regnext(n);
2418         }
2419         else if (stringok) {
2420             const unsigned int oldl = STR_LEN(scan);
2421             regnode * const nnext = regnext(n);
2422
2423             DEBUG_PEEP("merg",n,depth);
2424
2425             merged++;
2426             if (oldl + STR_LEN(n) > U8_MAX)
2427                 break;
2428             NEXT_OFF(scan) += NEXT_OFF(n);
2429             STR_LEN(scan) += STR_LEN(n);
2430             next = n + NODE_SZ_STR(n);
2431             /* Now we can overwrite *n : */
2432             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2433 #ifdef DEBUGGING
2434             stop = next - 1;
2435 #endif
2436             n = nnext;
2437             if (stopnow) break;
2438         }
2439
2440 #ifdef EXPERIMENTAL_INPLACESCAN
2441         if (flags && !NEXT_OFF(n)) {
2442             DEBUG_PEEP("atch", val, depth);
2443             if (reg_off_by_arg[OP(n)]) {
2444                 ARG_SET(n, val - n);
2445             }
2446             else {
2447                 NEXT_OFF(n) = val - n;
2448             }
2449             stopnow = 1;
2450         }
2451 #endif
2452     }
2453 #define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS   0x0390
2454 #define IOTA_D_T        GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2455 #define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS     0x03B0
2456 #define UPSILON_D_T     GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2457
2458     if (UTF
2459         && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
2460         && ( STR_LEN(scan) >= 6 ) )
2461     {
2462     /*
2463     Two problematic code points in Unicode casefolding of EXACT nodes:
2464
2465     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2466     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2467
2468     which casefold to
2469
2470     Unicode                      UTF-8
2471
2472     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2473     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2474
2475     This means that in case-insensitive matching (or "loose matching",
2476     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2477     length of the above casefolded versions) can match a target string
2478     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2479     This would rather mess up the minimum length computation.
2480
2481     What we'll do is to look for the tail four bytes, and then peek
2482     at the preceding two bytes to see whether we need to decrease
2483     the minimum length by four (six minus two).
2484
2485     Thanks to the design of UTF-8, there cannot be false matches:
2486     A sequence of valid UTF-8 bytes cannot be a subsequence of
2487     another valid sequence of UTF-8 bytes.
2488
2489     */
2490          char * const s0 = STRING(scan), *s, *t;
2491          char * const s1 = s0 + STR_LEN(scan) - 1;
2492          char * const s2 = s1 - 4;
2493 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2494          const char t0[] = "\xaf\x49\xaf\x42";
2495 #else
2496          const char t0[] = "\xcc\x88\xcc\x81";
2497 #endif
2498          const char * const t1 = t0 + 3;
2499
2500          for (s = s0 + 2;
2501               s < s2 && (t = ninstr(s, s1, t0, t1));
2502               s = t + 4) {
2503 #ifdef EBCDIC
2504               if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2505                   ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2506 #else
2507               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2508                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2509 #endif
2510                    *min -= 4;
2511          }
2512     }
2513
2514 #ifdef DEBUGGING
2515     /* Allow dumping */
2516     n = scan + NODE_SZ_STR(scan);
2517     while (n <= stop) {
2518         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2519             OP(n) = OPTIMIZED;
2520             NEXT_OFF(n) = 0;
2521         }
2522         n++;
2523     }
2524 #endif
2525     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2526     return stopnow;
2527 }
2528
2529 /* REx optimizer.  Converts nodes into quickier variants "in place".
2530    Finds fixed substrings.  */
2531
2532 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2533    to the position after last scanned or to NULL. */
2534
2535 #define INIT_AND_WITHP \
2536     assert(!and_withp); \
2537     Newx(and_withp,1,struct regnode_charclass_class); \
2538     SAVEFREEPV(and_withp)
2539
2540 /* this is a chain of data about sub patterns we are processing that
2541    need to be handled seperately/specially in study_chunk. Its so
2542    we can simulate recursion without losing state.  */
2543 struct scan_frame;
2544 typedef struct scan_frame {
2545     regnode *last;  /* last node to process in this frame */
2546     regnode *next;  /* next node to process when last is reached */
2547     struct scan_frame *prev; /*previous frame*/
2548     I32 stop; /* what stopparen do we use */
2549 } scan_frame;
2550
2551
2552 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2553
2554 #define CASE_SYNST_FNC(nAmE)                                       \
2555 case nAmE:                                                         \
2556     if (flags & SCF_DO_STCLASS_AND) {                              \
2557             for (value = 0; value < 256; value++)                  \
2558                 if (!is_ ## nAmE ## _cp(value))                       \
2559                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2560     }                                                              \
2561     else {                                                         \
2562             for (value = 0; value < 256; value++)                  \
2563                 if (is_ ## nAmE ## _cp(value))                        \
2564                     ANYOF_BITMAP_SET(data->start_class, value);    \
2565     }                                                              \
2566     break;                                                         \
2567 case N ## nAmE:                                                    \
2568     if (flags & SCF_DO_STCLASS_AND) {                              \
2569             for (value = 0; value < 256; value++)                   \
2570                 if (is_ ## nAmE ## _cp(value))                         \
2571                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2572     }                                                               \
2573     else {                                                          \
2574             for (value = 0; value < 256; value++)                   \
2575                 if (!is_ ## nAmE ## _cp(value))                        \
2576                     ANYOF_BITMAP_SET(data->start_class, value);     \
2577     }                                                               \
2578     break
2579
2580
2581
2582 STATIC I32
2583 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2584                         I32 *minlenp, I32 *deltap,
2585                         regnode *last,
2586                         scan_data_t *data,
2587                         I32 stopparen,
2588                         U8* recursed,
2589                         struct regnode_charclass_class *and_withp,
2590                         U32 flags, U32 depth)
2591                         /* scanp: Start here (read-write). */
2592                         /* deltap: Write maxlen-minlen here. */
2593                         /* last: Stop before this one. */
2594                         /* data: string data about the pattern */
2595                         /* stopparen: treat close N as END */
2596                         /* recursed: which subroutines have we recursed into */
2597                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2598 {
2599     dVAR;
2600     I32 min = 0, pars = 0, code;
2601     regnode *scan = *scanp, *next;
2602     I32 delta = 0;
2603     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2604     int is_inf_internal = 0;            /* The studied chunk is infinite */
2605     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2606     scan_data_t data_fake;
2607     SV *re_trie_maxbuff = NULL;
2608     regnode *first_non_open = scan;
2609     I32 stopmin = I32_MAX;
2610     scan_frame *frame = NULL;
2611     GET_RE_DEBUG_FLAGS_DECL;
2612
2613     PERL_ARGS_ASSERT_STUDY_CHUNK;
2614
2615 #ifdef DEBUGGING
2616     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2617 #endif
2618
2619     if ( depth == 0 ) {
2620         while (first_non_open && OP(first_non_open) == OPEN)
2621             first_non_open=regnext(first_non_open);
2622     }
2623
2624
2625   fake_study_recurse:
2626     while ( scan && OP(scan) != END && scan < last ){
2627         /* Peephole optimizer: */
2628         DEBUG_STUDYDATA("Peep:", data,depth);
2629         DEBUG_PEEP("Peep",scan,depth);
2630         JOIN_EXACT(scan,&min,0);
2631
2632         /* Follow the next-chain of the current node and optimize
2633            away all the NOTHINGs from it.  */
2634         if (OP(scan) != CURLYX) {
2635             const int max = (reg_off_by_arg[OP(scan)]
2636                        ? I32_MAX
2637                        /* I32 may be smaller than U16 on CRAYs! */
2638                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2639             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2640             int noff;
2641             regnode *n = scan;
2642
2643             /* Skip NOTHING and LONGJMP. */
2644             while ((n = regnext(n))
2645                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2646                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2647                    && off + noff < max)
2648                 off += noff;
2649             if (reg_off_by_arg[OP(scan)])
2650                 ARG(scan) = off;
2651             else
2652                 NEXT_OFF(scan) = off;
2653         }
2654
2655
2656
2657         /* The principal pseudo-switch.  Cannot be a switch, since we
2658            look into several different things.  */
2659         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2660                    || OP(scan) == IFTHEN) {
2661             next = regnext(scan);
2662             code = OP(scan);
2663             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2664
2665             if (OP(next) == code || code == IFTHEN) {
2666                 /* NOTE - There is similar code to this block below for handling
2667                    TRIE nodes on a re-study.  If you change stuff here check there
2668                    too. */
2669                 I32 max1 = 0, min1 = I32_MAX, num = 0;
2670                 struct regnode_charclass_class accum;
2671                 regnode * const startbranch=scan;
2672
2673                 if (flags & SCF_DO_SUBSTR)
2674                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2675                 if (flags & SCF_DO_STCLASS)
2676                     cl_init_zero(pRExC_state, &accum);
2677
2678                 while (OP(scan) == code) {
2679                     I32 deltanext, minnext, f = 0, fake;
2680                     struct regnode_charclass_class this_class;
2681
2682                     num++;
2683                     data_fake.flags = 0;
2684                     if (data) {
2685                         data_fake.whilem_c = data->whilem_c;
2686                         data_fake.last_closep = data->last_closep;
2687                     }
2688                     else
2689                         data_fake.last_closep = &fake;
2690
2691                     data_fake.pos_delta = delta;
2692                     next = regnext(scan);
2693                     scan = NEXTOPER(scan);
2694                     if (code != BRANCH)
2695                         scan = NEXTOPER(scan);
2696                     if (flags & SCF_DO_STCLASS) {
2697                         cl_init(pRExC_state, &this_class);
2698                         data_fake.start_class = &this_class;
2699                         f = SCF_DO_STCLASS_AND;
2700                     }
2701                     if (flags & SCF_WHILEM_VISITED_POS)
2702                         f |= SCF_WHILEM_VISITED_POS;
2703
2704                     /* we suppose the run is continuous, last=next...*/
2705                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2706                                           next, &data_fake,
2707                                           stopparen, recursed, NULL, f,depth+1);
2708                     if (min1 > minnext)
2709                         min1 = minnext;
2710                     if (max1 < minnext + deltanext)
2711                         max1 = minnext + deltanext;
2712                     if (deltanext == I32_MAX)
2713                         is_inf = is_inf_internal = 1;
2714                     scan = next;
2715                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2716                         pars++;
2717                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
2718                         if ( stopmin > minnext)
2719                             stopmin = min + min1;
2720                         flags &= ~SCF_DO_SUBSTR;
2721                         if (data)
2722                             data->flags |= SCF_SEEN_ACCEPT;
2723                     }
2724                     if (data) {
2725                         if (data_fake.flags & SF_HAS_EVAL)
2726                             data->flags |= SF_HAS_EVAL;
2727                         data->whilem_c = data_fake.whilem_c;
2728                     }
2729                     if (flags & SCF_DO_STCLASS)
2730                         cl_or(pRExC_state, &accum, &this_class);
2731                 }
2732                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2733                     min1 = 0;
2734                 if (flags & SCF_DO_SUBSTR) {
2735                     data->pos_min += min1;
2736                     data->pos_delta += max1 - min1;
2737                     if (max1 != min1 || is_inf)
2738                         data->longest = &(data->longest_float);
2739                 }
2740                 min += min1;
2741                 delta += max1 - min1;
2742                 if (flags & SCF_DO_STCLASS_OR) {
2743                     cl_or(pRExC_state, data->start_class, &accum);
2744                     if (min1) {
2745                         cl_and(data->start_class, and_withp);
2746                         flags &= ~SCF_DO_STCLASS;
2747                     }
2748                 }
2749                 else if (flags & SCF_DO_STCLASS_AND) {
2750                     if (min1) {
2751                         cl_and(data->start_class, &accum);
2752                         flags &= ~SCF_DO_STCLASS;
2753                     }
2754                     else {
2755                         /* Switch to OR mode: cache the old value of
2756                          * data->start_class */
2757                         INIT_AND_WITHP;
2758                         StructCopy(data->start_class, and_withp,
2759                                    struct regnode_charclass_class);
2760                         flags &= ~SCF_DO_STCLASS_AND;
2761                         StructCopy(&accum, data->start_class,
2762                                    struct regnode_charclass_class);
2763                         flags |= SCF_DO_STCLASS_OR;
2764                         data->start_class->flags |= ANYOF_EOS;
2765                     }
2766                 }
2767
2768                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2769                 /* demq.
2770
2771                    Assuming this was/is a branch we are dealing with: 'scan' now
2772                    points at the item that follows the branch sequence, whatever
2773                    it is. We now start at the beginning of the sequence and look
2774                    for subsequences of
2775
2776                    BRANCH->EXACT=>x1
2777                    BRANCH->EXACT=>x2
2778                    tail
2779
2780                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
2781
2782                    If we can find such a subseqence we need to turn the first
2783                    element into a trie and then add the subsequent branch exact
2784                    strings to the trie.
2785
2786                    We have two cases
2787
2788                      1. patterns where the whole set of branches can be converted.
2789
2790                      2. patterns where only a subset can be converted.
2791
2792                    In case 1 we can replace the whole set with a single regop
2793                    for the trie. In case 2 we need to keep the start and end
2794                    branches so
2795
2796                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2797                      becomes BRANCH TRIE; BRANCH X;
2798
2799                   There is an additional case, that being where there is a
2800                   common prefix, which gets split out into an EXACT like node
2801                   preceding the TRIE node.
2802
2803                   If x(1..n)==tail then we can do a simple trie, if not we make
2804                   a "jump" trie, such that when we match the appropriate word
2805                   we "jump" to the appopriate tail node. Essentailly we turn
2806                   a nested if into a case structure of sorts.
2807
2808                 */
2809
2810                     int made=0;
2811                     if (!re_trie_maxbuff) {
2812                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2813                         if (!SvIOK(re_trie_maxbuff))
2814                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2815                     }
2816                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2817                         regnode *cur;
2818                         regnode *first = (regnode *)NULL;
2819                         regnode *last = (regnode *)NULL;
2820                         regnode *tail = scan;
2821                         U8 optype = 0;
2822                         U32 count=0;
2823
2824 #ifdef DEBUGGING
2825                         SV * const mysv = sv_newmortal();       /* for dumping */
2826 #endif
2827                         /* var tail is used because there may be a TAIL
2828                            regop in the way. Ie, the exacts will point to the
2829                            thing following the TAIL, but the last branch will
2830                            point at the TAIL. So we advance tail. If we
2831                            have nested (?:) we may have to move through several
2832                            tails.
2833                          */
2834
2835                         while ( OP( tail ) == TAIL ) {
2836                             /* this is the TAIL generated by (?:) */
2837                             tail = regnext( tail );
2838                         }
2839
2840
2841                         DEBUG_OPTIMISE_r({
2842                             regprop(RExC_rx, mysv, tail );
2843                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2844                                 (int)depth * 2 + 2, "",
2845                                 "Looking for TRIE'able sequences. Tail node is: ",
2846                                 SvPV_nolen_const( mysv )
2847                             );
2848                         });
2849
2850                         /*
2851
2852                            step through the branches, cur represents each
2853                            branch, noper is the first thing to be matched
2854                            as part of that branch and noper_next is the
2855                            regnext() of that node. if noper is an EXACT
2856                            and noper_next is the same as scan (our current
2857                            position in the regex) then the EXACT branch is
2858                            a possible optimization target. Once we have
2859                            two or more consequetive such branches we can
2860                            create a trie of the EXACT's contents and stich
2861                            it in place. If the sequence represents all of
2862                            the branches we eliminate the whole thing and
2863                            replace it with a single TRIE. If it is a
2864                            subsequence then we need to stitch it in. This
2865                            means the first branch has to remain, and needs
2866                            to be repointed at the item on the branch chain
2867                            following the last branch optimized. This could
2868                            be either a BRANCH, in which case the
2869                            subsequence is internal, or it could be the
2870                            item following the branch sequence in which
2871                            case the subsequence is at the end.
2872
2873                         */
2874
2875                         /* dont use tail as the end marker for this traverse */
2876                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2877                             regnode * const noper = NEXTOPER( cur );
2878 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2879                             regnode * const noper_next = regnext( noper );
2880 #endif
2881
2882                             DEBUG_OPTIMISE_r({
2883                                 regprop(RExC_rx, mysv, cur);
2884                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2885                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2886
2887                                 regprop(RExC_rx, mysv, noper);
2888                                 PerlIO_printf( Perl_debug_log, " -> %s",
2889                                     SvPV_nolen_const(mysv));
2890
2891                                 if ( noper_next ) {
2892                                   regprop(RExC_rx, mysv, noper_next );
2893                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2894                                     SvPV_nolen_const(mysv));
2895                                 }
2896                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2897                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2898                             });
2899                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2900                                          : PL_regkind[ OP( noper ) ] == EXACT )
2901                                   || OP(noper) == NOTHING )
2902 #ifdef NOJUMPTRIE
2903                                   && noper_next == tail
2904 #endif
2905                                   && count < U16_MAX)
2906                             {
2907                                 count++;
2908                                 if ( !first || optype == NOTHING ) {
2909                                     if (!first) first = cur;
2910                                     optype = OP( noper );
2911                                 } else {
2912                                     last = cur;
2913                                 }
2914                             } else {
2915 /*
2916     Currently we do not believe that the trie logic can
2917     handle case insensitive matching properly when the
2918     pattern is not unicode (thus forcing unicode semantics).
2919
2920     If/when this is fixed the following define can be swapped
2921     in below to fully enable trie logic.
2922
2923 #define TRIE_TYPE_IS_SAFE 1
2924
2925 */
2926 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2927
2928                                 if ( last && TRIE_TYPE_IS_SAFE ) {
2929                                     make_trie( pRExC_state,
2930                                             startbranch, first, cur, tail, count,
2931                                             optype, depth+1 );
2932                                 }
2933                                 if ( PL_regkind[ OP( noper ) ] == EXACT
2934 #ifdef NOJUMPTRIE
2935                                      && noper_next == tail
2936 #endif
2937                                 ){
2938                                     count = 1;
2939                                     first = cur;
2940                                     optype = OP( noper );
2941                                 } else {
2942                                     count = 0;
2943                                     first = NULL;
2944                                     optype = 0;
2945                                 }
2946                                 last = NULL;
2947                             }
2948                         }
2949                         DEBUG_OPTIMISE_r({
2950                             regprop(RExC_rx, mysv, cur);
2951                             PerlIO_printf( Perl_debug_log,
2952                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2953                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2954
2955                         });
2956
2957                         if ( last && TRIE_TYPE_IS_SAFE ) {
2958                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2959 #ifdef TRIE_STUDY_OPT
2960                             if ( ((made == MADE_EXACT_TRIE &&
2961                                  startbranch == first)
2962                                  || ( first_non_open == first )) &&
2963                                  depth==0 ) {
2964                                 flags |= SCF_TRIE_RESTUDY;
2965                                 if ( startbranch == first
2966                                      && scan == tail )
2967                                 {
2968                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2969                                 }
2970                             }
2971 #endif
2972                         }
2973                     }
2974
2975                 } /* do trie */
2976
2977             }
2978             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
2979                 scan = NEXTOPER(NEXTOPER(scan));
2980             } else                      /* single branch is optimized. */
2981                 scan = NEXTOPER(scan);
2982             continue;
2983         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2984             scan_frame *newframe = NULL;
2985             I32 paren;
2986             regnode *start;
2987             regnode *end;
2988
2989             if (OP(scan) != SUSPEND) {
2990             /* set the pointer */
2991                 if (OP(scan) == GOSUB) {
2992                     paren = ARG(scan);
2993                     RExC_recurse[ARG2L(scan)] = scan;
2994                     start = RExC_open_parens[paren-1];
2995                     end   = RExC_close_parens[paren-1];
2996                 } else {
2997                     paren = 0;
2998                     start = RExC_rxi->program + 1;
2999                     end   = RExC_opend;
3000                 }
3001                 if (!recursed) {
3002                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3003                     SAVEFREEPV(recursed);
3004                 }
3005                 if (!PAREN_TEST(recursed,paren+1)) {
3006                     PAREN_SET(recursed,paren+1);
3007                     Newx(newframe,1,scan_frame);
3008                 } else {
3009                     if (flags & SCF_DO_SUBSTR) {
3010                         SCAN_COMMIT(pRExC_state,data,minlenp);
3011                         data->longest = &(data->longest_float);
3012                     }
3013                     is_inf = is_inf_internal = 1;
3014                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3015                         cl_anything(pRExC_state, data->start_class);
3016                     flags &= ~SCF_DO_STCLASS;
3017                 }
3018             } else {
3019                 Newx(newframe,1,scan_frame);
3020                 paren = stopparen;
3021                 start = scan+2;
3022                 end = regnext(scan);
3023             }
3024             if (newframe) {
3025                 assert(start);
3026                 assert(end);
3027                 SAVEFREEPV(newframe);
3028                 newframe->next = regnext(scan);
3029                 newframe->last = last;
3030                 newframe->stop = stopparen;
3031                 newframe->prev = frame;
3032
3033                 frame = newframe;
3034                 scan =  start;
3035                 stopparen = paren;
3036                 last = end;
3037
3038                 continue;
3039             }
3040         }
3041         else if (OP(scan) == EXACT) {
3042             I32 l = STR_LEN(scan);
3043             UV uc;
3044             if (UTF) {
3045                 const U8 * const s = (U8*)STRING(scan);
3046                 l = utf8_length(s, s + l);
3047                 uc = utf8_to_uvchr(s, NULL);
3048             } else {
3049                 uc = *((U8*)STRING(scan));
3050             }
3051             min += l;
3052             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3053                 /* The code below prefers earlier match for fixed
3054                    offset, later match for variable offset.  */
3055                 if (data->last_end == -1) { /* Update the start info. */
3056                     data->last_start_min = data->pos_min;
3057                     data->last_start_max = is_inf
3058                         ? I32_MAX : data->pos_min + data->pos_delta;
3059                 }
3060                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3061                 if (UTF)
3062                     SvUTF8_on(data->last_found);
3063                 {
3064                     SV * const sv = data->last_found;
3065                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3066                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3067                     if (mg && mg->mg_len >= 0)
3068                         mg->mg_len += utf8_length((U8*)STRING(scan),
3069                                                   (U8*)STRING(scan)+STR_LEN(scan));
3070                 }
3071                 data->last_end = data->pos_min + l;
3072                 data->pos_min += l; /* As in the first entry. */
3073                 data->flags &= ~SF_BEFORE_EOL;
3074             }
3075             if (flags & SCF_DO_STCLASS_AND) {
3076                 /* Check whether it is compatible with what we know already! */
3077                 int compat = 1;
3078
3079
3080                 /* If compatibile, we or it in below.  It is compatible if is
3081                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3082                  * it's for a locale.  Even if there isn't unicode semantics
3083                  * here, at runtime there may be because of matching against a
3084                  * utf8 string, so accept a possible false positive for
3085                  * latin1-range folds */
3086                 if (uc >= 0x100 ||
3087                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3088                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3089                     && (!(data->start_class->flags & ANYOF_FOLD)
3090                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3091                     )
3092                     compat = 0;
3093                 ANYOF_CLASS_ZERO(data->start_class);
3094                 ANYOF_BITMAP_ZERO(data->start_class);
3095                 if (compat)
3096                     ANYOF_BITMAP_SET(data->start_class, uc);
3097                 data->start_class->flags &= ~ANYOF_EOS;
3098                 if (uc < 0x100)
3099                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3100             }
3101             else if (flags & SCF_DO_STCLASS_OR) {
3102                 /* false positive possible if the class is case-folded */
3103                 if (uc < 0x100)
3104                     ANYOF_BITMAP_SET(data->start_class, uc);
3105                 else
3106                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3107                 data->start_class->flags &= ~ANYOF_EOS;
3108                 cl_and(data->start_class, and_withp);
3109             }
3110             flags &= ~SCF_DO_STCLASS;
3111         }
3112         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3113             I32 l = STR_LEN(scan);
3114             UV uc = *((U8*)STRING(scan));
3115
3116             /* Search for fixed substrings supports EXACT only. */
3117             if (flags & SCF_DO_SUBSTR) {
3118                 assert(data);
3119                 SCAN_COMMIT(pRExC_state, data, minlenp);
3120             }
3121             if (UTF) {
3122                 const U8 * const s = (U8 *)STRING(scan);
3123                 l = utf8_length(s, s + l);
3124                 uc = utf8_to_uvchr(s, NULL);
3125             }
3126             min += l;
3127             if (flags & SCF_DO_SUBSTR)
3128                 data->pos_min += l;
3129             if (flags & SCF_DO_STCLASS_AND) {
3130                 /* Check whether it is compatible with what we know already! */
3131                 int compat = 1;
3132                 if (uc >= 0x100 ||
3133                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3134                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3135                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3136                 {
3137                     compat = 0;
3138                 }
3139                 ANYOF_CLASS_ZERO(data->start_class);
3140                 ANYOF_BITMAP_ZERO(data->start_class);
3141                 if (compat) {
3142                     ANYOF_BITMAP_SET(data->start_class, uc);
3143                     data->start_class->flags &= ~ANYOF_EOS;
3144                     data->start_class->flags |= ANYOF_FOLD;
3145                     if (OP(scan) == EXACTFL) {
3146                         data->start_class->flags |= ANYOF_LOCALE;
3147                     }
3148                     else {
3149
3150                         /* Also set the other member of the fold pair.  In case
3151                          * that unicode semantics is called for at runtime, use
3152                          * the full latin1 fold.  (Can't do this for locale,
3153                          * because not known until runtime */
3154                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3155                     }
3156                 }
3157             }
3158             else if (flags & SCF_DO_STCLASS_OR) {
3159                 if (data->start_class->flags & ANYOF_FOLD) {
3160                     /* false positive possible if the class is case-folded.
3161                        Assume that the locale settings are the same... */
3162                     if (uc < 0x100) {
3163                         ANYOF_BITMAP_SET(data->start_class, uc);
3164                         if (OP(scan) != EXACTFL) {
3165
3166                             /* And set the other member of the fold pair, but
3167                              * can't do that in locale because not known until
3168                              * run-time */
3169                             ANYOF_BITMAP_SET(data->start_class,
3170                                              PL_fold_latin1[uc]);
3171                         }
3172                     }
3173                     data->start_class->flags &= ~ANYOF_EOS;
3174                 }
3175                 cl_and(data->start_class, and_withp);
3176             }
3177             flags &= ~SCF_DO_STCLASS;
3178         }
3179         else if (REGNODE_VARIES(OP(scan))) {
3180             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3181             I32 f = flags, pos_before = 0;
3182             regnode * const oscan = scan;
3183             struct regnode_charclass_class this_class;
3184             struct regnode_charclass_class *oclass = NULL;
3185             I32 next_is_eval = 0;
3186
3187             switch (PL_regkind[OP(scan)]) {
3188             case WHILEM:                /* End of (?:...)* . */
3189                 scan = NEXTOPER(scan);
3190                 goto finish;
3191             case PLUS:
3192                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3193                     next = NEXTOPER(scan);
3194                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3195                         mincount = 1;
3196                         maxcount = REG_INFTY;
3197                         next = regnext(scan);
3198                         scan = NEXTOPER(scan);
3199                         goto do_curly;
3200                     }
3201                 }
3202                 if (flags & SCF_DO_SUBSTR)
3203                     data->pos_min++;
3204                 min++;
3205                 /* Fall through. */
3206             case STAR:
3207                 if (flags & SCF_DO_STCLASS) {
3208                     mincount = 0;
3209                     maxcount = REG_INFTY;
3210                     next = regnext(scan);
3211                     scan = NEXTOPER(scan);
3212                     goto do_curly;
3213                 }
3214                 is_inf = is_inf_internal = 1;
3215                 scan = regnext(scan);
3216                 if (flags & SCF_DO_SUBSTR) {
3217                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3218                     data->longest = &(data->longest_float);
3219                 }
3220                 goto optimize_curly_tail;
3221             case CURLY:
3222                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3223                     && (scan->flags == stopparen))
3224                 {
3225                     mincount = 1;
3226                     maxcount = 1;
3227                 } else {
3228                     mincount = ARG1(scan);
3229                     maxcount = ARG2(scan);
3230                 }
3231                 next = regnext(scan);
3232                 if (OP(scan) == CURLYX) {
3233                     I32 lp = (data ? *(data->last_closep) : 0);
3234                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3235                 }
3236                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3237                 next_is_eval = (OP(scan) == EVAL);
3238               do_curly:
3239                 if (flags & SCF_DO_SUBSTR) {
3240                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3241                     pos_before = data->pos_min;
3242                 }
3243                 if (data) {
3244                     fl = data->flags;
3245                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3246                     if (is_inf)
3247                         data->flags |= SF_IS_INF;
3248                 }
3249                 if (flags & SCF_DO_STCLASS) {
3250                     cl_init(pRExC_state, &this_class);
3251                     oclass = data->start_class;
3252                     data->start_class = &this_class;
3253                     f |= SCF_DO_STCLASS_AND;
3254                     f &= ~SCF_DO_STCLASS_OR;
3255                 }
3256                 /* Exclude from super-linear cache processing any {n,m}
3257                    regops for which the combination of input pos and regex
3258                    pos is not enough information to determine if a match
3259                    will be possible.
3260
3261                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3262                    regex pos at the \s*, the prospects for a match depend not
3263                    only on the input position but also on how many (bar\s*)
3264                    repeats into the {4,8} we are. */
3265                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3266                     f &= ~SCF_WHILEM_VISITED_POS;
3267
3268                 /* This will finish on WHILEM, setting scan, or on NULL: */
3269                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3270                                       last, data, stopparen, recursed, NULL,
3271                                       (mincount == 0
3272                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3273
3274                 if (flags & SCF_DO_STCLASS)
3275                     data->start_class = oclass;
3276                 if (mincount == 0 || minnext == 0) {
3277                     if (flags & SCF_DO_STCLASS_OR) {
3278                         cl_or(pRExC_state, data->start_class, &this_class);
3279                     }
3280                     else if (flags & SCF_DO_STCLASS_AND) {
3281                         /* Switch to OR mode: cache the old value of
3282                          * data->start_class */
3283                         INIT_AND_WITHP;
3284                         StructCopy(data->start_class, and_withp,
3285                                    struct regnode_charclass_class);
3286                         flags &= ~SCF_DO_STCLASS_AND;
3287                         StructCopy(&this_class, data->start_class,
3288                                    struct regnode_charclass_class);
3289                         flags |= SCF_DO_STCLASS_OR;
3290                         data->start_class->flags |= ANYOF_EOS;
3291                     }
3292                 } else {                /* Non-zero len */
3293                     if (flags & SCF_DO_STCLASS_OR) {
3294                         cl_or(pRExC_state, data->start_class, &this_class);
3295                         cl_and(data->start_class, and_withp);
3296                     }
3297                     else if (flags & SCF_DO_STCLASS_AND)
3298                         cl_and(data->start_class, &this_class);
3299                     flags &= ~SCF_DO_STCLASS;
3300                 }
3301                 if (!scan)              /* It was not CURLYX, but CURLY. */
3302                     scan = next;
3303                 if ( /* ? quantifier ok, except for (?{ ... }) */
3304                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3305                     && (minnext == 0) && (deltanext == 0)
3306                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3307                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3308                 {
3309                     ckWARNreg(RExC_parse,
3310                               "Quantifier unexpected on zero-length expression");
3311                 }
3312
3313                 min += minnext * mincount;
3314                 is_inf_internal |= ((maxcount == REG_INFTY
3315                                      && (minnext + deltanext) > 0)
3316                                     || deltanext == I32_MAX);
3317                 is_inf |= is_inf_internal;
3318                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3319
3320                 /* Try powerful optimization CURLYX => CURLYN. */
3321                 if (  OP(oscan) == CURLYX && data
3322                       && data->flags & SF_IN_PAR
3323                       && !(data->flags & SF_HAS_EVAL)
3324                       && !deltanext && minnext == 1 ) {
3325                     /* Try to optimize to CURLYN.  */
3326                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3327                     regnode * const nxt1 = nxt;
3328 #ifdef DEBUGGING
3329                     regnode *nxt2;
3330 #endif
3331
3332                     /* Skip open. */
3333                     nxt = regnext(nxt);
3334                     if (!REGNODE_SIMPLE(OP(nxt))
3335                         && !(PL_regkind[OP(nxt)] == EXACT
3336                              && STR_LEN(nxt) == 1))
3337                         goto nogo;
3338 #ifdef DEBUGGING
3339                     nxt2 = nxt;
3340 #endif
3341                     nxt = regnext(nxt);
3342                     if (OP(nxt) != CLOSE)
3343                         goto nogo;
3344                     if (RExC_open_parens) {
3345                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3346                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3347                     }
3348                     /* Now we know that nxt2 is the only contents: */
3349                     oscan->flags = (U8)ARG(nxt);
3350                     OP(oscan) = CURLYN;
3351                     OP(nxt1) = NOTHING; /* was OPEN. */
3352
3353 #ifdef DEBUGGING
3354                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3355                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3356                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3357                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3358                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3359                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3360 #endif
3361                 }
3362               nogo:
3363
3364                 /* Try optimization CURLYX => CURLYM. */
3365                 if (  OP(oscan) == CURLYX && data
3366                       && !(data->flags & SF_HAS_PAR)
3367                       && !(data->flags & SF_HAS_EVAL)
3368                       && !deltanext     /* atom is fixed width */
3369                       && minnext != 0   /* CURLYM can't handle zero width */
3370                 ) {
3371                     /* XXXX How to optimize if data == 0? */
3372                     /* Optimize to a simpler form.  */
3373                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3374                     regnode *nxt2;
3375
3376                     OP(oscan) = CURLYM;
3377                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3378                             && (OP(nxt2) != WHILEM))
3379                         nxt = nxt2;
3380                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3381                     /* Need to optimize away parenths. */
3382                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3383                         /* Set the parenth number.  */
3384                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3385
3386                         oscan->flags = (U8)ARG(nxt);
3387                         if (RExC_open_parens) {
3388                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3389                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3390                         }
3391                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3392                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3393
3394 #ifdef DEBUGGING
3395                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3396                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3397                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3398                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3399 #endif
3400 #if 0
3401                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3402                             regnode *nnxt = regnext(nxt1);
3403                             if (nnxt == nxt) {
3404                                 if (reg_off_by_arg[OP(nxt1)])
3405                                     ARG_SET(nxt1, nxt2 - nxt1);
3406                                 else if (nxt2 - nxt1 < U16_MAX)
3407                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3408                                 else
3409                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3410                             }
3411                             nxt1 = nnxt;
3412                         }
3413 #endif
3414                         /* Optimize again: */
3415                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3416                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3417                     }
3418                     else
3419                         oscan->flags = 0;
3420                 }
3421                 else if ((OP(oscan) == CURLYX)
3422                          && (flags & SCF_WHILEM_VISITED_POS)
3423                          /* See the comment on a similar expression above.
3424                             However, this time it's not a subexpression
3425                             we care about, but the expression itself. */
3426                          && (maxcount == REG_INFTY)
3427                          && data && ++data->whilem_c < 16) {
3428                     /* This stays as CURLYX, we can put the count/of pair. */
3429                     /* Find WHILEM (as in regexec.c) */
3430                     regnode *nxt = oscan + NEXT_OFF(oscan);
3431
3432                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3433                         nxt += ARG(nxt);
3434                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3435                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3436                 }
3437                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3438                     pars++;
3439                 if (flags & SCF_DO_SUBSTR) {
3440                     SV *last_str = NULL;
3441                     int counted = mincount != 0;
3442
3443                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3444 #if defined(SPARC64_GCC_WORKAROUND)
3445                         I32 b = 0;
3446                         STRLEN l = 0;
3447                         const char *s = NULL;
3448                         I32 old = 0;
3449
3450                         if (pos_before >= data->last_start_min)
3451                             b = pos_before;
3452                         else
3453                             b = data->last_start_min;
3454
3455                         l = 0;
3456                         s = SvPV_const(data->last_found, l);
3457                         old = b - data->last_start_min;
3458
3459 #else
3460                         I32 b = pos_before >= data->last_start_min
3461                             ? pos_before : data->last_start_min;
3462                         STRLEN l;
3463                         const char * const s = SvPV_const(data->last_found, l);
3464                         I32 old = b - data->last_start_min;
3465 #endif
3466
3467                         if (UTF)
3468                             old = utf8_hop((U8*)s, old) - (U8*)s;
3469                         l -= old;
3470                         /* Get the added string: */
3471                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3472                         if (deltanext == 0 && pos_before == b) {
3473                             /* What was added is a constant string */
3474                             if (mincount > 1) {
3475                                 SvGROW(last_str, (mincount * l) + 1);
3476                                 repeatcpy(SvPVX(last_str) + l,
3477                                           SvPVX_const(last_str), l, mincount - 1);
3478                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3479                                 /* Add additional parts. */
3480                                 SvCUR_set(data->last_found,
3481                                           SvCUR(data->last_found) - l);
3482                                 sv_catsv(data->last_found, last_str);
3483                                 {
3484                                     SV * sv = data->last_found;
3485                                     MAGIC *mg =
3486                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3487                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3488                                     if (mg && mg->mg_len >= 0)
3489                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3490                                 }
3491                                 data->last_end += l * (mincount - 1);
3492                             }
3493                         } else {
3494                             /* start offset must point into the last copy */
3495                             data->last_start_min += minnext * (mincount - 1);
3496                             data->last_start_max += is_inf ? I32_MAX
3497                                 : (maxcount - 1) * (minnext + data->pos_delta);
3498                         }
3499                     }
3500                     /* It is counted once already... */
3501                     data->pos_min += minnext * (mincount - counted);
3502                     data->pos_delta += - counted * deltanext +
3503                         (minnext + deltanext) * maxcount - minnext * mincount;
3504                     if (mincount != maxcount) {
3505                          /* Cannot extend fixed substrings found inside
3506                             the group.  */
3507                         SCAN_COMMIT(pRExC_state,data,minlenp);
3508                         if (mincount && last_str) {
3509                             SV * const sv = data->last_found;
3510                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3511                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3512
3513                             if (mg)
3514                                 mg->mg_len = -1;
3515                             sv_setsv(sv, last_str);
3516                             data->last_end = data->pos_min;
3517                             data->last_start_min =
3518                                 data->pos_min - CHR_SVLEN(last_str);
3519                             data->last_start_max = is_inf
3520                                 ? I32_MAX
3521                                 : data->pos_min + data->pos_delta
3522                                 - CHR_SVLEN(last_str);
3523                         }
3524                         data->longest = &(data->longest_float);
3525                     }
3526                     SvREFCNT_dec(last_str);
3527                 }
3528                 if (data && (fl & SF_HAS_EVAL))
3529                     data->flags |= SF_HAS_EVAL;
3530               optimize_curly_tail:
3531                 if (OP(oscan) != CURLYX) {
3532                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3533                            && NEXT_OFF(next))
3534                         NEXT_OFF(oscan) += NEXT_OFF(next);
3535                 }
3536                 continue;
3537             default:                    /* REF and CLUMP only? */
3538                 if (flags & SCF_DO_SUBSTR) {
3539                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3540                     data->longest = &(data->longest_float);
3541                 }
3542                 is_inf = is_inf_internal = 1;
3543                 if (flags & SCF_DO_STCLASS_OR)
3544                     cl_anything(pRExC_state, data->start_class);
3545                 flags &= ~SCF_DO_STCLASS;
3546                 break;
3547             }
3548         }
3549         else if (OP(scan) == LNBREAK) {
3550             if (flags & SCF_DO_STCLASS) {
3551                 int value = 0;
3552                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3553                 if (flags & SCF_DO_STCLASS_AND) {
3554                     for (value = 0; value < 256; value++)
3555                         if (!is_VERTWS_cp(value))
3556                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3557                 }
3558                 else {
3559                     for (value = 0; value < 256; value++)
3560                         if (is_VERTWS_cp(value))
3561                             ANYOF_BITMAP_SET(data->start_class, value);
3562                 }
3563                 if (flags & SCF_DO_STCLASS_OR)
3564                     cl_and(data->start_class, and_withp);
3565                 flags &= ~SCF_DO_STCLASS;
3566             }
3567             min += 1;
3568             delta += 1;
3569             if (flags & SCF_DO_SUBSTR) {
3570                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3571                 data->pos_min += 1;
3572                 data->pos_delta += 1;
3573                 data->longest = &(data->longest_float);
3574             }
3575         }
3576         else if (OP(scan) == FOLDCHAR) {
3577             int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
3578             flags &= ~SCF_DO_STCLASS;
3579             min += 1;
3580             delta += d;
3581             if (flags & SCF_DO_SUBSTR) {
3582                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3583                 data->pos_min += 1;
3584                 data->pos_delta += d;
3585                 data->longest = &(data->longest_float);
3586             }
3587         }
3588         else if (REGNODE_SIMPLE(OP(scan))) {
3589             int value = 0;
3590
3591             if (flags & SCF_DO_SUBSTR) {
3592                 SCAN_COMMIT(pRExC_state,data,minlenp);
3593                 data->pos_min++;
3594             }
3595             min++;
3596             if (flags & SCF_DO_STCLASS) {
3597                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3598
3599                 /* Some of the logic below assumes that switching
3600                    locale on will only add false positives. */
3601                 switch (PL_regkind[OP(scan)]) {
3602                 case SANY:
3603                 default:
3604                   do_default:
3605                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3606                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3607                         cl_anything(pRExC_state, data->start_class);
3608                     break;
3609                 case REG_ANY:
3610                     if (OP(scan) == SANY)
3611                         goto do_default;
3612                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3613                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3614                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3615                         cl_anything(pRExC_state, data->start_class);
3616                     }
3617                     if (flags & SCF_DO_STCLASS_AND || !value)
3618                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3619                     break;
3620                 case ANYOF:
3621                     if (flags & SCF_DO_STCLASS_AND)
3622                         cl_and(data->start_class,
3623                                (struct regnode_charclass_class*)scan);
3624                     else
3625                         cl_or(pRExC_state, data->start_class,
3626                               (struct regnode_charclass_class*)scan);
3627                     break;
3628                 case ALNUM:
3629                     if (flags & SCF_DO_STCLASS_AND) {
3630                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3631                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3632                             if (FLAGS(scan) & USE_UNI) {
3633                                 for (value = 0; value < 256; value++) {
3634                                     if (!isWORDCHAR_L1(value)) {
3635                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3636                                     }
3637                                 }
3638                             } else {
3639                                 for (value = 0; value < 256; value++) {
3640                                     if (!isALNUM(value)) {
3641                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3642                                     }
3643                                 }
3644                             }
3645                         }
3646                     }
3647                     else {
3648                         if (data->start_class->flags & ANYOF_LOCALE)
3649                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3650                         else if (FLAGS(scan) & USE_UNI) {
3651                             for (value = 0; value < 256; value++) {
3652                                 if (isWORDCHAR_L1(value)) {
3653                                     ANYOF_BITMAP_SET(data->start_class, value);
3654                                 }
3655                             }
3656                         } else {
3657                             for (value = 0; value < 256; value++) {
3658                                 if (isALNUM(value)) {
3659                                     ANYOF_BITMAP_SET(data->start_class, value);
3660                                 }
3661                             }
3662                         }
3663                     }
3664                     break;
3665                 case ALNUML:
3666                     if (flags & SCF_DO_STCLASS_AND) {
3667                         if (data->start_class->flags & ANYOF_LOCALE)
3668                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3669                     }
3670                     else {
3671                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3672                         data->start_class->flags |= ANYOF_LOCALE;
3673                     }
3674                     break;
3675                 case NALNUM:
3676                     if (flags & SCF_DO_STCLASS_AND) {
3677                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3678                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3679                             if (FLAGS(scan) & USE_UNI) {
3680                                 for (value = 0; value < 256; value++) {
3681                                     if (isWORDCHAR_L1(value)) {
3682                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3683                                     }
3684                                 }
3685                             } else {
3686                                 for (value = 0; value < 256; value++) {
3687                                     if (isALNUM(value)) {
3688                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3689                                     }
3690                                 }
3691                             }
3692                         }
3693                     }
3694                     else {
3695                         if (data->start_class->flags & ANYOF_LOCALE)
3696                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3697                         else {
3698                             for (value = 0; value < 256; value++)
3699                                 if (!isALNUM(value))
3700                                     ANYOF_BITMAP_SET(data->start_class, value);
3701                         }
3702                     }
3703                     break;
3704                 case NALNUML:
3705                     if (flags & SCF_DO_STCLASS_AND) {
3706                         if (data->start_class->flags & ANYOF_LOCALE)
3707                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3708                     }
3709                     else {
3710                         data->start_class->flags |= ANYOF_LOCALE;
3711                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3712                     }
3713                     break;
3714                 case SPACE:
3715                     if (flags & SCF_DO_STCLASS_AND) {
3716                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3717                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3718                             if (FLAGS(scan) & USE_UNI) {
3719                                 for (value = 0; value < 256; value++) {
3720                                     if (!isSPACE_L1(value)) {
3721                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3722                                     }
3723                                 }
3724                             } else {
3725                                 for (value = 0; value < 256; value++) {
3726                                     if (!isSPACE(value)) {
3727                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3728                                     }
3729                                 }
3730                             }
3731                         }
3732                     }
3733                     else {
3734                         if (data->start_class->flags & ANYOF_LOCALE) {
3735                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3736                         }
3737                         else if (FLAGS(scan) & USE_UNI) {
3738                             for (value = 0; value < 256; value++) {
3739                                 if (isSPACE_L1(value)) {
3740                                     ANYOF_BITMAP_SET(data->start_class, value);
3741                                 }
3742                             }
3743                         } else {
3744                             for (value = 0; value < 256; value++) {
3745                                 if (isSPACE(value)) {
3746                                     ANYOF_BITMAP_SET(data->start_class, value);
3747                                 }
3748                             }
3749                         }
3750                     }
3751                     break;
3752                 case SPACEL:
3753                     if (flags & SCF_DO_STCLASS_AND) {
3754                         if (data->start_class->flags & ANYOF_LOCALE)
3755                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3756                     }
3757                     else {
3758                         data->start_class->flags |= ANYOF_LOCALE;
3759                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3760                     }
3761                     break;
3762                 case NSPACE:
3763                     if (flags & SCF_DO_STCLASS_AND) {
3764                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3765                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3766                             if (FLAGS(scan) & USE_UNI) {
3767                                 for (value = 0; value < 256; value++) {
3768                                     if (isSPACE_L1(value)) {
3769                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3770                                     }
3771                                 }
3772                             } else {
3773                                 for (value = 0; value < 256; value++) {
3774                                     if (isSPACE(value)) {
3775                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3776                                     }
3777                                 }
3778                             }
3779                         }
3780                     }
3781                     else {
3782                         if (data->start_class->flags & ANYOF_LOCALE)
3783                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3784                         else if (FLAGS(scan) & USE_UNI) {
3785                             for (value = 0; value < 256; value++) {
3786                                 if (!isSPACE_L1(value)) {
3787                                     ANYOF_BITMAP_SET(data->start_class, value);
3788                                 }
3789                             }
3790                         }
3791                         else {
3792                             for (value = 0; value < 256; value++) {
3793                                 if (!isSPACE(value)) {
3794                                     ANYOF_BITMAP_SET(data->start_class, value);
3795                                 }
3796                             }
3797                         }
3798                     }
3799                     break;
3800                 case NSPACEL:
3801                     if (flags & SCF_DO_STCLASS_AND) {
3802                         if (data->start_class->flags & ANYOF_LOCALE) {
3803                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3804                             for (value = 0; value < 256; value++)
3805                                 if (!isSPACE(value))
3806                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3807                         }
3808                     }
3809                     else {
3810                         data->start_class->flags |= ANYOF_LOCALE;
3811                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3812                     }
3813                     break;
3814                 case DIGIT:
3815                     if (flags & SCF_DO_STCLASS_AND) {
3816                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3817                         for (value = 0; value < 256; value++)
3818                             if (!isDIGIT(value))
3819                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3820                     }
3821                     else {
3822                         if (data->start_class->flags & ANYOF_LOCALE)
3823                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3824                         else {
3825                             for (value = 0; value < 256; value++)
3826                                 if (isDIGIT(value))
3827                                     ANYOF_BITMAP_SET(data->start_class, value);
3828                         }
3829                     }
3830                     break;
3831                 case NDIGIT:
3832                     if (flags & SCF_DO_STCLASS_AND) {
3833                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3834                         for (value = 0; value < 256; value++)
3835                             if (isDIGIT(value))
3836                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3837                     }
3838                     else {
3839                         if (data->start_class->flags & ANYOF_LOCALE)
3840                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3841                         else {
3842                             for (value = 0; value < 256; value++)
3843                                 if (!isDIGIT(value))
3844                                     ANYOF_BITMAP_SET(data->start_class, value);
3845                         }
3846                     }
3847                     break;
3848                 CASE_SYNST_FNC(VERTWS);
3849                 CASE_SYNST_FNC(HORIZWS);
3850
3851                 }
3852                 if (flags & SCF_DO_STCLASS_OR)
3853                     cl_and(data->start_class, and_withp);
3854                 flags &= ~SCF_DO_STCLASS;
3855             }
3856         }
3857         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3858             data->flags |= (OP(scan) == MEOL
3859                             ? SF_BEFORE_MEOL
3860                             : SF_BEFORE_SEOL);
3861         }
3862         else if (  PL_regkind[OP(scan)] == BRANCHJ
3863                  /* Lookbehind, or need to calculate parens/evals/stclass: */
3864                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
3865                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3866             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3867                 || OP(scan) == UNLESSM )
3868             {
3869                 /* Negative Lookahead/lookbehind
3870                    In this case we can't do fixed string optimisation.
3871                 */
3872
3873                 I32 deltanext, minnext, fake = 0;
3874                 regnode *nscan;
3875                 struct regnode_charclass_class intrnl;
3876                 int f = 0;
3877
3878                 data_fake.flags = 0;
3879                 if (data) {
3880                     data_fake.whilem_c = data->whilem_c;
3881                     data_fake.last_closep = data->last_closep;
3882                 }
3883                 else
3884                     data_fake.last_closep = &fake;
3885                 data_fake.pos_delta = delta;
3886                 if ( flags & SCF_DO_STCLASS && !scan->flags
3887                      && OP(scan) == IFMATCH ) { /* Lookahead */
3888                     cl_init(pRExC_state, &intrnl);
3889                     data_fake.start_class = &intrnl;
3890                     f |= SCF_DO_STCLASS_AND;
3891                 }
3892                 if (flags & SCF_WHILEM_VISITED_POS)
3893                     f |= SCF_WHILEM_VISITED_POS;
3894                 next = regnext(scan);
3895                 nscan = NEXTOPER(NEXTOPER(scan));
3896                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3897                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3898                 if (scan->flags) {
3899                     if (deltanext) {
3900                         FAIL("Variable length lookbehind not implemented");
3901                     }
3902                     else if (minnext > (I32)U8_MAX) {
3903                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3904                     }
3905                     scan->flags = (U8)minnext;
3906                 }
3907                 if (data) {
3908                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3909                         pars++;
3910                     if (data_fake.flags & SF_HAS_EVAL)
3911                         data->flags |= SF_HAS_EVAL;
3912                     data->whilem_c = data_fake.whilem_c;
3913                 }
3914                 if (f & SCF_DO_STCLASS_AND) {
3915                     if (flags & SCF_DO_STCLASS_OR) {
3916                         /* OR before, AND after: ideally we would recurse with
3917                          * data_fake to get the AND applied by study of the
3918                          * remainder of the pattern, and then derecurse;
3919                          * *** HACK *** for now just treat as "no information".
3920                          * See [perl #56690].
3921                          */
3922                         cl_init(pRExC_state, data->start_class);
3923                     }  else {
3924                         /* AND before and after: combine and continue */
3925                         const int was = (data->start_class->flags & ANYOF_EOS);
3926
3927                         cl_and(data->start_class, &intrnl);
3928                         if (was)
3929                             data->start_class->flags |= ANYOF_EOS;
3930                     }
3931                 }
3932             }
3933 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3934             else {
3935                 /* Positive Lookahead/lookbehind
3936                    In this case we can do fixed string optimisation,
3937                    but we must be careful about it. Note in the case of
3938                    lookbehind the positions will be offset by the minimum
3939                    length of the pattern, something we won't know about
3940                    until after the recurse.
3941                 */
3942                 I32 deltanext, fake = 0;
3943                 regnode *nscan;
3944                 struct regnode_charclass_class intrnl;
3945                 int f = 0;
3946                 /* We use SAVEFREEPV so that when the full compile
3947                     is finished perl will clean up the allocated
3948                     minlens when it's all done. This way we don't
3949                     have to worry about freeing them when we know
3950                     they wont be used, which would be a pain.
3951                  */
3952                 I32 *minnextp;
3953                 Newx( minnextp, 1, I32 );
3954                 SAVEFREEPV(minnextp);
3955
3956                 if (data) {
3957                     StructCopy(data, &data_fake, scan_data_t);
3958                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3959                         f |= SCF_DO_SUBSTR;
3960                         if (scan->flags)
3961                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3962                         data_fake.last_found=newSVsv(data->last_found);
3963                     }
3964                 }
3965                 else
3966                     data_fake.last_closep = &fake;
3967                 data_fake.flags = 0;
3968                 data_fake.pos_delta = delta;
3969                 if (is_inf)
3970                     data_fake.flags |= SF_IS_INF;
3971                 if ( flags & SCF_DO_STCLASS && !scan->flags
3972                      && OP(scan) == IFMATCH ) { /* Lookahead */
3973                     cl_init(pRExC_state, &intrnl);
3974                     data_fake.start_class = &intrnl;
3975                     f |= SCF_DO_STCLASS_AND;
3976                 }
3977                 if (flags & SCF_WHILEM_VISITED_POS)
3978                     f |= SCF_WHILEM_VISITED_POS;
3979                 next = regnext(scan);
3980                 nscan = NEXTOPER(NEXTOPER(scan));
3981
3982                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3983                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3984                 if (scan->flags) {
3985                     if (deltanext) {
3986                         FAIL("Variable length lookbehind not implemented");
3987                     }
3988                     else if (*minnextp > (I32)U8_MAX) {
3989                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3990                     }
3991                     scan->flags = (U8)*minnextp;
3992                 }
3993
3994                 *minnextp += min;
3995
3996                 if (f & SCF_DO_STCLASS_AND) {
3997                     const int was = (data->start_class->flags & ANYOF_EOS);
3998
3999                     cl_and(data->start_class, &intrnl);
4000                     if (was)
4001                         data->start_class->flags |= ANYOF_EOS;
4002                 }
4003                 if (data) {
4004                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4005                         pars++;
4006                     if (data_fake.flags & SF_HAS_EVAL)
4007                         data->flags |= SF_HAS_EVAL;
4008                     data->whilem_c = data_fake.whilem_c;
4009                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4010                         if (RExC_rx->minlen<*minnextp)
4011                             RExC_rx->minlen=*minnextp;
4012                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4013                         SvREFCNT_dec(data_fake.last_found);
4014
4015                         if ( data_fake.minlen_fixed != minlenp )
4016                         {
4017                             data->offset_fixed= data_fake.offset_fixed;
4018                             data->minlen_fixed= data_fake.minlen_fixed;
4019                             data->lookbehind_fixed+= scan->flags;
4020                         }
4021                         if ( data_fake.minlen_float != minlenp )
4022                         {
4023                             data->minlen_float= data_fake.minlen_float;
4024                             data->offset_float_min=data_fake.offset_float_min;
4025                             data->offset_float_max=data_fake.offset_float_max;
4026                             data->lookbehind_float+= scan->flags;
4027                         }
4028                     }
4029                 }
4030
4031
4032             }
4033 #endif
4034         }
4035         else if (OP(scan) == OPEN) {
4036             if (stopparen != (I32)ARG(scan))
4037                 pars++;
4038         }
4039         else if (OP(scan) == CLOSE) {
4040             if (stopparen == (I32)ARG(scan)) {
4041                 break;
4042             }
4043             if ((I32)ARG(scan) == is_par) {
4044                 next = regnext(scan);
4045
4046                 if ( next && (OP(next) != WHILEM) && next < last)
4047                     is_par = 0;         /* Disable optimization */
4048             }
4049             if (data)
4050                 *(data->last_closep) = ARG(scan);
4051         }
4052         else if (OP(scan) == EVAL) {
4053                 if (data)
4054                     data->flags |= SF_HAS_EVAL;
4055         }
4056         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4057             if (flags & SCF_DO_SUBSTR) {
4058                 SCAN_COMMIT(pRExC_state,data,minlenp);
4059                 flags &= ~SCF_DO_SUBSTR;
4060             }
4061             if (data && OP(scan)==ACCEPT) {
4062                 data->flags |= SCF_SEEN_ACCEPT;
4063                 if (stopmin > min)
4064                     stopmin = min;
4065             }
4066         }
4067         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4068         {
4069                 if (flags & SCF_DO_SUBSTR) {
4070                     SCAN_COMMIT(pRExC_state,data,minlenp);
4071                     data->longest = &(data->longest_float);
4072                 }
4073                 is_inf = is_inf_internal = 1;
4074                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4075                     cl_anything(pRExC_state, data->start_class);
4076                 flags &= ~SCF_DO_STCLASS;
4077         }
4078         else if (OP(scan) == GPOS) {
4079             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4080                 !(delta || is_inf || (data && data->pos_delta)))
4081             {
4082                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4083                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4084                 if (RExC_rx->gofs < (U32)min)
4085                     RExC_rx->gofs = min;
4086             } else {
4087                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4088                 RExC_rx->gofs = 0;
4089             }
4090         }
4091 #ifdef TRIE_STUDY_OPT
4092 #ifdef FULL_TRIE_STUDY
4093         else if (PL_regkind[OP(scan)] == TRIE) {
4094             /* NOTE - There is similar code to this block above for handling
4095                BRANCH nodes on the initial study.  If you change stuff here
4096                check there too. */
4097             regnode *trie_node= scan;
4098             regnode *tail= regnext(scan);
4099             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4100             I32 max1 = 0, min1 = I32_MAX;
4101             struct regnode_charclass_class accum;
4102
4103             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4104                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4105             if (flags & SCF_DO_STCLASS)
4106                 cl_init_zero(pRExC_state, &accum);
4107
4108             if (!trie->jump) {
4109                 min1= trie->minlen;
4110                 max1= trie->maxlen;
4111             } else {
4112                 const regnode *nextbranch= NULL;
4113                 U32 word;
4114
4115                 for ( word=1 ; word <= trie->wordcount ; word++)
4116                 {
4117                     I32 deltanext=0, minnext=0, f = 0, fake;
4118                     struct regnode_charclass_class this_class;
4119
4120                     data_fake.flags = 0;
4121                     if (data) {
4122                         data_fake.whilem_c = data->whilem_c;
4123                         data_fake.last_closep = data->last_closep;
4124                     }
4125                     else
4126                         data_fake.last_closep = &fake;
4127                     data_fake.pos_delta = delta;
4128                     if (flags & SCF_DO_STCLASS) {
4129                         cl_init(pRExC_state, &this_class);
4130                         data_fake.start_class = &this_class;
4131                         f = SCF_DO_STCLASS_AND;
4132                     }
4133                     if (flags & SCF_WHILEM_VISITED_POS)
4134                         f |= SCF_WHILEM_VISITED_POS;
4135
4136                     if (trie->jump[word]) {
4137                         if (!nextbranch)
4138                             nextbranch = trie_node + trie->jump[0];
4139                         scan= trie_node + trie->jump[word];
4140                         /* We go from the jump point to the branch that follows
4141                            it. Note this means we need the vestigal unused branches
4142                            even though they arent otherwise used.
4143                          */
4144                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4145                             &deltanext, (regnode *)nextbranch, &data_fake,
4146                             stopparen, recursed, NULL, f,depth+1);
4147                     }
4148                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4149                         nextbranch= regnext((regnode*)nextbranch);
4150
4151                     if (min1 > (I32)(minnext + trie->minlen))
4152                         min1 = minnext + trie->minlen;
4153                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4154                         max1 = minnext + deltanext + trie->maxlen;
4155                     if (deltanext == I32_MAX)
4156                         is_inf = is_inf_internal = 1;
4157
4158                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4159                         pars++;
4160                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4161                         if ( stopmin > min + min1)
4162                             stopmin = min + min1;
4163                         flags &= ~SCF_DO_SUBSTR;
4164                         if (data)
4165                             data->flags |= SCF_SEEN_ACCEPT;
4166                     }
4167                     if (data) {
4168                         if (data_fake.flags & SF_HAS_EVAL)
4169                             data->flags |= SF_HAS_EVAL;
4170                         data->whilem_c = data_fake.whilem_c;
4171                     }
4172                     if (flags & SCF_DO_STCLASS)
4173                         cl_or(pRExC_state, &accum, &this_class);
4174                 }
4175             }
4176             if (flags & SCF_DO_SUBSTR) {
4177                 data->pos_min += min1;
4178                 data->pos_delta += max1 - min1;
4179                 if (max1 != min1 || is_inf)
4180                     data->longest = &(data->longest_float);
4181             }
4182             min += min1;
4183             delta += max1 - min1;
4184             if (flags & SCF_DO_STCLASS_OR) {
4185                 cl_or(pRExC_state, data->start_class, &accum);
4186                 if (min1) {
4187                     cl_and(data->start_class, and_withp);
4188                     flags &= ~SCF_DO_STCLASS;
4189                 }
4190             }
4191             else if (flags & SCF_DO_STCLASS_AND) {
4192                 if (min1) {
4193                     cl_and(data->start_class, &accum);
4194                     flags &= ~SCF_DO_STCLASS;
4195                 }
4196                 else {
4197                     /* Switch to OR mode: cache the old value of
4198                      * data->start_class */
4199                     INIT_AND_WITHP;
4200                     StructCopy(data->start_class, and_withp,
4201                                struct regnode_charclass_class);
4202                     flags &= ~SCF_DO_STCLASS_AND;
4203                     StructCopy(&accum, data->start_class,
4204                                struct regnode_charclass_class);
4205                     flags |= SCF_DO_STCLASS_OR;
4206                     data->start_class->flags |= ANYOF_EOS;
4207                 }
4208             }
4209             scan= tail;
4210             continue;
4211         }
4212 #else
4213         else if (PL_regkind[OP(scan)] == TRIE) {
4214             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4215             U8*bang=NULL;
4216
4217             min += trie->minlen;
4218             delta += (trie->maxlen - trie->minlen);
4219             flags &= ~SCF_DO_STCLASS; /* xxx */
4220             if (flags & SCF_DO_SUBSTR) {
4221                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4222                 data->pos_min += trie->minlen;
4223                 data->pos_delta += (trie->maxlen - trie->minlen);
4224                 if (trie->maxlen != trie->minlen)
4225                     data->longest = &(data->longest_float);
4226             }
4227             if (trie->jump) /* no more substrings -- for now /grr*/
4228                 flags &= ~SCF_DO_SUBSTR;
4229         }
4230 #endif /* old or new */
4231 #endif /* TRIE_STUDY_OPT */
4232
4233         /* Else: zero-length, ignore. */
4234         scan = regnext(scan);
4235     }
4236     if (frame) {
4237         last = frame->last;
4238         scan = frame->next;
4239         stopparen = frame->stop;
4240         frame = frame->prev;
4241         goto fake_study_recurse;
4242     }
4243
4244   finish:
4245     assert(!frame);
4246     DEBUG_STUDYDATA("pre-fin:",data,depth);
4247
4248     *scanp = scan;
4249     *deltap = is_inf_internal ? I32_MAX : delta;
4250     if (flags & SCF_DO_SUBSTR && is_inf)
4251         data->pos_delta = I32_MAX - data->pos_min;
4252     if (is_par > (I32)U8_MAX)
4253         is_par = 0;
4254     if (is_par && pars==1 && data) {
4255         data->flags |= SF_IN_PAR;
4256         data->flags &= ~SF_HAS_PAR;
4257     }
4258     else if (pars && data) {
4259         data->flags |= SF_HAS_PAR;
4260         data->flags &= ~SF_IN_PAR;
4261     }
4262     if (flags & SCF_DO_STCLASS_OR)
4263         cl_and(data->start_class, and_withp);
4264     if (flags & SCF_TRIE_RESTUDY)
4265         data->flags |=  SCF_TRIE_RESTUDY;
4266
4267     DEBUG_STUDYDATA("post-fin:",data,depth);
4268
4269     return min < stopmin ? min : stopmin;
4270 }
4271
4272 STATIC U32
4273 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4274 {
4275     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4276
4277     PERL_ARGS_ASSERT_ADD_DATA;
4278
4279     Renewc(RExC_rxi->data,
4280            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4281            char, struct reg_data);
4282     if(count)
4283         Renew(RExC_rxi->data->what, count + n, U8);
4284     else
4285         Newx(RExC_rxi->data->what, n, U8);
4286     RExC_rxi->data->count = count + n;
4287     Copy(s, RExC_rxi->data->what + count, n, U8);
4288     return count;
4289 }
4290
4291 /*XXX: todo make this not included in a non debugging perl */
4292 #ifndef PERL_IN_XSUB_RE
4293 void
4294 Perl_reginitcolors(pTHX)
4295 {
4296     dVAR;
4297     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4298     if (s) {
4299         char *t = savepv(s);
4300         int i = 0;
4301         PL_colors[0] = t;
4302         while (++i < 6) {
4303             t = strchr(t, '\t');
4304             if (t) {
4305                 *t = '\0';
4306                 PL_colors[i] = ++t;
4307             }
4308             else
4309                 PL_colors[i] = t = (char *)"";
4310         }
4311     } else {
4312         int i = 0;
4313         while (i < 6)
4314             PL_colors[i++] = (char *)"";
4315     }
4316     PL_colorset = 1;
4317 }
4318 #endif
4319
4320
4321 #ifdef TRIE_STUDY_OPT
4322 #define CHECK_RESTUDY_GOTO                                  \
4323         if (                                                \
4324               (data.flags & SCF_TRIE_RESTUDY)               \
4325               && ! restudied++                              \
4326         )     goto reStudy
4327 #else
4328 #define CHECK_RESTUDY_GOTO
4329 #endif
4330
4331 /*
4332  - pregcomp - compile a regular expression into internal code
4333  *
4334  * We can't allocate space until we know how big the compiled form will be,
4335  * but we can't compile it (and thus know how big it is) until we've got a
4336  * place to put the code.  So we cheat:  we compile it twice, once with code
4337  * generation turned off and size counting turned on, and once "for real".
4338  * This also means that we don't allocate space until we are sure that the
4339  * thing really will compile successfully, and we never have to move the
4340  * code and thus invalidate pointers into it.  (Note that it has to be in
4341  * one piece because free() must be able to free it all.) [NB: not true in perl]
4342  *
4343  * Beware that the optimization-preparation code in here knows about some
4344  * of the structure of the compiled regexp.  [I'll say.]
4345  */
4346
4347
4348
4349 #ifndef PERL_IN_XSUB_RE
4350 #define RE_ENGINE_PTR &reh_regexp_engine
4351 #else
4352 extern const struct regexp_engine my_reg_engine;
4353 #define RE_ENGINE_PTR &my_reg_engine
4354 #endif
4355
4356 #ifndef PERL_IN_XSUB_RE
4357 REGEXP *
4358 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4359 {
4360     dVAR;
4361     HV * const table = GvHV(PL_hintgv);
4362
4363     PERL_ARGS_ASSERT_PREGCOMP;
4364
4365     /* Dispatch a request to compile a regexp to correct
4366        regexp engine. */
4367     if (table) {
4368         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4369         GET_RE_DEBUG_FLAGS_DECL;
4370         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4371             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4372             DEBUG_COMPILE_r({
4373                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4374                     SvIV(*ptr));
4375             });
4376             return CALLREGCOMP_ENG(eng, pattern, flags);
4377         }
4378     }
4379     return Perl_re_compile(aTHX_ pattern, flags);
4380 }
4381 #endif
4382
4383 REGEXP *
4384 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4385 {
4386     dVAR;
4387     REGEXP *rx;
4388     struct regexp *r;
4389     register regexp_internal *ri;
4390     STRLEN plen;
4391     char  *exp;
4392     char* xend;
4393     regnode *scan;
4394     I32 flags;
4395     I32 minlen = 0;
4396     U32 pm_flags;
4397
4398     /* these are all flags - maybe they should be turned
4399      * into a single int with different bit masks */
4400     I32 sawlookahead = 0;
4401     I32 sawplus = 0;
4402     I32 sawopen = 0;
4403     bool used_setjump = FALSE;
4404
4405     U8 jump_ret = 0;
4406     dJMPENV;
4407     scan_data_t data;
4408     RExC_state_t RExC_state;
4409     RExC_state_t * const pRExC_state = &RExC_state;
4410 #ifdef TRIE_STUDY_OPT
4411     int restudied;
4412     RExC_state_t copyRExC_state;
4413 #endif
4414     GET_RE_DEBUG_FLAGS_DECL;
4415
4416     PERL_ARGS_ASSERT_RE_COMPILE;
4417
4418     DEBUG_r(if (!PL_colorset) reginitcolors());
4419
4420     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4421
4422     /****************** LONG JUMP TARGET HERE***********************/
4423     /* Longjmp back to here if have to switch in midstream to utf8 */
4424     if (! RExC_orig_utf8) {
4425         JMPENV_PUSH(jump_ret);
4426         used_setjump = TRUE;
4427     }
4428
4429     if (jump_ret == 0) {    /* First time through */
4430         exp = SvPV(pattern, plen);
4431         xend = exp + plen;
4432         /* ignore the utf8ness if the pattern is 0 length */
4433         if (plen == 0) {
4434             RExC_utf8 = RExC_orig_utf8 = 0;
4435         }
4436
4437         DEBUG_COMPILE_r({
4438             SV *dsv= sv_newmortal();
4439             RE_PV_QUOTED_DECL(s, RExC_utf8,
4440                 dsv, exp, plen, 60);
4441             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4442                            PL_colors[4],PL_colors[5],s);
4443         });
4444     }
4445     else {  /* longjumped back */
4446         STRLEN len = plen;
4447
4448         /* If the cause for the longjmp was other than changing to utf8, pop
4449          * our own setjmp, and longjmp to the correct handler */
4450         if (jump_ret != UTF8_LONGJMP) {
4451             JMPENV_POP;
4452             JMPENV_JUMP(jump_ret);
4453         }
4454
4455         GET_RE_DEBUG_FLAGS;
4456
4457         /* It's possible to write a regexp in ascii that represents Unicode
4458         codepoints outside of the byte range, such as via \x{100}. If we
4459         detect such a sequence we have to convert the entire pattern to utf8
4460         and then recompile, as our sizing calculation will have been based
4461         on 1 byte == 1 character, but we will need to use utf8 to encode
4462         at least some part of the pattern, and therefore must convert the whole
4463         thing.
4464         -- dmq */
4465         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4466             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4467         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4468         xend = exp + len;
4469         RExC_orig_utf8 = RExC_utf8 = 1;
4470         SAVEFREEPV(exp);
4471     }
4472
4473 #ifdef TRIE_STUDY_OPT
4474     restudied = 0;
4475 #endif
4476
4477     /* Set to use unicode semantics if the pattern is in utf8 and has the
4478      * 'dual' charset specified, as it means unicode when utf8  */
4479     pm_flags = orig_pm_flags;
4480     if (RExC_utf8  && ! (pm_flags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE))) {
4481         pm_flags |= RXf_PMf_UNICODE;
4482     }
4483
4484     RExC_precomp = exp;
4485     RExC_flags = pm_flags;
4486     RExC_sawback = 0;
4487
4488     RExC_seen = 0;
4489     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4490     RExC_seen_evals = 0;
4491     RExC_extralen = 0;
4492
4493     /* First pass: determine size, legality. */
4494     RExC_parse = exp;
4495     RExC_start = exp;
4496     RExC_end = xend;
4497     RExC_naughty = 0;
4498     RExC_npar = 1;
4499     RExC_nestroot = 0;
4500     RExC_size = 0L;
4501     RExC_emit = &PL_regdummy;
4502     RExC_whilem_seen = 0;
4503     RExC_open_parens = NULL;
4504     RExC_close_parens = NULL;
4505     RExC_opend = NULL;
4506     RExC_paren_names = NULL;
4507 #ifdef DEBUGGING
4508     RExC_paren_name_list = NULL;
4509 #endif
4510     RExC_recurse = NULL;
4511     RExC_recurse_count = 0;
4512
4513 #if 0 /* REGC() is (currently) a NOP at the first pass.
4514        * Clever compilers notice this and complain. --jhi */
4515     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4516 #endif
4517     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4518     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4519         RExC_precomp = NULL;
4520         return(NULL);
4521     }
4522
4523     /* Here, finished first pass.  Get rid of any added setjmp */
4524     if (used_setjump) {
4525         JMPENV_POP;
4526     }
4527     DEBUG_PARSE_r({
4528         PerlIO_printf(Perl_debug_log,
4529             "Required size %"IVdf" nodes\n"
4530             "Starting second pass (creation)\n",
4531             (IV)RExC_size);
4532         RExC_lastnum=0;
4533         RExC_lastparse=NULL;
4534     });
4535     /* Small enough for pointer-storage convention?
4536        If extralen==0, this means that we will not need long jumps. */
4537     if (RExC_size >= 0x10000L && RExC_extralen)
4538         RExC_size += RExC_extralen;
4539     else
4540         RExC_extralen = 0;
4541     if (RExC_whilem_seen > 15)
4542         RExC_whilem_seen = 15;
4543
4544     /* Allocate space and zero-initialize. Note, the two step process
4545        of zeroing when in debug mode, thus anything assigned has to
4546        happen after that */
4547     rx = (REGEXP*) newSV_type(SVt_REGEXP);
4548     r = (struct regexp*)SvANY(rx);
4549     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4550          char, regexp_internal);
4551     if ( r == NULL || ri == NULL )
4552         FAIL("Regexp out of space");
4553 #ifdef DEBUGGING
4554     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4555     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4556 #else
4557     /* bulk initialize base fields with 0. */
4558     Zero(ri, sizeof(regexp_internal), char);
4559 #endif
4560
4561     /* non-zero initialization begins here */
4562     RXi_SET( r, ri );
4563     r->engine= RE_ENGINE_PTR;
4564     r->extflags = pm_flags;
4565     {
4566         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4567         bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
4568
4569         /* The caret is output if there are any defaults: if not all the STD
4570          * flags are set, or if no character set specifier is needed */
4571         bool has_default =
4572                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4573                     || ! has_charset);
4574         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4575         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4576                             >> RXf_PMf_STD_PMMOD_SHIFT);
4577         const char *fptr = STD_PAT_MODS;        /*"msix"*/
4578         char *p;
4579         /* Allocate for the worst case, which is all the std flags are turned
4580          * on.  If more precision is desired, we could do a population count of
4581          * the flags set.  This could be done with a small lookup table, or by
4582          * shifting, masking and adding, or even, when available, assembly
4583          * language for a machine-language population count.
4584          * We never output a minus, as all those are defaults, so are
4585          * covered by the caret */
4586         const STRLEN wraplen = plen + has_p + has_runon
4587             + has_default       /* If needs a caret */
4588             + has_charset       /* If needs a character set specifier */
4589             + (sizeof(STD_PAT_MODS) - 1)
4590             + (sizeof("(?:)") - 1);
4591
4592         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4593         SvPOK_on(rx);
4594         SvFLAGS(rx) |= SvUTF8(pattern);
4595         *p++='('; *p++='?';
4596
4597         /* If a default, cover it using the caret */
4598         if (has_default) {
4599             *p++= DEFAULT_PAT_MOD;
4600         }
4601         if (has_charset) {
4602             if (r->extflags & RXf_PMf_LOCALE) {
4603                 *p++ = LOCALE_PAT_MOD;
4604             } else {
4605                 *p++ = UNICODE_PAT_MOD;
4606             }
4607         }
4608         if (has_p)
4609             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4610         {
4611             char ch;
4612             while((ch = *fptr++)) {
4613                 if(reganch & 1)
4614                     *p++ = ch;
4615                 reganch >>= 1;
4616             }
4617         }
4618
4619         *p++ = ':';
4620         Copy(RExC_precomp, p, plen, char);
4621         assert ((RX_WRAPPED(rx) - p) < 16);
4622         r->pre_prefix = p - RX_WRAPPED(rx);
4623         p += plen;
4624         if (has_runon)
4625             *p++ = '\n';
4626         *p++ = ')';
4627         *p = 0;
4628         SvCUR_set(rx, p - SvPVX_const(rx));
4629     }
4630
4631     r->intflags = 0;
4632     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4633
4634     if (RExC_seen & REG_SEEN_RECURSE) {
4635         Newxz(RExC_open_parens, RExC_npar,regnode *);
4636         SAVEFREEPV(RExC_open_parens);
4637         Newxz(RExC_close_parens,RExC_npar,regnode *);
4638         SAVEFREEPV(RExC_close_parens);
4639     }
4640
4641     /* Useful during FAIL. */
4642 #ifdef RE_TRACK_PATTERN_OFFSETS
4643     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4644     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4645                           "%s %"UVuf" bytes for offset annotations.\n",
4646                           ri->u.offsets ? "Got" : "Couldn't get",
4647                           (UV)((2*RExC_size+1) * sizeof(U32))));
4648 #endif
4649     SetProgLen(ri,RExC_size);
4650     RExC_rx_sv = rx;
4651     RExC_rx = r;
4652     RExC_rxi = ri;
4653     REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4654
4655     /* Second pass: emit code. */
4656     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
4657     RExC_parse = exp;
4658     RExC_end = xend;
4659     RExC_naughty = 0;
4660     RExC_npar = 1;
4661     RExC_emit_start = ri->program;
4662     RExC_emit = ri->program;
4663     RExC_emit_bound = ri->program + RExC_size + 1;
4664
4665     /* Store the count of eval-groups for security checks: */
4666     RExC_rx->seen_evals = RExC_seen_evals;
4667     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4668     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4669         ReREFCNT_dec(rx);
4670         return(NULL);
4671     }
4672     /* XXXX To minimize changes to RE engine we always allocate
4673        3-units-long substrs field. */
4674     Newx(r->substrs, 1, struct reg_substr_data);
4675     if (RExC_recurse_count) {
4676         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4677         SAVEFREEPV(RExC_recurse);
4678     }
4679
4680 reStudy:
4681     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4682     Zero(r->substrs, 1, struct reg_substr_data);
4683
4684 #ifdef TRIE_STUDY_OPT
4685     if (!restudied) {
4686         StructCopy(&zero_scan_data, &data, scan_data_t);
4687         copyRExC_state = RExC_state;
4688     } else {
4689         U32 seen=RExC_seen;
4690         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4691
4692         RExC_state = copyRExC_state;
4693         if (seen & REG_TOP_LEVEL_BRANCHES)
4694             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4695         else
4696             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4697         if (data.last_found) {
4698             SvREFCNT_dec(data.longest_fixed);
4699             SvREFCNT_dec(data.longest_float);
4700             SvREFCNT_dec(data.last_found);
4701         }
4702         StructCopy(&zero_scan_data, &data, scan_data_t);
4703     }
4704 #else
4705     StructCopy(&zero_scan_data, &data, scan_data_t);
4706 #endif
4707
4708     /* Dig out information for optimizations. */
4709     r->extflags = RExC_flags; /* was pm_op */
4710     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4711
4712     if (UTF)
4713         SvUTF8_on(rx);  /* Unicode in it? */
4714     ri->regstclass = NULL;
4715     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
4716         r->intflags |= PREGf_NAUGHTY;
4717     scan = ri->program + 1;             /* First BRANCH. */
4718
4719     /* testing for BRANCH here tells us whether there is "must appear"
4720        data in the pattern. If there is then we can use it for optimisations */
4721     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4722         I32 fake;
4723         STRLEN longest_float_length, longest_fixed_length;
4724         struct regnode_charclass_class ch_class; /* pointed to by data */
4725         int stclass_flag;
4726         I32 last_close = 0; /* pointed to by data */
4727         regnode *first= scan;
4728         regnode *first_next= regnext(first);
4729         /*
4730          * Skip introductions and multiplicators >= 1
4731          * so that we can extract the 'meat' of the pattern that must
4732          * match in the large if() sequence following.
4733          * NOTE that EXACT is NOT covered here, as it is normally
4734          * picked up by the optimiser separately.
4735          *
4736          * This is unfortunate as the optimiser isnt handling lookahead
4737          * properly currently.
4738          *
4739          */
4740         while ((OP(first) == OPEN && (sawopen = 1)) ||
4741                /* An OR of *one* alternative - should not happen now. */
4742             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4743             /* for now we can't handle lookbehind IFMATCH*/
4744             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4745             (OP(first) == PLUS) ||
4746             (OP(first) == MINMOD) ||
4747                /* An {n,m} with n>0 */
4748             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4749             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4750         {
4751                 /*
4752                  * the only op that could be a regnode is PLUS, all the rest
4753                  * will be regnode_1 or regnode_2.
4754                  *
4755                  */
4756                 if (OP(first) == PLUS)
4757                     sawplus = 1;
4758                 else
4759                     first += regarglen[OP(first)];
4760
4761                 first = NEXTOPER(first);
4762                 first_next= regnext(first);
4763         }
4764
4765         /* Starting-point info. */
4766       again:
4767         DEBUG_PEEP("first:",first,0);
4768         /* Ignore EXACT as we deal with it later. */
4769         if (PL_regkind[OP(first)] == EXACT) {
4770             if (OP(first) == EXACT)
4771                 NOOP;   /* Empty, get anchored substr later. */
4772             else
4773                 ri->regstclass = first;
4774         }
4775 #ifdef TRIE_STCLASS
4776         else if (PL_regkind[OP(first)] == TRIE &&
4777                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4778         {
4779             regnode *trie_op;
4780             /* this can happen only on restudy */
4781             if ( OP(first) == TRIE ) {
4782                 struct regnode_1 *trieop = (struct regnode_1 *)
4783                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
4784                 StructCopy(first,trieop,struct regnode_1);
4785                 trie_op=(regnode *)trieop;
4786             } else {
4787                 struct regnode_charclass *trieop = (struct regnode_charclass *)
4788                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4789                 StructCopy(first,trieop,struct regnode_charclass);
4790                 trie_op=(regnode *)trieop;
4791             }
4792             OP(trie_op)+=2;
4793             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4794             ri->regstclass = trie_op;
4795         }
4796 #endif
4797         else if (REGNODE_SIMPLE(OP(first)))
4798             ri->regstclass = first;
4799         else if (PL_regkind[OP(first)] == BOUND ||
4800                  PL_regkind[OP(first)] == NBOUND)
4801             ri->regstclass = first;
4802         else if (PL_regkind[OP(first)] == BOL) {
4803             r->extflags |= (OP(first) == MBOL
4804                            ? RXf_ANCH_MBOL
4805                            : (OP(first) == SBOL
4806                               ? RXf_ANCH_SBOL
4807                               : RXf_ANCH_BOL));
4808             first = NEXTOPER(first);
4809             goto again;
4810         }
4811         else if (OP(first) == GPOS) {
4812             r->extflags |= RXf_ANCH_GPOS;
4813             first = NEXTOPER(first);
4814             goto again;
4815         }
4816         else if ((!sawopen || !RExC_sawback) &&
4817             (OP(first) == STAR &&
4818             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4819             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4820         {
4821             /* turn .* into ^.* with an implied $*=1 */
4822             const int type =
4823                 (OP(NEXTOPER(first)) == REG_ANY)
4824                     ? RXf_ANCH_MBOL
4825                     : RXf_ANCH_SBOL;
4826             r->extflags |= type;
4827             r->intflags |= PREGf_IMPLICIT;
4828             first = NEXTOPER(first);
4829             goto again;
4830         }
4831         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4832             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4833             /* x+ must match at the 1st pos of run of x's */
4834             r->intflags |= PREGf_SKIP;
4835
4836         /* Scan is after the zeroth branch, first is atomic matcher. */
4837 #ifdef TRIE_STUDY_OPT
4838         DEBUG_PARSE_r(
4839             if (!restudied)
4840                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4841                               (IV)(first - scan + 1))
4842         );
4843 #else
4844         DEBUG_PARSE_r(
4845             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4846                 (IV)(first - scan + 1))
4847         );
4848 #endif
4849
4850
4851         /*
4852         * If there's something expensive in the r.e., find the
4853         * longest literal string that must appear and make it the
4854         * regmust.  Resolve ties in favor of later strings, since
4855         * the regstart check works with the beginning of the r.e.
4856         * and avoiding duplication strengthens checking.  Not a
4857         * strong reason, but sufficient in the absence of others.
4858         * [Now we resolve ties in favor of the earlier string if
4859         * it happens that c_offset_min has been invalidated, since the
4860         * earlier string may buy us something the later one won't.]
4861         */
4862
4863         data.longest_fixed = newSVpvs("");
4864         data.longest_float = newSVpvs("");
4865         data.last_found = newSVpvs("");
4866         data.longest = &(data.longest_fixed);
4867         first = scan;
4868         if (!ri->regstclass) {
4869             cl_init(pRExC_state, &ch_class);
4870             data.start_class = &ch_class;
4871             stclass_flag = SCF_DO_STCLASS_AND;
4872         } else                          /* XXXX Check for BOUND? */
4873             stclass_flag = 0;
4874         data.last_closep = &last_close;
4875
4876         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4877             &data, -1, NULL, NULL,
4878             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4879
4880
4881         CHECK_RESTUDY_GOTO;
4882
4883
4884         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4885              && data.last_start_min == 0 && data.last_end > 0
4886              && !RExC_seen_zerolen
4887              && !(RExC_seen & REG_SEEN_VERBARG)
4888              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4889             r->extflags |= RXf_CHECK_ALL;
4890         scan_commit(pRExC_state, &data,&minlen,0);
4891         SvREFCNT_dec(data.last_found);
4892
4893         /* Note that code very similar to this but for anchored string
4894            follows immediately below, changes may need to be made to both.
4895            Be careful.
4896          */
4897         longest_float_length = CHR_SVLEN(data.longest_float);
4898         if (longest_float_length
4899             || (data.flags & SF_FL_BEFORE_EOL
4900                 && (!(data.flags & SF_FL_BEFORE_MEOL)
4901                     || (RExC_flags & RXf_PMf_MULTILINE))))
4902         {
4903             I32 t,ml;
4904
4905             if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
4906                 && data.offset_fixed == data.offset_float_min
4907                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4908                     goto remove_float;          /* As in (a)+. */
4909
4910             /* copy the information about the longest float from the reg_scan_data
4911                over to the program. */
4912             if (SvUTF8(data.longest_float)) {
4913                 r->float_utf8 = data.longest_float;
4914                 r->float_substr = NULL;
4915             } else {
4916                 r->float_substr = data.longest_float;
4917                 r->float_utf8 = NULL;
4918             }
4919             /* float_end_shift is how many chars that must be matched that
4920                follow this item. We calculate it ahead of time as once the
4921                lookbehind offset is added in we lose the ability to correctly
4922                calculate it.*/
4923             ml = data.minlen_float ? *(data.minlen_float)
4924                                    : (I32)longest_float_length;
4925             r->float_end_shift = ml - data.offset_float_min
4926                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4927                 + data.lookbehind_float;
4928             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4929             r->float_max_offset = data.offset_float_max;
4930             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4931                 r->float_max_offset -= data.lookbehind_float;
4932
4933             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4934                        && (!(data.flags & SF_FL_BEFORE_MEOL)
4935                            || (RExC_flags & RXf_PMf_MULTILINE)));
4936             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4937         }
4938         else {
4939           remove_float:
4940             r->float_substr = r->float_utf8 = NULL;
4941             SvREFCNT_dec(data.longest_float);
4942             longest_float_length = 0;
4943         }
4944
4945         /* Note that code very similar to this but for floating string
4946            is immediately above, changes may need to be made to both.
4947            Be careful.
4948          */
4949         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4950         if (longest_fixed_length
4951             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4952                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4953                     || (RExC_flags & RXf_PMf_MULTILINE))))
4954         {
4955             I32 t,ml;
4956
4957             /* copy the information about the longest fixed
4958                from the reg_scan_data over to the program. */
4959             if (SvUTF8(data.longest_fixed)) {
4960                 r->anchored_utf8 = data.longest_fixed;
4961                 r->anchored_substr = NULL;
4962             } else {
4963                 r->anchored_substr = data.longest_fixed;
4964                 r->anchored_utf8 = NULL;
4965             }
4966             /* fixed_end_shift is how many chars that must be matched that
4967                follow this item. We calculate it ahead of time as once the
4968                lookbehind offset is added in we lose the ability to correctly
4969                calculate it.*/
4970             ml = data.minlen_fixed ? *(data.minlen_fixed)
4971                                    : (I32)longest_fixed_length;
4972             r->anchored_end_shift = ml - data.offset_fixed
4973                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4974                 + data.lookbehind_fixed;
4975             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4976
4977             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4978                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
4979                      || (RExC_flags & RXf_PMf_MULTILINE)));
4980             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4981         }
4982         else {
4983             r->anchored_substr = r->anchored_utf8 = NULL;
4984             SvREFCNT_dec(data.longest_fixed);
4985             longest_fixed_length = 0;
4986         }
4987         if (ri->regstclass
4988             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4989             ri->regstclass = NULL;
4990         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4991             && stclass_flag
4992             && !(data.start_class->flags & ANYOF_EOS)
4993             && !cl_is_anything(data.start_class))
4994         {
4995             const U32 n = add_data(pRExC_state, 1, "f");
4996
4997             Newx(RExC_rxi->data->data[n], 1,
4998                 struct regnode_charclass_class);
4999             StructCopy(data.start_class,
5000                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5001                        struct regnode_charclass_class);
5002             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5003             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5004             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5005                       regprop(r, sv, (regnode*)data.start_class);
5006                       PerlIO_printf(Perl_debug_log,
5007                                     "synthetic stclass \"%s\".\n",
5008                                     SvPVX_const(sv));});
5009         }
5010
5011         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5012         if (longest_fixed_length > longest_float_length) {
5013             r->check_end_shift = r->anchored_end_shift;
5014             r->check_substr = r->anchored_substr;
5015             r->check_utf8 = r->anchored_utf8;
5016             r->check_offset_min = r->check_offset_max = r->anchored_offset;
5017             if (r->extflags & RXf_ANCH_SINGLE)
5018                 r->extflags |= RXf_NOSCAN;
5019         }
5020         else {
5021             r->check_end_shift = r->float_end_shift;
5022             r->check_substr = r->float_substr;
5023             r->check_utf8 = r->float_utf8;
5024             r->check_offset_min = r->float_min_offset;
5025             r->check_offset_max = r->float_max_offset;
5026         }
5027         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5028            This should be changed ASAP!  */
5029         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5030             r->extflags |= RXf_USE_INTUIT;
5031             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5032                 r->extflags |= RXf_INTUIT_TAIL;
5033         }
5034         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5035         if ( (STRLEN)minlen < longest_float_length )
5036             minlen= longest_float_length;
5037         if ( (STRLEN)minlen < longest_fixed_length )
5038             minlen= longest_fixed_length;
5039         */
5040     }
5041     else {
5042         /* Several toplevels. Best we can is to set minlen. */
5043         I32 fake;
5044         struct regnode_charclass_class ch_class;
5045         I32 last_close = 0;
5046
5047         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5048
5049         scan = ri->program + 1;
5050         cl_init(pRExC_state, &ch_class);
5051         data.start_class = &ch_class;
5052         data.last_closep = &last_close;
5053
5054
5055         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5056             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5057
5058         CHECK_RESTUDY_GOTO;
5059
5060         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5061                 = r->float_substr = r->float_utf8 = NULL;
5062         if (!(data.start_class->flags & ANYOF_EOS)
5063             && !cl_is_anything(data.start_class))
5064         {
5065             const U32 n = add_data(pRExC_state, 1, "f");
5066
5067             Newx(RExC_rxi->data->data[n], 1,
5068                 struct regnode_charclass_class);
5069             StructCopy(data.start_class,
5070                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5071                        struct regnode_charclass_class);
5072             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5073             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5074             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5075                       regprop(r, sv, (regnode*)data.start_class);
5076                       PerlIO_printf(Perl_debug_log,
5077                                     "synthetic stclass \"%s\".\n",
5078                                     SvPVX_const(sv));});
5079         }
5080     }
5081
5082     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5083        the "real" pattern. */
5084     DEBUG_OPTIMISE_r({
5085         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5086                       (IV)minlen, (IV)r->minlen);
5087     });
5088     r->minlenret = minlen;
5089     if (r->minlen < minlen)
5090         r->minlen = minlen;
5091
5092     if (RExC_seen & REG_SEEN_GPOS)
5093         r->extflags |= RXf_GPOS_SEEN;
5094     if (RExC_seen & REG_SEEN_LOOKBEHIND)
5095         r->extflags |= RXf_LOOKBEHIND_SEEN;
5096     if (RExC_seen & REG_SEEN_EVAL)
5097         r->extflags |= RXf_EVAL_SEEN;
5098     if (RExC_seen & REG_SEEN_CANY)
5099         r->extflags |= RXf_CANY_SEEN;
5100     if (RExC_seen & REG_SEEN_VERBARG)
5101         r->intflags |= PREGf_VERBARG_SEEN;
5102     if (RExC_seen & REG_SEEN_CUTGROUP)
5103         r->intflags |= PREGf_CUTGROUP_SEEN;
5104     if (RExC_paren_names)
5105         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5106     else
5107         RXp_PAREN_NAMES(r) = NULL;
5108
5109 #ifdef STUPID_PATTERN_CHECKS
5110     if (RX_PRELEN(rx) == 0)
5111         r->extflags |= RXf_NULL;
5112     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5113         /* XXX: this should happen BEFORE we compile */
5114         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5115     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5116         r->extflags |= RXf_WHITE;
5117     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5118         r->extflags |= RXf_START_ONLY;
5119 #else
5120     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5121             /* XXX: this should happen BEFORE we compile */
5122             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5123     else {
5124         regnode *first = ri->program + 1;
5125         U8 fop = OP(first);
5126         U8 nop = OP(NEXTOPER(first));
5127
5128         if (PL_regkind[fop] == NOTHING && nop == END)
5129             r->extflags |= RXf_NULL;
5130         else if (PL_regkind[fop] == BOL && nop == END)
5131             r->extflags |= RXf_START_ONLY;
5132         else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
5133             r->extflags |= RXf_WHITE;
5134     }
5135 #endif
5136 #ifdef DEBUGGING
5137     if (RExC_paren_names) {
5138         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5139         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5140     } else
5141 #endif
5142         ri->name_list_idx = 0;
5143
5144     if (RExC_recurse_count) {
5145         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5146             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5147             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5148         }
5149     }
5150     Newxz(r->offs, RExC_npar, regexp_paren_pair);
5151     /* assume we don't need to swap parens around before we match */
5152
5153     DEBUG_DUMP_r({
5154         PerlIO_printf(Perl_debug_log,"Final program:\n");
5155         regdump(r);
5156     });
5157 #ifdef RE_TRACK_PATTERN_OFFSETS
5158     DEBUG_OFFSETS_r(if (ri->u.offsets) {
5159         const U32 len = ri->u.offsets[0];
5160         U32 i;
5161         GET_RE_DEBUG_FLAGS_DECL;
5162         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5163         for (i = 1; i <= len; i++) {
5164             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5165                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5166                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5167             }
5168         PerlIO_printf(Perl_debug_log, "\n");
5169     });
5170 #endif
5171     return rx;
5172 }
5173
5174 #undef RE_ENGINE_PTR
5175
5176
5177 SV*
5178 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5179                     const U32 flags)
5180 {
5181     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5182
5183     PERL_UNUSED_ARG(value);
5184
5185     if (flags & RXapif_FETCH) {
5186         return reg_named_buff_fetch(rx, key, flags);
5187     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5188         Perl_croak_no_modify(aTHX);
5189         return NULL;
5190     } else if (flags & RXapif_EXISTS) {
5191         return reg_named_buff_exists(rx, key, flags)
5192             ? &PL_sv_yes
5193             : &PL_sv_no;
5194     } else if (flags & RXapif_REGNAMES) {
5195         return reg_named_buff_all(rx, flags);
5196     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5197         return reg_named_buff_scalar(rx, flags);
5198     } else {
5199         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5200         return NULL;
5201     }
5202 }
5203
5204 SV*
5205 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5206                          const U32 flags)
5207 {
5208     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5209     PERL_UNUSED_ARG(lastkey);
5210
5211     if (flags & RXapif_FIRSTKEY)
5212         return reg_named_buff_firstkey(rx, flags);
5213     else if (flags & RXapif_NEXTKEY)
5214         return reg_named_buff_nextkey(rx, flags);
5215     else {
5216         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5217         return NULL;
5218     }
5219 }
5220
5221 SV*
5222 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5223                           const U32 flags)
5224 {
5225     AV *retarray = NULL;
5226     SV *ret;
5227     struct regexp *const rx = (struct regexp *)SvANY(r);
5228
5229     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5230
5231     if (flags & RXapif_ALL)
5232         retarray=newAV();
5233
5234     if (rx && RXp_PAREN_NAMES(rx)) {
5235         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5236         if (he_str) {
5237             IV i;
5238             SV* sv_dat=HeVAL(he_str);
5239             I32 *nums=(I32*)SvPVX(sv_dat);
5240             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5241                 if ((I32)(rx->nparens) >= nums[i]
5242                     && rx->offs[nums[i]].start != -1
5243                     && rx->offs[nums[i]].end != -1)
5244                 {
5245                     ret = newSVpvs("");
5246                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5247                     if (!retarray)
5248                         return ret;
5249                 } else {
5250                     ret = newSVsv(&PL_sv_undef);
5251                 }
5252                 if (retarray)
5253                     av_push(retarray, ret);
5254             }
5255             if (retarray)
5256                 return newRV_noinc(MUTABLE_SV(retarray));
5257         }
5258     }
5259     return NULL;
5260 }
5261
5262 bool
5263 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5264                            const U32 flags)
5265 {
5266     struct regexp *const rx = (struct regexp *)SvANY(r);
5267
5268     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5269
5270     if (rx && RXp_PAREN_NAMES(rx)) {
5271         if (flags & RXapif_ALL) {
5272             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5273         } else {
5274             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5275             if (sv) {
5276                 SvREFCNT_dec(sv);
5277                 return TRUE;
5278             } else {
5279                 return FALSE;
5280             }
5281         }
5282     } else {
5283         return FALSE;
5284     }
5285 }
5286
5287 SV*
5288 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5289 {
5290     struct regexp *const rx = (struct regexp *)SvANY(r);
5291
5292     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5293
5294     if ( rx && RXp_PAREN_NAMES(rx) ) {
5295         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5296
5297         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5298     } else {
5299         return FALSE;
5300     }
5301 }
5302
5303 SV*
5304 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5305 {
5306     struct regexp *const rx = (struct regexp *)SvANY(r);
5307     GET_RE_DEBUG_FLAGS_DECL;
5308
5309     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5310
5311     if (rx && RXp_PAREN_NAMES(rx)) {
5312         HV *hv = RXp_PAREN_NAMES(rx);
5313         HE *temphe;
5314         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5315             IV i;
5316             IV parno = 0;
5317             SV* sv_dat = HeVAL(temphe);
5318             I32 *nums = (I32*)SvPVX(sv_dat);
5319             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5320                 if ((I32)(rx->lastparen) >= nums[i] &&
5321                     rx->offs[nums[i]].start != -1 &&
5322                     rx->offs[nums[i]].end != -1)
5323                 {
5324                     parno = nums[i];
5325                     break;
5326                 }
5327             }
5328             if (parno || flags & RXapif_ALL) {
5329                 return newSVhek(HeKEY_hek(temphe));
5330             }
5331         }
5332     }
5333     return NULL;
5334 }
5335
5336 SV*
5337 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5338 {
5339     SV *ret;
5340     AV *av;
5341     I32 length;
5342     struct regexp *const rx = (struct regexp *)SvANY(r);
5343
5344     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5345
5346     if (rx && RXp_PAREN_NAMES(rx)) {
5347         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5348             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5349         } else if (flags & RXapif_ONE) {
5350             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5351             av = MUTABLE_AV(SvRV(ret));
5352             length = av_len(av);
5353             SvREFCNT_dec(ret);
5354             return newSViv(length + 1);
5355         } else {
5356             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5357             return NULL;
5358         }
5359     }
5360     return &PL_sv_undef;
5361 }
5362
5363 SV*
5364 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5365 {
5366     struct regexp *const rx = (struct regexp *)SvANY(r);
5367     AV *av = newAV();
5368
5369     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5370
5371     if (rx && RXp_PAREN_NAMES(rx)) {
5372         HV *hv= RXp_PAREN_NAMES(rx);
5373         HE *temphe;
5374         (void)hv_iterinit(hv);
5375         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5376             IV i;
5377             IV parno = 0;
5378             SV* sv_dat = HeVAL(temphe);
5379             I32 *nums = (I32*)SvPVX(sv_dat);
5380             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5381                 if ((I32)(rx->lastparen) >= nums[i] &&
5382                     rx->offs[nums[i]].start != -1 &&
5383                     rx->offs[nums[i]].end != -1)
5384                 {
5385                     parno = nums[i];
5386                     break;
5387                 }
5388             }
5389             if (parno || flags & RXapif_ALL) {
5390                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5391             }
5392         }
5393     }
5394
5395     return newRV_noinc(MUTABLE_SV(av));
5396 }
5397
5398 void
5399 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5400                              SV * const sv)
5401 {
5402     struct regexp *const rx = (struct regexp *)SvANY(r);
5403     char *s = NULL;
5404     I32 i = 0;
5405     I32 s1, t1;
5406
5407     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5408
5409     if (!rx->subbeg) {
5410         sv_setsv(sv,&PL_sv_undef);
5411         return;
5412     }
5413     else
5414     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5415         /* $` */
5416         i = rx->offs[0].start;
5417         s = rx->subbeg;
5418     }
5419     else
5420     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5421         /* $' */
5422         s = rx->subbeg + rx->offs[0].end;
5423         i = rx->sublen - rx->offs[0].end;
5424     }
5425     else
5426     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5427         (s1 = rx->offs[paren].start) != -1 &&
5428         (t1 = rx->offs[paren].end) != -1)
5429     {
5430         /* $& $1 ... */
5431         i = t1 - s1;
5432         s = rx->subbeg + s1;
5433     } else {
5434         sv_setsv(sv,&PL_sv_undef);
5435         return;
5436     }
5437     assert(rx->sublen >= (s - rx->subbeg) + i );
5438     if (i >= 0) {
5439         const int oldtainted = PL_tainted;
5440         TAINT_NOT;
5441         sv_setpvn(sv, s, i);
5442         PL_tainted = oldtainted;
5443         if ( (rx->extflags & RXf_CANY_SEEN)
5444             ? (RXp_MATCH_UTF8(rx)
5445                         && (!i || is_utf8_string((U8*)s, i)))
5446             : (RXp_MATCH_UTF8(rx)) )
5447         {
5448             SvUTF8_on(sv);
5449         }
5450         else
5451             SvUTF8_off(sv);
5452         if (PL_tainting) {
5453             if (RXp_MATCH_TAINTED(rx)) {
5454                 if (SvTYPE(sv) >= SVt_PVMG) {
5455                     MAGIC* const mg = SvMAGIC(sv);
5456                     MAGIC* mgt;
5457                     PL_tainted = 1;
5458                     SvMAGIC_set(sv, mg->mg_moremagic);
5459                     SvTAINT(sv);
5460                     if ((mgt = SvMAGIC(sv))) {
5461                         mg->mg_moremagic = mgt;
5462                         SvMAGIC_set(sv, mg);
5463                     }
5464                 } else {
5465                     PL_tainted = 1;
5466                     SvTAINT(sv);
5467                 }
5468             } else
5469                 SvTAINTED_off(sv);
5470         }
5471     } else {
5472         sv_setsv(sv,&PL_sv_undef);
5473         return;
5474     }
5475 }
5476
5477 void
5478 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5479                                                          SV const * const value)
5480 {
5481     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5482
5483     PERL_UNUSED_ARG(rx);
5484     PERL_UNUSED_ARG(paren);
5485     PERL_UNUSED_ARG(value);
5486
5487     if (!PL_localizing)
5488         Perl_croak_no_modify(aTHX);
5489 }
5490
5491 I32
5492 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5493                               const I32 paren)
5494 {
5495     struct regexp *const rx = (struct regexp *)SvANY(r);
5496     I32 i;
5497     I32 s1, t1;
5498
5499     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5500
5501     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5502         switch (paren) {
5503       /* $` / ${^PREMATCH} */
5504       case RX_BUFF_IDX_PREMATCH:
5505         if (rx->offs[0].start != -1) {
5506                         i = rx->offs[0].start;
5507                         if (i > 0) {
5508                                 s1 = 0;
5509                                 t1 = i;
5510                                 goto getlen;
5511                         }
5512             }
5513         return 0;
5514       /* $' / ${^POSTMATCH} */
5515       case RX_BUFF_IDX_POSTMATCH:
5516             if (rx->offs[0].end != -1) {
5517                         i = rx->sublen - rx->offs[0].end;
5518                         if (i > 0) {
5519                                 s1 = rx->offs[0].end;
5520                                 t1 = rx->sublen;
5521                                 goto getlen;
5522                         }
5523             }
5524         return 0;
5525       /* $& / ${^MATCH}, $1, $2, ... */
5526       default:
5527             if (paren <= (I32)rx->nparens &&
5528             (s1 = rx->offs[paren].start) != -1 &&
5529             (t1 = rx->offs[paren].end) != -1)
5530             {
5531             i = t1 - s1;
5532             goto getlen;
5533         } else {
5534             if (ckWARN(WARN_UNINITIALIZED))
5535                 report_uninit((const SV *)sv);
5536             return 0;
5537         }
5538     }
5539   getlen:
5540     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5541         const char * const s = rx->subbeg + s1;
5542         const U8 *ep;
5543         STRLEN el;
5544
5545         i = t1 - s1;
5546         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5547                         i = el;
5548     }
5549     return i;
5550 }
5551
5552 SV*
5553 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5554 {
5555     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5556         PERL_UNUSED_ARG(rx);
5557         if (0)
5558             return NULL;
5559         else
5560             return newSVpvs("Regexp");
5561 }
5562
5563 /* Scans the name of a named buffer from the pattern.
5564  * If flags is REG_RSN_RETURN_NULL returns null.
5565  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5566  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5567  * to the parsed name as looked up in the RExC_paren_names hash.
5568  * If there is an error throws a vFAIL().. type exception.
5569  */
5570
5571 #define REG_RSN_RETURN_NULL    0
5572 #define REG_RSN_RETURN_NAME    1
5573 #define REG_RSN_RETURN_DATA    2
5574
5575 STATIC SV*
5576 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5577 {
5578     char *name_start = RExC_parse;
5579
5580     PERL_ARGS_ASSERT_REG_SCAN_NAME;
5581
5582     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5583          /* skip IDFIRST by using do...while */
5584         if (UTF)
5585             do {
5586                 RExC_parse += UTF8SKIP(RExC_parse);
5587             } while (isALNUM_utf8((U8*)RExC_parse));
5588         else
5589             do {
5590                 RExC_parse++;
5591             } while (isALNUM(*RExC_parse));
5592     }
5593
5594     if ( flags ) {
5595         SV* sv_name
5596             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5597                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5598         if ( flags == REG_RSN_RETURN_NAME)
5599             return sv_name;
5600         else if (flags==REG_RSN_RETURN_DATA) {
5601             HE *he_str = NULL;
5602             SV *sv_dat = NULL;
5603             if ( ! sv_name )      /* should not happen*/
5604                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5605             if (RExC_paren_names)
5606                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5607             if ( he_str )
5608                 sv_dat = HeVAL(he_str);
5609             if ( ! sv_dat )
5610                 vFAIL("Reference to nonexistent named group");
5611             return sv_dat;
5612         }
5613         else {
5614             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5615         }
5616         /* NOT REACHED */
5617     }
5618     return NULL;
5619 }
5620
5621 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5622     int rem=(int)(RExC_end - RExC_parse);                       \
5623     int cut;                                                    \
5624     int num;                                                    \
5625     int iscut=0;                                                \
5626     if (rem>10) {                                               \
5627         rem=10;                                                 \
5628         iscut=1;                                                \
5629     }                                                           \
5630     cut=10-rem;                                                 \
5631     if (RExC_lastparse!=RExC_parse)                             \
5632         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5633             rem, RExC_parse,                                    \
5634             cut + 4,                                            \
5635             iscut ? "..." : "<"                                 \
5636         );                                                      \
5637     else                                                        \
5638         PerlIO_printf(Perl_debug_log,"%16s","");                \
5639                                                                 \
5640     if (SIZE_ONLY)                                              \
5641        num = RExC_size + 1;                                     \
5642     else                                                        \
5643        num=REG_NODE_NUM(RExC_emit);                             \
5644     if (RExC_lastnum!=num)                                      \
5645        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5646     else                                                        \
5647        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5648     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5649         (int)((depth*2)), "",                                   \
5650         (funcname)                                              \
5651     );                                                          \
5652     RExC_lastnum=num;                                           \
5653     RExC_lastparse=RExC_parse;                                  \
5654 })
5655
5656
5657
5658 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5659     DEBUG_PARSE_MSG((funcname));                            \
5660     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5661 })
5662 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5663     DEBUG_PARSE_MSG((funcname));                            \
5664     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5665 })
5666 /*
5667  - reg - regular expression, i.e. main body or parenthesized thing
5668  *
5669  * Caller must absorb opening parenthesis.
5670  *
5671  * Combining parenthesis handling with the base level of regular expression
5672  * is a trifle forced, but the need to tie the tails of the branches to what
5673  * follows makes it hard to avoid.
5674  */
5675 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5676 #ifdef DEBUGGING
5677 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5678 #else
5679 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5680 #endif
5681
5682 STATIC regnode *
5683 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5684     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5685 {
5686     dVAR;
5687     register regnode *ret;              /* Will be the head of the group. */
5688     register regnode *br;
5689     register regnode *lastbr;
5690     register regnode *ender = NULL;
5691     register I32 parno = 0;
5692     I32 flags;
5693     U32 oregflags = RExC_flags;
5694     bool have_branch = 0;
5695     bool is_open = 0;
5696     I32 freeze_paren = 0;
5697     I32 after_freeze = 0;
5698
5699     /* for (?g), (?gc), and (?o) warnings; warning
5700        about (?c) will warn about (?g) -- japhy    */
5701
5702 #define WASTED_O  0x01
5703 #define WASTED_G  0x02
5704 #define WASTED_C  0x04
5705 #define WASTED_GC (0x02|0x04)
5706     I32 wastedflags = 0x00;
5707
5708     char * parse_start = RExC_parse; /* MJD */
5709     char * const oregcomp_parse = RExC_parse;
5710
5711     GET_RE_DEBUG_FLAGS_DECL;
5712
5713     PERL_ARGS_ASSERT_REG;
5714     DEBUG_PARSE("reg ");
5715
5716     *flagp = 0;                         /* Tentatively. */
5717
5718
5719     /* Make an OPEN node, if parenthesized. */
5720     if (paren) {
5721         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5722             char *start_verb = RExC_parse;
5723             STRLEN verb_len = 0;
5724             char *start_arg = NULL;
5725             unsigned char op = 0;
5726             int argok = 1;
5727             int internal_argval = 0; /* internal_argval is only useful if !argok */
5728             while ( *RExC_parse && *RExC_parse != ')' ) {
5729                 if ( *RExC_parse == ':' ) {
5730                     start_arg = RExC_parse + 1;
5731                     break;
5732                 }
5733                 RExC_parse++;
5734             }
5735             ++start_verb;
5736             verb_len = RExC_parse - start_verb;
5737             if ( start_arg ) {
5738                 RExC_parse++;
5739                 while ( *RExC_parse && *RExC_parse != ')' )
5740                     RExC_parse++;
5741                 if ( *RExC_parse != ')' )
5742                     vFAIL("Unterminated verb pattern argument");
5743                 if ( RExC_parse == start_arg )
5744                     start_arg = NULL;
5745             } else {
5746                 if ( *RExC_parse != ')' )
5747                     vFAIL("Unterminated verb pattern");
5748             }
5749
5750             switch ( *start_verb ) {
5751             case 'A':  /* (*ACCEPT) */
5752                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5753                     op = ACCEPT;
5754                     internal_argval = RExC_nestroot;
5755                 }
5756                 break;
5757             case 'C':  /* (*COMMIT) */
5758                 if ( memEQs(start_verb,verb_len,"COMMIT") )
5759                     op = COMMIT;
5760                 break;
5761             case 'F':  /* (*FAIL) */
5762                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5763                     op = OPFAIL;
5764                     argok = 0;
5765                 }
5766                 break;
5767             case ':':  /* (*:NAME) */
5768             case 'M':  /* (*MARK:NAME) */
5769                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5770                     op = MARKPOINT;
5771                     argok = -1;
5772                 }
5773                 break;
5774             case 'P':  /* (*PRUNE) */
5775                 if ( memEQs(start_verb,verb_len,"PRUNE") )
5776                     op = PRUNE;
5777                 break;
5778             case 'S':   /* (*SKIP) */
5779                 if ( memEQs(start_verb,verb_len,"SKIP") )
5780                     op = SKIP;
5781                 break;
5782             case 'T':  /* (*THEN) */
5783                 /* [19:06] <TimToady> :: is then */
5784                 if ( memEQs(start_verb,verb_len,"THEN") ) {
5785                     op = CUTGROUP;
5786                     RExC_seen |= REG_SEEN_CUTGROUP;
5787                 }
5788                 break;
5789             }
5790             if ( ! op ) {
5791                 RExC_parse++;
5792                 vFAIL3("Unknown verb pattern '%.*s'",
5793                     verb_len, start_verb);
5794             }
5795             if ( argok ) {
5796                 if ( start_arg && internal_argval ) {
5797                     vFAIL3("Verb pattern '%.*s' may not have an argument",
5798                         verb_len, start_verb);
5799                 } else if ( argok < 0 && !start_arg ) {
5800                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5801                         verb_len, start_verb);
5802                 } else {
5803                     ret = reganode(pRExC_state, op, internal_argval);
5804                     if ( ! internal_argval && ! SIZE_ONLY ) {
5805                         if (start_arg) {
5806                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5807                             ARG(ret) = add_data( pRExC_state, 1, "S" );
5808                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5809                             ret->flags = 0;
5810                         } else {
5811                             ret->flags = 1;
5812                         }
5813                     }
5814                 }
5815                 if (!internal_argval)
5816                     RExC_seen |= REG_SEEN_VERBARG;
5817             } else if ( start_arg ) {
5818                 vFAIL3("Verb pattern '%.*s' may not have an argument",
5819                         verb_len, start_verb);
5820             } else {
5821                 ret = reg_node(pRExC_state, op);
5822             }
5823             nextchar(pRExC_state);
5824             return ret;
5825         } else
5826         if (*RExC_parse == '?') { /* (?...) */
5827             bool is_logical = 0;
5828             const char * const seqstart = RExC_parse;
5829             bool has_use_defaults = FALSE;
5830
5831             RExC_parse++;
5832             paren = *RExC_parse++;
5833             ret = NULL;                 /* For look-ahead/behind. */
5834             switch (paren) {
5835
5836             case 'P':   /* (?P...) variants for those used to PCRE/Python */
5837                 paren = *RExC_parse++;
5838                 if ( paren == '<')         /* (?P<...>) named capture */
5839                     goto named_capture;
5840                 else if (paren == '>') {   /* (?P>name) named recursion */
5841                     goto named_recursion;
5842                 }
5843                 else if (paren == '=') {   /* (?P=...)  named backref */
5844                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
5845                        you change this make sure you change that */
5846                     char* name_start = RExC_parse;
5847                     U32 num = 0;
5848                     SV *sv_dat = reg_scan_name(pRExC_state,
5849                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5850                     if (RExC_parse == name_start || *RExC_parse != ')')
5851                         vFAIL2("Sequence %.3s... not terminated",parse_start);
5852
5853                     if (!SIZE_ONLY) {
5854                         num = add_data( pRExC_state, 1, "S" );
5855                         RExC_rxi->data->data[num]=(void*)sv_dat;
5856                         SvREFCNT_inc_simple_void(sv_dat);
5857                     }
5858                     RExC_sawback = 1;
5859                     ret = reganode(pRExC_state,
5860                                    ((! FOLD)
5861                                      ? NREF
5862                                      : (UNI_SEMANTICS)
5863                                        ? NREFFU
5864                                        : (LOC)
5865                                          ? NREFFL
5866                                          : NREFF),
5867                                     num);
5868                     *flagp |= HASWIDTH;
5869
5870                     Set_Node_Offset(ret, parse_start+1);
5871                     Set_Node_Cur_Length(ret); /* MJD */
5872
5873                     nextchar(pRExC_state);
5874                     return ret;
5875                 }
5876                 RExC_parse++;
5877                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5878                 /*NOTREACHED*/
5879             case '<':           /* (?<...) */
5880                 if (*RExC_parse == '!')
5881                     paren = ',';
5882                 else if (*RExC_parse != '=')
5883               named_capture:
5884                 {               /* (?<...>) */
5885                     char *name_start;
5886                     SV *svname;
5887                     paren= '>';
5888             case '\'':          /* (?'...') */
5889                     name_start= RExC_parse;
5890                     svname = reg_scan_name(pRExC_state,
5891                         SIZE_ONLY ?  /* reverse test from the others */
5892                         REG_RSN_RETURN_NAME :
5893                         REG_RSN_RETURN_NULL);
5894                     if (RExC_parse == name_start) {
5895                         RExC_parse++;
5896                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5897                         /*NOTREACHED*/
5898                     }
5899                     if (*RExC_parse != paren)
5900                         vFAIL2("Sequence (?%c... not terminated",
5901                             paren=='>' ? '<' : paren);
5902                     if (SIZE_ONLY) {
5903                         HE *he_str;
5904                         SV *sv_dat = NULL;
5905                         if (!svname) /* shouldnt happen */
5906                             Perl_croak(aTHX_
5907                                 "panic: reg_scan_name returned NULL");
5908                         if (!RExC_paren_names) {
5909                             RExC_paren_names= newHV();
5910                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
5911 #ifdef DEBUGGING
5912                             RExC_paren_name_list= newAV();
5913                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5914 #endif
5915                         }
5916                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5917                         if ( he_str )
5918                             sv_dat = HeVAL(he_str);
5919                         if ( ! sv_dat ) {
5920                             /* croak baby croak */
5921                             Perl_croak(aTHX_
5922                                 "panic: paren_name hash element allocation failed");
5923                         } else if ( SvPOK(sv_dat) ) {
5924                             /* (?|...) can mean we have dupes so scan to check
5925                                its already been stored. Maybe a flag indicating
5926                                we are inside such a construct would be useful,
5927                                but the arrays are likely to be quite small, so
5928                                for now we punt -- dmq */
5929                             IV count = SvIV(sv_dat);
5930                             I32 *pv = (I32*)SvPVX(sv_dat);
5931                             IV i;
5932                             for ( i = 0 ; i < count ; i++ ) {
5933                                 if ( pv[i] == RExC_npar ) {
5934                                     count = 0;
5935                                     break;
5936                                 }
5937                             }
5938                             if ( count ) {
5939                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5940                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5941                                 pv[count] = RExC_npar;
5942                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5943                             }
5944                         } else {
5945                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
5946                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5947                             SvIOK_on(sv_dat);
5948                             SvIV_set(sv_dat, 1);
5949                         }
5950 #ifdef DEBUGGING
5951                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5952                             SvREFCNT_dec(svname);
5953 #endif
5954
5955                         /*sv_dump(sv_dat);*/
5956                     }
5957                     nextchar(pRExC_state);
5958                     paren = 1;
5959                     goto capturing_parens;
5960                 }
5961                 RExC_seen |= REG_SEEN_LOOKBEHIND;
5962                 RExC_parse++;
5963             case '=':           /* (?=...) */
5964                 RExC_seen_zerolen++;
5965                 break;
5966             case '!':           /* (?!...) */
5967                 RExC_seen_zerolen++;
5968                 if (*RExC_parse == ')') {
5969                     ret=reg_node(pRExC_state, OPFAIL);
5970                     nextchar(pRExC_state);
5971                     return ret;
5972                 }
5973                 break;
5974             case '|':           /* (?|...) */
5975                 /* branch reset, behave like a (?:...) except that
5976                    buffers in alternations share the same numbers */
5977                 paren = ':';
5978                 after_freeze = freeze_paren = RExC_npar;
5979                 break;
5980             case ':':           /* (?:...) */
5981             case '>':           /* (?>...) */
5982                 break;
5983             case '$':           /* (?$...) */
5984             case '@':           /* (?@...) */
5985                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5986                 break;
5987             case '#':           /* (?#...) */
5988                 while (*RExC_parse && *RExC_parse != ')')
5989                     RExC_parse++;
5990                 if (*RExC_parse != ')')
5991                     FAIL("Sequence (?#... not terminated");
5992                 nextchar(pRExC_state);
5993                 *flagp = TRYAGAIN;
5994                 return NULL;
5995             case '0' :           /* (?0) */
5996             case 'R' :           /* (?R) */
5997                 if (*RExC_parse != ')')
5998                     FAIL("Sequence (?R) not terminated");
5999                 ret = reg_node(pRExC_state, GOSTART);
6000                 *flagp |= POSTPONED;
6001                 nextchar(pRExC_state);
6002                 return ret;
6003                 /*notreached*/
6004             { /* named and numeric backreferences */
6005                 I32 num;
6006             case '&':            /* (?&NAME) */
6007                 parse_start = RExC_parse - 1;
6008               named_recursion:
6009                 {
6010                     SV *sv_dat = reg_scan_name(pRExC_state,
6011                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6012                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6013                 }
6014                 goto gen_recurse_regop;
6015                 /* NOT REACHED */
6016             case '+':
6017                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6018                     RExC_parse++;
6019                     vFAIL("Illegal pattern");
6020                 }
6021                 goto parse_recursion;
6022                 /* NOT REACHED*/
6023             case '-': /* (?-1) */
6024                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6025                     RExC_parse--; /* rewind to let it be handled later */
6026                     goto parse_flags;
6027                 }
6028                 /*FALLTHROUGH */
6029             case '1': case '2': case '3': case '4': /* (?1) */
6030             case '5': case '6': case '7': case '8': case '9':
6031                 RExC_parse--;
6032               parse_recursion:
6033                 num = atoi(RExC_parse);
6034                 parse_start = RExC_parse - 1; /* MJD */
6035                 if (*RExC_parse == '-')
6036                     RExC_parse++;
6037                 while (isDIGIT(*RExC_parse))
6038                         RExC_parse++;
6039                 if (*RExC_parse!=')')
6040                     vFAIL("Expecting close bracket");
6041
6042               gen_recurse_regop:
6043                 if ( paren == '-' ) {
6044                     /*
6045                     Diagram of capture buffer numbering.
6046                     Top line is the normal capture buffer numbers
6047                     Bottom line is the negative indexing as from
6048                     the X (the (?-2))
6049
6050                     +   1 2    3 4 5 X          6 7
6051                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6052                     -   5 4    3 2 1 X          x x
6053
6054                     */
6055                     num = RExC_npar + num;
6056                     if (num < 1)  {
6057                         RExC_parse++;
6058                         vFAIL("Reference to nonexistent group");
6059                     }
6060                 } else if ( paren == '+' ) {
6061                     num = RExC_npar + num - 1;
6062                 }
6063
6064                 ret = reganode(pRExC_state, GOSUB, num);
6065                 if (!SIZE_ONLY) {
6066                     if (num > (I32)RExC_rx->nparens) {
6067                         RExC_parse++;
6068                         vFAIL("Reference to nonexistent group");
6069                     }
6070                     ARG2L_SET( ret, RExC_recurse_count++);
6071                     RExC_emit++;
6072                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6073                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6074                 } else {
6075                     RExC_size++;
6076                 }
6077                 RExC_seen |= REG_SEEN_RECURSE;
6078                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6079                 Set_Node_Offset(ret, parse_start); /* MJD */
6080
6081                 *flagp |= POSTPONED;
6082                 nextchar(pRExC_state);
6083                 return ret;
6084             } /* named and numeric backreferences */
6085             /* NOT REACHED */
6086
6087             case '?':           /* (??...) */
6088                 is_logical = 1;
6089                 if (*RExC_parse != '{') {
6090                     RExC_parse++;
6091                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6092                     /*NOTREACHED*/
6093                 }
6094                 *flagp |= POSTPONED;
6095                 paren = *RExC_parse++;
6096                 /* FALL THROUGH */
6097             case '{':           /* (?{...}) */
6098             {
6099                 I32 count = 1;
6100                 U32 n = 0;
6101                 char c;
6102                 char *s = RExC_parse;
6103
6104                 RExC_seen_zerolen++;
6105                 RExC_seen |= REG_SEEN_EVAL;
6106                 while (count && (c = *RExC_parse)) {
6107                     if (c == '\\') {
6108                         if (RExC_parse[1])
6109                             RExC_parse++;
6110                     }
6111                     else if (c == '{')
6112                         count++;
6113                     else if (c == '}')
6114                         count--;
6115                     RExC_parse++;
6116                 }
6117                 if (*RExC_parse != ')') {
6118                     RExC_parse = s;
6119                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6120                 }
6121                 if (!SIZE_ONLY) {
6122                     PAD *pad;
6123                     OP_4tree *sop, *rop;
6124                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6125
6126                     ENTER;
6127                     Perl_save_re_context(aTHX);
6128                     rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6129                     sop->op_private |= OPpREFCOUNTED;
6130                     /* re_dup will OpREFCNT_inc */
6131                     OpREFCNT_set(sop, 1);
6132                     LEAVE;
6133
6134                     n = add_data(pRExC_state, 3, "nop");
6135                     RExC_rxi->data->data[n] = (void*)rop;
6136                     RExC_rxi->data->data[n+1] = (void*)sop;
6137                     RExC_rxi->data->data[n+2] = (void*)pad;
6138                     SvREFCNT_dec(sv);
6139                 }
6140                 else {                                          /* First pass */
6141                     if (PL_reginterp_cnt < ++RExC_seen_evals
6142                         && IN_PERL_RUNTIME)
6143                         /* No compiled RE interpolated, has runtime
6144                            components ===> unsafe.  */
6145                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
6146                     if (PL_tainting && PL_tainted)
6147                         FAIL("Eval-group in insecure regular expression");
6148 #if PERL_VERSION > 8
6149                     if (IN_PERL_COMPILETIME)
6150                         PL_cv_has_eval = 1;
6151 #endif
6152                 }
6153
6154                 nextchar(pRExC_state);
6155                 if (is_logical) {
6156                     ret = reg_node(pRExC_state, LOGICAL);
6157                     if (!SIZE_ONLY)
6158                         ret->flags = 2;
6159                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6160                     /* deal with the length of this later - MJD */
6161                     return ret;
6162                 }
6163                 ret = reganode(pRExC_state, EVAL, n);
6164                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6165                 Set_Node_Offset(ret, parse_start);
6166                 return ret;
6167             }
6168             case '(':           /* (?(?{...})...) and (?(?=...)...) */
6169             {
6170                 int is_define= 0;
6171                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
6172                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6173                         || RExC_parse[1] == '<'
6174                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
6175                         I32 flag;
6176
6177                         ret = reg_node(pRExC_state, LOGICAL);
6178                         if (!SIZE_ONLY)
6179                             ret->flags = 1;
6180                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6181                         goto insert_if;
6182                     }
6183                 }
6184                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
6185                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6186                 {
6187                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
6188                     char *name_start= RExC_parse++;
6189                     U32 num = 0;
6190                     SV *sv_dat=reg_scan_name(pRExC_state,
6191                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6192                     if (RExC_parse == name_start || *RExC_parse != ch)
6193                         vFAIL2("Sequence (?(%c... not terminated",
6194                             (ch == '>' ? '<' : ch));
6195                     RExC_parse++;
6196                     if (!SIZE_ONLY) {
6197                         num = add_data( pRExC_state, 1, "S" );
6198                         RExC_rxi->data->data[num]=(void*)sv_dat;
6199                         SvREFCNT_inc_simple_void(sv_dat);
6200                     }
6201                     ret = reganode(pRExC_state,NGROUPP,num);
6202                     goto insert_if_check_paren;
6203                 }
6204                 else if (RExC_parse[0] == 'D' &&
6205                          RExC_parse[1] == 'E' &&
6206                          RExC_parse[2] == 'F' &&
6207                          RExC_parse[3] == 'I' &&
6208                          RExC_parse[4] == 'N' &&
6209                          RExC_parse[5] == 'E')
6210                 {
6211                     ret = reganode(pRExC_state,DEFINEP,0);
6212                     RExC_parse +=6 ;
6213                     is_define = 1;
6214                     goto insert_if_check_paren;
6215                 }
6216                 else if (RExC_parse[0] == 'R') {
6217                     RExC_parse++;
6218                     parno = 0;
6219                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6220                         parno = atoi(RExC_parse++);
6221                         while (isDIGIT(*RExC_parse))
6222                             RExC_parse++;
6223                     } else if (RExC_parse[0] == '&') {
6224                         SV *sv_dat;
6225                         RExC_parse++;
6226                         sv_dat = reg_scan_name(pRExC_state,
6227                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6228                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6229                     }
6230                     ret = reganode(pRExC_state,INSUBP,parno);
6231                     goto insert_if_check_paren;
6232                 }
6233                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6234                     /* (?(1)...) */
6235                     char c;
6236                     parno = atoi(RExC_parse++);
6237
6238                     while (isDIGIT(*RExC_parse))
6239                         RExC_parse++;
6240                     ret = reganode(pRExC_state, GROUPP, parno);
6241
6242                  insert_if_check_paren:
6243                     if ((c = *nextchar(pRExC_state)) != ')')
6244                         vFAIL("Switch condition not recognized");
6245                   insert_if:
6246                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6247                     br = regbranch(pRExC_state, &flags, 1,depth+1);
6248                     if (br == NULL)
6249                         br = reganode(pRExC_state, LONGJMP, 0);
6250                     else
6251                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6252                     c = *nextchar(pRExC_state);
6253                     if (flags&HASWIDTH)
6254                         *flagp |= HASWIDTH;
6255                     if (c == '|') {
6256                         if (is_define)
6257                             vFAIL("(?(DEFINE)....) does not allow branches");
6258                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6259                         regbranch(pRExC_state, &flags, 1,depth+1);
6260                         REGTAIL(pRExC_state, ret, lastbr);
6261                         if (flags&HASWIDTH)
6262                             *flagp |= HASWIDTH;
6263                         c = *nextchar(pRExC_state);
6264                     }
6265                     else
6266                         lastbr = NULL;
6267                     if (c != ')')
6268                         vFAIL("Switch (?(condition)... contains too many branches");
6269                     ender = reg_node(pRExC_state, TAIL);
6270                     REGTAIL(pRExC_state, br, ender);
6271                     if (lastbr) {
6272                         REGTAIL(pRExC_state, lastbr, ender);
6273                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6274                     }
6275                     else
6276                         REGTAIL(pRExC_state, ret, ender);
6277                     RExC_size++; /* XXX WHY do we need this?!!
6278                                     For large programs it seems to be required
6279                                     but I can't figure out why. -- dmq*/
6280                     return ret;
6281                 }
6282                 else {
6283                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6284                 }
6285             }
6286             case 0:
6287                 RExC_parse--; /* for vFAIL to print correctly */
6288                 vFAIL("Sequence (? incomplete");
6289                 break;
6290             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
6291                                        that follow */
6292                 has_use_defaults = TRUE;
6293                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
6294                 if (RExC_utf8) {    /* But the default for a utf8 pattern is
6295                                        unicode semantics */
6296                     RExC_flags |= RXf_PMf_UNICODE;
6297                 }
6298                 goto parse_flags;
6299             default:
6300                 --RExC_parse;
6301                 parse_flags:      /* (?i) */
6302             {
6303                 U32 posflags = 0, negflags = 0;
6304                 U32 *flagsp = &posflags;
6305                 bool has_charset_modifier = 0;
6306
6307                 while (*RExC_parse) {
6308                     /* && strchr("iogcmsx", *RExC_parse) */
6309                     /* (?g), (?gc) and (?o) are useless here
6310                        and must be globally applied -- japhy */
6311                     switch (*RExC_parse) {
6312                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6313                     case LOCALE_PAT_MOD:
6314                         if (has_charset_modifier || flagsp == &negflags) {
6315                             goto fail_modifiers;
6316                         }
6317                         posflags |= RXf_PMf_LOCALE;
6318                         negflags |= RXf_PMf_UNICODE;
6319                         has_charset_modifier = 1;
6320                         break;
6321                     case UNICODE_PAT_MOD:
6322                         if (has_charset_modifier || flagsp == &negflags) {
6323                             goto fail_modifiers;
6324                         }
6325                         posflags |= RXf_PMf_UNICODE;
6326                         negflags |= RXf_PMf_LOCALE;
6327                         has_charset_modifier = 1;
6328                         break;
6329                     case DUAL_PAT_MOD:
6330                         if (has_use_defaults
6331                             || has_charset_modifier
6332                             || flagsp == &negflags)
6333                         {
6334                             goto fail_modifiers;
6335                         }
6336
6337                         /* The dual charset means unicode semantics if the
6338                          * pattern (or target, not known until runtime) are
6339                          * utf8 */
6340                         if (RExC_utf8) {
6341                             posflags |= RXf_PMf_UNICODE;
6342                             negflags |= RXf_PMf_LOCALE;
6343                         }
6344                         else {
6345                             negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
6346                         }
6347                         has_charset_modifier = 1;
6348                         break;
6349                     case ONCE_PAT_MOD: /* 'o' */
6350                     case GLOBAL_PAT_MOD: /* 'g' */
6351                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6352                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6353                             if (! (wastedflags & wflagbit) ) {
6354                                 wastedflags |= wflagbit;
6355                                 vWARN5(
6356                                     RExC_parse + 1,
6357                                     "Useless (%s%c) - %suse /%c modifier",
6358                                     flagsp == &negflags ? "?-" : "?",
6359                                     *RExC_parse,
6360                                     flagsp == &negflags ? "don't " : "",
6361                                     *RExC_parse
6362                                 );
6363                             }
6364                         }
6365                         break;
6366
6367                     case CONTINUE_PAT_MOD: /* 'c' */
6368                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6369                             if (! (wastedflags & WASTED_C) ) {
6370                                 wastedflags |= WASTED_GC;
6371                                 vWARN3(
6372                                     RExC_parse + 1,
6373                                     "Useless (%sc) - %suse /gc modifier",
6374                                     flagsp == &negflags ? "?-" : "?",
6375                                     flagsp == &negflags ? "don't " : ""
6376                                 );
6377                             }
6378                         }
6379                         break;
6380                     case KEEPCOPY_PAT_MOD: /* 'p' */
6381                         if (flagsp == &negflags) {
6382                             if (SIZE_ONLY)
6383                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6384                         } else {
6385                             *flagsp |= RXf_PMf_KEEPCOPY;
6386                         }
6387                         break;
6388                     case '-':
6389                         /* A flag is a default iff it is following a minus, so
6390                          * if there is a minus, it means will be trying to
6391                          * re-specify a default which is an error */
6392                         if (has_use_defaults || flagsp == &negflags) {
6393             fail_modifiers:
6394                             RExC_parse++;
6395                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6396                             /*NOTREACHED*/
6397                         }
6398                         flagsp = &negflags;
6399                         wastedflags = 0;  /* reset so (?g-c) warns twice */
6400                         break;
6401                     case ':':
6402                         paren = ':';
6403                         /*FALLTHROUGH*/
6404                     case ')':
6405                         RExC_flags |= posflags;
6406                         RExC_flags &= ~negflags;
6407                         if (paren != ':') {
6408                             oregflags |= posflags;
6409                             oregflags &= ~negflags;
6410                         }
6411                         nextchar(pRExC_state);
6412                         if (paren != ':') {
6413                             *flagp = TRYAGAIN;
6414                             return NULL;
6415                         } else {
6416                             ret = NULL;
6417                             goto parse_rest;
6418                         }
6419                         /*NOTREACHED*/
6420                     default:
6421                         RExC_parse++;
6422                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6423                         /*NOTREACHED*/
6424                     }
6425                     ++RExC_parse;
6426                 }
6427             }} /* one for the default block, one for the switch */
6428         }
6429         else {                  /* (...) */
6430           capturing_parens:
6431             parno = RExC_npar;
6432             RExC_npar++;
6433
6434             ret = reganode(pRExC_state, OPEN, parno);
6435             if (!SIZE_ONLY ){
6436                 if (!RExC_nestroot)
6437                     RExC_nestroot = parno;
6438                 if (RExC_seen & REG_SEEN_RECURSE
6439                     && !RExC_open_parens[parno-1])
6440                 {
6441                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6442                         "Setting open paren #%"IVdf" to %d\n",
6443                         (IV)parno, REG_NODE_NUM(ret)));
6444                     RExC_open_parens[parno-1]= ret;
6445                 }
6446             }
6447             Set_Node_Length(ret, 1); /* MJD */
6448             Set_Node_Offset(ret, RExC_parse); /* MJD */
6449             is_open = 1;
6450         }
6451     }
6452     else                        /* ! paren */
6453         ret = NULL;
6454
6455    parse_rest:
6456     /* Pick up the branches, linking them together. */
6457     parse_start = RExC_parse;   /* MJD */
6458     br = regbranch(pRExC_state, &flags, 1,depth+1);
6459
6460     if (freeze_paren) {
6461         if (RExC_npar > after_freeze)
6462             after_freeze = RExC_npar;
6463         RExC_npar = freeze_paren;
6464     }
6465
6466     /*     branch_len = (paren != 0); */
6467
6468     if (br == NULL)
6469         return(NULL);
6470     if (*RExC_parse == '|') {
6471         if (!SIZE_ONLY && RExC_extralen) {
6472             reginsert(pRExC_state, BRANCHJ, br, depth+1);
6473         }
6474         else {                  /* MJD */
6475             reginsert(pRExC_state, BRANCH, br, depth+1);
6476             Set_Node_Length(br, paren != 0);
6477             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6478         }
6479         have_branch = 1;
6480         if (SIZE_ONLY)
6481             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
6482     }
6483     else if (paren == ':') {
6484         *flagp |= flags&SIMPLE;
6485     }
6486     if (is_open) {                              /* Starts with OPEN. */
6487         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
6488     }
6489     else if (paren != '?')              /* Not Conditional */
6490         ret = br;
6491     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6492     lastbr = br;
6493     while (*RExC_parse == '|') {
6494         if (!SIZE_ONLY && RExC_extralen) {
6495             ender = reganode(pRExC_state, LONGJMP,0);
6496             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6497         }
6498         if (SIZE_ONLY)
6499             RExC_extralen += 2;         /* Account for LONGJMP. */
6500         nextchar(pRExC_state);
6501         if (freeze_paren) {
6502             if (RExC_npar > after_freeze)
6503                 after_freeze = RExC_npar;
6504             RExC_npar = freeze_paren;
6505         }
6506         br = regbranch(pRExC_state, &flags, 0, depth+1);
6507
6508         if (br == NULL)
6509             return(NULL);
6510         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
6511         lastbr = br;
6512         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6513     }
6514
6515     if (have_branch || paren != ':') {
6516         /* Make a closing node, and hook it on the end. */
6517         switch (paren) {
6518         case ':':
6519             ender = reg_node(pRExC_state, TAIL);
6520             break;
6521         case 1:
6522             ender = reganode(pRExC_state, CLOSE, parno);
6523             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6524                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6525                         "Setting close paren #%"IVdf" to %d\n",
6526                         (IV)parno, REG_NODE_NUM(ender)));
6527                 RExC_close_parens[parno-1]= ender;
6528                 if (RExC_nestroot == parno)
6529                     RExC_nestroot = 0;
6530             }
6531             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6532             Set_Node_Length(ender,1); /* MJD */
6533             break;
6534         case '<':
6535         case ',':
6536         case '=':
6537         case '!':
6538             *flagp &= ~HASWIDTH;
6539             /* FALL THROUGH */
6540         case '>':
6541             ender = reg_node(pRExC_state, SUCCEED);
6542             break;
6543         case 0:
6544             ender = reg_node(pRExC_state, END);
6545             if (!SIZE_ONLY) {
6546                 assert(!RExC_opend); /* there can only be one! */
6547                 RExC_opend = ender;
6548             }
6549             break;
6550         }
6551         REGTAIL(pRExC_state, lastbr, ender);
6552
6553         if (have_branch && !SIZE_ONLY) {
6554             if (depth==1)
6555                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6556
6557             /* Hook the tails of the branches to the closing node. */
6558             for (br = ret; br; br = regnext(br)) {
6559                 const U8 op = PL_regkind[OP(br)];
6560                 if (op == BRANCH) {
6561                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6562                 }
6563                 else if (op == BRANCHJ) {
6564                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6565                 }
6566             }
6567         }
6568     }
6569
6570     {
6571         const char *p;
6572         static const char parens[] = "=!<,>";
6573
6574         if (paren && (p = strchr(parens, paren))) {
6575             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6576             int flag = (p - parens) > 1;
6577
6578             if (paren == '>')
6579                 node = SUSPEND, flag = 0;
6580             reginsert(pRExC_state, node,ret, depth+1);
6581             Set_Node_Cur_Length(ret);
6582             Set_Node_Offset(ret, parse_start + 1);
6583             ret->flags = flag;
6584             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6585         }
6586     }
6587
6588     /* Check for proper termination. */
6589     if (paren) {
6590         RExC_flags = oregflags;
6591         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6592             RExC_parse = oregcomp_parse;
6593             vFAIL("Unmatched (");
6594         }
6595     }
6596     else if (!paren && RExC_parse < RExC_end) {
6597         if (*RExC_parse == ')') {
6598             RExC_parse++;
6599             vFAIL("Unmatched )");
6600         }
6601         else
6602             FAIL("Junk on end of regexp");      /* "Can't happen". */
6603         /* NOTREACHED */
6604     }
6605     if (after_freeze)
6606         RExC_npar = after_freeze;
6607     return(ret);
6608 }
6609
6610 /*
6611  - regbranch - one alternative of an | operator
6612  *
6613  * Implements the concatenation operator.
6614  */
6615 STATIC regnode *
6616 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6617 {
6618     dVAR;
6619     register regnode *ret;
6620     register regnode *chain = NULL;
6621     register regnode *latest;
6622     I32 flags = 0, c = 0;
6623     GET_RE_DEBUG_FLAGS_DECL;
6624
6625     PERL_ARGS_ASSERT_REGBRANCH;
6626
6627     DEBUG_PARSE("brnc");
6628
6629     if (first)
6630         ret = NULL;
6631     else {
6632         if (!SIZE_ONLY && RExC_extralen)
6633             ret = reganode(pRExC_state, BRANCHJ,0);
6634         else {
6635             ret = reg_node(pRExC_state, BRANCH);
6636             Set_Node_Length(ret, 1);
6637         }
6638     }
6639
6640     if (!first && SIZE_ONLY)
6641         RExC_extralen += 1;                     /* BRANCHJ */
6642
6643     *flagp = WORST;                     /* Tentatively. */
6644
6645     RExC_parse--;
6646     nextchar(pRExC_state);
6647     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6648         flags &= ~TRYAGAIN;
6649         latest = regpiece(pRExC_state, &flags,depth+1);
6650         if (latest == NULL) {
6651             if (flags & TRYAGAIN)
6652                 continue;
6653             return(NULL);
6654         }
6655         else if (ret == NULL)
6656             ret = latest;
6657         *flagp |= flags&(HASWIDTH|POSTPONED);
6658         if (chain == NULL)      /* First piece. */
6659             *flagp |= flags&SPSTART;
6660         else {
6661             RExC_naughty++;
6662             REGTAIL(pRExC_state, chain, latest);
6663         }
6664         chain = latest;
6665         c++;
6666     }
6667     if (chain == NULL) {        /* Loop ran zero times. */
6668         chain = reg_node(pRExC_state, NOTHING);
6669         if (ret == NULL)
6670             ret = chain;
6671     }
6672     if (c == 1) {
6673         *flagp |= flags&SIMPLE;
6674     }
6675
6676     return ret;
6677 }
6678
6679 /*
6680  - regpiece - something followed by possible [*+?]
6681  *
6682  * Note that the branching code sequences used for ? and the general cases
6683  * of * and + are somewhat optimized:  they use the same NOTHING node as
6684  * both the endmarker for their branch list and the body of the last branch.
6685  * It might seem that this node could be dispensed with entirely, but the
6686  * endmarker role is not redundant.
6687  */
6688 STATIC regnode *
6689 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6690 {
6691     dVAR;
6692     register regnode *ret;
6693     register char op;
6694     register char *next;
6695     I32 flags;
6696     const char * const origparse = RExC_parse;
6697     I32 min;
6698     I32 max = REG_INFTY;
6699     char *parse_start;
6700     const char *maxpos = NULL;
6701     GET_RE_DEBUG_FLAGS_DECL;
6702
6703     PERL_ARGS_ASSERT_REGPIECE;
6704
6705     DEBUG_PARSE("piec");
6706
6707     ret = regatom(pRExC_state, &flags,depth+1);
6708     if (ret == NULL) {
6709         if (flags & TRYAGAIN)
6710             *flagp |= TRYAGAIN;
6711         return(NULL);
6712     }
6713
6714     op = *RExC_parse;
6715
6716     if (op == '{' && regcurly(RExC_parse)) {
6717         maxpos = NULL;
6718         parse_start = RExC_parse; /* MJD */
6719         next = RExC_parse + 1;
6720         while (isDIGIT(*next) || *next == ',') {
6721             if (*next == ',') {
6722                 if (maxpos)
6723                     break;
6724                 else
6725                     maxpos = next;
6726             }
6727             next++;
6728         }
6729         if (*next == '}') {             /* got one */
6730             if (!maxpos)
6731                 maxpos = next;
6732             RExC_parse++;
6733             min = atoi(RExC_parse);
6734             if (*maxpos == ',')
6735                 maxpos++;
6736             else
6737                 maxpos = RExC_parse;
6738             max = atoi(maxpos);
6739             if (!max && *maxpos != '0')
6740                 max = REG_INFTY;                /* meaning "infinity" */
6741             else if (max >= REG_INFTY)
6742                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6743             RExC_parse = next;
6744             nextchar(pRExC_state);
6745
6746         do_curly:
6747             if ((flags&SIMPLE)) {
6748                 RExC_naughty += 2 + RExC_naughty / 2;
6749                 reginsert(pRExC_state, CURLY, ret, depth+1);
6750                 Set_Node_Offset(ret, parse_start+1); /* MJD */
6751                 Set_Node_Cur_Length(ret);
6752             }
6753             else {
6754                 regnode * const w = reg_node(pRExC_state, WHILEM);
6755
6756                 w->flags = 0;
6757                 REGTAIL(pRExC_state, ret, w);
6758                 if (!SIZE_ONLY && RExC_extralen) {
6759                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
6760                     reginsert(pRExC_state, NOTHING,ret, depth+1);
6761                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
6762                 }
6763                 reginsert(pRExC_state, CURLYX,ret, depth+1);
6764                                 /* MJD hk */
6765                 Set_Node_Offset(ret, parse_start+1);
6766                 Set_Node_Length(ret,
6767                                 op == '{' ? (RExC_parse - parse_start) : 1);
6768
6769                 if (!SIZE_ONLY && RExC_extralen)
6770                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
6771                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6772                 if (SIZE_ONLY)
6773                     RExC_whilem_seen++, RExC_extralen += 3;
6774                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
6775             }
6776             ret->flags = 0;
6777
6778             if (min > 0)
6779                 *flagp = WORST;
6780             if (max > 0)
6781                 *flagp |= HASWIDTH;
6782             if (max < min)
6783                 vFAIL("Can't do {n,m} with n > m");
6784             if (!SIZE_ONLY) {
6785                 ARG1_SET(ret, (U16)min);
6786                 ARG2_SET(ret, (U16)max);
6787             }
6788
6789             goto nest_check;
6790         }
6791     }
6792
6793     if (!ISMULT1(op)) {
6794         *flagp = flags;
6795         return(ret);
6796     }
6797
6798 #if 0                           /* Now runtime fix should be reliable. */
6799
6800     /* if this is reinstated, don't forget to put this back into perldiag:
6801
6802             =item Regexp *+ operand could be empty at {#} in regex m/%s/
6803
6804            (F) The part of the regexp subject to either the * or + quantifier
6805            could match an empty string. The {#} shows in the regular
6806            expression about where the problem was discovered.
6807
6808     */
6809
6810     if (!(flags&HASWIDTH) && op != '?')
6811       vFAIL("Regexp *+ operand could be empty");
6812 #endif
6813
6814     parse_start = RExC_parse;
6815     nextchar(pRExC_state);
6816
6817     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6818
6819     if (op == '*' && (flags&SIMPLE)) {
6820         reginsert(pRExC_state, STAR, ret, depth+1);
6821         ret->flags = 0;
6822         RExC_naughty += 4;
6823     }
6824     else if (op == '*') {
6825         min = 0;
6826         goto do_curly;
6827     }
6828     else if (op == '+' && (flags&SIMPLE)) {
6829         reginsert(pRExC_state, PLUS, ret, depth+1);
6830         ret->flags = 0;
6831         RExC_naughty += 3;
6832     }
6833     else if (op == '+') {
6834         min = 1;
6835         goto do_curly;
6836     }
6837     else if (op == '?') {
6838         min = 0; max = 1;
6839         goto do_curly;
6840     }
6841   nest_check:
6842     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6843         ckWARN3reg(RExC_parse,
6844                    "%.*s matches null string many times",
6845                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6846                    origparse);
6847     }
6848
6849     if (RExC_parse < RExC_end && *RExC_parse == '?') {
6850         nextchar(pRExC_state);
6851         reginsert(pRExC_state, MINMOD, ret, depth+1);
6852         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6853     }
6854 #ifndef REG_ALLOW_MINMOD_SUSPEND
6855     else
6856 #endif
6857     if (RExC_parse < RExC_end && *RExC_parse == '+') {
6858         regnode *ender;
6859         nextchar(pRExC_state);
6860         ender = reg_node(pRExC_state, SUCCEED);
6861         REGTAIL(pRExC_state, ret, ender);
6862         reginsert(pRExC_state, SUSPEND, ret, depth+1);
6863         ret->flags = 0;
6864         ender = reg_node(pRExC_state, TAIL);
6865         REGTAIL(pRExC_state, ret, ender);
6866         /*ret= ender;*/
6867     }
6868
6869     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6870         RExC_parse++;
6871         vFAIL("Nested quantifiers");
6872     }
6873
6874     return(ret);
6875 }
6876
6877
6878 /* reg_namedseq(pRExC_state,UVp)
6879
6880    This is expected to be called by a parser routine that has
6881    recognized '\N' and needs to handle the rest. RExC_parse is
6882    expected to point at the first char following the N at the time
6883    of the call.
6884
6885    The \N may be inside (indicated by valuep not being NULL) or outside a
6886    character class.
6887
6888    \N may begin either a named sequence, or if outside a character class, mean
6889    to match a non-newline.  For non single-quoted regexes, the tokenizer has
6890    attempted to decide which, and in the case of a named sequence converted it
6891    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6892    where c1... are the characters in the sequence.  For single-quoted regexes,
6893    the tokenizer passes the \N sequence through unchanged; this code will not
6894    attempt to determine this nor expand those.  The net effect is that if the
6895    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6896    signals that this \N occurrence means to match a non-newline.
6897
6898    Only the \N{U+...} form should occur in a character class, for the same
6899    reason that '.' inside a character class means to just match a period: it
6900    just doesn't make sense.
6901
6902    If valuep is non-null then it is assumed that we are parsing inside
6903    of a charclass definition and the first codepoint in the resolved
6904    string is returned via *valuep and the routine will return NULL.
6905    In this mode if a multichar string is returned from the charnames
6906    handler, a warning will be issued, and only the first char in the
6907    sequence will be examined. If the string returned is zero length
6908    then the value of *valuep is undefined and NON-NULL will
6909    be returned to indicate failure. (This will NOT be a valid pointer
6910    to a regnode.)
6911
6912    If valuep is null then it is assumed that we are parsing normal text and a
6913    new EXACT node is inserted into the program containing the resolved string,
6914    and a pointer to the new node is returned.  But if the string is zero length
6915    a NOTHING node is emitted instead.
6916
6917    On success RExC_parse is set to the char following the endbrace.
6918    Parsing failures will generate a fatal error via vFAIL(...)
6919  */
6920 STATIC regnode *
6921 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6922 {
6923     char * endbrace;    /* '}' following the name */
6924     regnode *ret = NULL;
6925 #ifdef DEBUGGING
6926     char* parse_start = RExC_parse - 2;     /* points to the '\N' */
6927 #endif
6928     char* p;
6929
6930     GET_RE_DEBUG_FLAGS_DECL;
6931
6932     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6933
6934     GET_RE_DEBUG_FLAGS;
6935
6936     /* The [^\n] meaning of \N ignores spaces and comments under the /x
6937      * modifier.  The other meaning does not */
6938     p = (RExC_flags & RXf_PMf_EXTENDED)
6939         ? regwhite( pRExC_state, RExC_parse )
6940         : RExC_parse;
6941
6942     /* Disambiguate between \N meaning a named character versus \N meaning
6943      * [^\n].  The former is assumed when it can't be the latter. */
6944     if (*p != '{' || regcurly(p)) {
6945         RExC_parse = p;
6946         if (valuep) {
6947             /* no bare \N in a charclass */
6948             vFAIL("\\N in a character class must be a named character: \\N{...}");
6949         }
6950         nextchar(pRExC_state);
6951         ret = reg_node(pRExC_state, REG_ANY);
6952         *flagp |= HASWIDTH|SIMPLE;
6953         RExC_naughty++;
6954         RExC_parse--;
6955         Set_Node_Length(ret, 1); /* MJD */
6956         return ret;
6957     }
6958
6959     /* Here, we have decided it should be a named sequence */
6960
6961     /* The test above made sure that the next real character is a '{', but
6962      * under the /x modifier, it could be separated by space (or a comment and
6963      * \n) and this is not allowed (for consistency with \x{...} and the
6964      * tokenizer handling of \N{NAME}). */
6965     if (*RExC_parse != '{') {
6966         vFAIL("Missing braces on \\N{}");
6967     }
6968
6969     RExC_parse++;       /* Skip past the '{' */
6970
6971     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6972         || ! (endbrace == RExC_parse            /* nothing between the {} */
6973               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
6974                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6975     {
6976         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
6977         vFAIL("\\N{NAME} must be resolved by the lexer");
6978     }
6979
6980     if (endbrace == RExC_parse) {   /* empty: \N{} */
6981         if (! valuep) {
6982             RExC_parse = endbrace + 1;
6983             return reg_node(pRExC_state,NOTHING);
6984         }
6985
6986         if (SIZE_ONLY) {
6987             ckWARNreg(RExC_parse,
6988                     "Ignoring zero length \\N{} in character class"
6989             );
6990             RExC_parse = endbrace + 1;
6991         }
6992         *valuep = 0;
6993         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6994     }
6995
6996     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
6997     RExC_parse += 2;    /* Skip past the 'U+' */
6998
6999     if (valuep) {   /* In a bracketed char class */
7000         /* We only pay attention to the first char of
7001         multichar strings being returned. I kinda wonder
7002         if this makes sense as it does change the behaviour
7003         from earlier versions, OTOH that behaviour was broken
7004         as well. XXX Solution is to recharacterize as
7005         [rest-of-class]|multi1|multi2... */
7006
7007         STRLEN length_of_hex;
7008         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7009             | PERL_SCAN_DISALLOW_PREFIX
7010             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7011
7012         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7013         if (endchar < endbrace) {
7014             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7015         }
7016
7017         length_of_hex = (STRLEN)(endchar - RExC_parse);
7018         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7019
7020         /* The tokenizer should have guaranteed validity, but it's possible to
7021          * bypass it by using single quoting, so check */
7022         if (length_of_hex == 0
7023             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7024         {
7025             RExC_parse += length_of_hex;        /* Includes all the valid */
7026             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
7027                             ? UTF8SKIP(RExC_parse)
7028                             : 1;
7029             /* Guard against malformed utf8 */
7030             if (RExC_parse >= endchar) RExC_parse = endchar;
7031             vFAIL("Invalid hexadecimal number in \\N{U+...}");
7032         }
7033
7034         RExC_parse = endbrace + 1;
7035         if (endchar == endbrace) return NULL;
7036
7037         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
7038     }
7039     else {      /* Not a char class */
7040         char *s;            /* String to put in generated EXACT node */
7041         STRLEN len = 0;     /* Its current byte length */
7042         char *endchar;      /* Points to '.' or '}' ending cur char in the input
7043                                stream */
7044
7045         ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
7046                                                    : (LOC)
7047                                                       ? EXACTFL
7048                                                       : UNI_SEMANTICS
7049                                                         ? EXACTFU
7050                                                         : EXACTF));
7051         s= STRING(ret);
7052
7053         /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
7054          * the input which is of the form now 'c1.c2.c3...}' until find the
7055          * ending brace or exceed length 255.  The characters that exceed this
7056          * limit are dropped.  The limit could be relaxed should it become
7057          * desirable by reparsing this as (?:\N{NAME}), so could generate
7058          * multiple EXACT nodes, as is done for just regular input.  But this
7059          * is primarily a named character, and not intended to be a huge long
7060          * string, so 255 bytes should be good enough */
7061         while (1) {
7062             STRLEN length_of_hex;
7063             I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7064                             | PERL_SCAN_DISALLOW_PREFIX
7065                             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7066             UV cp;  /* Ord of current character */
7067
7068             /* Code points are separated by dots.  If none, there is only one
7069              * code point, and is terminated by the brace */
7070             endchar = RExC_parse + strcspn(RExC_parse, ".}");
7071
7072             /* The values are Unicode even on EBCDIC machines */
7073             length_of_hex = (STRLEN)(endchar - RExC_parse);
7074             cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7075             if ( length_of_hex == 0
7076                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7077             {
7078                 RExC_parse += length_of_hex;        /* Includes all the valid */
7079                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
7080                                 ? UTF8SKIP(RExC_parse)
7081                                 : 1;
7082                 /* Guard against malformed utf8 */
7083                 if (RExC_parse >= endchar) RExC_parse = endchar;
7084                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7085             }
7086
7087             if (! FOLD) {       /* Not folding, just append to the string */
7088                 STRLEN unilen;
7089
7090                 /* Quit before adding this character if would exceed limit */
7091                 if (len + UNISKIP(cp) > U8_MAX) break;
7092
7093                 unilen = reguni(pRExC_state, cp, s);
7094                 if (unilen > 0) {
7095                     s   += unilen;
7096                     len += unilen;
7097                 }
7098             } else {    /* Folding, output the folded equivalent */
7099                 STRLEN foldlen,numlen;
7100                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7101                 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7102
7103                 /* Quit before exceeding size limit */
7104                 if (len + foldlen > U8_MAX) break;
7105
7106                 for (foldbuf = tmpbuf;
7107                     foldlen;
7108                     foldlen -= numlen)
7109                 {
7110                     cp = utf8_to_uvchr(foldbuf, &numlen);
7111                     if (numlen > 0) {
7112                         const STRLEN unilen = reguni(pRExC_state, cp, s);
7113                         s       += unilen;
7114                         len     += unilen;
7115                         /* In EBCDIC the numlen and unilen can differ. */
7116                         foldbuf += numlen;
7117                         if (numlen >= foldlen)
7118                             break;
7119                     }
7120                     else
7121                         break; /* "Can't happen." */
7122                 }
7123             }
7124
7125             /* Point to the beginning of the next character in the sequence. */
7126             RExC_parse = endchar + 1;
7127
7128             /* Quit if no more characters */
7129             if (RExC_parse >= endbrace) break;
7130         }
7131
7132
7133         if (SIZE_ONLY) {
7134             if (RExC_parse < endbrace) {
7135                 ckWARNreg(RExC_parse - 1,
7136                           "Using just the first characters returned by \\N{}");
7137             }
7138
7139             RExC_size += STR_SZ(len);
7140         } else {
7141             STR_LEN(ret) = len;
7142             RExC_emit += STR_SZ(len);
7143         }
7144
7145         RExC_parse = endbrace + 1;
7146
7147         *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7148                                with malformed in t/re/pat_advanced.t */
7149         RExC_parse --;
7150         Set_Node_Cur_Length(ret); /* MJD */
7151         nextchar(pRExC_state);
7152     }
7153
7154     return ret;
7155 }
7156
7157
7158 /*
7159  * reg_recode
7160  *
7161  * It returns the code point in utf8 for the value in *encp.
7162  *    value: a code value in the source encoding
7163  *    encp:  a pointer to an Encode object
7164  *
7165  * If the result from Encode is not a single character,
7166  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7167  */
7168 STATIC UV
7169 S_reg_recode(pTHX_ const char value, SV **encp)
7170 {
7171     STRLEN numlen = 1;
7172     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7173     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7174     const STRLEN newlen = SvCUR(sv);
7175     UV uv = UNICODE_REPLACEMENT;
7176
7177     PERL_ARGS_ASSERT_REG_RECODE;
7178
7179     if (newlen)
7180         uv = SvUTF8(sv)
7181              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7182              : *(U8*)s;
7183
7184     if (!newlen || numlen != newlen) {
7185         uv = UNICODE_REPLACEMENT;
7186         *encp = NULL;
7187     }
7188     return uv;
7189 }
7190
7191
7192 /*
7193  - regatom - the lowest level
7194
7195    Try to identify anything special at the start of the pattern. If there
7196    is, then handle it as required. This may involve generating a single regop,
7197    such as for an assertion; or it may involve recursing, such as to
7198    handle a () structure.
7199
7200    If the string doesn't start with something special then we gobble up
7201    as much literal text as we can.
7202
7203    Once we have been able to handle whatever type of thing started the
7204    sequence, we return.
7205
7206    Note: we have to be careful with escapes, as they can be both literal
7207    and special, and in the case of \10 and friends can either, depending
7208    on context. Specifically there are two seperate switches for handling
7209    escape sequences, with the one for handling literal escapes requiring
7210    a dummy entry for all of the special escapes that are actually handled
7211    by the other.
7212 */
7213
7214 STATIC regnode *
7215 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7216 {
7217     dVAR;
7218     register regnode *ret = NULL;
7219     I32 flags;
7220     char *parse_start = RExC_parse;
7221     GET_RE_DEBUG_FLAGS_DECL;
7222     DEBUG_PARSE("atom");
7223     *flagp = WORST;             /* Tentatively. */
7224
7225     PERL_ARGS_ASSERT_REGATOM;
7226
7227 tryagain:
7228     switch ((U8)*RExC_parse) {
7229     case '^':
7230         RExC_seen_zerolen++;
7231         nextchar(pRExC_state);
7232         if (RExC_flags & RXf_PMf_MULTILINE)
7233             ret = reg_node(pRExC_state, MBOL);
7234         else if (RExC_flags & RXf_PMf_SINGLELINE)
7235             ret = reg_node(pRExC_state, SBOL);
7236         else
7237             ret = reg_node(pRExC_state, BOL);
7238         Set_Node_Length(ret, 1); /* MJD */
7239         break;
7240     case '$':
7241         nextchar(pRExC_state);
7242         if (*RExC_parse)
7243             RExC_seen_zerolen++;
7244         if (RExC_flags & RXf_PMf_MULTILINE)
7245             ret = reg_node(pRExC_state, MEOL);
7246         else if (RExC_flags & RXf_PMf_SINGLELINE)
7247             ret = reg_node(pRExC_state, SEOL);
7248         else
7249             ret = reg_node(pRExC_state, EOL);
7250         Set_Node_Length(ret, 1); /* MJD */
7251         break;
7252     case '.':
7253         nextchar(pRExC_state);
7254         if (RExC_flags & RXf_PMf_SINGLELINE)
7255             ret = reg_node(pRExC_state, SANY);
7256         else
7257             ret = reg_node(pRExC_state, REG_ANY);
7258         *flagp |= HASWIDTH|SIMPLE;
7259         RExC_naughty++;
7260         Set_Node_Length(ret, 1); /* MJD */
7261         break;
7262     case '[':
7263     {
7264         char * const oregcomp_parse = ++RExC_parse;
7265         ret = regclass(pRExC_state,depth+1);
7266         if (*RExC_parse != ']') {
7267             RExC_parse = oregcomp_parse;
7268             vFAIL("Unmatched [");
7269         }
7270         nextchar(pRExC_state);
7271         *flagp |= HASWIDTH|SIMPLE;
7272         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7273         break;
7274     }
7275     case '(':
7276         nextchar(pRExC_state);
7277         ret = reg(pRExC_state, 1, &flags,depth+1);
7278         if (ret == NULL) {
7279                 if (flags & TRYAGAIN) {
7280                     if (RExC_parse == RExC_end) {
7281                          /* Make parent create an empty node if needed. */
7282                         *flagp |= TRYAGAIN;
7283                         return(NULL);
7284                     }
7285                     goto tryagain;
7286                 }
7287                 return(NULL);
7288         }
7289         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7290         break;
7291     case '|':
7292     case ')':
7293         if (flags & TRYAGAIN) {
7294             *flagp |= TRYAGAIN;
7295             return NULL;
7296         }
7297         vFAIL("Internal urp");
7298                                 /* Supposed to be caught earlier. */
7299         break;
7300     case '{':
7301         if (!regcurly(RExC_parse)) {
7302             RExC_parse++;
7303             goto defchar;
7304         }
7305         /* FALL THROUGH */
7306     case '?':
7307     case '+':
7308     case '*':
7309         RExC_parse++;
7310         vFAIL("Quantifier follows nothing");
7311         break;
7312     case LATIN_SMALL_LETTER_SHARP_S:
7313     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7314     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7315 #if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
7316 #error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ.  Other instances in this code should have the case statement below.
7317     case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
7318 #endif
7319         do_foldchar:
7320         if (!LOC && FOLD) {
7321             U32 len,cp;
7322             len=0; /* silence a spurious compiler warning */
7323             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7324                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7325                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7326                 ret = reganode(pRExC_state, FOLDCHAR, cp);
7327                 Set_Node_Length(ret, 1); /* MJD */
7328                 nextchar(pRExC_state); /* kill whitespace under /x */
7329                 return ret;
7330             }
7331         }
7332         goto outer_default;
7333     case '\\':
7334         /* Special Escapes
7335
7336            This switch handles escape sequences that resolve to some kind
7337            of special regop and not to literal text. Escape sequnces that
7338            resolve to literal text are handled below in the switch marked
7339            "Literal Escapes".
7340
7341            Every entry in this switch *must* have a corresponding entry
7342            in the literal escape switch. However, the opposite is not
7343            required, as the default for this switch is to jump to the
7344            literal text handling code.
7345         */
7346         switch ((U8)*++RExC_parse) {
7347         case LATIN_SMALL_LETTER_SHARP_S:
7348         case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7349         case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7350                    goto do_foldchar;
7351         /* Special Escapes */
7352         case 'A':
7353             RExC_seen_zerolen++;
7354             ret = reg_node(pRExC_state, SBOL);
7355             *flagp |= SIMPLE;
7356             goto finish_meta_pat;
7357         case 'G':
7358             ret = reg_node(pRExC_state, GPOS);
7359             RExC_seen |= REG_SEEN_GPOS;
7360             *flagp |= SIMPLE;
7361             goto finish_meta_pat;
7362         case 'K':
7363             RExC_seen_zerolen++;
7364             ret = reg_node(pRExC_state, KEEPS);
7365             *flagp |= SIMPLE;
7366             /* XXX:dmq : disabling in-place substitution seems to
7367              * be necessary here to avoid cases of memory corruption, as
7368              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7369              */
7370             RExC_seen |= REG_SEEN_LOOKBEHIND;
7371             goto finish_meta_pat;
7372         case 'Z':
7373             ret = reg_node(pRExC_state, SEOL);
7374             *flagp |= SIMPLE;
7375             RExC_seen_zerolen++;                /* Do not optimize RE away */
7376             goto finish_meta_pat;
7377         case 'z':
7378             ret = reg_node(pRExC_state, EOS);
7379             *flagp |= SIMPLE;
7380             RExC_seen_zerolen++;                /* Do not optimize RE away */
7381             goto finish_meta_pat;
7382         case 'C':
7383             ret = reg_node(pRExC_state, CANY);
7384             RExC_seen |= REG_SEEN_CANY;
7385             *flagp |= HASWIDTH|SIMPLE;
7386             goto finish_meta_pat;
7387         case 'X':
7388             ret = reg_node(pRExC_state, CLUMP);
7389             *flagp |= HASWIDTH;
7390             goto finish_meta_pat;
7391         case 'w':
7392             if (LOC) {
7393                 ret = reg_node(pRExC_state, (U8)(ALNUML));
7394             } else {
7395                 ret = reg_node(pRExC_state, (U8)(ALNUM));
7396                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7397             }
7398             *flagp |= HASWIDTH|SIMPLE;
7399             goto finish_meta_pat;
7400         case 'W':
7401             if (LOC) {
7402                 ret = reg_node(pRExC_state, (U8)(NALNUML));
7403             } else {
7404                 ret = reg_node(pRExC_state, (U8)(NALNUM));
7405                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7406             }
7407             *flagp |= HASWIDTH|SIMPLE;
7408             goto finish_meta_pat;
7409         case 'b':
7410             RExC_seen_zerolen++;
7411             RExC_seen |= REG_SEEN_LOOKBEHIND;
7412             if (LOC) {
7413                 ret = reg_node(pRExC_state, (U8)(BOUNDL));
7414             } else {
7415                 ret = reg_node(pRExC_state, (U8)(BOUND));
7416                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7417             }
7418             *flagp |= SIMPLE;
7419             goto finish_meta_pat;
7420         case 'B':
7421             RExC_seen_zerolen++;
7422             RExC_seen |= REG_SEEN_LOOKBEHIND;
7423             if (LOC) {
7424                 ret = reg_node(pRExC_state, (U8)(NBOUNDL));
7425             } else {
7426                 ret = reg_node(pRExC_state, (U8)(NBOUND));
7427                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7428             }
7429             *flagp |= SIMPLE;
7430             goto finish_meta_pat;
7431         case 's':
7432             if (LOC) {
7433                 ret = reg_node(pRExC_state, (U8)(SPACEL));
7434             } else {
7435                 ret = reg_node(pRExC_state, (U8)(SPACE));
7436                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7437             }
7438             *flagp |= HASWIDTH|SIMPLE;
7439             goto finish_meta_pat;
7440         case 'S':
7441             if (LOC) {
7442                 ret = reg_node(pRExC_state, (U8)(NSPACEL));
7443             } else {
7444                 ret = reg_node(pRExC_state, (U8)(NSPACE));
7445                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7446             }
7447             *flagp |= HASWIDTH|SIMPLE;
7448             goto finish_meta_pat;
7449         case 'd':
7450             if (LOC) {
7451                 ret = reg_node(pRExC_state, (U8)(DIGITL));
7452             } else {
7453                 ret = reg_node(pRExC_state, (U8)(DIGIT));
7454             }
7455             *flagp |= HASWIDTH|SIMPLE;
7456             goto finish_meta_pat;
7457         case 'D':
7458             if (LOC) {
7459                 ret = reg_node(pRExC_state, (U8)(NDIGITL));
7460             } else {
7461                 ret = reg_node(pRExC_state, (U8)(NDIGIT));
7462             }
7463             *flagp |= HASWIDTH|SIMPLE;
7464             goto finish_meta_pat;
7465         case 'R':
7466             ret = reg_node(pRExC_state, LNBREAK);
7467             *flagp |= HASWIDTH|SIMPLE;
7468             goto finish_meta_pat;
7469         case 'h':
7470             ret = reg_node(pRExC_state, HORIZWS);
7471             *flagp |= HASWIDTH|SIMPLE;
7472             goto finish_meta_pat;
7473         case 'H':
7474             ret = reg_node(pRExC_state, NHORIZWS);
7475             *flagp |= HASWIDTH|SIMPLE;
7476             goto finish_meta_pat;
7477         case 'v':
7478             ret = reg_node(pRExC_state, VERTWS);
7479             *flagp |= HASWIDTH|SIMPLE;
7480             goto finish_meta_pat;
7481         case 'V':
7482             ret = reg_node(pRExC_state, NVERTWS);
7483             *flagp |= HASWIDTH|SIMPLE;
7484          finish_meta_pat:
7485             nextchar(pRExC_state);
7486             Set_Node_Length(ret, 2); /* MJD */
7487             break;
7488         case 'p':
7489         case 'P':
7490             {
7491                 char* const oldregxend = RExC_end;
7492 #ifdef DEBUGGING
7493                 char* parse_start = RExC_parse - 2;
7494 #endif
7495
7496                 if (RExC_parse[1] == '{') {
7497                   /* a lovely hack--pretend we saw [\pX] instead */
7498                     RExC_end = strchr(RExC_parse, '}');
7499                     if (!RExC_end) {
7500                         const U8 c = (U8)*RExC_parse;
7501                         RExC_parse += 2;
7502                         RExC_end = oldregxend;
7503                         vFAIL2("Missing right brace on \\%c{}", c);
7504                     }
7505                     RExC_end++;
7506                 }
7507                 else {
7508                     RExC_end = RExC_parse + 2;
7509                     if (RExC_end > oldregxend)
7510                         RExC_end = oldregxend;
7511                 }
7512                 RExC_parse--;
7513
7514                 ret = regclass(pRExC_state,depth+1);
7515
7516                 RExC_end = oldregxend;
7517                 RExC_parse--;
7518
7519                 Set_Node_Offset(ret, parse_start + 2);
7520                 Set_Node_Cur_Length(ret);
7521                 nextchar(pRExC_state);
7522                 *flagp |= HASWIDTH|SIMPLE;
7523             }
7524             break;
7525         case 'N':
7526             /* Handle \N and \N{NAME} here and not below because it can be
7527             multicharacter. join_exact() will join them up later on.
7528             Also this makes sure that things like /\N{BLAH}+/ and
7529             \N{BLAH} being multi char Just Happen. dmq*/
7530             ++RExC_parse;
7531             ret= reg_namedseq(pRExC_state, NULL, flagp);
7532             break;
7533         case 'k':    /* Handle \k<NAME> and \k'NAME' */
7534         parse_named_seq:
7535         {
7536             char ch= RExC_parse[1];
7537             if (ch != '<' && ch != '\'' && ch != '{') {
7538                 RExC_parse++;
7539                 vFAIL2("Sequence %.2s... not terminated",parse_start);
7540             } else {
7541                 /* this pretty much dupes the code for (?P=...) in reg(), if
7542                    you change this make sure you change that */
7543                 char* name_start = (RExC_parse += 2);
7544                 U32 num = 0;
7545                 SV *sv_dat = reg_scan_name(pRExC_state,
7546                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7547                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7548                 if (RExC_parse == name_start || *RExC_parse != ch)
7549                     vFAIL2("Sequence %.3s... not terminated",parse_start);
7550
7551                 if (!SIZE_ONLY) {
7552                     num = add_data( pRExC_state, 1, "S" );
7553                     RExC_rxi->data->data[num]=(void*)sv_dat;
7554                     SvREFCNT_inc_simple_void(sv_dat);
7555                 }
7556
7557                 RExC_sawback = 1;
7558                 ret = reganode(pRExC_state,
7559                                ((! FOLD)
7560                                  ? NREF
7561                                  : (UNI_SEMANTICS)
7562                                    ? NREFFU
7563                                    : (LOC)
7564                                      ? NREFFL
7565                                      : NREFF),
7566                                 num);
7567                 *flagp |= HASWIDTH;
7568
7569                 /* override incorrect value set in reganode MJD */
7570                 Set_Node_Offset(ret, parse_start+1);
7571                 Set_Node_Cur_Length(ret); /* MJD */
7572                 nextchar(pRExC_state);
7573
7574             }
7575             break;
7576         }
7577         case 'g':
7578         case '1': case '2': case '3': case '4':
7579         case '5': case '6': case '7': case '8': case '9':
7580             {
7581                 I32 num;
7582                 bool isg = *RExC_parse == 'g';
7583                 bool isrel = 0;
7584                 bool hasbrace = 0;
7585                 if (isg) {
7586                     RExC_parse++;
7587                     if (*RExC_parse == '{') {
7588                         RExC_parse++;
7589                         hasbrace = 1;
7590                     }
7591                     if (*RExC_parse == '-') {
7592                         RExC_parse++;
7593                         isrel = 1;
7594                     }
7595                     if (hasbrace && !isDIGIT(*RExC_parse)) {
7596                         if (isrel) RExC_parse--;
7597                         RExC_parse -= 2;
7598                         goto parse_named_seq;
7599                 }   }
7600                 num = atoi(RExC_parse);
7601                 if (isg && num == 0)
7602                     vFAIL("Reference to invalid group 0");
7603                 if (isrel) {
7604                     num = RExC_npar - num;
7605                     if (num < 1)
7606                         vFAIL("Reference to nonexistent or unclosed group");
7607                 }
7608                 if (!isg && num > 9 && num >= RExC_npar)
7609                     goto defchar;
7610                 else {
7611                     char * const parse_start = RExC_parse - 1; /* MJD */
7612                     while (isDIGIT(*RExC_parse))
7613                         RExC_parse++;
7614                     if (parse_start == RExC_parse - 1)
7615                         vFAIL("Unterminated \\g... pattern");
7616                     if (hasbrace) {
7617                         if (*RExC_parse != '}')
7618                             vFAIL("Unterminated \\g{...} pattern");
7619                         RExC_parse++;
7620                     }
7621                     if (!SIZE_ONLY) {
7622                         if (num > (I32)RExC_rx->nparens)
7623                             vFAIL("Reference to nonexistent group");
7624                     }
7625                     RExC_sawback = 1;
7626                     ret = reganode(pRExC_state,
7627                                    ((! FOLD)
7628                                      ? REF
7629                                      : (UNI_SEMANTICS)
7630                                        ? REFFU
7631                                        : (LOC)
7632                                          ? REFFL
7633                                          : REFF),
7634                                     num);
7635                     *flagp |= HASWIDTH;
7636
7637                     /* override incorrect value set in reganode MJD */
7638                     Set_Node_Offset(ret, parse_start+1);
7639                     Set_Node_Cur_Length(ret); /* MJD */
7640                     RExC_parse--;
7641                     nextchar(pRExC_state);
7642                 }
7643             }
7644             break;
7645         case '\0':
7646             if (RExC_parse >= RExC_end)
7647                 FAIL("Trailing \\");
7648             /* FALL THROUGH */
7649         default:
7650             /* Do not generate "unrecognized" warnings here, we fall
7651                back into the quick-grab loop below */
7652             parse_start--;
7653             goto defchar;
7654         }
7655         break;
7656
7657     case '#':
7658         if (RExC_flags & RXf_PMf_EXTENDED) {
7659             if ( reg_skipcomment( pRExC_state ) )
7660                 goto tryagain;
7661         }
7662         /* FALL THROUGH */
7663
7664     default:
7665         outer_default:{
7666             register STRLEN len;
7667             register UV ender;
7668             register char *p;
7669             char *s;
7670             STRLEN foldlen;
7671             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7672
7673             parse_start = RExC_parse - 1;
7674
7675             RExC_parse++;
7676
7677         defchar:
7678             ender = 0;
7679             ret = reg_node(pRExC_state,
7680                            (U8) ((! FOLD) ? EXACT
7681                                           : (LOC)
7682                                              ? EXACTFL
7683                                              : (UNI_SEMANTICS)
7684                                                ? EXACTFU
7685                                                : EXACTF)
7686                     );
7687             s = STRING(ret);
7688             for (len = 0, p = RExC_parse - 1;
7689               len < 127 && p < RExC_end;
7690               len++)
7691             {
7692                 char * const oldp = p;
7693
7694                 if (RExC_flags & RXf_PMf_EXTENDED)
7695                     p = regwhite( pRExC_state, p );
7696                 switch ((U8)*p) {
7697                 case LATIN_SMALL_LETTER_SHARP_S:
7698                 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7699                 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7700                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7701                                 goto normal_default;
7702                 case '^':
7703                 case '$':
7704                 case '.':
7705                 case '[':
7706                 case '(':
7707                 case ')':
7708                 case '|':
7709                     goto loopdone;
7710                 case '\\':
7711                     /* Literal Escapes Switch
7712
7713                        This switch is meant to handle escape sequences that
7714                        resolve to a literal character.
7715
7716                        Every escape sequence that represents something
7717                        else, like an assertion or a char class, is handled
7718                        in the switch marked 'Special Escapes' above in this
7719                        routine, but also has an entry here as anything that
7720                        isn't explicitly mentioned here will be treated as
7721                        an unescaped equivalent literal.
7722                     */
7723
7724                     switch ((U8)*++p) {
7725                     /* These are all the special escapes. */
7726                     case LATIN_SMALL_LETTER_SHARP_S:
7727                     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7728                     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7729                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7730                                 goto normal_default;
7731                     case 'A':             /* Start assertion */
7732                     case 'b': case 'B':   /* Word-boundary assertion*/
7733                     case 'C':             /* Single char !DANGEROUS! */
7734                     case 'd': case 'D':   /* digit class */
7735                     case 'g': case 'G':   /* generic-backref, pos assertion */
7736                     case 'h': case 'H':   /* HORIZWS */
7737                     case 'k': case 'K':   /* named backref, keep marker */
7738                     case 'N':             /* named char sequence */
7739                     case 'p': case 'P':   /* Unicode property */
7740                               case 'R':   /* LNBREAK */
7741                     case 's': case 'S':   /* space class */
7742                     case 'v': case 'V':   /* VERTWS */
7743                     case 'w': case 'W':   /* word class */
7744                     case 'X':             /* eXtended Unicode "combining character sequence" */
7745                     case 'z': case 'Z':   /* End of line/string assertion */
7746                         --p;
7747                         goto loopdone;
7748
7749                     /* Anything after here is an escape that resolves to a
7750                        literal. (Except digits, which may or may not)
7751                      */
7752                     case 'n':
7753                         ender = '\n';
7754                         p++;
7755                         break;
7756                     case 'r':
7757                         ender = '\r';
7758                         p++;
7759                         break;
7760                     case 't':
7761                         ender = '\t';
7762                         p++;
7763                         break;
7764                     case 'f':
7765                         ender = '\f';
7766                         p++;
7767                         break;
7768                     case 'e':
7769                           ender = ASCII_TO_NATIVE('\033');
7770                         p++;
7771                         break;
7772                     case 'a':
7773                           ender = ASCII_TO_NATIVE('\007');
7774                         p++;
7775                         break;
7776                     case 'o':
7777                         {
7778                             STRLEN brace_len = len;
7779                             UV result;
7780                             const char* error_msg;
7781
7782                             bool valid = grok_bslash_o(p,
7783                                                        &result,
7784                                                        &brace_len,
7785                                                        &error_msg,
7786                                                        1);
7787                             p += brace_len;
7788                             if (! valid) {
7789                                 RExC_parse = p; /* going to die anyway; point
7790                                                    to exact spot of failure */
7791                                 vFAIL(error_msg);
7792                             }
7793                             else
7794                             {
7795                                 ender = result;
7796                             }
7797                             if (PL_encoding && ender < 0x100) {
7798                                 goto recode_encoding;
7799                             }
7800                             if (ender > 0xff) {
7801                                 REQUIRE_UTF8;
7802                             }
7803                             break;
7804                         }
7805                     case 'x':
7806                         if (*++p == '{') {
7807                             char* const e = strchr(p, '}');
7808
7809                             if (!e) {
7810                                 RExC_parse = p + 1;
7811                                 vFAIL("Missing right brace on \\x{}");
7812                             }
7813                             else {
7814                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7815                                     | PERL_SCAN_DISALLOW_PREFIX;
7816                                 STRLEN numlen = e - p - 1;
7817                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7818                                 if (ender > 0xff)
7819                                     REQUIRE_UTF8;
7820                                 p = e + 1;
7821                             }
7822                         }
7823                         else {
7824                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7825                             STRLEN numlen = 2;
7826                             ender = grok_hex(p, &numlen, &flags, NULL);
7827                             p += numlen;
7828                         }
7829                         if (PL_encoding && ender < 0x100)
7830                             goto recode_encoding;
7831                         break;
7832                     case 'c':
7833                         p++;
7834                         ender = grok_bslash_c(*p++, SIZE_ONLY);
7835                         break;
7836                     case '0': case '1': case '2': case '3':case '4':
7837                     case '5': case '6': case '7': case '8':case '9':
7838                         if (*p == '0' ||
7839                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
7840                         {
7841                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
7842                             STRLEN numlen = 3;
7843                             ender = grok_oct(p, &numlen, &flags, NULL);
7844                             if (ender > 0xff) {
7845                                 REQUIRE_UTF8;
7846                             }
7847                             p += numlen;
7848                         }
7849                         else {
7850                             --p;
7851                             goto loopdone;
7852                         }
7853                         if (PL_encoding && ender < 0x100)
7854                             goto recode_encoding;
7855                         break;
7856                     recode_encoding:
7857                         {
7858                             SV* enc = PL_encoding;
7859                             ender = reg_recode((const char)(U8)ender, &enc);
7860                             if (!enc && SIZE_ONLY)
7861                                 ckWARNreg(p, "Invalid escape in the specified encoding");
7862                             REQUIRE_UTF8;
7863                         }
7864                         break;
7865                     case '\0':
7866                         if (p >= RExC_end)
7867                             FAIL("Trailing \\");
7868                         /* FALL THROUGH */
7869                     default:
7870                         if (!SIZE_ONLY&& isALPHA(*p))
7871                             ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7872                         goto normal_default;
7873                     }
7874                     break;
7875                 default:
7876                   normal_default:
7877                     if (UTF8_IS_START(*p) && UTF) {
7878                         STRLEN numlen;
7879                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7880                                                &numlen, UTF8_ALLOW_DEFAULT);
7881                         p += numlen;
7882                     }
7883                     else
7884                         ender = *p++;
7885                     break;
7886                 }
7887                 if ( RExC_flags & RXf_PMf_EXTENDED)
7888                     p = regwhite( pRExC_state, p );
7889                 if (UTF && FOLD) {
7890                     /* Prime the casefolded buffer. */
7891                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7892                 }
7893                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7894                     if (len)
7895                         p = oldp;
7896                     else if (UTF) {
7897                          if (FOLD) {
7898                               /* Emit all the Unicode characters. */
7899                               STRLEN numlen;
7900                               for (foldbuf = tmpbuf;
7901                                    foldlen;
7902                                    foldlen -= numlen) {
7903                                    ender = utf8_to_uvchr(foldbuf, &numlen);
7904                                    if (numlen > 0) {
7905                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
7906                                         s       += unilen;
7907                                         len     += unilen;
7908                                         /* In EBCDIC the numlen
7909                                          * and unilen can differ. */
7910                                         foldbuf += numlen;
7911                                         if (numlen >= foldlen)
7912                                              break;
7913                                    }
7914                                    else
7915                                         break; /* "Can't happen." */
7916                               }
7917                          }
7918                          else {
7919                               const STRLEN unilen = reguni(pRExC_state, ender, s);
7920                               if (unilen > 0) {
7921                                    s   += unilen;
7922                                    len += unilen;
7923                               }
7924                          }
7925                     }
7926                     else {
7927                         len++;
7928                         REGC((char)ender, s++);
7929                     }
7930                     break;
7931                 }
7932                 if (UTF) {
7933                      if (FOLD) {
7934                           /* Emit all the Unicode characters. */
7935                           STRLEN numlen;
7936                           for (foldbuf = tmpbuf;
7937                                foldlen;
7938                                foldlen -= numlen) {
7939                                ender = utf8_to_uvchr(foldbuf, &numlen);
7940                                if (numlen > 0) {
7941                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
7942                                     len     += unilen;
7943                                     s       += unilen;
7944                                     /* In EBCDIC the numlen
7945                                      * and unilen can differ. */
7946                                     foldbuf += numlen;
7947                                     if (numlen >= foldlen)
7948                                          break;
7949                                }
7950                                else
7951                                     break;
7952                           }
7953                      }
7954                      else {
7955                           const STRLEN unilen = reguni(pRExC_state, ender, s);
7956                           if (unilen > 0) {
7957                                s   += unilen;
7958                                len += unilen;
7959                           }
7960                      }
7961                      len--;
7962                 }
7963                 else
7964                     REGC((char)ender, s++);
7965             }
7966         loopdone:
7967             RExC_parse = p - 1;
7968             Set_Node_Cur_Length(ret); /* MJD */
7969             nextchar(pRExC_state);
7970             {
7971                 /* len is STRLEN which is unsigned, need to copy to signed */
7972                 IV iv = len;
7973                 if (iv < 0)
7974                     vFAIL("Internal disaster");
7975             }
7976             if (len > 0)
7977                 *flagp |= HASWIDTH;
7978             if (len == 1 && UNI_IS_INVARIANT(ender))
7979                 *flagp |= SIMPLE;
7980
7981             if (SIZE_ONLY)
7982                 RExC_size += STR_SZ(len);
7983             else {
7984                 STR_LEN(ret) = len;
7985                 RExC_emit += STR_SZ(len);
7986             }
7987         }
7988         break;
7989     }
7990
7991     return(ret);
7992 }
7993
7994 STATIC char *
7995 S_regwhite( RExC_state_t *pRExC_state, char *p )
7996 {
7997     const char *e = RExC_end;
7998
7999     PERL_ARGS_ASSERT_REGWHITE;
8000
8001     while (p < e) {
8002         if (isSPACE(*p))
8003             ++p;
8004         else if (*p == '#') {
8005             bool ended = 0;
8006             do {
8007                 if (*p++ == '\n') {
8008                     ended = 1;
8009                     break;
8010                 }
8011             } while (p < e);
8012             if (!ended)
8013                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8014         }
8015         else
8016             break;
8017     }
8018     return p;
8019 }
8020
8021 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
8022    Character classes ([:foo:]) can also be negated ([:^foo:]).
8023    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
8024    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
8025    but trigger failures because they are currently unimplemented. */
8026
8027 #define POSIXCC_DONE(c)   ((c) == ':')
8028 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
8029 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
8030
8031 STATIC I32
8032 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
8033 {
8034     dVAR;
8035     I32 namedclass = OOB_NAMEDCLASS;
8036
8037     PERL_ARGS_ASSERT_REGPPOSIXCC;
8038
8039     if (value == '[' && RExC_parse + 1 < RExC_end &&
8040         /* I smell either [: or [= or [. -- POSIX has been here, right? */
8041         POSIXCC(UCHARAT(RExC_parse))) {
8042         const char c = UCHARAT(RExC_parse);
8043         char* const s = RExC_parse++;
8044
8045         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
8046             RExC_parse++;
8047         if (RExC_parse == RExC_end)
8048             /* Grandfather lone [:, [=, [. */
8049             RExC_parse = s;
8050         else {
8051             const char* const t = RExC_parse++; /* skip over the c */
8052             assert(*t == c);
8053
8054             if (UCHARAT(RExC_parse) == ']') {
8055                 const char *posixcc = s + 1;
8056                 RExC_parse++; /* skip over the ending ] */
8057
8058                 if (*s == ':') {
8059                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
8060                     const I32 skip = t - posixcc;
8061
8062                     /* Initially switch on the length of the name.  */
8063                     switch (skip) {
8064                     case 4:
8065                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
8066                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
8067                         break;
8068                     case 5:
8069                         /* Names all of length 5.  */
8070                         /* alnum alpha ascii blank cntrl digit graph lower
8071                            print punct space upper  */
8072                         /* Offset 4 gives the best switch position.  */
8073                         switch (posixcc[4]) {
8074                         case 'a':
8075                             if (memEQ(posixcc, "alph", 4)) /* alpha */
8076                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
8077                             break;
8078                         case 'e':
8079                             if (memEQ(posixcc, "spac", 4)) /* space */
8080                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
8081                             break;
8082                         case 'h':
8083                             if (memEQ(posixcc, "grap", 4)) /* graph */
8084                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
8085                             break;
8086                         case 'i':
8087                             if (memEQ(posixcc, "asci", 4)) /* ascii */
8088                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
8089                             break;
8090                         case 'k':
8091                             if (memEQ(posixcc, "blan", 4)) /* blank */
8092                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
8093                             break;
8094                         case 'l':
8095                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
8096                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
8097                             break;
8098                         case 'm':
8099                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
8100                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
8101                             break;
8102                         case 'r':
8103                             if (memEQ(posixcc, "lowe", 4)) /* lower */
8104                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
8105                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
8106                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
8107                             break;
8108                         case 't':
8109                             if (memEQ(posixcc, "digi", 4)) /* digit */
8110                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
8111                             else if (memEQ(posixcc, "prin", 4)) /* print */
8112                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
8113                             else if (memEQ(posixcc, "punc", 4)) /* punct */
8114                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
8115                             break;
8116                         }
8117                         break;
8118                     case 6:
8119                         if (memEQ(posixcc, "xdigit", 6))
8120                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
8121                         break;
8122                     }
8123
8124                     if (namedclass == OOB_NAMEDCLASS)
8125                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8126                                       t - s - 1, s + 1);
8127                     assert (posixcc[skip] == ':');
8128                     assert (posixcc[skip+1] == ']');
8129                 } else if (!SIZE_ONLY) {
8130                     /* [[=foo=]] and [[.foo.]] are still future. */
8131
8132                     /* adjust RExC_parse so the warning shows after
8133                        the class closes */
8134                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
8135                         RExC_parse++;
8136                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8137                 }
8138             } else {
8139                 /* Maternal grandfather:
8140                  * "[:" ending in ":" but not in ":]" */
8141                 RExC_parse = s;
8142             }
8143         }
8144     }
8145
8146     return namedclass;
8147 }
8148
8149 STATIC void
8150 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
8151 {
8152     dVAR;
8153
8154     PERL_ARGS_ASSERT_CHECKPOSIXCC;
8155
8156     if (POSIXCC(UCHARAT(RExC_parse))) {
8157         const char *s = RExC_parse;
8158         const char  c = *s++;
8159
8160         while (isALNUM(*s))
8161             s++;
8162         if (*s && c == *s && s[1] == ']') {
8163             ckWARN3reg(s+2,
8164                        "POSIX syntax [%c %c] belongs inside character classes",
8165                        c, c);
8166
8167             /* [[=foo=]] and [[.foo.]] are still future. */
8168             if (POSIXCC_NOTYET(c)) {
8169                 /* adjust RExC_parse so the error shows after
8170                    the class closes */
8171                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
8172                     NOOP;
8173                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8174             }
8175         }
8176     }
8177 }
8178
8179 /* No locale test */
8180 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
8181 ANYOF_##NAME:                                           \
8182         for (value = 0; value < 256; value++)           \
8183             if (TEST)                                   \
8184                 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8185     yesno = '+';                                        \
8186     what = WORD;                                        \
8187     break;                                              \
8188 case ANYOF_N##NAME:                                     \
8189         for (value = 0; value < 256; value++)           \
8190             if (!TEST)                                  \
8191                 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8192     yesno = '!';                                        \
8193     what = WORD;                                        \
8194     break
8195
8196 /* Like the above, but there are differences if we are in uni-8-bit or not, so
8197  * there are two tests passed in, to use depending on that. There aren't any
8198  * cases where the label is different from the name, so no need for that
8199  * parameter */
8200 #define _C_C_T_(NAME,TEST_8,TEST_7,WORD)       \
8201 ANYOF_##NAME:                                           \
8202     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);        \
8203     else if (UNI_SEMANTICS) {                           \
8204         for (value = 0; value < 256; value++) {         \
8205             if (TEST_8) stored +=                       \
8206                       S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8207         }                                               \
8208     }                                                   \
8209     else {                                              \
8210         for (value = 0; value < 128; value++) {         \
8211             if (TEST_7) stored +=                       \
8212                        S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value)); \
8213         }                                               \
8214     }                                                   \
8215     yesno = '+';                                        \
8216     what = WORD;                                        \
8217     break;                                              \
8218 case ANYOF_N##NAME:                                     \
8219     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);       \
8220     else if (UNI_SEMANTICS) {                           \
8221         for (value = 0; value < 256; value++) {         \
8222             if (! TEST_8) stored +=                     \
8223                         S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8224         }                                               \
8225     }                                                   \
8226     else {                                              \
8227         for (value = 0; value < 128; value++) {         \
8228             if (! TEST_7) stored +=                     \
8229                         S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8230         }                                               \
8231         for (value = 128; value < 256; value++) {         \
8232                         S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8233         }                                               \
8234     }                                                   \
8235     yesno = '!';                                        \
8236     what = WORD;                                        \
8237     break
8238
8239 /*
8240    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8241    so that it is possible to override the option here without having to
8242    rebuild the entire core. as we are required to do if we change regcomp.h
8243    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8244 */
8245 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8246 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8247 #endif
8248
8249 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8250 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8251 #else
8252 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8253 #endif
8254
8255 STATIC U8
8256 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8257 {
8258
8259     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
8260      * Locale folding is done at run-time, so this function should not be
8261      * called for nodes that are for locales.
8262      *
8263      * This function simply sets the bit corresponding to the fold of the input
8264      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
8265      * 'F' is 'f'.
8266      *
8267      * It also sets any necessary flags, and returns the number of bits that
8268      * actually changed from 0 to 1 */
8269
8270     U8 stored = 0;
8271     U8 fold;
8272
8273     fold = (UNI_SEMANTICS) ? PL_fold_latin1[value]
8274                            : PL_fold[value];
8275
8276     /* It assumes the bit for 'value' has already been set */
8277     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
8278         ANYOF_BITMAP_SET(node, fold);
8279         stored++;
8280     }
8281
8282     /* The fold of the German sharp s is two ASCII characters, so isn't in the
8283      * bitmap and doesn't have to be in utf8, but we only process it if unicode
8284      * semantics are called for */
8285     if (UNI_SEMANTICS && value == LATIN_SMALL_LETTER_SHARP_S) {
8286         ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
8287     }
8288     else if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
8289              || (! UNI_SEMANTICS
8290                  && ! isASCII(value)
8291                  && PL_fold_latin1[value] != value))
8292     {   /* A character that has a fold outside of Latin1 matches outside the
8293            bitmap, but only when the target string is utf8.  Similarly when we
8294            don't have unicode semantics for the above ASCII Latin-1 characters,
8295            and they have a fold, they should match if the target is utf8, and
8296            not otherwise */
8297         ANYOF_FLAGS(node) |= ANYOF_UTF8;
8298     }
8299
8300     return stored;
8301 }
8302
8303
8304 PERL_STATIC_INLINE U8
8305 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8306 {
8307     /* This inline function sets a bit in the bitmap if not already set, and if
8308      * appropriate, its fold, returning the number of bits that actually
8309      * changed from 0 to 1 */
8310
8311     U8 stored;
8312
8313     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
8314         return 0;
8315     }
8316
8317     ANYOF_BITMAP_SET(node, value);
8318     stored = 1;
8319
8320     if (FOLD && ! LOC) {        /* Locale folds aren't known until runtime */
8321         stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value);
8322     }
8323
8324     return stored;
8325 }
8326
8327 /*
8328    parse a class specification and produce either an ANYOF node that
8329    matches the pattern or if the pattern matches a single char only and
8330    that char is < 256 and we are case insensitive then we produce an
8331    EXACT node instead.
8332 */
8333
8334 STATIC regnode *
8335 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
8336 {
8337     dVAR;
8338     register UV nextvalue;
8339     register IV prevvalue = OOB_UNICODE;
8340     register IV range = 0;
8341     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
8342     register regnode *ret;
8343     STRLEN numlen;
8344     IV namedclass;
8345     char *rangebegin = NULL;
8346     bool need_class = 0;
8347     SV *listsv = NULL;
8348     UV n;
8349     AV* unicode_alternate  = NULL;
8350 #ifdef EBCDIC
8351     UV literal_endpoint = 0;
8352 #endif
8353     UV stored = 0;  /* how many chars stored in the bitmap */
8354
8355     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
8356         case we need to change the emitted regop to an EXACT. */
8357     const char * orig_parse = RExC_parse;
8358     GET_RE_DEBUG_FLAGS_DECL;
8359
8360     PERL_ARGS_ASSERT_REGCLASS;
8361 #ifndef DEBUGGING
8362     PERL_UNUSED_ARG(depth);
8363 #endif
8364
8365     DEBUG_PARSE("clas");
8366
8367     /* Assume we are going to generate an ANYOF node. */
8368     ret = reganode(pRExC_state, ANYOF, 0);
8369
8370     if (!SIZE_ONLY)
8371         ANYOF_FLAGS(ret) = 0;
8372
8373     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
8374         RExC_naughty++;
8375         RExC_parse++;
8376         if (!SIZE_ONLY)
8377             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8378     }
8379
8380     if (SIZE_ONLY) {
8381         RExC_size += ANYOF_SKIP;
8382 #ifdef ANYOF_ADD_LOC_SKIP
8383         if (LOC) {
8384             RExC_size += ANYOF_ADD_LOC_SKIP;
8385         }
8386 #endif
8387         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8388     }
8389     else {
8390         RExC_emit += ANYOF_SKIP;
8391         if (LOC) {
8392             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
8393 #ifdef ANYOF_ADD_LOC_SKIP
8394             RExC_emit += ANYOF_ADD_LOC_SKIP;
8395 #endif
8396         }
8397         ANYOF_BITMAP_ZERO(ret);
8398         listsv = newSVpvs("# comment\n");
8399     }
8400
8401     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8402
8403     if (!SIZE_ONLY && POSIXCC(nextvalue))
8404         checkposixcc(pRExC_state);
8405
8406     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8407     if (UCHARAT(RExC_parse) == ']')
8408         goto charclassloop;
8409
8410 parseit:
8411     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
8412
8413     charclassloop:
8414
8415         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8416
8417         if (!range)
8418             rangebegin = RExC_parse;
8419         if (UTF) {
8420             value = utf8n_to_uvchr((U8*)RExC_parse,
8421                                    RExC_end - RExC_parse,
8422                                    &numlen, UTF8_ALLOW_DEFAULT);
8423             RExC_parse += numlen;
8424         }
8425         else
8426             value = UCHARAT(RExC_parse++);
8427
8428         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8429         if (value == '[' && POSIXCC(nextvalue))
8430             namedclass = regpposixcc(pRExC_state, value);
8431         else if (value == '\\') {
8432             if (UTF) {
8433                 value = utf8n_to_uvchr((U8*)RExC_parse,
8434                                    RExC_end - RExC_parse,
8435                                    &numlen, UTF8_ALLOW_DEFAULT);
8436                 RExC_parse += numlen;
8437             }
8438             else
8439                 value = UCHARAT(RExC_parse++);
8440             /* Some compilers cannot handle switching on 64-bit integer
8441              * values, therefore value cannot be an UV.  Yes, this will
8442              * be a problem later if we want switch on Unicode.
8443              * A similar issue a little bit later when switching on
8444              * namedclass. --jhi */
8445             switch ((I32)value) {
8446             case 'w':   namedclass = ANYOF_ALNUM;       break;
8447             case 'W':   namedclass = ANYOF_NALNUM;      break;
8448             case 's':   namedclass = ANYOF_SPACE;       break;
8449             case 'S':   namedclass = ANYOF_NSPACE;      break;
8450             case 'd':   namedclass = ANYOF_DIGIT;       break;
8451             case 'D':   namedclass = ANYOF_NDIGIT;      break;
8452             case 'v':   namedclass = ANYOF_VERTWS;      break;
8453             case 'V':   namedclass = ANYOF_NVERTWS;     break;
8454             case 'h':   namedclass = ANYOF_HORIZWS;     break;
8455             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
8456             case 'N':  /* Handle \N{NAME} in class */
8457                 {
8458                     /* We only pay attention to the first char of
8459                     multichar strings being returned. I kinda wonder
8460                     if this makes sense as it does change the behaviour
8461                     from earlier versions, OTOH that behaviour was broken
8462                     as well. */
8463                     UV v; /* value is register so we cant & it /grrr */
8464                     if (reg_namedseq(pRExC_state, &v, NULL)) {
8465                         goto parseit;
8466                     }
8467                     value= v;
8468                 }
8469                 break;
8470             case 'p':
8471             case 'P':
8472                 {
8473                 char *e;
8474                 if (RExC_parse >= RExC_end)
8475                     vFAIL2("Empty \\%c{}", (U8)value);
8476                 if (*RExC_parse == '{') {
8477                     const U8 c = (U8)value;
8478                     e = strchr(RExC_parse++, '}');
8479                     if (!e)
8480                         vFAIL2("Missing right brace on \\%c{}", c);
8481                     while (isSPACE(UCHARAT(RExC_parse)))
8482                         RExC_parse++;
8483                     if (e == RExC_parse)
8484                         vFAIL2("Empty \\%c{}", c);
8485                     n = e - RExC_parse;
8486                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8487                         n--;
8488                 }
8489                 else {
8490                     e = RExC_parse;
8491                     n = 1;
8492                 }
8493                 if (!SIZE_ONLY) {
8494                     if (UCHARAT(RExC_parse) == '^') {
8495                          RExC_parse++;
8496                          n--;
8497                          value = value == 'p' ? 'P' : 'p'; /* toggle */
8498                          while (isSPACE(UCHARAT(RExC_parse))) {
8499                               RExC_parse++;
8500                               n--;
8501                          }
8502                     }
8503                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8504                         (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8505                 }
8506                 RExC_parse = e + 1;
8507
8508                 /* The \p could match something in the Latin1 range, hence
8509                  * something that isn't utf8 */
8510                 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
8511                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
8512                 }
8513                 break;
8514             case 'n':   value = '\n';                   break;
8515             case 'r':   value = '\r';                   break;
8516             case 't':   value = '\t';                   break;
8517             case 'f':   value = '\f';                   break;
8518             case 'b':   value = '\b';                   break;
8519             case 'e':   value = ASCII_TO_NATIVE('\033');break;
8520             case 'a':   value = ASCII_TO_NATIVE('\007');break;
8521             case 'o':
8522                 RExC_parse--;   /* function expects to be pointed at the 'o' */
8523                 {
8524                     const char* error_msg;
8525                     bool valid = grok_bslash_o(RExC_parse,
8526                                                &value,
8527                                                &numlen,
8528                                                &error_msg,
8529                                                SIZE_ONLY);
8530                     RExC_parse += numlen;
8531                     if (! valid) {
8532                         vFAIL(error_msg);
8533                     }
8534                 }
8535                 if (PL_encoding && value < 0x100) {
8536                     goto recode_encoding;
8537                 }
8538                 break;
8539             case 'x':
8540                 if (*RExC_parse == '{') {
8541                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8542                         | PERL_SCAN_DISALLOW_PREFIX;
8543                     char * const e = strchr(RExC_parse++, '}');
8544                     if (!e)
8545                         vFAIL("Missing right brace on \\x{}");
8546
8547                     numlen = e - RExC_parse;
8548                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8549                     RExC_parse = e + 1;
8550                 }
8551                 else {
8552                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8553                     numlen = 2;
8554                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8555                     RExC_parse += numlen;
8556                 }
8557                 if (PL_encoding && value < 0x100)
8558                     goto recode_encoding;
8559                 break;
8560             case 'c':
8561                 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
8562                 break;
8563             case '0': case '1': case '2': case '3': case '4':
8564             case '5': case '6': case '7':
8565                 {
8566                     /* Take 1-3 octal digits */
8567                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8568                     numlen = 3;
8569                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8570                     RExC_parse += numlen;
8571                     if (PL_encoding && value < 0x100)
8572                         goto recode_encoding;
8573                     break;
8574                 }
8575             recode_encoding:
8576                 {
8577                     SV* enc = PL_encoding;
8578                     value = reg_recode((const char)(U8)value, &enc);
8579                     if (!enc && SIZE_ONLY)
8580                         ckWARNreg(RExC_parse,
8581                                   "Invalid escape in the specified encoding");
8582                     break;
8583                 }
8584             default:
8585                 /* Allow \_ to not give an error */
8586                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
8587                     ckWARN2reg(RExC_parse,
8588                                "Unrecognized escape \\%c in character class passed through",
8589                                (int)value);
8590                 }
8591                 break;
8592             }
8593         } /* end of \blah */
8594 #ifdef EBCDIC
8595         else
8596             literal_endpoint++;
8597 #endif
8598
8599         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8600
8601             /* What matches in a locale is not known until runtime, so need to
8602              * (one time per class) allocate extra space to pass to regexec.
8603              * The space will contain a bit for each named class that is to be
8604              * matched against.  This isn't needed for \p{} and pseudo-classes,
8605              * as they are not affected by locale, and hence are dealt with
8606              * separately */
8607             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
8608                 need_class = 1;
8609                 if (SIZE_ONLY) {
8610 #ifdef ANYOF_CLASS_ADD_SKIP
8611                     RExC_size += ANYOF_CLASS_ADD_SKIP;
8612 #endif
8613                 }
8614                 else {
8615 #ifdef ANYOF_CLASS_ADD_SKIP
8616                     RExC_emit += ANYOF_CLASS_ADD_SKIP;
8617 #endif
8618                     ANYOF_CLASS_ZERO(ret);
8619                 }
8620                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8621             }
8622
8623             /* a bad range like a-\d, a-[:digit:] ? */
8624             if (range) {
8625                 if (!SIZE_ONLY) {
8626                     const int w =
8627                         RExC_parse >= rangebegin ?
8628                         RExC_parse - rangebegin : 0;
8629                     ckWARN4reg(RExC_parse,
8630                                "False [] range \"%*.*s\"",
8631                                w, w, rangebegin);
8632
8633                     if (prevvalue < 256) {
8634                         stored +=
8635                          S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue);
8636                         stored +=
8637                          S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
8638                     }
8639                     else {
8640                         ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8641                         Perl_sv_catpvf(aTHX_ listsv,
8642                            "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8643                     }
8644                 }
8645
8646                 range = 0; /* this was not a true range */
8647             }
8648
8649
8650
8651             if (!SIZE_ONLY) {
8652                 const char *what = NULL;
8653                 char yesno = 0;
8654
8655                 /* Possible truncation here but in some 64-bit environments
8656                  * the compiler gets heartburn about switch on 64-bit values.
8657                  * A similar issue a little earlier when switching on value.
8658                  * --jhi */
8659                 switch ((I32)namedclass) {
8660
8661                 case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
8662                 case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
8663                 case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
8664                 case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
8665                 case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
8666                 case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
8667                 case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
8668                 case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
8669                 case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
8670                 case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
8671 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8672                 /* \s, \w match all unicode if utf8. */
8673                 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
8674                 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
8675 #else
8676                 /* \s, \w match ascii and locale only */
8677                 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
8678                 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
8679 #endif
8680                 case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
8681                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8682                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8683                 case ANYOF_ASCII:
8684                     if (LOC)
8685                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8686                     else {
8687                         for (value = 0; value < 128; value++)
8688                             stored +=
8689                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
8690                     }
8691                     yesno = '+';
8692                     what = NULL;        /* Doesn't match outside ascii, so
8693                                            don't want to add +utf8:: */
8694                     break;
8695                 case ANYOF_NASCII:
8696                     if (LOC)
8697                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8698                     else {
8699                         for (value = 128; value < 256; value++)
8700                             stored +=
8701                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
8702                     }
8703                     yesno = '!';
8704                     what = "ASCII";
8705                     break;
8706                 case ANYOF_DIGIT:
8707                     if (LOC)
8708                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8709                     else {
8710                         /* consecutive digits assumed */
8711                         for (value = '0'; value <= '9'; value++)
8712                             stored +=
8713                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
8714                     }
8715                     yesno = '+';
8716                     what = POSIX_CC_UNI_NAME("Digit");
8717                     break;
8718                 case ANYOF_NDIGIT:
8719                     if (LOC)
8720                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8721                     else {
8722                         /* consecutive digits assumed */
8723                         for (value = 0; value < '0'; value++)
8724                             stored +=
8725                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
8726                         for (value = '9' + 1; value < 256; value++)
8727                             stored +=
8728                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
8729                     }
8730                     yesno = '!';
8731                     what = POSIX_CC_UNI_NAME("Digit");
8732                     break;
8733                 case ANYOF_MAX:
8734                     /* this is to handle \p and \P */
8735                     break;
8736                 default:
8737                     vFAIL("Invalid [::] class");
8738                     break;
8739                 }
8740                 if (what) {
8741                     /* Strings such as "+utf8::isWord\n" */
8742                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8743                     ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8744                 }
8745
8746                 continue;
8747             }
8748         } /* end of namedclass \blah */
8749
8750         if (range) {
8751             if (prevvalue > (IV)value) /* b-a */ {
8752                 const int w = RExC_parse - rangebegin;
8753                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8754                 range = 0; /* not a valid range */
8755             }
8756         }
8757         else {
8758             prevvalue = value; /* save the beginning of the range */
8759             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8760                 RExC_parse[1] != ']') {
8761                 RExC_parse++;
8762
8763                 /* a bad range like \w-, [:word:]- ? */
8764                 if (namedclass > OOB_NAMEDCLASS) {
8765                     if (ckWARN(WARN_REGEXP)) {
8766                         const int w =
8767                             RExC_parse >= rangebegin ?
8768                             RExC_parse - rangebegin : 0;
8769                         vWARN4(RExC_parse,
8770                                "False [] range \"%*.*s\"",
8771                                w, w, rangebegin);
8772                     }
8773                     if (!SIZE_ONLY)
8774                         stored +=
8775                             S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
8776                 } else
8777                     range = 1;  /* yeah, it's a range! */
8778                 continue;       /* but do it the next time */
8779             }
8780         }
8781
8782         /* now is the next time */
8783         if (!SIZE_ONLY) {
8784             if (prevvalue < 256) {
8785                 const IV ceilvalue = value < 256 ? value : 255;
8786                 IV i;
8787 #ifdef EBCDIC
8788                 /* In EBCDIC [\x89-\x91] should include
8789                  * the \x8e but [i-j] should not. */
8790                 if (literal_endpoint == 2 &&
8791                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8792                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8793                 {
8794                     if (isLOWER(prevvalue)) {
8795                         for (i = prevvalue; i <= ceilvalue; i++)
8796                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8797                                 stored +=
8798                                   S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
8799                             }
8800                     } else {
8801                         for (i = prevvalue; i <= ceilvalue; i++)
8802                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8803                                 stored +=
8804                                   S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
8805                             }
8806                     }
8807                 }
8808                 else
8809 #endif
8810                       for (i = prevvalue; i <= ceilvalue; i++) {
8811                         stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
8812                       }
8813           }
8814           if (value > 255 || UTF) {
8815                 const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
8816                 const UV natvalue      = NATIVE_TO_UNI(value);
8817
8818                 /* If the code point requires utf8 to represent, and we are not
8819                  * folding, it can't match unless the target is in utf8.  Only
8820                  * a few code points above 255 fold to below it, so XXX an
8821                  * optimization would be to know which ones and set the flag
8822                  * appropriately. */
8823                 ANYOF_FLAGS(ret) |= (FOLD || value < 256)
8824                                     ? ANYOF_NONBITMAP
8825                                     : ANYOF_UTF8;
8826                 if (prevnatvalue < natvalue) { /* what about > ? */
8827                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8828                                    prevnatvalue, natvalue);
8829                 }
8830                 else if (prevnatvalue == natvalue) {
8831                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8832                     if (FOLD) {
8833                          U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8834                          STRLEN foldlen;
8835                          const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8836
8837 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8838                          if (RExC_precomp[0] == ':' &&
8839                              RExC_precomp[1] == '[' &&
8840                              (f == 0xDF || f == 0x92)) {
8841                              f = NATIVE_TO_UNI(f);
8842                         }
8843 #endif
8844                          /* If folding and foldable and a single
8845                           * character, insert also the folded version
8846                           * to the charclass. */
8847                          if (f != value) {
8848 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8849                              if ((RExC_precomp[0] == ':' &&
8850                                   RExC_precomp[1] == '[' &&
8851                                   (f == 0xA2 &&
8852                                    (value == 0xFB05 || value == 0xFB06))) ?
8853                                  foldlen == ((STRLEN)UNISKIP(f) - 1) :
8854                                  foldlen == (STRLEN)UNISKIP(f) )
8855 #else
8856                               if (foldlen == (STRLEN)UNISKIP(f))
8857 #endif
8858                                   Perl_sv_catpvf(aTHX_ listsv,
8859                                                  "%04"UVxf"\n", f);
8860                               else {
8861                                   /* Any multicharacter foldings
8862                                    * require the following transform:
8863                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8864                                    * where E folds into "pq" and F folds
8865                                    * into "rst", all other characters
8866                                    * fold to single characters.  We save
8867                                    * away these multicharacter foldings,
8868                                    * to be later saved as part of the
8869                                    * additional "s" data. */
8870                                   SV *sv;
8871
8872                                   if (!unicode_alternate)
8873                                       unicode_alternate = newAV();
8874                                   sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8875                                                      TRUE);
8876                                   av_push(unicode_alternate, sv);
8877                               }
8878                          }
8879
8880                          /* If folding and the value is one of the Greek
8881                           * sigmas insert a few more sigmas to make the
8882                           * folding rules of the sigmas to work right.
8883                           * Note that not all the possible combinations
8884                           * are handled here: some of them are handled
8885                           * by the standard folding rules, and some of
8886                           * them (literal or EXACTF cases) are handled
8887                           * during runtime in regexec.c:S_find_byclass(). */
8888                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8889                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8890                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8891                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8892                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8893                          }
8894                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8895                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8896                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8897                     }
8898                 }
8899             }
8900 #ifdef EBCDIC
8901             literal_endpoint = 0;
8902 #endif
8903         }
8904
8905         range = 0; /* this range (if it was one) is done now */
8906     }
8907
8908
8909
8910     if (SIZE_ONLY)
8911         return ret;
8912     /****** !SIZE_ONLY AFTER HERE *********/
8913
8914     /* Folding in the bitmap is taken care of above, but not for locale, for
8915      * which we have to wait to see what folding is in effect at runtime, and
8916      * for things not in the bitmap */
8917     if (FOLD && (LOC || ANYOF_FLAGS(ret) & ANYOF_NONBITMAP)) {
8918         ANYOF_FLAGS(ret) |= ANYOF_FOLD;
8919     }
8920
8921     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that this doesn't
8922      * optimize locale.  Doing so perhaps could be done as long as there is
8923      * nothing like \w in it; some thought also would have to be given to the
8924      * interaction with above 0x100 chars */
8925     if ((ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8926         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8927             ANYOF_BITMAP(ret)[value] ^= 0xFF;
8928         stored = 256 - stored;
8929
8930         /* The inversion means that everything above 255 is matched */
8931         ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
8932     }
8933
8934     /* A single character class can be "optimized" into an EXACTish node.
8935      * Note that since we don't currently count how many characters there are
8936      * outside the bitmap, we are XXX missing optimization possibilities for
8937      * them.  This optimization can't happen unless this is a truly single
8938      * character class, which means that it can't be an inversion into a
8939      * many-character class, and there must be no possibility of there being
8940      * things outside the bitmap.  'stored' (only) for locales doesn't include
8941      * \w, etc, so have to make a special test that they aren't present
8942      *
8943      * Similarly A 2-character class of the very special form like [bB] can be
8944      * optimized into an EXACTFish node, but only for non-locales, and for
8945      * characters which only have the two folds; so things like 'fF' and 'Ii'
8946      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
8947      * FI'. */
8948     if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
8949         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
8950                               || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
8951             || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
8952                                  && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
8953                                  /* If the latest code point has a fold whose
8954                                   * bit is set, it must be the only other one */
8955                                 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
8956                                  && ANYOF_BITMAP_TEST(ret, prevvalue)))))
8957     {
8958         /* Note that the information needed to decide to do this optimization
8959          * is not currently available until the 2nd pass, and that the actually
8960          * used EXACTish node takes less space than the calculated ANYOF node,
8961          * and hence the amount of space calculated in the first pass is larger
8962          * than actually used, so this optimization doesn't gain us any space.
8963          * But an EXACT node is faster than an ANYOF node, and can be combined
8964          * with any adjacent EXACT nodes later by the optimizer for further
8965          * gains.  The speed of executing an EXACTF is similar to an ANYOF
8966          * node, so the optimization advantage comes from the ability to join
8967          * it to adjacent EXACT nodes */
8968
8969         const char * cur_parse= RExC_parse;
8970         U8 op;
8971         RExC_emit = (regnode *)orig_emit;
8972         RExC_parse = (char *)orig_parse;
8973
8974         if (stored == 1) {
8975
8976             /* A locale node with one point can be folded; all the other cases
8977              * with folding will have two points, since we calculate them above
8978              */
8979             if (ANYOF_FLAGS(ret) & ANYOF_FOLD) {
8980                  op = EXACTFL;
8981             }
8982             else {
8983                 op = EXACT;
8984             }
8985         }   /* else 2 chars in the bit map: the folds of each other */
8986         else if (UNI_SEMANTICS || !isASCII(value)) {
8987
8988             /* To join adjacent nodes, they must be the exact EXACTish type.
8989              * Try to use the most likely type, by using EXACTFU if the regex
8990              * calls for them, or is required because the character is
8991              * non-ASCII */
8992             op = EXACTFU;
8993         }
8994         else {    /* Otherwise, more likely to be EXACTF type */
8995             op = EXACTF;
8996         }
8997
8998         ret = reg_node(pRExC_state, op);
8999         RExC_parse = (char *)cur_parse;
9000         if (UTF && ! NATIVE_IS_INVARIANT(value)) {
9001             *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
9002             *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
9003             STR_LEN(ret)= 2;
9004             RExC_emit += STR_SZ(2);
9005         }
9006         else {
9007             *STRING(ret)= (char)value;
9008             STR_LEN(ret)= 1;
9009             RExC_emit += STR_SZ(1);
9010         }
9011         SvREFCNT_dec(listsv);
9012         return ret;
9013     }
9014
9015     {
9016         AV * const av = newAV();
9017         SV *rv;
9018         /* The 0th element stores the character class description
9019          * in its textual form: used later (regexec.c:Perl_regclass_swash())
9020          * to initialize the appropriate swash (which gets stored in
9021          * the 1st element), and also useful for dumping the regnode.
9022          * The 2nd element stores the multicharacter foldings,
9023          * used later (regexec.c:S_reginclass()). */
9024         av_store(av, 0, listsv);
9025         av_store(av, 1, NULL);
9026         av_store(av, 2, MUTABLE_SV(unicode_alternate));
9027         rv = newRV_noinc(MUTABLE_SV(av));
9028         n = add_data(pRExC_state, 1, "s");
9029         RExC_rxi->data->data[n] = (void*)rv;
9030         ARG_SET(ret, n);
9031     }
9032     return ret;
9033 }
9034 #undef _C_C_T_
9035
9036
9037 /* reg_skipcomment()
9038
9039    Absorbs an /x style # comments from the input stream.
9040    Returns true if there is more text remaining in the stream.
9041    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
9042    terminates the pattern without including a newline.
9043
9044    Note its the callers responsibility to ensure that we are
9045    actually in /x mode
9046
9047 */
9048
9049 STATIC bool
9050 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
9051 {
9052     bool ended = 0;
9053
9054     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
9055
9056     while (RExC_parse < RExC_end)
9057         if (*RExC_parse++ == '\n') {
9058             ended = 1;
9059             break;
9060         }
9061     if (!ended) {
9062         /* we ran off the end of the pattern without ending
9063            the comment, so we have to add an \n when wrapping */
9064         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9065         return 0;
9066     } else
9067         return 1;
9068 }
9069
9070 /* nextchar()
9071
9072    Advances the parse position, and optionally absorbs
9073    "whitespace" from the inputstream.
9074
9075    Without /x "whitespace" means (?#...) style comments only,
9076    with /x this means (?#...) and # comments and whitespace proper.
9077
9078    Returns the RExC_parse point from BEFORE the scan occurs.
9079
9080    This is the /x friendly way of saying RExC_parse++.
9081 */
9082
9083 STATIC char*
9084 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
9085 {
9086     char* const retval = RExC_parse++;
9087
9088     PERL_ARGS_ASSERT_NEXTCHAR;
9089
9090     for (;;) {
9091         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
9092                 RExC_parse[2] == '#') {
9093             while (*RExC_parse != ')') {
9094                 if (RExC_parse == RExC_end)
9095                     FAIL("Sequence (?#... not terminated");
9096                 RExC_parse++;
9097             }
9098             RExC_parse++;
9099             continue;
9100         }
9101         if (RExC_flags & RXf_PMf_EXTENDED) {
9102             if (isSPACE(*RExC_parse)) {
9103                 RExC_parse++;
9104                 continue;
9105             }
9106             else if (*RExC_parse == '#') {
9107                 if ( reg_skipcomment( pRExC_state ) )
9108                     continue;
9109             }
9110         }
9111         return retval;
9112     }
9113 }
9114
9115 /*
9116 - reg_node - emit a node
9117 */
9118 STATIC regnode *                        /* Location. */
9119 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
9120 {
9121     dVAR;
9122     register regnode *ptr;
9123     regnode * const ret = RExC_emit;
9124     GET_RE_DEBUG_FLAGS_DECL;
9125
9126     PERL_ARGS_ASSERT_REG_NODE;
9127
9128     if (SIZE_ONLY) {
9129         SIZE_ALIGN(RExC_size);
9130         RExC_size += 1;
9131         return(ret);
9132     }
9133     if (RExC_emit >= RExC_emit_bound)
9134         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9135
9136     NODE_ALIGN_FILL(ret);
9137     ptr = ret;
9138     FILL_ADVANCE_NODE(ptr, op);
9139     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
9140 #ifdef RE_TRACK_PATTERN_OFFSETS
9141     if (RExC_offsets) {         /* MJD */
9142         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
9143               "reg_node", __LINE__,
9144               PL_reg_name[op],
9145               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
9146                 ? "Overwriting end of array!\n" : "OK",
9147               (UV)(RExC_emit - RExC_emit_start),
9148               (UV)(RExC_parse - RExC_start),
9149               (UV)RExC_offsets[0]));
9150         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
9151     }
9152 #endif
9153     RExC_emit = ptr;
9154     return(ret);
9155 }
9156
9157 /*
9158 - reganode - emit a node with an argument
9159 */
9160 STATIC regnode *                        /* Location. */
9161 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
9162 {
9163     dVAR;
9164     register regnode *ptr;
9165     regnode * const ret = RExC_emit;
9166     GET_RE_DEBUG_FLAGS_DECL;
9167
9168     PERL_ARGS_ASSERT_REGANODE;
9169
9170     if (SIZE_ONLY) {
9171         SIZE_ALIGN(RExC_size);
9172         RExC_size += 2;
9173         /*
9174            We can't do this:
9175
9176            assert(2==regarglen[op]+1);
9177
9178            Anything larger than this has to allocate the extra amount.
9179            If we changed this to be:
9180
9181            RExC_size += (1 + regarglen[op]);
9182
9183            then it wouldn't matter. Its not clear what side effect
9184            might come from that so its not done so far.
9185            -- dmq
9186         */
9187         return(ret);
9188     }
9189     if (RExC_emit >= RExC_emit_bound)
9190         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9191
9192     NODE_ALIGN_FILL(ret);
9193     ptr = ret;
9194     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
9195     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
9196 #ifdef RE_TRACK_PATTERN_OFFSETS
9197     if (RExC_offsets) {         /* MJD */
9198         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9199               "reganode",
9200               __LINE__,
9201               PL_reg_name[op],
9202               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
9203               "Overwriting end of array!\n" : "OK",
9204               (UV)(RExC_emit - RExC_emit_start),
9205               (UV)(RExC_parse - RExC_start),
9206               (UV)RExC_offsets[0]));
9207         Set_Cur_Node_Offset;
9208     }
9209 #endif
9210     RExC_emit = ptr;
9211     return(ret);
9212 }
9213
9214 /*
9215 - reguni - emit (if appropriate) a Unicode character
9216 */
9217 STATIC STRLEN
9218 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
9219 {
9220     dVAR;
9221
9222     PERL_ARGS_ASSERT_REGUNI;
9223
9224     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
9225 }
9226
9227 /*
9228 - reginsert - insert an operator in front of already-emitted operand
9229 *
9230 * Means relocating the operand.
9231 */
9232 STATIC void
9233 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
9234 {
9235     dVAR;
9236     register regnode *src;
9237     register regnode *dst;
9238     register regnode *place;
9239     const int offset = regarglen[(U8)op];
9240     const int size = NODE_STEP_REGNODE + offset;
9241     GET_RE_DEBUG_FLAGS_DECL;
9242
9243     PERL_ARGS_ASSERT_REGINSERT;
9244     PERL_UNUSED_ARG(depth);
9245 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
9246     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
9247     if (SIZE_ONLY) {
9248         RExC_size += size;
9249         return;
9250     }
9251
9252     src = RExC_emit;
9253     RExC_emit += size;
9254     dst = RExC_emit;
9255     if (RExC_open_parens) {
9256         int paren;
9257         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
9258         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
9259             if ( RExC_open_parens[paren] >= opnd ) {
9260                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
9261                 RExC_open_parens[paren] += size;
9262             } else {
9263                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
9264             }
9265             if ( RExC_close_parens[paren] >= opnd ) {
9266                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
9267                 RExC_close_parens[paren] += size;
9268             } else {
9269                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
9270             }
9271         }
9272     }
9273
9274     while (src > opnd) {
9275         StructCopy(--src, --dst, regnode);
9276 #ifdef RE_TRACK_PATTERN_OFFSETS
9277         if (RExC_offsets) {     /* MJD 20010112 */
9278             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
9279                   "reg_insert",
9280                   __LINE__,
9281                   PL_reg_name[op],
9282                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
9283                     ? "Overwriting end of array!\n" : "OK",
9284                   (UV)(src - RExC_emit_start),
9285                   (UV)(dst - RExC_emit_start),
9286                   (UV)RExC_offsets[0]));
9287             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
9288             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
9289         }
9290 #endif
9291     }
9292
9293
9294     place = opnd;               /* Op node, where operand used to be. */
9295 #ifdef RE_TRACK_PATTERN_OFFSETS
9296     if (RExC_offsets) {         /* MJD */
9297         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9298               "reginsert",
9299               __LINE__,
9300               PL_reg_name[op],
9301               (UV)(place - RExC_emit_start) > RExC_offsets[0]
9302               ? "Overwriting end of array!\n" : "OK",
9303               (UV)(place - RExC_emit_start),
9304               (UV)(RExC_parse - RExC_start),
9305               (UV)RExC_offsets[0]));
9306         Set_Node_Offset(place, RExC_parse);
9307         Set_Node_Length(place, 1);
9308     }
9309 #endif
9310     src = NEXTOPER(place);
9311     FILL_ADVANCE_NODE(place, op);
9312     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
9313     Zero(src, offset, regnode);
9314 }
9315
9316 /*
9317 - regtail - set the next-pointer at the end of a node chain of p to val.
9318 - SEE ALSO: regtail_study
9319 */
9320 /* TODO: All three parms should be const */
9321 STATIC void
9322 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9323 {
9324     dVAR;
9325     register regnode *scan;
9326     GET_RE_DEBUG_FLAGS_DECL;
9327
9328     PERL_ARGS_ASSERT_REGTAIL;
9329 #ifndef DEBUGGING
9330     PERL_UNUSED_ARG(depth);
9331 #endif
9332
9333     if (SIZE_ONLY)
9334         return;
9335
9336     /* Find last node. */
9337     scan = p;
9338     for (;;) {
9339         regnode * const temp = regnext(scan);
9340         DEBUG_PARSE_r({
9341             SV * const mysv=sv_newmortal();
9342             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
9343             regprop(RExC_rx, mysv, scan);
9344             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
9345                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
9346                     (temp == NULL ? "->" : ""),
9347                     (temp == NULL ? PL_reg_name[OP(val)] : "")
9348             );
9349         });
9350         if (temp == NULL)
9351             break;
9352         scan = temp;
9353     }
9354
9355     if (reg_off_by_arg[OP(scan)]) {
9356         ARG_SET(scan, val - scan);
9357     }
9358     else {
9359         NEXT_OFF(scan) = val - scan;
9360     }
9361 }
9362
9363 #ifdef DEBUGGING
9364 /*
9365 - regtail_study - set the next-pointer at the end of a node chain of p to val.
9366 - Look for optimizable sequences at the same time.
9367 - currently only looks for EXACT chains.
9368
9369 This is expermental code. The idea is to use this routine to perform
9370 in place optimizations on branches and groups as they are constructed,
9371 with the long term intention of removing optimization from study_chunk so
9372 that it is purely analytical.
9373
9374 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
9375 to control which is which.
9376
9377 */
9378 /* TODO: All four parms should be const */
9379
9380 STATIC U8
9381 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9382 {
9383     dVAR;
9384     register regnode *scan;
9385     U8 exact = PSEUDO;
9386 #ifdef EXPERIMENTAL_INPLACESCAN
9387     I32 min = 0;
9388 #endif
9389     GET_RE_DEBUG_FLAGS_DECL;
9390
9391     PERL_ARGS_ASSERT_REGTAIL_STUDY;
9392
9393
9394     if (SIZE_ONLY)
9395         return exact;
9396
9397     /* Find last node. */
9398
9399     scan = p;
9400     for (;;) {
9401         regnode * const temp = regnext(scan);
9402 #ifdef EXPERIMENTAL_INPLACESCAN
9403         if (PL_regkind[OP(scan)] == EXACT)
9404             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
9405                 return EXACT;
9406 #endif
9407         if ( exact ) {
9408             switch (OP(scan)) {
9409                 case EXACT:
9410                 case EXACTF:
9411                 case EXACTFU:
9412                 case EXACTFL:
9413                         if( exact == PSEUDO )
9414                             exact= OP(scan);
9415                         else if ( exact != OP(scan) )
9416                             exact= 0;
9417                 case NOTHING:
9418                     break;
9419                 default:
9420                     exact= 0;
9421             }
9422         }
9423         DEBUG_PARSE_r({
9424             SV * const mysv=sv_newmortal();
9425             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
9426             regprop(RExC_rx, mysv, scan);
9427             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
9428                 SvPV_nolen_const(mysv),
9429                 REG_NODE_NUM(scan),
9430                 PL_reg_name[exact]);
9431         });
9432         if (temp == NULL)
9433             break;
9434         scan = temp;
9435     }
9436     DEBUG_PARSE_r({
9437         SV * const mysv_val=sv_newmortal();
9438         DEBUG_PARSE_MSG("");
9439         regprop(RExC_rx, mysv_val, val);
9440         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
9441                       SvPV_nolen_const(mysv_val),
9442                       (IV)REG_NODE_NUM(val),
9443                       (IV)(val - scan)
9444         );
9445     });
9446     if (reg_off_by_arg[OP(scan)]) {
9447         ARG_SET(scan, val - scan);
9448     }
9449     else {
9450         NEXT_OFF(scan) = val - scan;
9451     }
9452
9453     return exact;
9454 }
9455 #endif
9456
9457 /*
9458  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
9459  */
9460 #ifdef DEBUGGING
9461 static void
9462 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9463 {
9464     int bit;
9465     int set=0;
9466
9467     for (bit=0; bit<32; bit++) {
9468         if (flags & (1<<bit)) {
9469             if (!set++ && lead)
9470                 PerlIO_printf(Perl_debug_log, "%s",lead);
9471             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9472         }
9473     }
9474     if (lead)  {
9475         if (set)
9476             PerlIO_printf(Perl_debug_log, "\n");
9477         else
9478             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9479     }
9480 }
9481 #endif
9482
9483 void
9484 Perl_regdump(pTHX_ const regexp *r)
9485 {
9486 #ifdef DEBUGGING
9487     dVAR;
9488     SV * const sv = sv_newmortal();
9489     SV *dsv= sv_newmortal();
9490     RXi_GET_DECL(r,ri);
9491     GET_RE_DEBUG_FLAGS_DECL;
9492
9493     PERL_ARGS_ASSERT_REGDUMP;
9494
9495     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
9496
9497     /* Header fields of interest. */
9498     if (r->anchored_substr) {
9499         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9500             RE_SV_DUMPLEN(r->anchored_substr), 30);
9501         PerlIO_printf(Perl_debug_log,
9502                       "anchored %s%s at %"IVdf" ",
9503                       s, RE_SV_TAIL(r->anchored_substr),
9504                       (IV)r->anchored_offset);
9505     } else if (r->anchored_utf8) {
9506         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9507             RE_SV_DUMPLEN(r->anchored_utf8), 30);
9508         PerlIO_printf(Perl_debug_log,
9509                       "anchored utf8 %s%s at %"IVdf" ",
9510                       s, RE_SV_TAIL(r->anchored_utf8),
9511                       (IV)r->anchored_offset);
9512     }
9513     if (r->float_substr) {
9514         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9515             RE_SV_DUMPLEN(r->float_substr), 30);
9516         PerlIO_printf(Perl_debug_log,
9517                       "floating %s%s at %"IVdf"..%"UVuf" ",
9518                       s, RE_SV_TAIL(r->float_substr),
9519                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9520     } else if (r->float_utf8) {
9521         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9522             RE_SV_DUMPLEN(r->float_utf8), 30);
9523         PerlIO_printf(Perl_debug_log,
9524                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9525                       s, RE_SV_TAIL(r->float_utf8),
9526                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9527     }
9528     if (r->check_substr || r->check_utf8)
9529         PerlIO_printf(Perl_debug_log,
9530                       (const char *)
9531                       (r->check_substr == r->float_substr
9532                        && r->check_utf8 == r->float_utf8
9533                        ? "(checking floating" : "(checking anchored"));
9534     if (r->extflags & RXf_NOSCAN)
9535         PerlIO_printf(Perl_debug_log, " noscan");
9536     if (r->extflags & RXf_CHECK_ALL)
9537         PerlIO_printf(Perl_debug_log, " isall");
9538     if (r->check_substr || r->check_utf8)
9539         PerlIO_printf(Perl_debug_log, ") ");
9540
9541     if (ri->regstclass) {
9542         regprop(r, sv, ri->regstclass);
9543         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9544     }
9545     if (r->extflags & RXf_ANCH) {
9546         PerlIO_printf(Perl_debug_log, "anchored");
9547         if (r->extflags & RXf_ANCH_BOL)
9548             PerlIO_printf(Perl_debug_log, "(BOL)");
9549         if (r->extflags & RXf_ANCH_MBOL)
9550             PerlIO_printf(Perl_debug_log, "(MBOL)");
9551         if (r->extflags & RXf_ANCH_SBOL)
9552             PerlIO_printf(Perl_debug_log, "(SBOL)");
9553         if (r->extflags & RXf_ANCH_GPOS)
9554             PerlIO_printf(Perl_debug_log, "(GPOS)");
9555         PerlIO_putc(Perl_debug_log, ' ');
9556     }
9557     if (r->extflags & RXf_GPOS_SEEN)
9558         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9559     if (r->intflags & PREGf_SKIP)
9560         PerlIO_printf(Perl_debug_log, "plus ");
9561     if (r->intflags & PREGf_IMPLICIT)
9562         PerlIO_printf(Perl_debug_log, "implicit ");
9563     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9564     if (r->extflags & RXf_EVAL_SEEN)
9565         PerlIO_printf(Perl_debug_log, "with eval ");
9566     PerlIO_printf(Perl_debug_log, "\n");
9567     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9568 #else
9569     PERL_ARGS_ASSERT_REGDUMP;
9570     PERL_UNUSED_CONTEXT;
9571     PERL_UNUSED_ARG(r);
9572 #endif  /* DEBUGGING */
9573 }
9574
9575 /*
9576 - regprop - printable representation of opcode
9577 */
9578 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9579 STMT_START { \
9580         if (do_sep) {                           \
9581             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9582             if (flags & ANYOF_INVERT)           \
9583                 /*make sure the invert info is in each */ \
9584                 sv_catpvs(sv, "^");             \
9585             do_sep = 0;                         \
9586         }                                       \
9587 } STMT_END
9588
9589 void
9590 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9591 {
9592 #ifdef DEBUGGING
9593     dVAR;
9594     register int k;
9595     RXi_GET_DECL(prog,progi);
9596     GET_RE_DEBUG_FLAGS_DECL;
9597
9598     PERL_ARGS_ASSERT_REGPROP;
9599
9600     sv_setpvs(sv, "");
9601
9602     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
9603         /* It would be nice to FAIL() here, but this may be called from
9604            regexec.c, and it would be hard to supply pRExC_state. */
9605         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9606     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9607
9608     k = PL_regkind[OP(o)];
9609
9610     if (k == EXACT) {
9611         sv_catpvs(sv, " ");
9612         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9613          * is a crude hack but it may be the best for now since
9614          * we have no flag "this EXACTish node was UTF-8"
9615          * --jhi */
9616         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9617                   PERL_PV_ESCAPE_UNI_DETECT |
9618                   PERL_PV_PRETTY_ELLIPSES   |
9619                   PERL_PV_PRETTY_LTGT       |
9620                   PERL_PV_PRETTY_NOCLEAR
9621                   );
9622     } else if (k == TRIE) {
9623         /* print the details of the trie in dumpuntil instead, as
9624          * progi->data isn't available here */
9625         const char op = OP(o);
9626         const U32 n = ARG(o);
9627         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9628                (reg_ac_data *)progi->data->data[n] :
9629                NULL;
9630         const reg_trie_data * const trie
9631             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9632
9633         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9634         DEBUG_TRIE_COMPILE_r(
9635             Perl_sv_catpvf(aTHX_ sv,
9636                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9637                 (UV)trie->startstate,
9638                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9639                 (UV)trie->wordcount,
9640                 (UV)trie->minlen,
9641                 (UV)trie->maxlen,
9642                 (UV)TRIE_CHARCOUNT(trie),
9643                 (UV)trie->uniquecharcount
9644             )
9645         );
9646         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9647             int i;
9648             int rangestart = -1;
9649             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9650             sv_catpvs(sv, "[");
9651             for (i = 0; i <= 256; i++) {
9652                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9653                     if (rangestart == -1)
9654                         rangestart = i;
9655                 } else if (rangestart != -1) {
9656                     if (i <= rangestart + 3)
9657                         for (; rangestart < i; rangestart++)
9658                             put_byte(sv, rangestart);
9659                     else {
9660                         put_byte(sv, rangestart);
9661                         sv_catpvs(sv, "-");
9662                         put_byte(sv, i - 1);
9663                     }
9664                     rangestart = -1;
9665                 }
9666             }
9667             sv_catpvs(sv, "]");
9668         }
9669
9670     } else if (k == CURLY) {
9671         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9672             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9673         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9674     }
9675     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
9676         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9677     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9678         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
9679         if ( RXp_PAREN_NAMES(prog) ) {
9680             if ( k != REF || (OP(o) < NREF)) {
9681                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9682                 SV **name= av_fetch(list, ARG(o), 0 );
9683                 if (name)
9684                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9685             }
9686             else {
9687                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9688                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9689                 I32 *nums=(I32*)SvPVX(sv_dat);
9690                 SV **name= av_fetch(list, nums[0], 0 );
9691                 I32 n;
9692                 if (name) {
9693                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
9694                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9695                                     (n ? "," : ""), (IV)nums[n]);
9696                     }
9697                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9698                 }
9699             }
9700         }
9701     } else if (k == GOSUB)
9702         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9703     else if (k == VERB) {
9704         if (!o->flags)
9705             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9706                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9707     } else if (k == LOGICAL)
9708         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
9709     else if (k == FOLDCHAR)
9710         Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9711     else if (k == ANYOF) {
9712         int i, rangestart = -1;
9713         const U8 flags = ANYOF_FLAGS(o);
9714         int do_sep = 0;
9715
9716         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9717         static const char * const anyofs[] = {
9718             "\\w",
9719             "\\W",
9720             "\\s",
9721             "\\S",
9722             "\\d",
9723             "\\D",
9724             "[:alnum:]",
9725             "[:^alnum:]",
9726             "[:alpha:]",
9727             "[:^alpha:]",
9728             "[:ascii:]",
9729             "[:^ascii:]",
9730             "[:cntrl:]",
9731             "[:^cntrl:]",
9732             "[:graph:]",
9733             "[:^graph:]",
9734             "[:lower:]",
9735             "[:^lower:]",
9736             "[:print:]",
9737             "[:^print:]",
9738             "[:punct:]",
9739             "[:^punct:]",
9740             "[:upper:]",
9741             "[:^upper:]",
9742             "[:xdigit:]",
9743             "[:^xdigit:]",
9744             "[:space:]",
9745             "[:^space:]",
9746             "[:blank:]",
9747             "[:^blank:]"
9748         };
9749
9750         if (flags & ANYOF_LOCALE)
9751             sv_catpvs(sv, "{loc}");
9752         if (flags & ANYOF_FOLD)
9753             sv_catpvs(sv, "{i}");
9754         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9755         if (flags & ANYOF_INVERT)
9756             sv_catpvs(sv, "^");
9757
9758         /* output what the standard cp 0-255 bitmap matches */
9759         for (i = 0; i <= 256; i++) {
9760             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9761                 if (rangestart == -1)
9762                     rangestart = i;
9763             } else if (rangestart != -1) {
9764                 if (i <= rangestart + 3)
9765                     for (; rangestart < i; rangestart++)
9766                         put_byte(sv, rangestart);
9767                 else {
9768                     put_byte(sv, rangestart);
9769                     sv_catpvs(sv, "-");
9770                     put_byte(sv, i - 1);
9771                 }
9772                 do_sep = 1;
9773                 rangestart = -1;
9774             }
9775         }
9776
9777         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9778         /* output any special charclass tests (used entirely under use locale) */
9779         if (ANYOF_CLASS_TEST_ANY_SET(o))
9780             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9781                 if (ANYOF_CLASS_TEST(o,i)) {
9782                     sv_catpv(sv, anyofs[i]);
9783                     do_sep = 1;
9784                 }
9785
9786         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9787
9788         /* output information about the unicode matching */
9789         if (flags & ANYOF_UNICODE_ALL)
9790             sv_catpvs(sv, "{unicode_all}");
9791         else if (flags & ANYOF_UTF8)
9792             sv_catpvs(sv, "{unicode}");
9793         if (flags & ANYOF_NONBITMAP_NON_UTF8)
9794             sv_catpvs(sv, "{outside bitmap}");
9795
9796         {
9797             SV *lv;
9798             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9799
9800             if (lv) {
9801                 if (sw) {
9802                     U8 s[UTF8_MAXBYTES_CASE+1];
9803
9804                     for (i = 0; i <= 256; i++) { /* just the first 256 */
9805                         uvchr_to_utf8(s, i);
9806
9807                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
9808                             if (rangestart == -1)
9809                                 rangestart = i;
9810                         } else if (rangestart != -1) {
9811                             if (i <= rangestart + 3)
9812                                 for (; rangestart < i; rangestart++) {
9813                                     const U8 * const e = uvchr_to_utf8(s,rangestart);
9814                                     U8 *p;
9815                                     for(p = s; p < e; p++)
9816                                         put_byte(sv, *p);
9817                                 }
9818                             else {
9819                                 const U8 *e = uvchr_to_utf8(s,rangestart);
9820                                 U8 *p;
9821                                 for (p = s; p < e; p++)
9822                                     put_byte(sv, *p);
9823                                 sv_catpvs(sv, "-");
9824                                 e = uvchr_to_utf8(s, i-1);
9825                                 for (p = s; p < e; p++)
9826                                     put_byte(sv, *p);
9827                                 }
9828                                 rangestart = -1;
9829                             }
9830                         }
9831
9832                     sv_catpvs(sv, "..."); /* et cetera */
9833                 }
9834
9835                 {
9836                     char *s = savesvpv(lv);
9837                     char * const origs = s;
9838
9839                     while (*s && *s != '\n')
9840                         s++;
9841
9842                     if (*s == '\n') {
9843                         const char * const t = ++s;
9844
9845                         while (*s) {
9846                             if (*s == '\n')
9847                                 *s = ' ';
9848                             s++;
9849                         }
9850                         if (s[-1] == ' ')
9851                             s[-1] = 0;
9852
9853                         sv_catpv(sv, t);
9854                     }
9855
9856                     Safefree(origs);
9857                 }
9858             }
9859         }
9860
9861         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9862     }
9863     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9864         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9865 #else
9866     PERL_UNUSED_CONTEXT;
9867     PERL_UNUSED_ARG(sv);
9868     PERL_UNUSED_ARG(o);
9869     PERL_UNUSED_ARG(prog);
9870 #endif  /* DEBUGGING */
9871 }
9872
9873 SV *
9874 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9875 {                               /* Assume that RE_INTUIT is set */
9876     dVAR;
9877     struct regexp *const prog = (struct regexp *)SvANY(r);
9878     GET_RE_DEBUG_FLAGS_DECL;
9879
9880     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9881     PERL_UNUSED_CONTEXT;
9882
9883     DEBUG_COMPILE_r(
9884         {
9885             const char * const s = SvPV_nolen_const(prog->check_substr
9886                       ? prog->check_substr : prog->check_utf8);
9887
9888             if (!PL_colorset) reginitcolors();
9889             PerlIO_printf(Perl_debug_log,
9890                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9891                       PL_colors[4],
9892                       prog->check_substr ? "" : "utf8 ",
9893                       PL_colors[5],PL_colors[0],
9894                       s,
9895                       PL_colors[1],
9896                       (strlen(s) > 60 ? "..." : ""));
9897         } );
9898
9899     return prog->check_substr ? prog->check_substr : prog->check_utf8;
9900 }
9901
9902 /*
9903    pregfree()
9904
9905    handles refcounting and freeing the perl core regexp structure. When
9906    it is necessary to actually free the structure the first thing it
9907    does is call the 'free' method of the regexp_engine associated to
9908    the regexp, allowing the handling of the void *pprivate; member
9909    first. (This routine is not overridable by extensions, which is why
9910    the extensions free is called first.)
9911
9912    See regdupe and regdupe_internal if you change anything here.
9913 */
9914 #ifndef PERL_IN_XSUB_RE
9915 void
9916 Perl_pregfree(pTHX_ REGEXP *r)
9917 {
9918     SvREFCNT_dec(r);
9919 }
9920
9921 void
9922 Perl_pregfree2(pTHX_ REGEXP *rx)
9923 {
9924     dVAR;
9925     struct regexp *const r = (struct regexp *)SvANY(rx);
9926     GET_RE_DEBUG_FLAGS_DECL;
9927
9928     PERL_ARGS_ASSERT_PREGFREE2;
9929
9930     if (r->mother_re) {
9931         ReREFCNT_dec(r->mother_re);
9932     } else {
9933         CALLREGFREE_PVT(rx); /* free the private data */
9934         SvREFCNT_dec(RXp_PAREN_NAMES(r));
9935     }
9936     if (r->substrs) {
9937         SvREFCNT_dec(r->anchored_substr);
9938         SvREFCNT_dec(r->anchored_utf8);
9939         SvREFCNT_dec(r->float_substr);
9940         SvREFCNT_dec(r->float_utf8);
9941         Safefree(r->substrs);
9942     }
9943     RX_MATCH_COPY_FREE(rx);
9944 #ifdef PERL_OLD_COPY_ON_WRITE
9945     SvREFCNT_dec(r->saved_copy);
9946 #endif
9947     Safefree(r->offs);
9948 }
9949
9950 /*  reg_temp_copy()
9951
9952     This is a hacky workaround to the structural issue of match results
9953     being stored in the regexp structure which is in turn stored in
9954     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9955     could be PL_curpm in multiple contexts, and could require multiple
9956     result sets being associated with the pattern simultaneously, such
9957     as when doing a recursive match with (??{$qr})
9958
9959     The solution is to make a lightweight copy of the regexp structure
9960     when a qr// is returned from the code executed by (??{$qr}) this
9961     lightweight copy doesnt actually own any of its data except for
9962     the starp/end and the actual regexp structure itself.
9963
9964 */
9965
9966
9967 REGEXP *
9968 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9969 {
9970     struct regexp *ret;
9971     struct regexp *const r = (struct regexp *)SvANY(rx);
9972     register const I32 npar = r->nparens+1;
9973
9974     PERL_ARGS_ASSERT_REG_TEMP_COPY;
9975
9976     if (!ret_x)
9977         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9978     ret = (struct regexp *)SvANY(ret_x);
9979
9980     (void)ReREFCNT_inc(rx);
9981     /* We can take advantage of the existing "copied buffer" mechanism in SVs
9982        by pointing directly at the buffer, but flagging that the allocated
9983        space in the copy is zero. As we've just done a struct copy, it's now
9984        a case of zero-ing that, rather than copying the current length.  */
9985     SvPV_set(ret_x, RX_WRAPPED(rx));
9986     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9987     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9988            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9989     SvLEN_set(ret_x, 0);
9990     SvSTASH_set(ret_x, NULL);
9991     SvMAGIC_set(ret_x, NULL);
9992     Newx(ret->offs, npar, regexp_paren_pair);
9993     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9994     if (r->substrs) {
9995         Newx(ret->substrs, 1, struct reg_substr_data);
9996         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9997
9998         SvREFCNT_inc_void(ret->anchored_substr);
9999         SvREFCNT_inc_void(ret->anchored_utf8);
10000         SvREFCNT_inc_void(ret->float_substr);
10001         SvREFCNT_inc_void(ret->float_utf8);
10002
10003         /* check_substr and check_utf8, if non-NULL, point to either their
10004            anchored or float namesakes, and don't hold a second reference.  */
10005     }
10006     RX_MATCH_COPIED_off(ret_x);
10007 #ifdef PERL_OLD_COPY_ON_WRITE
10008     ret->saved_copy = NULL;
10009 #endif
10010     ret->mother_re = rx;
10011
10012     return ret_x;
10013 }
10014 #endif
10015
10016 /* regfree_internal()
10017
10018    Free the private data in a regexp. This is overloadable by
10019    extensions. Perl takes care of the regexp structure in pregfree(),
10020    this covers the *pprivate pointer which technically perl doesn't
10021    know about, however of course we have to handle the
10022    regexp_internal structure when no extension is in use.
10023
10024    Note this is called before freeing anything in the regexp
10025    structure.
10026  */
10027
10028 void
10029 Perl_regfree_internal(pTHX_ REGEXP * const rx)
10030 {
10031     dVAR;
10032     struct regexp *const r = (struct regexp *)SvANY(rx);
10033     RXi_GET_DECL(r,ri);
10034     GET_RE_DEBUG_FLAGS_DECL;
10035
10036     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
10037
10038     DEBUG_COMPILE_r({
10039         if (!PL_colorset)
10040             reginitcolors();
10041         {
10042             SV *dsv= sv_newmortal();
10043             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
10044                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
10045             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
10046                 PL_colors[4],PL_colors[5],s);
10047         }
10048     });
10049 #ifdef RE_TRACK_PATTERN_OFFSETS
10050     if (ri->u.offsets)
10051         Safefree(ri->u.offsets);             /* 20010421 MJD */
10052 #endif
10053     if (ri->data) {
10054         int n = ri->data->count;
10055         PAD* new_comppad = NULL;
10056         PAD* old_comppad;
10057         PADOFFSET refcnt;
10058
10059         while (--n >= 0) {
10060           /* If you add a ->what type here, update the comment in regcomp.h */
10061             switch (ri->data->what[n]) {
10062             case 'a':
10063             case 's':
10064             case 'S':
10065             case 'u':
10066                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
10067                 break;
10068             case 'f':
10069                 Safefree(ri->data->data[n]);
10070                 break;
10071             case 'p':
10072                 new_comppad = MUTABLE_AV(ri->data->data[n]);
10073                 break;
10074             case 'o':
10075                 if (new_comppad == NULL)
10076                     Perl_croak(aTHX_ "panic: pregfree comppad");
10077                 PAD_SAVE_LOCAL(old_comppad,
10078                     /* Watch out for global destruction's random ordering. */
10079                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
10080                 );
10081                 OP_REFCNT_LOCK;
10082                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
10083                 OP_REFCNT_UNLOCK;
10084                 if (!refcnt)
10085                     op_free((OP_4tree*)ri->data->data[n]);
10086
10087                 PAD_RESTORE_LOCAL(old_comppad);
10088                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
10089                 new_comppad = NULL;
10090                 break;
10091             case 'n':
10092                 break;
10093             case 'T':
10094                 { /* Aho Corasick add-on structure for a trie node.
10095                      Used in stclass optimization only */
10096                     U32 refcount;
10097                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
10098                     OP_REFCNT_LOCK;
10099                     refcount = --aho->refcount;
10100                     OP_REFCNT_UNLOCK;
10101                     if ( !refcount ) {
10102                         PerlMemShared_free(aho->states);
10103                         PerlMemShared_free(aho->fail);
10104                          /* do this last!!!! */
10105                         PerlMemShared_free(ri->data->data[n]);
10106                         PerlMemShared_free(ri->regstclass);
10107                     }
10108                 }
10109                 break;
10110             case 't':
10111                 {
10112                     /* trie structure. */
10113                     U32 refcount;
10114                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
10115                     OP_REFCNT_LOCK;
10116                     refcount = --trie->refcount;
10117                     OP_REFCNT_UNLOCK;
10118                     if ( !refcount ) {
10119                         PerlMemShared_free(trie->charmap);
10120                         PerlMemShared_free(trie->states);
10121                         PerlMemShared_free(trie->trans);
10122                         if (trie->bitmap)
10123                             PerlMemShared_free(trie->bitmap);
10124                         if (trie->jump)
10125                             PerlMemShared_free(trie->jump);
10126                         PerlMemShared_free(trie->wordinfo);
10127                         /* do this last!!!! */
10128                         PerlMemShared_free(ri->data->data[n]);
10129                     }
10130                 }
10131                 break;
10132             default:
10133                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
10134             }
10135         }
10136         Safefree(ri->data->what);
10137         Safefree(ri->data);
10138     }
10139
10140     Safefree(ri);
10141 }
10142
10143 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
10144 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
10145 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
10146
10147 /*
10148    re_dup - duplicate a regexp.
10149
10150    This routine is expected to clone a given regexp structure. It is only
10151    compiled under USE_ITHREADS.
10152
10153    After all of the core data stored in struct regexp is duplicated
10154    the regexp_engine.dupe method is used to copy any private data
10155    stored in the *pprivate pointer. This allows extensions to handle
10156    any duplication it needs to do.
10157
10158    See pregfree() and regfree_internal() if you change anything here.
10159 */
10160 #if defined(USE_ITHREADS)
10161 #ifndef PERL_IN_XSUB_RE
10162 void
10163 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
10164 {
10165     dVAR;
10166     I32 npar;
10167     const struct regexp *r = (const struct regexp *)SvANY(sstr);
10168     struct regexp *ret = (struct regexp *)SvANY(dstr);
10169
10170     PERL_ARGS_ASSERT_RE_DUP_GUTS;
10171
10172     npar = r->nparens+1;
10173     Newx(ret->offs, npar, regexp_paren_pair);
10174     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
10175     if(ret->swap) {
10176         /* no need to copy these */
10177         Newx(ret->swap, npar, regexp_paren_pair);
10178     }
10179
10180     if (ret->substrs) {
10181         /* Do it this way to avoid reading from *r after the StructCopy().
10182            That way, if any of the sv_dup_inc()s dislodge *r from the L1
10183            cache, it doesn't matter.  */
10184         const bool anchored = r->check_substr
10185             ? r->check_substr == r->anchored_substr
10186             : r->check_utf8 == r->anchored_utf8;
10187         Newx(ret->substrs, 1, struct reg_substr_data);
10188         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
10189
10190         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
10191         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
10192         ret->float_substr = sv_dup_inc(ret->float_substr, param);
10193         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
10194
10195         /* check_substr and check_utf8, if non-NULL, point to either their
10196            anchored or float namesakes, and don't hold a second reference.  */
10197
10198         if (ret->check_substr) {
10199             if (anchored) {
10200                 assert(r->check_utf8 == r->anchored_utf8);
10201                 ret->check_substr = ret->anchored_substr;
10202                 ret->check_utf8 = ret->anchored_utf8;
10203             } else {
10204                 assert(r->check_substr == r->float_substr);
10205                 assert(r->check_utf8 == r->float_utf8);
10206                 ret->check_substr = ret->float_substr;
10207                 ret->check_utf8 = ret->float_utf8;
10208             }
10209         } else if (ret->check_utf8) {
10210             if (anchored) {
10211                 ret->check_utf8 = ret->anchored_utf8;
10212             } else {
10213                 ret->check_utf8 = ret->float_utf8;
10214             }
10215         }
10216     }
10217
10218     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
10219
10220     if (ret->pprivate)
10221         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
10222
10223     if (RX_MATCH_COPIED(dstr))
10224         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
10225     else
10226         ret->subbeg = NULL;
10227 #ifdef PERL_OLD_COPY_ON_WRITE
10228     ret->saved_copy = NULL;
10229 #endif
10230
10231     if (ret->mother_re) {
10232         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
10233             /* Our storage points directly to our mother regexp, but that's
10234                1: a buffer in a different thread
10235                2: something we no longer hold a reference on
10236                so we need to copy it locally.  */
10237             /* Note we need to sue SvCUR() on our mother_re, because it, in
10238                turn, may well be pointing to its own mother_re.  */
10239             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
10240                                    SvCUR(ret->mother_re)+1));
10241             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
10242         }
10243         ret->mother_re      = NULL;
10244     }
10245     ret->gofs = 0;
10246 }
10247 #endif /* PERL_IN_XSUB_RE */
10248
10249 /*
10250    regdupe_internal()
10251
10252    This is the internal complement to regdupe() which is used to copy
10253    the structure pointed to by the *pprivate pointer in the regexp.
10254    This is the core version of the extension overridable cloning hook.
10255    The regexp structure being duplicated will be copied by perl prior
10256    to this and will be provided as the regexp *r argument, however
10257    with the /old/ structures pprivate pointer value. Thus this routine
10258    may override any copying normally done by perl.
10259
10260    It returns a pointer to the new regexp_internal structure.
10261 */
10262
10263 void *
10264 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
10265 {
10266     dVAR;
10267     struct regexp *const r = (struct regexp *)SvANY(rx);
10268     regexp_internal *reti;
10269     int len, npar;
10270     RXi_GET_DECL(r,ri);
10271
10272     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
10273
10274     npar = r->nparens+1;
10275     len = ProgLen(ri);
10276
10277     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
10278     Copy(ri->program, reti->program, len+1, regnode);
10279
10280
10281     reti->regstclass = NULL;
10282
10283     if (ri->data) {
10284         struct reg_data *d;
10285         const int count = ri->data->count;
10286         int i;
10287
10288         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
10289                 char, struct reg_data);
10290         Newx(d->what, count, U8);
10291
10292         d->count = count;
10293         for (i = 0; i < count; i++) {
10294             d->what[i] = ri->data->what[i];
10295             switch (d->what[i]) {
10296                 /* legal options are one of: sSfpontTua
10297                    see also regcomp.h and pregfree() */
10298             case 'a': /* actually an AV, but the dup function is identical.  */
10299             case 's':
10300             case 'S':
10301             case 'p': /* actually an AV, but the dup function is identical.  */
10302             case 'u': /* actually an HV, but the dup function is identical.  */
10303                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
10304                 break;
10305             case 'f':
10306                 /* This is cheating. */
10307                 Newx(d->data[i], 1, struct regnode_charclass_class);
10308                 StructCopy(ri->data->data[i], d->data[i],
10309                             struct regnode_charclass_class);
10310                 reti->regstclass = (regnode*)d->data[i];
10311                 break;
10312             case 'o':
10313                 /* Compiled op trees are readonly and in shared memory,
10314                    and can thus be shared without duplication. */
10315                 OP_REFCNT_LOCK;
10316                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
10317                 OP_REFCNT_UNLOCK;
10318                 break;
10319             case 'T':
10320                 /* Trie stclasses are readonly and can thus be shared
10321                  * without duplication. We free the stclass in pregfree
10322                  * when the corresponding reg_ac_data struct is freed.
10323                  */
10324                 reti->regstclass= ri->regstclass;
10325                 /* Fall through */
10326             case 't':
10327                 OP_REFCNT_LOCK;
10328                 ((reg_trie_data*)ri->data->data[i])->refcount++;
10329                 OP_REFCNT_UNLOCK;
10330                 /* Fall through */
10331             case 'n':
10332                 d->data[i] = ri->data->data[i];
10333                 break;
10334             default:
10335                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
10336             }
10337         }
10338
10339         reti->data = d;
10340     }
10341     else
10342         reti->data = NULL;
10343
10344     reti->name_list_idx = ri->name_list_idx;
10345
10346 #ifdef RE_TRACK_PATTERN_OFFSETS
10347     if (ri->u.offsets) {
10348         Newx(reti->u.offsets, 2*len+1, U32);
10349         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
10350     }
10351 #else
10352     SetProgLen(reti,len);
10353 #endif
10354
10355     return (void*)reti;
10356 }
10357
10358 #endif    /* USE_ITHREADS */
10359
10360 #ifndef PERL_IN_XSUB_RE
10361
10362 /*
10363  - regnext - dig the "next" pointer out of a node
10364  */
10365 regnode *
10366 Perl_regnext(pTHX_ register regnode *p)
10367 {
10368     dVAR;
10369     register I32 offset;
10370
10371     if (!p)
10372         return(NULL);
10373
10374     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
10375         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
10376     }
10377
10378     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
10379     if (offset == 0)
10380         return(NULL);
10381
10382     return(p+offset);
10383 }
10384 #endif
10385
10386 STATIC void
10387 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
10388 {
10389     va_list args;
10390     STRLEN l1 = strlen(pat1);
10391     STRLEN l2 = strlen(pat2);
10392     char buf[512];
10393     SV *msv;
10394     const char *message;
10395
10396     PERL_ARGS_ASSERT_RE_CROAK2;
10397
10398     if (l1 > 510)
10399         l1 = 510;
10400     if (l1 + l2 > 510)
10401         l2 = 510 - l1;
10402     Copy(pat1, buf, l1 , char);
10403     Copy(pat2, buf + l1, l2 , char);
10404     buf[l1 + l2] = '\n';
10405     buf[l1 + l2 + 1] = '\0';
10406 #ifdef I_STDARG
10407     /* ANSI variant takes additional second argument */
10408     va_start(args, pat2);
10409 #else
10410     va_start(args);
10411 #endif
10412     msv = vmess(buf, &args);
10413     va_end(args);
10414     message = SvPV_const(msv,l1);
10415     if (l1 > 512)
10416         l1 = 512;
10417     Copy(message, buf, l1 , char);
10418     buf[l1-1] = '\0';                   /* Overwrite \n */
10419     Perl_croak(aTHX_ "%s", buf);
10420 }
10421
10422 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
10423
10424 #ifndef PERL_IN_XSUB_RE
10425 void
10426 Perl_save_re_context(pTHX)
10427 {
10428     dVAR;
10429
10430     struct re_save_state *state;
10431
10432     SAVEVPTR(PL_curcop);
10433     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10434
10435     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10436     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
10437     SSPUSHUV(SAVEt_RE_STATE);
10438
10439     Copy(&PL_reg_state, state, 1, struct re_save_state);
10440
10441     PL_reg_start_tmp = 0;
10442     PL_reg_start_tmpl = 0;
10443     PL_reg_oldsaved = NULL;
10444     PL_reg_oldsavedlen = 0;
10445     PL_reg_maxiter = 0;
10446     PL_reg_leftiter = 0;
10447     PL_reg_poscache = NULL;
10448     PL_reg_poscache_size = 0;
10449 #ifdef PERL_OLD_COPY_ON_WRITE
10450     PL_nrs = NULL;
10451 #endif
10452
10453     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10454     if (PL_curpm) {
10455         const REGEXP * const rx = PM_GETRE(PL_curpm);
10456         if (rx) {
10457             U32 i;
10458             for (i = 1; i <= RX_NPARENS(rx); i++) {
10459                 char digits[TYPE_CHARS(long)];
10460                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
10461                 GV *const *const gvp
10462                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10463
10464                 if (gvp) {
10465                     GV * const gv = *gvp;
10466                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10467                         save_scalar(gv);
10468                 }
10469             }
10470         }
10471     }
10472 }
10473 #endif
10474
10475 static void
10476 clear_re(pTHX_ void *r)
10477 {
10478     dVAR;
10479     ReREFCNT_dec((REGEXP *)r);
10480 }
10481
10482 #ifdef DEBUGGING
10483
10484 STATIC void
10485 S_put_byte(pTHX_ SV *sv, int c)
10486 {
10487     PERL_ARGS_ASSERT_PUT_BYTE;
10488
10489     /* Our definition of isPRINT() ignores locales, so only bytes that are
10490        not part of UTF-8 are considered printable. I assume that the same
10491        holds for UTF-EBCDIC.
10492        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10493        which Wikipedia says:
10494
10495        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10496        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10497        identical, to the ASCII delete (DEL) or rubout control character.
10498        ) So the old condition can be simplified to !isPRINT(c)  */
10499     if (!isPRINT(c)) {
10500         if (c < 256) {
10501             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
10502         }
10503         else {
10504             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
10505         }
10506     }
10507     else {
10508         const char string = c;
10509         if (c == '-' || c == ']' || c == '\\' || c == '^')
10510             sv_catpvs(sv, "\\");
10511         sv_catpvn(sv, &string, 1);
10512     }
10513 }
10514
10515
10516 #define CLEAR_OPTSTART \
10517     if (optstart) STMT_START { \
10518             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
10519             optstart=NULL; \
10520     } STMT_END
10521
10522 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
10523
10524 STATIC const regnode *
10525 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
10526             const regnode *last, const regnode *plast,
10527             SV* sv, I32 indent, U32 depth)
10528 {
10529     dVAR;
10530     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
10531     register const regnode *next;
10532     const regnode *optstart= NULL;
10533
10534     RXi_GET_DECL(r,ri);
10535     GET_RE_DEBUG_FLAGS_DECL;
10536
10537     PERL_ARGS_ASSERT_DUMPUNTIL;
10538
10539 #ifdef DEBUG_DUMPUNTIL
10540     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10541         last ? last-start : 0,plast ? plast-start : 0);
10542 #endif
10543
10544     if (plast && plast < last)
10545         last= plast;
10546
10547     while (PL_regkind[op] != END && (!last || node < last)) {
10548         /* While that wasn't END last time... */
10549         NODE_ALIGN(node);
10550         op = OP(node);
10551         if (op == CLOSE || op == WHILEM)
10552             indent--;
10553         next = regnext((regnode *)node);
10554
10555         /* Where, what. */
10556         if (OP(node) == OPTIMIZED) {
10557             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10558                 optstart = node;
10559             else
10560                 goto after_print;
10561         } else
10562             CLEAR_OPTSTART;
10563
10564         regprop(r, sv, node);
10565         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10566                       (int)(2*indent + 1), "", SvPVX_const(sv));
10567
10568         if (OP(node) != OPTIMIZED) {
10569             if (next == NULL)           /* Next ptr. */
10570                 PerlIO_printf(Perl_debug_log, " (0)");
10571             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10572                 PerlIO_printf(Perl_debug_log, " (FAIL)");
10573             else
10574                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10575             (void)PerlIO_putc(Perl_debug_log, '\n');
10576         }
10577
10578       after_print:
10579         if (PL_regkind[(U8)op] == BRANCHJ) {
10580             assert(next);
10581             {
10582                 register const regnode *nnode = (OP(next) == LONGJMP
10583                                              ? regnext((regnode *)next)
10584                                              : next);
10585                 if (last && nnode > last)
10586                     nnode = last;
10587                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10588             }
10589         }
10590         else if (PL_regkind[(U8)op] == BRANCH) {
10591             assert(next);
10592             DUMPUNTIL(NEXTOPER(node), next);
10593         }
10594         else if ( PL_regkind[(U8)op]  == TRIE ) {
10595             const regnode *this_trie = node;
10596             const char op = OP(node);
10597             const U32 n = ARG(node);
10598             const reg_ac_data * const ac = op>=AHOCORASICK ?
10599                (reg_ac_data *)ri->data->data[n] :
10600                NULL;
10601             const reg_trie_data * const trie =
10602                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10603 #ifdef DEBUGGING
10604             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10605 #endif
10606             const regnode *nextbranch= NULL;
10607             I32 word_idx;
10608             sv_setpvs(sv, "");
10609             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10610                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10611
10612                 PerlIO_printf(Perl_debug_log, "%*s%s ",
10613                    (int)(2*(indent+3)), "",
10614                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10615                             PL_colors[0], PL_colors[1],
10616                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10617                             PERL_PV_PRETTY_ELLIPSES    |
10618                             PERL_PV_PRETTY_LTGT
10619                             )
10620                             : "???"
10621                 );
10622                 if (trie->jump) {
10623                     U16 dist= trie->jump[word_idx+1];
10624                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10625                                   (UV)((dist ? this_trie + dist : next) - start));
10626                     if (dist) {
10627                         if (!nextbranch)
10628                             nextbranch= this_trie + trie->jump[0];
10629                         DUMPUNTIL(this_trie + dist, nextbranch);
10630                     }
10631                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10632                         nextbranch= regnext((regnode *)nextbranch);
10633                 } else {
10634                     PerlIO_printf(Perl_debug_log, "\n");
10635                 }
10636             }
10637             if (last && next > last)
10638                 node= last;
10639             else
10640                 node= next;
10641         }
10642         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
10643             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10644                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10645         }
10646         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10647             assert(next);
10648             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10649         }
10650         else if ( op == PLUS || op == STAR) {
10651             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10652         }
10653         else if (op == ANYOF) {
10654             /* arglen 1 + class block */
10655             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
10656                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10657             node = NEXTOPER(node);
10658         }
10659         else if (PL_regkind[(U8)op] == EXACT) {
10660             /* Literal string, where present. */
10661             node += NODE_SZ_STR(node) - 1;
10662             node = NEXTOPER(node);
10663         }
10664         else {
10665             node = NEXTOPER(node);
10666             node += regarglen[(U8)op];
10667         }
10668         if (op == CURLYX || op == OPEN)
10669             indent++;
10670     }
10671     CLEAR_OPTSTART;
10672 #ifdef DEBUG_DUMPUNTIL
10673     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10674 #endif
10675     return node;
10676 }
10677
10678 #endif  /* DEBUGGING */
10679
10680 /*
10681  * Local variables:
10682  * c-indentation-style: bsd
10683  * c-basic-offset: 4
10684  * indent-tabs-mode: t
10685  * End:
10686  *
10687  * ex: set ts=8 sts=4 sw=4 noet:
10688  */