src/5013008/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89
  90 #ifdef op
  91 #undef op
  92 #endif /* op */
  93
  94 #ifdef MSDOS
  95 #  if defined(BUGGY_MSC6)
  96  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  97 #    pragma optimize("a",off)
  98  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  99 #    pragma optimize("w",on )
 100 #  endif /* BUGGY_MSC6 */
 101 #endif /* MSDOS */
 102
 103 #ifndef STATIC
 104 #define STATIC  static
 105 #endif
 106
 107 typedef struct RExC_state_t {
 108     U32         flags;                  /* are we folding, multilining? */
 109     char        *precomp;               /* uncompiled string. */
 110     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 111     regexp      *rx;                    /* perl core regexp structure */
 112     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 113     char        *start;                 /* Start of input for compile */
 114     char        *end;                   /* End of input for compile */
 115     char        *parse;                 /* Input-scan pointer. */
 116     I32         whilem_seen;            /* number of WHILEM in this expr */
 117     regnode     *emit_start;            /* Start of emitted-code area */
 118     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 120     I32         naughty;                /* How bad is this pattern? */
 121     I32         sawback;                /* Did we see \1, ...? */
 122     U32         seen;
 123     I32         size;                   /* Code size. */
 124     I32         npar;                   /* Capture buffer count, (OPEN). */
 125     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 126     I32         nestroot;               /* root parens we are in - used by accept */
 127     I32         extralen;
 128     I32         seen_zerolen;
 129     I32         seen_evals;
 130     regnode     **open_parens;          /* pointers to open parens */
 131     regnode     **close_parens;         /* pointers to close parens */
 132     regnode     *opend;                 /* END node in program */
 133     I32         utf8;           /* whether the pattern is utf8 or not */
 134     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 135                                 /* XXX use this for future optimisation of case
 136                                  * where pattern must be upgraded to utf8. */
 137     HV          *paren_names;           /* Paren names */
 138
 139     regnode     **recurse;              /* Recurse regops */
 140     I32         recurse_count;          /* Number of recurse regops */
 141 #if ADD_TO_REGEXEC
 142     char        *starttry;              /* -Dr: where regtry was called. */
 143 #define RExC_starttry   (pRExC_state->starttry)
 144 #endif
 145 #ifdef DEBUGGING
 146     const char  *lastparse;
 147     I32         lastnum;
 148     AV          *paren_name_list;       /* idx -> name */
 149 #define RExC_lastparse  (pRExC_state->lastparse)
 150 #define RExC_lastnum    (pRExC_state->lastnum)
 151 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 152 #endif
 153 } RExC_state_t;
 154
 155 #define RExC_flags      (pRExC_state->flags)
 156 #define RExC_precomp    (pRExC_state->precomp)
 157 #define RExC_rx_sv      (pRExC_state->rx_sv)
 158 #define RExC_rx         (pRExC_state->rx)
 159 #define RExC_rxi        (pRExC_state->rxi)
 160 #define RExC_start      (pRExC_state->start)
 161 #define RExC_end        (pRExC_state->end)
 162 #define RExC_parse      (pRExC_state->parse)
 163 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 164 #ifdef RE_TRACK_PATTERN_OFFSETS
 165 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 166 #endif
 167 #define RExC_emit       (pRExC_state->emit)
 168 #define RExC_emit_start (pRExC_state->emit_start)
 169 #define RExC_emit_bound (pRExC_state->emit_bound)
 170 #define RExC_naughty    (pRExC_state->naughty)
 171 #define RExC_sawback    (pRExC_state->sawback)
 172 #define RExC_seen       (pRExC_state->seen)
 173 #define RExC_size       (pRExC_state->size)
 174 #define RExC_npar       (pRExC_state->npar)
 175 #define RExC_nestroot   (pRExC_state->nestroot)
 176 #define RExC_extralen   (pRExC_state->extralen)
 177 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 178 #define RExC_seen_evals (pRExC_state->seen_evals)
 179 #define RExC_utf8       (pRExC_state->utf8)
 180 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 181 #define RExC_open_parens        (pRExC_state->open_parens)
 182 #define RExC_close_parens       (pRExC_state->close_parens)
 183 #define RExC_opend      (pRExC_state->opend)
 184 #define RExC_paren_names        (pRExC_state->paren_names)
 185 #define RExC_recurse    (pRExC_state->recurse)
 186 #define RExC_recurse_count      (pRExC_state->recurse_count)
 187
 188
 189 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 190 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 191         ((*s) == '{' && regcurly(s)))
 192
 193 #ifdef SPSTART
 194 #undef SPSTART          /* dratted cpp namespace... */
 195 #endif
 196 /*
 197  * Flags to be passed up and down.
 198  */
 199 #define WORST           0       /* Worst case. */
 200 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 201
 202 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 203  * character, and if utf8, must be invariant. */
 204 #define SIMPLE          0x02
 205 #define SPSTART         0x04    /* Starts with * or +. */
 206 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 207 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 208
 209 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 210
 211 /* whether trie related optimizations are enabled */
 212 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 213 #define TRIE_STUDY_OPT
 214 #define FULL_TRIE_STUDY
 215 #define TRIE_STCLASS
 216 #endif
 217
 218
 219
 220 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 221 #define PBITVAL(paren) (1 << ((paren) & 7))
 222 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 223 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 224 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 225
 226 /* If not already in utf8, do a longjmp back to the beginning */
 227 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 228 #define REQUIRE_UTF8    STMT_START {                                       \
 229                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 230                         } STMT_END
 231
 232 /* About scan_data_t.
 233
 234   During optimisation we recurse through the regexp program performing
 235   various inplace (keyhole style) optimisations. In addition study_chunk
 236   and scan_commit populate this data structure with information about
 237   what strings MUST appear in the pattern. We look for the longest
 238   string that must appear at a fixed location, and we look for the
 239   longest string that may appear at a floating location. So for instance
 240   in the pattern:
 241
 242     /FOO[xX]A.*B[xX]BAR/
 243
 244   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 245   strings (because they follow a .* construct). study_chunk will identify
 246   both FOO and BAR as being the longest fixed and floating strings respectively.
 247
 248   The strings can be composites, for instance
 249
 250      /(f)(o)(o)/
 251
 252   will result in a composite fixed substring 'foo'.
 253
 254   For each string some basic information is maintained:
 255
 256   - offset or min_offset
 257     This is the position the string must appear at, or not before.
 258     It also implicitly (when combined with minlenp) tells us how many
 259     characters must match before the string we are searching for.
 260     Likewise when combined with minlenp and the length of the string it
 261     tells us how many characters must appear after the string we have
 262     found.
 263
 264   - max_offset
 265     Only used for floating strings. This is the rightmost point that
 266     the string can appear at. If set to I32 max it indicates that the
 267     string can occur infinitely far to the right.
 268
 269   - minlenp
 270     A pointer to the minimum length of the pattern that the string
 271     was found inside. This is important as in the case of positive
 272     lookahead or positive lookbehind we can have multiple patterns
 273     involved. Consider
 274
 275     /(?=FOO).*F/
 276
 277     The minimum length of the pattern overall is 3, the minimum length
 278     of the lookahead part is 3, but the minimum length of the part that
 279     will actually match is 1. So 'FOO's minimum length is 3, but the
 280     minimum length for the F is 1. This is important as the minimum length
 281     is used to determine offsets in front of and behind the string being
 282     looked for.  Since strings can be composites this is the length of the
 283     pattern at the time it was commited with a scan_commit. Note that
 284     the length is calculated by study_chunk, so that the minimum lengths
 285     are not known until the full pattern has been compiled, thus the
 286     pointer to the value.
 287
 288   - lookbehind
 289
 290     In the case of lookbehind the string being searched for can be
 291     offset past the start point of the final matching string.
 292     If this value was just blithely removed from the min_offset it would
 293     invalidate some of the calculations for how many chars must match
 294     before or after (as they are derived from min_offset and minlen and
 295     the length of the string being searched for).
 296     When the final pattern is compiled and the data is moved from the
 297     scan_data_t structure into the regexp structure the information
 298     about lookbehind is factored in, with the information that would
 299     have been lost precalculated in the end_shift field for the
 300     associated string.
 301
 302   The fields pos_min and pos_delta are used to store the minimum offset
 303   and the delta to the maximum offset at the current point in the pattern.
 304
 305 */
 306
 307 typedef struct scan_data_t {
 308     /*I32 len_min;      unused */
 309     /*I32 len_delta;    unused */
 310     I32 pos_min;
 311     I32 pos_delta;
 312     SV *last_found;
 313     I32 last_end;           /* min value, <0 unless valid. */
 314     I32 last_start_min;
 315     I32 last_start_max;
 316     SV **longest;           /* Either &l_fixed, or &l_float. */
 317     SV *longest_fixed;      /* longest fixed string found in pattern */
 318     I32 offset_fixed;       /* offset where it starts */
 319     I32 *minlen_fixed;      /* pointer to the minlen relevent to the string */
 320     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 321     SV *longest_float;      /* longest floating string found in pattern */
 322     I32 offset_float_min;   /* earliest point in string it can appear */
 323     I32 offset_float_max;   /* latest point in string it can appear */
 324     I32 *minlen_float;      /* pointer to the minlen relevent to the string */
 325     I32 lookbehind_float;   /* is the position of the string modified by LB */
 326     I32 flags;
 327     I32 whilem_c;
 328     I32 *last_closep;
 329     struct regnode_charclass_class *start_class;
 330 } scan_data_t;
 331
 332 /*
 333  * Forward declarations for pregcomp()'s friends.
 334  */
 335
 336 static const scan_data_t zero_scan_data =
 337   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 338
 339 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 340 #define SF_BEFORE_SEOL          0x0001
 341 #define SF_BEFORE_MEOL          0x0002
 342 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 343 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 344
 345 #ifdef NO_UNARY_PLUS
 346 #  define SF_FIX_SHIFT_EOL      (0+2)
 347 #  define SF_FL_SHIFT_EOL               (0+4)
 348 #else
 349 #  define SF_FIX_SHIFT_EOL      (+2)
 350 #  define SF_FL_SHIFT_EOL               (+4)
 351 #endif
 352
 353 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 354 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 355
 356 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 357 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 358 #define SF_IS_INF               0x0040
 359 #define SF_HAS_PAR              0x0080
 360 #define SF_IN_PAR               0x0100
 361 #define SF_HAS_EVAL             0x0200
 362 #define SCF_DO_SUBSTR           0x0400
 363 #define SCF_DO_STCLASS_AND      0x0800
 364 #define SCF_DO_STCLASS_OR       0x1000
 365 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 366 #define SCF_WHILEM_VISITED_POS  0x2000
 367
 368 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 369 #define SCF_SEEN_ACCEPT         0x8000
 370
 371 #define UTF cBOOL(RExC_utf8)
 372 #define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
 373 #define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
 374 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 375
 376 #define OOB_UNICODE             12345678
 377 #define OOB_NAMEDCLASS          -1
 378
 379 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 380 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 381
 382
 383 /* length of regex to show in messages that don't mark a position within */
 384 #define RegexLengthToShowInErrorMessages 127
 385
 386 /*
 387  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 388  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 389  * op/pragma/warn/regcomp.
 390  */
 391 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 392 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 393
 394 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 395
 396 /*
 397  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 398  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 399  * "...".
 400  */
 401 #define _FAIL(code) STMT_START {                                        \
 402     const char *ellipses = "";                                          \
 403     IV len = RExC_end - RExC_precomp;                                   \
 404                                                                         \
 405     if (!SIZE_ONLY)                                                     \
 406         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 407     if (len > RegexLengthToShowInErrorMessages) {                       \
 408         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 409         len = RegexLengthToShowInErrorMessages - 10;                    \
 410         ellipses = "...";                                               \
 411     }                                                                   \
 412     code;                                                               \
 413 } STMT_END
 414
 415 #define FAIL(msg) _FAIL(                            \
 416     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 417             msg, (int)len, RExC_precomp, ellipses))
 418
 419 #define FAIL2(msg,arg) _FAIL(                       \
 420     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 421             arg, (int)len, RExC_precomp, ellipses))
 422
 423 /*
 424  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 425  */
 426 #define Simple_vFAIL(m) STMT_START {                                    \
 427     const IV offset = RExC_parse - RExC_precomp;                        \
 428     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 429             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 430 } STMT_END
 431
 432 /*
 433  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 434  */
 435 #define vFAIL(m) STMT_START {                           \
 436     if (!SIZE_ONLY)                                     \
 437         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 438     Simple_vFAIL(m);                                    \
 439 } STMT_END
 440
 441 /*
 442  * Like Simple_vFAIL(), but accepts two arguments.
 443  */
 444 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 445     const IV offset = RExC_parse - RExC_precomp;                        \
 446     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 447             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 448 } STMT_END
 449
 450 /*
 451  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 452  */
 453 #define vFAIL2(m,a1) STMT_START {                       \
 454     if (!SIZE_ONLY)                                     \
 455         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 456     Simple_vFAIL2(m, a1);                               \
 457 } STMT_END
 458
 459
 460 /*
 461  * Like Simple_vFAIL(), but accepts three arguments.
 462  */
 463 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 464     const IV offset = RExC_parse - RExC_precomp;                \
 465     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 466             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 467 } STMT_END
 468
 469 /*
 470  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 471  */
 472 #define vFAIL3(m,a1,a2) STMT_START {                    \
 473     if (!SIZE_ONLY)                                     \
 474         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 475     Simple_vFAIL3(m, a1, a2);                           \
 476 } STMT_END
 477
 478 /*
 479  * Like Simple_vFAIL(), but accepts four arguments.
 480  */
 481 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 482     const IV offset = RExC_parse - RExC_precomp;                \
 483     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 484             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 485 } STMT_END
 486
 487 #define ckWARNreg(loc,m) STMT_START {                                   \
 488     const IV offset = loc - RExC_precomp;                               \
 489     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 490             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 491 } STMT_END
 492
 493 #define ckWARNregdep(loc,m) STMT_START {                                \
 494     const IV offset = loc - RExC_precomp;                               \
 495     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 496             m REPORT_LOCATION,                                          \
 497             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 498 } STMT_END
 499
 500 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 501     const IV offset = loc - RExC_precomp;                               \
 502     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 503             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 504 } STMT_END
 505
 506 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 507     const IV offset = loc - RExC_precomp;                               \
 508     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 509             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 510 } STMT_END
 511
 512 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 513     const IV offset = loc - RExC_precomp;                               \
 514     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 515             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 516 } STMT_END
 517
 518 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 519     const IV offset = loc - RExC_precomp;                               \
 520     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 521             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 522 } STMT_END
 523
 524 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 525     const IV offset = loc - RExC_precomp;                               \
 526     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 527             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 528 } STMT_END
 529
 530 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 531     const IV offset = loc - RExC_precomp;                               \
 532     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 533             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 534 } STMT_END
 535
 536
 537 /* Allow for side effects in s */
 538 #define REGC(c,s) STMT_START {                  \
 539     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 540 } STMT_END
 541
 542 /* Macros for recording node offsets.   20001227 mjd@plover.com
 543  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 544  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 545  * Element 0 holds the number n.
 546  * Position is 1 indexed.
 547  */
 548 #ifndef RE_TRACK_PATTERN_OFFSETS
 549 #define Set_Node_Offset_To_R(node,byte)
 550 #define Set_Node_Offset(node,byte)
 551 #define Set_Cur_Node_Offset
 552 #define Set_Node_Length_To_R(node,len)
 553 #define Set_Node_Length(node,len)
 554 #define Set_Node_Cur_Length(node)
 555 #define Node_Offset(n)
 556 #define Node_Length(n)
 557 #define Set_Node_Offset_Length(node,offset,len)
 558 #define ProgLen(ri) ri->u.proglen
 559 #define SetProgLen(ri,x) ri->u.proglen = x
 560 #else
 561 #define ProgLen(ri) ri->u.offsets[0]
 562 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 563 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 564     if (! SIZE_ONLY) {                                                  \
 565         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 566                     __LINE__, (int)(node), (int)(byte)));               \
 567         if((node) < 0) {                                                \
 568             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 569         } else {                                                        \
 570             RExC_offsets[2*(node)-1] = (byte);                          \
 571         }                                                               \
 572     }                                                                   \
 573 } STMT_END
 574
 575 #define Set_Node_Offset(node,byte) \
 576     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 577 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 578
 579 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 580     if (! SIZE_ONLY) {                                                  \
 581         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 582                 __LINE__, (int)(node), (int)(len)));                    \
 583         if((node) < 0) {                                                \
 584             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 585         } else {                                                        \
 586             RExC_offsets[2*(node)] = (len);                             \
 587         }                                                               \
 588     }                                                                   \
 589 } STMT_END
 590
 591 #define Set_Node_Length(node,len) \
 592     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 593 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 594 #define Set_Node_Cur_Length(node) \
 595     Set_Node_Length(node, RExC_parse - parse_start)
 596
 597 /* Get offsets and lengths */
 598 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 599 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 600
 601 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 602     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 603     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 604 } STMT_END
 605 #endif
 606
 607 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 608 #define EXPERIMENTAL_INPLACESCAN
 609 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 610
 611 #define DEBUG_STUDYDATA(str,data,depth)                              \
 612 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 613     PerlIO_printf(Perl_debug_log,                                    \
 614         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 615         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 616         (int)(depth)*2, "",                                          \
 617         (IV)((data)->pos_min),                                       \
 618         (IV)((data)->pos_delta),                                     \
 619         (UV)((data)->flags),                                         \
 620         (IV)((data)->whilem_c),                                      \
 621         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 622         is_inf ? "INF " : ""                                         \
 623     );                                                               \
 624     if ((data)->last_found)                                          \
 625         PerlIO_printf(Perl_debug_log,                                \
 626             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 627             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 628             SvPVX_const((data)->last_found),                         \
 629             (IV)((data)->last_end),                                  \
 630             (IV)((data)->last_start_min),                            \
 631             (IV)((data)->last_start_max),                            \
 632             ((data)->longest &&                                      \
 633              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 634             SvPVX_const((data)->longest_fixed),                      \
 635             (IV)((data)->offset_fixed),                              \
 636             ((data)->longest &&                                      \
 637              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 638             SvPVX_const((data)->longest_float),                      \
 639             (IV)((data)->offset_float_min),                          \
 640             (IV)((data)->offset_float_max)                           \
 641         );                                                           \
 642     PerlIO_printf(Perl_debug_log,"\n");                              \
 643 });
 644
 645 static void clear_re(pTHX_ void *r);
 646
 647 /* Mark that we cannot extend a found fixed substring at this point.
 648    Update the longest found anchored substring and the longest found
 649    floating substrings if needed. */
 650
 651 STATIC void
 652 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 653 {
 654     const STRLEN l = CHR_SVLEN(data->last_found);
 655     const STRLEN old_l = CHR_SVLEN(*data->longest);
 656     GET_RE_DEBUG_FLAGS_DECL;
 657
 658     PERL_ARGS_ASSERT_SCAN_COMMIT;
 659
 660     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 661         SvSetMagicSV(*data->longest, data->last_found);
 662         if (*data->longest == data->longest_fixed) {
 663             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 664             if (data->flags & SF_BEFORE_EOL)
 665                 data->flags
 666                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 667             else
 668                 data->flags &= ~SF_FIX_BEFORE_EOL;
 669             data->minlen_fixed=minlenp;
 670             data->lookbehind_fixed=0;
 671         }
 672         else { /* *data->longest == data->longest_float */
 673             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 674             data->offset_float_max = (l
 675                                       ? data->last_start_max
 676                                       : data->pos_min + data->pos_delta);
 677             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 678                 data->offset_float_max = I32_MAX;
 679             if (data->flags & SF_BEFORE_EOL)
 680                 data->flags
 681                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 682             else
 683                 data->flags &= ~SF_FL_BEFORE_EOL;
 684             data->minlen_float=minlenp;
 685             data->lookbehind_float=0;
 686         }
 687     }
 688     SvCUR_set(data->last_found, 0);
 689     {
 690         SV * const sv = data->last_found;
 691         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 692             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 693             if (mg)
 694                 mg->mg_len = 0;
 695         }
 696     }
 697     data->last_end = -1;
 698     data->flags &= ~SF_BEFORE_EOL;
 699     DEBUG_STUDYDATA("commit: ",data,0);
 700 }
 701
 702 /* Can match anything (initialization) */
 703 STATIC void
 704 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 705 {
 706     PERL_ARGS_ASSERT_CL_ANYTHING;
 707
 708     ANYOF_CLASS_ZERO(cl);
 709     ANYOF_BITMAP_SETALL(cl);
 710     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
 711     if (LOC)
 712         cl->flags |= ANYOF_LOCALE;
 713     cl->flags |= ANYOF_FOLD;
 714 }
 715
 716 /* Can match anything (initialization) */
 717 STATIC int
 718 S_cl_is_anything(const struct regnode_charclass_class *cl)
 719 {
 720     int value;
 721
 722     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 723
 724     for (value = 0; value <= ANYOF_MAX; value += 2)
 725         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 726             return 1;
 727     if (!(cl->flags & ANYOF_UNICODE_ALL))
 728         return 0;
 729     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 730         return 0;
 731     return 1;
 732 }
 733
 734 /* Can match anything (initialization) */
 735 STATIC void
 736 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 737 {
 738     PERL_ARGS_ASSERT_CL_INIT;
 739
 740     Zero(cl, 1, struct regnode_charclass_class);
 741     cl->type = ANYOF;
 742     cl_anything(pRExC_state, cl);
 743 }
 744
 745 STATIC void
 746 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 747 {
 748     PERL_ARGS_ASSERT_CL_INIT_ZERO;
 749
 750     Zero(cl, 1, struct regnode_charclass_class);
 751     cl->type = ANYOF;
 752     cl_anything(pRExC_state, cl);
 753     if (LOC)
 754         cl->flags |= ANYOF_LOCALE;
 755 }
 756
 757 /* 'And' a given class with another one.  Can create false positives */
 758 /* We assume that cl is not inverted */
 759 STATIC void
 760 S_cl_and(struct regnode_charclass_class *cl,
 761         const struct regnode_charclass_class *and_with)
 762 {
 763     PERL_ARGS_ASSERT_CL_AND;
 764
 765     assert(and_with->type == ANYOF);
 766
 767     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 768         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 769         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 770         && !(and_with->flags & ANYOF_FOLD)
 771         && !(cl->flags & ANYOF_FOLD)) {
 772         int i;
 773
 774         if (and_with->flags & ANYOF_INVERT)
 775             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 776                 cl->bitmap[i] &= ~and_with->bitmap[i];
 777         else
 778             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 779                 cl->bitmap[i] &= and_with->bitmap[i];
 780     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 781     if (!(and_with->flags & ANYOF_EOS))
 782         cl->flags &= ~ANYOF_EOS;
 783
 784     if (!(and_with->flags & ANYOF_FOLD))
 785         cl->flags &= ~ANYOF_FOLD;
 786
 787     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP &&
 788         !(and_with->flags & ANYOF_INVERT)) {
 789         cl->flags &= ~ANYOF_UNICODE_ALL;
 790         cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
 791                                                            only the one(s)
 792                                                            actually set */
 793         ARG_SET(cl, ARG(and_with));
 794     }
 795     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
 796         !(and_with->flags & ANYOF_INVERT))
 797         cl->flags &= ~ANYOF_UNICODE_ALL;
 798     if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
 799         !(and_with->flags & ANYOF_INVERT))
 800         cl->flags &= ~ANYOF_NONBITMAP;
 801 }
 802
 803 /* 'OR' a given class with another one.  Can create false positives */
 804 /* We assume that cl is not inverted */
 805 STATIC void
 806 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 807 {
 808     PERL_ARGS_ASSERT_CL_OR;
 809
 810     if (or_with->flags & ANYOF_INVERT) {
 811         /* We do not use
 812          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 813          *   <= (B1 | !B2) | (CL1 | !CL2)
 814          * which is wasteful if CL2 is small, but we ignore CL2:
 815          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 816          * XXXX Can we handle case-fold?  Unclear:
 817          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 818          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 819          */
 820         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 821              && !(or_with->flags & ANYOF_FOLD)
 822              && !(cl->flags & ANYOF_FOLD) ) {
 823             int i;
 824
 825             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 826                 cl->bitmap[i] |= ~or_with->bitmap[i];
 827         } /* XXXX: logic is complicated otherwise */
 828         else {
 829             cl_anything(pRExC_state, cl);
 830         }
 831     } else {
 832         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 833         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 834              && (!(or_with->flags & ANYOF_FOLD)
 835                  || (cl->flags & ANYOF_FOLD)) ) {
 836             int i;
 837
 838             /* OR char bitmap and class bitmap separately */
 839             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 840                 cl->bitmap[i] |= or_with->bitmap[i];
 841             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 842                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 843                     cl->classflags[i] |= or_with->classflags[i];
 844                 cl->flags |= ANYOF_CLASS;
 845             }
 846         }
 847         else { /* XXXX: logic is complicated, leave it along for a moment. */
 848             cl_anything(pRExC_state, cl);
 849         }
 850     }
 851     if (or_with->flags & ANYOF_EOS)
 852         cl->flags |= ANYOF_EOS;
 853
 854     if (or_with->flags & ANYOF_FOLD)
 855         cl->flags |= ANYOF_FOLD;
 856
 857     /* If both nodes match something outside the bitmap, but what they match
 858      * outside is not the same pointer, and hence not easily compared, give up
 859      * and allow the start class to match everything outside the bitmap */
 860     if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
 861         ARG(cl) != ARG(or_with)) {
 862         cl->flags |= ANYOF_UNICODE_ALL;
 863     }
 864
 865     if (or_with->flags & ANYOF_UNICODE_ALL) {
 866         cl->flags |= ANYOF_UNICODE_ALL;
 867     }
 868 }
 869
 870 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 871 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 872 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 873 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
 874
 875
 876 #ifdef DEBUGGING
 877 /*
 878    dump_trie(trie,widecharmap,revcharmap)
 879    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
 880    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 881
 882    These routines dump out a trie in a somewhat readable format.
 883    The _interim_ variants are used for debugging the interim
 884    tables that are used to generate the final compressed
 885    representation which is what dump_trie expects.
 886
 887    Part of the reason for their existance is to provide a form
 888    of documentation as to how the different representations function.
 889
 890 */
 891
 892 /*
 893   Dumps the final compressed table form of the trie to Perl_debug_log.
 894   Used for debugging make_trie().
 895 */
 896
 897 STATIC void
 898 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
 899             AV *revcharmap, U32 depth)
 900 {
 901     U32 state;
 902     SV *sv=sv_newmortal();
 903     int colwidth= widecharmap ? 6 : 4;
 904     U16 word;
 905     GET_RE_DEBUG_FLAGS_DECL;
 906
 907     PERL_ARGS_ASSERT_DUMP_TRIE;
 908
 909     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
 910         (int)depth * 2 + 2,"",
 911         "Match","Base","Ofs" );
 912
 913     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
 914         SV ** const tmp = av_fetch( revcharmap, state, 0);
 915         if ( tmp ) {
 916             PerlIO_printf( Perl_debug_log, "%*s",
 917                 colwidth,
 918                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 919                             PL_colors[0], PL_colors[1],
 920                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 921                             PERL_PV_ESCAPE_FIRSTCHAR
 922                 )
 923             );
 924         }
 925     }
 926     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
 927         (int)depth * 2 + 2,"");
 928
 929     for( state = 0 ; state < trie->uniquecharcount ; state++ )
 930         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
 931     PerlIO_printf( Perl_debug_log, "\n");
 932
 933     for( state = 1 ; state < trie->statecount ; state++ ) {
 934         const U32 base = trie->states[ state ].trans.base;
 935
 936         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
 937
 938         if ( trie->states[ state ].wordnum ) {
 939             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
 940         } else {
 941             PerlIO_printf( Perl_debug_log, "%6s", "" );
 942         }
 943
 944         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
 945
 946         if ( base ) {
 947             U32 ofs = 0;
 948
 949             while( ( base + ofs  < trie->uniquecharcount ) ||
 950                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
 951                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
 952                     ofs++;
 953
 954             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
 955
 956             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
 957                 if ( ( base + ofs >= trie->uniquecharcount ) &&
 958                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
 959                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
 960                 {
 961                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
 962                     colwidth,
 963                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
 964                 } else {
 965                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
 966                 }
 967             }
 968
 969             PerlIO_printf( Perl_debug_log, "]");
 970
 971         }
 972         PerlIO_printf( Perl_debug_log, "\n" );
 973     }
 974     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
 975     for (word=1; word <= trie->wordcount; word++) {
 976         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
 977             (int)word, (int)(trie->wordinfo[word].prev),
 978             (int)(trie->wordinfo[word].len));
 979     }
 980     PerlIO_printf(Perl_debug_log, "\n" );
 981 }
 982 /*
 983   Dumps a fully constructed but uncompressed trie in list form.
 984   List tries normally only are used for construction when the number of
 985   possible chars (trie->uniquecharcount) is very high.
 986   Used for debugging make_trie().
 987 */
 988 STATIC void
 989 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
 990                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
 991                          U32 depth)
 992 {
 993     U32 state;
 994     SV *sv=sv_newmortal();
 995     int colwidth= widecharmap ? 6 : 4;
 996     GET_RE_DEBUG_FLAGS_DECL;
 997
 998     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
 999
1000     /* print out the table precompression.  */
1001     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1002         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1003         "------:-----+-----------------\n" );
1004
1005     for( state=1 ; state < next_alloc ; state ++ ) {
1006         U16 charid;
1007
1008         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1009             (int)depth * 2 + 2,"", (UV)state  );
1010         if ( ! trie->states[ state ].wordnum ) {
1011             PerlIO_printf( Perl_debug_log, "%5s| ","");
1012         } else {
1013             PerlIO_printf( Perl_debug_log, "W%4x| ",
1014                 trie->states[ state ].wordnum
1015             );
1016         }
1017         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1018             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1019             if ( tmp ) {
1020                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1021                     colwidth,
1022                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1023                             PL_colors[0], PL_colors[1],
1024                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1025                             PERL_PV_ESCAPE_FIRSTCHAR
1026                     ) ,
1027                     TRIE_LIST_ITEM(state,charid).forid,
1028                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1029                 );
1030                 if (!(charid % 10))
1031                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1032                         (int)((depth * 2) + 14), "");
1033             }
1034         }
1035         PerlIO_printf( Perl_debug_log, "\n");
1036     }
1037 }
1038
1039 /*
1040   Dumps a fully constructed but uncompressed trie in table form.
1041   This is the normal DFA style state transition table, with a few
1042   twists to facilitate compression later.
1043   Used for debugging make_trie().
1044 */
1045 STATIC void
1046 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1047                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1048                           U32 depth)
1049 {
1050     U32 state;
1051     U16 charid;
1052     SV *sv=sv_newmortal();
1053     int colwidth= widecharmap ? 6 : 4;
1054     GET_RE_DEBUG_FLAGS_DECL;
1055
1056     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1057
1058     /*
1059        print out the table precompression so that we can do a visual check
1060        that they are identical.
1061      */
1062
1063     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1064
1065     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1066         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1067         if ( tmp ) {
1068             PerlIO_printf( Perl_debug_log, "%*s",
1069                 colwidth,
1070                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1071                             PL_colors[0], PL_colors[1],
1072                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1073                             PERL_PV_ESCAPE_FIRSTCHAR
1074                 )
1075             );
1076         }
1077     }
1078
1079     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1080
1081     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1082         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1083     }
1084
1085     PerlIO_printf( Perl_debug_log, "\n" );
1086
1087     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1088
1089         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1090             (int)depth * 2 + 2,"",
1091             (UV)TRIE_NODENUM( state ) );
1092
1093         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1094             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1095             if (v)
1096                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1097             else
1098                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1099         }
1100         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1101             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1102         } else {
1103             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1104             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1105         }
1106     }
1107 }
1108
1109 #endif
1110
1111
1112 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1113   startbranch: the first branch in the whole branch sequence
1114   first      : start branch of sequence of branch-exact nodes.
1115                May be the same as startbranch
1116   last       : Thing following the last branch.
1117                May be the same as tail.
1118   tail       : item following the branch sequence
1119   count      : words in the sequence
1120   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1121   depth      : indent depth
1122
1123 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1124
1125 A trie is an N'ary tree where the branches are determined by digital
1126 decomposition of the key. IE, at the root node you look up the 1st character and
1127 follow that branch repeat until you find the end of the branches. Nodes can be
1128 marked as "accepting" meaning they represent a complete word. Eg:
1129
1130   /he|she|his|hers/
1131
1132 would convert into the following structure. Numbers represent states, letters
1133 following numbers represent valid transitions on the letter from that state, if
1134 the number is in square brackets it represents an accepting state, otherwise it
1135 will be in parenthesis.
1136
1137       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1138       |    |
1139       |   (2)
1140       |    |
1141      (1)   +-i->(6)-+-s->[7]
1142       |
1143       +-s->(3)-+-h->(4)-+-e->[5]
1144
1145       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1146
1147 This shows that when matching against the string 'hers' we will begin at state 1
1148 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1149 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1150 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1151 single traverse. We store a mapping from accepting to state to which word was
1152 matched, and then when we have multiple possibilities we try to complete the
1153 rest of the regex in the order in which they occured in the alternation.
1154
1155 The only prior NFA like behaviour that would be changed by the TRIE support is
1156 the silent ignoring of duplicate alternations which are of the form:
1157
1158  / (DUPE|DUPE) X? (?{ ... }) Y /x
1159
1160 Thus EVAL blocks following a trie may be called a different number of times with
1161 and without the optimisation. With the optimisations dupes will be silently
1162 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1163 the following demonstrates:
1164
1165  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1166
1167 which prints out 'word' three times, but
1168
1169  'words'=~/(word|word|word)(?{ print $1 })S/
1170
1171 which doesnt print it out at all. This is due to other optimisations kicking in.
1172
1173 Example of what happens on a structural level:
1174
1175 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1176
1177    1: CURLYM[1] {1,32767}(18)
1178    5:   BRANCH(8)
1179    6:     EXACT <ac>(16)
1180    8:   BRANCH(11)
1181    9:     EXACT <ad>(16)
1182   11:   BRANCH(14)
1183   12:     EXACT <ab>(16)
1184   16:   SUCCEED(0)
1185   17:   NOTHING(18)
1186   18: END(0)
1187
1188 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1189 and should turn into:
1190
1191    1: CURLYM[1] {1,32767}(18)
1192    5:   TRIE(16)
1193         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1194           <ac>
1195           <ad>
1196           <ab>
1197   16:   SUCCEED(0)
1198   17:   NOTHING(18)
1199   18: END(0)
1200
1201 Cases where tail != last would be like /(?foo|bar)baz/:
1202
1203    1: BRANCH(4)
1204    2:   EXACT <foo>(8)
1205    4: BRANCH(7)
1206    5:   EXACT <bar>(8)
1207    7: TAIL(8)
1208    8: EXACT <baz>(10)
1209   10: END(0)
1210
1211 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1212 and would end up looking like:
1213
1214     1: TRIE(8)
1215       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1216         <foo>
1217         <bar>
1218    7: TAIL(8)
1219    8: EXACT <baz>(10)
1220   10: END(0)
1221
1222     d = uvuni_to_utf8_flags(d, uv, 0);
1223
1224 is the recommended Unicode-aware way of saying
1225
1226     *(d++) = uv;
1227 */
1228
1229 #define TRIE_STORE_REVCHAR                                                 \
1230     STMT_START {                                                           \
1231         if (UTF) {                                                         \
1232             SV *zlopp = newSV(2);                                          \
1233             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1234             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1235             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1236             SvPOK_on(zlopp);                                               \
1237             SvUTF8_on(zlopp);                                              \
1238             av_push(revcharmap, zlopp);                                    \
1239         } else {                                                           \
1240             char ooooff = (char)uvc;                                               \
1241             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1242         }                                                                  \
1243         } STMT_END
1244
1245 #define TRIE_READ_CHAR STMT_START {                                           \
1246     wordlen++;                                                                \
1247     if ( UTF ) {                                                              \
1248         if ( folder ) {                                                       \
1249             if ( foldlen > 0 ) {                                              \
1250                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1251                foldlen -= len;                                                \
1252                scan += len;                                                   \
1253                len = 0;                                                       \
1254             } else {                                                          \
1255                 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1256                 uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1257                 foldlen -= UNISKIP( uvc );                                    \
1258                 scan = foldbuf + UNISKIP( uvc );                              \
1259             }                                                                 \
1260         } else {                                                              \
1261             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1262         }                                                                     \
1263     } else {                                                                  \
1264         uvc = (U32)*uc;                                                       \
1265         len = 1;                                                              \
1266     }                                                                         \
1267 } STMT_END
1268
1269
1270
1271 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1272     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1273         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1274         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1275     }                                                           \
1276     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1277     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1278     TRIE_LIST_CUR( state )++;                                   \
1279 } STMT_END
1280
1281 #define TRIE_LIST_NEW(state) STMT_START {                       \
1282     Newxz( trie->states[ state ].trans.list,               \
1283         4, reg_trie_trans_le );                                 \
1284      TRIE_LIST_CUR( state ) = 1;                                \
1285      TRIE_LIST_LEN( state ) = 4;                                \
1286 } STMT_END
1287
1288 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1289     U16 dupe= trie->states[ state ].wordnum;                    \
1290     regnode * const noper_next = regnext( noper );              \
1291                                                                 \
1292     DEBUG_r({                                                   \
1293         /* store the word for dumping */                        \
1294         SV* tmp;                                                \
1295         if (OP(noper) != NOTHING)                               \
1296             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1297         else                                                    \
1298             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1299         av_push( trie_words, tmp );                             \
1300     });                                                         \
1301                                                                 \
1302     curword++;                                                  \
1303     trie->wordinfo[curword].prev   = 0;                         \
1304     trie->wordinfo[curword].len    = wordlen;                   \
1305     trie->wordinfo[curword].accept = state;                     \
1306                                                                 \
1307     if ( noper_next < tail ) {                                  \
1308         if (!trie->jump)                                        \
1309             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1310         trie->jump[curword] = (U16)(noper_next - convert);      \
1311         if (!jumper)                                            \
1312             jumper = noper_next;                                \
1313         if (!nextbranch)                                        \
1314             nextbranch= regnext(cur);                           \
1315     }                                                           \
1316                                                                 \
1317     if ( dupe ) {                                               \
1318         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1319         /* chain, so that when the bits of chain are later    */\
1320         /* linked together, the dups appear in the chain      */\
1321         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1322         trie->wordinfo[dupe].prev = curword;                    \
1323     } else {                                                    \
1324         /* we haven't inserted this word yet.                */ \
1325         trie->states[ state ].wordnum = curword;                \
1326     }                                                           \
1327 } STMT_END
1328
1329
1330 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1331      ( ( base + charid >=  ucharcount                                   \
1332          && base + charid < ubound                                      \
1333          && state == trie->trans[ base - ucharcount + charid ].check    \
1334          && trie->trans[ base - ucharcount + charid ].next )            \
1335            ? trie->trans[ base - ucharcount + charid ].next             \
1336            : ( state==1 ? special : 0 )                                 \
1337       )
1338
1339 #define MADE_TRIE       1
1340 #define MADE_JUMP_TRIE  2
1341 #define MADE_EXACT_TRIE 4
1342
1343 STATIC I32
1344 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1345 {
1346     dVAR;
1347     /* first pass, loop through and scan words */
1348     reg_trie_data *trie;
1349     HV *widecharmap = NULL;
1350     AV *revcharmap = newAV();
1351     regnode *cur;
1352     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1353     STRLEN len = 0;
1354     UV uvc = 0;
1355     U16 curword = 0;
1356     U32 next_alloc = 0;
1357     regnode *jumper = NULL;
1358     regnode *nextbranch = NULL;
1359     regnode *convert = NULL;
1360     U32 *prev_states; /* temp array mapping each state to previous one */
1361     /* we just use folder as a flag in utf8 */
1362     const U8 * folder = NULL;
1363
1364 #ifdef DEBUGGING
1365     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1366     AV *trie_words = NULL;
1367     /* along with revcharmap, this only used during construction but both are
1368      * useful during debugging so we store them in the struct when debugging.
1369      */
1370 #else
1371     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1372     STRLEN trie_charcount=0;
1373 #endif
1374     SV *re_trie_maxbuff;
1375     GET_RE_DEBUG_FLAGS_DECL;
1376
1377     PERL_ARGS_ASSERT_MAKE_TRIE;
1378 #ifndef DEBUGGING
1379     PERL_UNUSED_ARG(depth);
1380 #endif
1381
1382     switch (flags) {
1383         case EXACTFU: folder = PL_fold_latin1; break;
1384         case EXACTF:  folder = PL_fold; break;
1385         case EXACTFL: folder = PL_fold_locale; break;
1386     }
1387
1388     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1389     trie->refcount = 1;
1390     trie->startstate = 1;
1391     trie->wordcount = word_count;
1392     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1393     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1394     if (!(UTF && folder))
1395         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1396     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1397                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1398
1399     DEBUG_r({
1400         trie_words = newAV();
1401     });
1402
1403     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1404     if (!SvIOK(re_trie_maxbuff)) {
1405         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1406     }
1407     DEBUG_OPTIMISE_r({
1408                 PerlIO_printf( Perl_debug_log,
1409                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1410                   (int)depth * 2 + 2, "",
1411                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1412                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1413                   (int)depth);
1414     });
1415
1416    /* Find the node we are going to overwrite */
1417     if ( first == startbranch && OP( last ) != BRANCH ) {
1418         /* whole branch chain */
1419         convert = first;
1420     } else {
1421         /* branch sub-chain */
1422         convert = NEXTOPER( first );
1423     }
1424
1425     /*  -- First loop and Setup --
1426
1427        We first traverse the branches and scan each word to determine if it
1428        contains widechars, and how many unique chars there are, this is
1429        important as we have to build a table with at least as many columns as we
1430        have unique chars.
1431
1432        We use an array of integers to represent the character codes 0..255
1433        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1434        native representation of the character value as the key and IV's for the
1435        coded index.
1436
1437        *TODO* If we keep track of how many times each character is used we can
1438        remap the columns so that the table compression later on is more
1439        efficient in terms of memory by ensuring the most common value is in the
1440        middle and the least common are on the outside.  IMO this would be better
1441        than a most to least common mapping as theres a decent chance the most
1442        common letter will share a node with the least common, meaning the node
1443        will not be compressable. With a middle is most common approach the worst
1444        case is when we have the least common nodes twice.
1445
1446      */
1447
1448     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1449         regnode * const noper = NEXTOPER( cur );
1450         const U8 *uc = (U8*)STRING( noper );
1451         const U8 * const e  = uc + STR_LEN( noper );
1452         STRLEN foldlen = 0;
1453         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1454         const U8 *scan = (U8*)NULL;
1455         U32 wordlen      = 0;         /* required init */
1456         STRLEN chars = 0;
1457         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1458
1459         if (OP(noper) == NOTHING) {
1460             trie->minlen= 0;
1461             continue;
1462         }
1463         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1464             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1465                                           regardless of encoding */
1466
1467         for ( ; uc < e ; uc += len ) {
1468             TRIE_CHARCOUNT(trie)++;
1469             TRIE_READ_CHAR;
1470             chars++;
1471             if ( uvc < 256 ) {
1472                 if ( !trie->charmap[ uvc ] ) {
1473                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1474                     if ( folder )
1475                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1476                     TRIE_STORE_REVCHAR;
1477                 }
1478                 if ( set_bit ) {
1479                     /* store the codepoint in the bitmap, and its folded
1480                      * equivalent. */
1481                     TRIE_BITMAP_SET(trie,uvc);
1482
1483                     /* store the folded codepoint */
1484                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1485
1486                     if ( !UTF ) {
1487                         /* store first byte of utf8 representation of
1488                            variant codepoints */
1489                         if (! UNI_IS_INVARIANT(uvc)) {
1490                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1491                         }
1492                     }
1493                     set_bit = 0; /* We've done our bit :-) */
1494                 }
1495             } else {
1496                 SV** svpp;
1497                 if ( !widecharmap )
1498                     widecharmap = newHV();
1499
1500                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1501
1502                 if ( !svpp )
1503                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1504
1505                 if ( !SvTRUE( *svpp ) ) {
1506                     sv_setiv( *svpp, ++trie->uniquecharcount );
1507                     TRIE_STORE_REVCHAR;
1508                 }
1509             }
1510         }
1511         if( cur == first ) {
1512             trie->minlen=chars;
1513             trie->maxlen=chars;
1514         } else if (chars < trie->minlen) {
1515             trie->minlen=chars;
1516         } else if (chars > trie->maxlen) {
1517             trie->maxlen=chars;
1518         }
1519
1520     } /* end first pass */
1521     DEBUG_TRIE_COMPILE_r(
1522         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1523                 (int)depth * 2 + 2,"",
1524                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1525                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1526                 (int)trie->minlen, (int)trie->maxlen )
1527     );
1528
1529     /*
1530         We now know what we are dealing with in terms of unique chars and
1531         string sizes so we can calculate how much memory a naive
1532         representation using a flat table  will take. If it's over a reasonable
1533         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1534         conservative but potentially much slower representation using an array
1535         of lists.
1536
1537         At the end we convert both representations into the same compressed
1538         form that will be used in regexec.c for matching with. The latter
1539         is a form that cannot be used to construct with but has memory
1540         properties similar to the list form and access properties similar
1541         to the table form making it both suitable for fast searches and
1542         small enough that its feasable to store for the duration of a program.
1543
1544         See the comment in the code where the compressed table is produced
1545         inplace from the flat tabe representation for an explanation of how
1546         the compression works.
1547
1548     */
1549
1550
1551     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1552     prev_states[1] = 0;
1553
1554     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1555         /*
1556             Second Pass -- Array Of Lists Representation
1557
1558             Each state will be represented by a list of charid:state records
1559             (reg_trie_trans_le) the first such element holds the CUR and LEN
1560             points of the allocated array. (See defines above).
1561
1562             We build the initial structure using the lists, and then convert
1563             it into the compressed table form which allows faster lookups
1564             (but cant be modified once converted).
1565         */
1566
1567         STRLEN transcount = 1;
1568
1569         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1570             "%*sCompiling trie using list compiler\n",
1571             (int)depth * 2 + 2, ""));
1572
1573         trie->states = (reg_trie_state *)
1574             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1575                                   sizeof(reg_trie_state) );
1576         TRIE_LIST_NEW(1);
1577         next_alloc = 2;
1578
1579         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1580
1581             regnode * const noper = NEXTOPER( cur );
1582             U8 *uc           = (U8*)STRING( noper );
1583             const U8 * const e = uc + STR_LEN( noper );
1584             U32 state        = 1;         /* required init */
1585             U16 charid       = 0;         /* sanity init */
1586             U8 *scan         = (U8*)NULL; /* sanity init */
1587             STRLEN foldlen   = 0;         /* required init */
1588             U32 wordlen      = 0;         /* required init */
1589             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1590
1591             if (OP(noper) != NOTHING) {
1592                 for ( ; uc < e ; uc += len ) {
1593
1594                     TRIE_READ_CHAR;
1595
1596                     if ( uvc < 256 ) {
1597                         charid = trie->charmap[ uvc ];
1598                     } else {
1599                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1600                         if ( !svpp ) {
1601                             charid = 0;
1602                         } else {
1603                             charid=(U16)SvIV( *svpp );
1604                         }
1605                     }
1606                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1607                     if ( charid ) {
1608
1609                         U16 check;
1610                         U32 newstate = 0;
1611
1612                         charid--;
1613                         if ( !trie->states[ state ].trans.list ) {
1614                             TRIE_LIST_NEW( state );
1615                         }
1616                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1617                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1618                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1619                                 break;
1620                             }
1621                         }
1622                         if ( ! newstate ) {
1623                             newstate = next_alloc++;
1624                             prev_states[newstate] = state;
1625                             TRIE_LIST_PUSH( state, charid, newstate );
1626                             transcount++;
1627                         }
1628                         state = newstate;
1629                     } else {
1630                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1631                     }
1632                 }
1633             }
1634             TRIE_HANDLE_WORD(state);
1635
1636         } /* end second pass */
1637
1638         /* next alloc is the NEXT state to be allocated */
1639         trie->statecount = next_alloc;
1640         trie->states = (reg_trie_state *)
1641             PerlMemShared_realloc( trie->states,
1642                                    next_alloc
1643                                    * sizeof(reg_trie_state) );
1644
1645         /* and now dump it out before we compress it */
1646         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1647                                                          revcharmap, next_alloc,
1648                                                          depth+1)
1649         );
1650
1651         trie->trans = (reg_trie_trans *)
1652             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1653         {
1654             U32 state;
1655             U32 tp = 0;
1656             U32 zp = 0;
1657
1658
1659             for( state=1 ; state < next_alloc ; state ++ ) {
1660                 U32 base=0;
1661
1662                 /*
1663                 DEBUG_TRIE_COMPILE_MORE_r(
1664                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1665                 );
1666                 */
1667
1668                 if (trie->states[state].trans.list) {
1669                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1670                     U16 maxid=minid;
1671                     U16 idx;
1672
1673                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1674                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1675                         if ( forid < minid ) {
1676                             minid=forid;
1677                         } else if ( forid > maxid ) {
1678                             maxid=forid;
1679                         }
1680                     }
1681                     if ( transcount < tp + maxid - minid + 1) {
1682                         transcount *= 2;
1683                         trie->trans = (reg_trie_trans *)
1684                             PerlMemShared_realloc( trie->trans,
1685                                                      transcount
1686                                                      * sizeof(reg_trie_trans) );
1687                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1688                     }
1689                     base = trie->uniquecharcount + tp - minid;
1690                     if ( maxid == minid ) {
1691                         U32 set = 0;
1692                         for ( ; zp < tp ; zp++ ) {
1693                             if ( ! trie->trans[ zp ].next ) {
1694                                 base = trie->uniquecharcount + zp - minid;
1695                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1696                                 trie->trans[ zp ].check = state;
1697                                 set = 1;
1698                                 break;
1699                             }
1700                         }
1701                         if ( !set ) {
1702                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1703                             trie->trans[ tp ].check = state;
1704                             tp++;
1705                             zp = tp;
1706                         }
1707                     } else {
1708                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1709                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1710                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1711                             trie->trans[ tid ].check = state;
1712                         }
1713                         tp += ( maxid - minid + 1 );
1714                     }
1715                     Safefree(trie->states[ state ].trans.list);
1716                 }
1717                 /*
1718                 DEBUG_TRIE_COMPILE_MORE_r(
1719                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1720                 );
1721                 */
1722                 trie->states[ state ].trans.base=base;
1723             }
1724             trie->lasttrans = tp + 1;
1725         }
1726     } else {
1727         /*
1728            Second Pass -- Flat Table Representation.
1729
1730            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1731            We know that we will need Charcount+1 trans at most to store the data
1732            (one row per char at worst case) So we preallocate both structures
1733            assuming worst case.
1734
1735            We then construct the trie using only the .next slots of the entry
1736            structs.
1737
1738            We use the .check field of the first entry of the node temporarily to
1739            make compression both faster and easier by keeping track of how many non
1740            zero fields are in the node.
1741
1742            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1743            transition.
1744
1745            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1746            number representing the first entry of the node, and state as a
1747            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1748            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1749            are 2 entrys per node. eg:
1750
1751              A B       A B
1752           1. 2 4    1. 3 7
1753           2. 0 3    3. 0 5
1754           3. 0 0    5. 0 0
1755           4. 0 0    7. 0 0
1756
1757            The table is internally in the right hand, idx form. However as we also
1758            have to deal with the states array which is indexed by nodenum we have to
1759            use TRIE_NODENUM() to convert.
1760
1761         */
1762         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1763             "%*sCompiling trie using table compiler\n",
1764             (int)depth * 2 + 2, ""));
1765
1766         trie->trans = (reg_trie_trans *)
1767             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1768                                   * trie->uniquecharcount + 1,
1769                                   sizeof(reg_trie_trans) );
1770         trie->states = (reg_trie_state *)
1771             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1772                                   sizeof(reg_trie_state) );
1773         next_alloc = trie->uniquecharcount + 1;
1774
1775
1776         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1777
1778             regnode * const noper   = NEXTOPER( cur );
1779             const U8 *uc     = (U8*)STRING( noper );
1780             const U8 * const e = uc + STR_LEN( noper );
1781
1782             U32 state        = 1;         /* required init */
1783
1784             U16 charid       = 0;         /* sanity init */
1785             U32 accept_state = 0;         /* sanity init */
1786             U8 *scan         = (U8*)NULL; /* sanity init */
1787
1788             STRLEN foldlen   = 0;         /* required init */
1789             U32 wordlen      = 0;         /* required init */
1790             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1791
1792             if ( OP(noper) != NOTHING ) {
1793                 for ( ; uc < e ; uc += len ) {
1794
1795                     TRIE_READ_CHAR;
1796
1797                     if ( uvc < 256 ) {
1798                         charid = trie->charmap[ uvc ];
1799                     } else {
1800                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1801                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1802                     }
1803                     if ( charid ) {
1804                         charid--;
1805                         if ( !trie->trans[ state + charid ].next ) {
1806                             trie->trans[ state + charid ].next = next_alloc;
1807                             trie->trans[ state ].check++;
1808                             prev_states[TRIE_NODENUM(next_alloc)]
1809                                     = TRIE_NODENUM(state);
1810                             next_alloc += trie->uniquecharcount;
1811                         }
1812                         state = trie->trans[ state + charid ].next;
1813                     } else {
1814                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1815                     }
1816                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1817                 }
1818             }
1819             accept_state = TRIE_NODENUM( state );
1820             TRIE_HANDLE_WORD(accept_state);
1821
1822         } /* end second pass */
1823
1824         /* and now dump it out before we compress it */
1825         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1826                                                           revcharmap,
1827                                                           next_alloc, depth+1));
1828
1829         {
1830         /*
1831            * Inplace compress the table.*
1832
1833            For sparse data sets the table constructed by the trie algorithm will
1834            be mostly 0/FAIL transitions or to put it another way mostly empty.
1835            (Note that leaf nodes will not contain any transitions.)
1836
1837            This algorithm compresses the tables by eliminating most such
1838            transitions, at the cost of a modest bit of extra work during lookup:
1839
1840            - Each states[] entry contains a .base field which indicates the
1841            index in the state[] array wheres its transition data is stored.
1842
1843            - If .base is 0 there are no valid transitions from that node.
1844
1845            - If .base is nonzero then charid is added to it to find an entry in
1846            the trans array.
1847
1848            -If trans[states[state].base+charid].check!=state then the
1849            transition is taken to be a 0/Fail transition. Thus if there are fail
1850            transitions at the front of the node then the .base offset will point
1851            somewhere inside the previous nodes data (or maybe even into a node
1852            even earlier), but the .check field determines if the transition is
1853            valid.
1854
1855            XXX - wrong maybe?
1856            The following process inplace converts the table to the compressed
1857            table: We first do not compress the root node 1,and mark all its
1858            .check pointers as 1 and set its .base pointer as 1 as well. This
1859            allows us to do a DFA construction from the compressed table later,
1860            and ensures that any .base pointers we calculate later are greater
1861            than 0.
1862
1863            - We set 'pos' to indicate the first entry of the second node.
1864
1865            - We then iterate over the columns of the node, finding the first and
1866            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1867            and set the .check pointers accordingly, and advance pos
1868            appropriately and repreat for the next node. Note that when we copy
1869            the next pointers we have to convert them from the original
1870            NODEIDX form to NODENUM form as the former is not valid post
1871            compression.
1872
1873            - If a node has no transitions used we mark its base as 0 and do not
1874            advance the pos pointer.
1875
1876            - If a node only has one transition we use a second pointer into the
1877            structure to fill in allocated fail transitions from other states.
1878            This pointer is independent of the main pointer and scans forward
1879            looking for null transitions that are allocated to a state. When it
1880            finds one it writes the single transition into the "hole".  If the
1881            pointer doesnt find one the single transition is appended as normal.
1882
1883            - Once compressed we can Renew/realloc the structures to release the
1884            excess space.
1885
1886            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1887            specifically Fig 3.47 and the associated pseudocode.
1888
1889            demq
1890         */
1891         const U32 laststate = TRIE_NODENUM( next_alloc );
1892         U32 state, charid;
1893         U32 pos = 0, zp=0;
1894         trie->statecount = laststate;
1895
1896         for ( state = 1 ; state < laststate ; state++ ) {
1897             U8 flag = 0;
1898             const U32 stateidx = TRIE_NODEIDX( state );
1899             const U32 o_used = trie->trans[ stateidx ].check;
1900             U32 used = trie->trans[ stateidx ].check;
1901             trie->trans[ stateidx ].check = 0;
1902
1903             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1904                 if ( flag || trie->trans[ stateidx + charid ].next ) {
1905                     if ( trie->trans[ stateidx + charid ].next ) {
1906                         if (o_used == 1) {
1907                             for ( ; zp < pos ; zp++ ) {
1908                                 if ( ! trie->trans[ zp ].next ) {
1909                                     break;
1910                                 }
1911                             }
1912                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1913                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1914                             trie->trans[ zp ].check = state;
1915                             if ( ++zp > pos ) pos = zp;
1916                             break;
1917                         }
1918                         used--;
1919                     }
1920                     if ( !flag ) {
1921                         flag = 1;
1922                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1923                     }
1924                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1925                     trie->trans[ pos ].check = state;
1926                     pos++;
1927                 }
1928             }
1929         }
1930         trie->lasttrans = pos + 1;
1931         trie->states = (reg_trie_state *)
1932             PerlMemShared_realloc( trie->states, laststate
1933                                    * sizeof(reg_trie_state) );
1934         DEBUG_TRIE_COMPILE_MORE_r(
1935                 PerlIO_printf( Perl_debug_log,
1936                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1937                     (int)depth * 2 + 2,"",
1938                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1939                     (IV)next_alloc,
1940                     (IV)pos,
1941                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1942             );
1943
1944         } /* end table compress */
1945     }
1946     DEBUG_TRIE_COMPILE_MORE_r(
1947             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1948                 (int)depth * 2 + 2, "",
1949                 (UV)trie->statecount,
1950                 (UV)trie->lasttrans)
1951     );
1952     /* resize the trans array to remove unused space */
1953     trie->trans = (reg_trie_trans *)
1954         PerlMemShared_realloc( trie->trans, trie->lasttrans
1955                                * sizeof(reg_trie_trans) );
1956
1957     {   /* Modify the program and insert the new TRIE node */
1958         U8 nodetype =(U8)(flags & 0xFF);
1959         char *str=NULL;
1960
1961 #ifdef DEBUGGING
1962         regnode *optimize = NULL;
1963 #ifdef RE_TRACK_PATTERN_OFFSETS
1964
1965         U32 mjd_offset = 0;
1966         U32 mjd_nodelen = 0;
1967 #endif /* RE_TRACK_PATTERN_OFFSETS */
1968 #endif /* DEBUGGING */
1969         /*
1970            This means we convert either the first branch or the first Exact,
1971            depending on whether the thing following (in 'last') is a branch
1972            or not and whther first is the startbranch (ie is it a sub part of
1973            the alternation or is it the whole thing.)
1974            Assuming its a sub part we convert the EXACT otherwise we convert
1975            the whole branch sequence, including the first.
1976          */
1977         /* Find the node we are going to overwrite */
1978         if ( first != startbranch || OP( last ) == BRANCH ) {
1979             /* branch sub-chain */
1980             NEXT_OFF( first ) = (U16)(last - first);
1981 #ifdef RE_TRACK_PATTERN_OFFSETS
1982             DEBUG_r({
1983                 mjd_offset= Node_Offset((convert));
1984                 mjd_nodelen= Node_Length((convert));
1985             });
1986 #endif
1987             /* whole branch chain */
1988         }
1989 #ifdef RE_TRACK_PATTERN_OFFSETS
1990         else {
1991             DEBUG_r({
1992                 const  regnode *nop = NEXTOPER( convert );
1993                 mjd_offset= Node_Offset((nop));
1994                 mjd_nodelen= Node_Length((nop));
1995             });
1996         }
1997         DEBUG_OPTIMISE_r(
1998             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1999                 (int)depth * 2 + 2, "",
2000                 (UV)mjd_offset, (UV)mjd_nodelen)
2001         );
2002 #endif
2003         /* But first we check to see if there is a common prefix we can
2004            split out as an EXACT and put in front of the TRIE node.  */
2005         trie->startstate= 1;
2006         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2007             U32 state;
2008             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2009                 U32 ofs = 0;
2010                 I32 idx = -1;
2011                 U32 count = 0;
2012                 const U32 base = trie->states[ state ].trans.base;
2013
2014                 if ( trie->states[state].wordnum )
2015                         count = 1;
2016
2017                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2018                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2019                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2020                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2021                     {
2022                         if ( ++count > 1 ) {
2023                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2024                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2025                             if ( state == 1 ) break;
2026                             if ( count == 2 ) {
2027                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2028                                 DEBUG_OPTIMISE_r(
2029                                     PerlIO_printf(Perl_debug_log,
2030                                         "%*sNew Start State=%"UVuf" Class: [",
2031                                         (int)depth * 2 + 2, "",
2032                                         (UV)state));
2033                                 if (idx >= 0) {
2034                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2035                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2036
2037                                     TRIE_BITMAP_SET(trie,*ch);
2038                                     if ( folder )
2039                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2040                                     DEBUG_OPTIMISE_r(
2041                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2042                                     );
2043                                 }
2044                             }
2045                             TRIE_BITMAP_SET(trie,*ch);
2046                             if ( folder )
2047                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2048                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2049                         }
2050                         idx = ofs;
2051                     }
2052                 }
2053                 if ( count == 1 ) {
2054                     SV **tmp = av_fetch( revcharmap, idx, 0);
2055                     STRLEN len;
2056                     char *ch = SvPV( *tmp, len );
2057                     DEBUG_OPTIMISE_r({
2058                         SV *sv=sv_newmortal();
2059                         PerlIO_printf( Perl_debug_log,
2060                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2061                             (int)depth * 2 + 2, "",
2062                             (UV)state, (UV)idx,
2063                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2064                                 PL_colors[0], PL_colors[1],
2065                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2066                                 PERL_PV_ESCAPE_FIRSTCHAR
2067                             )
2068                         );
2069                     });
2070                     if ( state==1 ) {
2071                         OP( convert ) = nodetype;
2072                         str=STRING(convert);
2073                         STR_LEN(convert)=0;
2074                     }
2075                     STR_LEN(convert) += len;
2076                     while (len--)
2077                         *str++ = *ch++;
2078                 } else {
2079 #ifdef DEBUGGING
2080                     if (state>1)
2081                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2082 #endif
2083                     break;
2084                 }
2085             }
2086             trie->prefixlen = (state-1);
2087             if (str) {
2088                 regnode *n = convert+NODE_SZ_STR(convert);
2089                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2090                 trie->startstate = state;
2091                 trie->minlen -= (state - 1);
2092                 trie->maxlen -= (state - 1);
2093 #ifdef DEBUGGING
2094                /* At least the UNICOS C compiler choked on this
2095                 * being argument to DEBUG_r(), so let's just have
2096                 * it right here. */
2097                if (
2098 #ifdef PERL_EXT_RE_BUILD
2099                    1
2100 #else
2101                    DEBUG_r_TEST
2102 #endif
2103                    ) {
2104                    regnode *fix = convert;
2105                    U32 word = trie->wordcount;
2106                    mjd_nodelen++;
2107                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2108                    while( ++fix < n ) {
2109                        Set_Node_Offset_Length(fix, 0, 0);
2110                    }
2111                    while (word--) {
2112                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2113                        if (tmp) {
2114                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2115                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2116                            else
2117                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2118                        }
2119                    }
2120                }
2121 #endif
2122                 if (trie->maxlen) {
2123                     convert = n;
2124                 } else {
2125                     NEXT_OFF(convert) = (U16)(tail - convert);
2126                     DEBUG_r(optimize= n);
2127                 }
2128             }
2129         }
2130         if (!jumper)
2131             jumper = last;
2132         if ( trie->maxlen ) {
2133             NEXT_OFF( convert ) = (U16)(tail - convert);
2134             ARG_SET( convert, data_slot );
2135             /* Store the offset to the first unabsorbed branch in
2136                jump[0], which is otherwise unused by the jump logic.
2137                We use this when dumping a trie and during optimisation. */
2138             if (trie->jump)
2139                 trie->jump[0] = (U16)(nextbranch - convert);
2140
2141             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2142              *   and there is a bitmap
2143              *   and the first "jump target" node we found leaves enough room
2144              * then convert the TRIE node into a TRIEC node, with the bitmap
2145              * embedded inline in the opcode - this is hypothetically faster.
2146              */
2147             if ( !trie->states[trie->startstate].wordnum
2148                  && trie->bitmap
2149                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2150             {
2151                 OP( convert ) = TRIEC;
2152                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2153                 PerlMemShared_free(trie->bitmap);
2154                 trie->bitmap= NULL;
2155             } else
2156                 OP( convert ) = TRIE;
2157
2158             /* store the type in the flags */
2159             convert->flags = nodetype;
2160             DEBUG_r({
2161             optimize = convert
2162                       + NODE_STEP_REGNODE
2163                       + regarglen[ OP( convert ) ];
2164             });
2165             /* XXX We really should free up the resource in trie now,
2166                    as we won't use them - (which resources?) dmq */
2167         }
2168         /* needed for dumping*/
2169         DEBUG_r(if (optimize) {
2170             regnode *opt = convert;
2171
2172             while ( ++opt < optimize) {
2173                 Set_Node_Offset_Length(opt,0,0);
2174             }
2175             /*
2176                 Try to clean up some of the debris left after the
2177                 optimisation.
2178              */
2179             while( optimize < jumper ) {
2180                 mjd_nodelen += Node_Length((optimize));
2181                 OP( optimize ) = OPTIMIZED;
2182                 Set_Node_Offset_Length(optimize,0,0);
2183                 optimize++;
2184             }
2185             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2186         });
2187     } /* end node insert */
2188
2189     /*  Finish populating the prev field of the wordinfo array.  Walk back
2190      *  from each accept state until we find another accept state, and if
2191      *  so, point the first word's .prev field at the second word. If the
2192      *  second already has a .prev field set, stop now. This will be the
2193      *  case either if we've already processed that word's accept state,
2194      *  or that state had multiple words, and the overspill words were
2195      *  already linked up earlier.
2196      */
2197     {
2198         U16 word;
2199         U32 state;
2200         U16 prev;
2201
2202         for (word=1; word <= trie->wordcount; word++) {
2203             prev = 0;
2204             if (trie->wordinfo[word].prev)
2205                 continue;
2206             state = trie->wordinfo[word].accept;
2207             while (state) {
2208                 state = prev_states[state];
2209                 if (!state)
2210                     break;
2211                 prev = trie->states[state].wordnum;
2212                 if (prev)
2213                     break;
2214             }
2215             trie->wordinfo[word].prev = prev;
2216         }
2217         Safefree(prev_states);
2218     }
2219
2220
2221     /* and now dump out the compressed format */
2222     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2223
2224     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2225 #ifdef DEBUGGING
2226     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2227     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2228 #else
2229     SvREFCNT_dec(revcharmap);
2230 #endif
2231     return trie->jump
2232            ? MADE_JUMP_TRIE
2233            : trie->startstate>1
2234              ? MADE_EXACT_TRIE
2235              : MADE_TRIE;
2236 }
2237
2238 STATIC void
2239 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2240 {
2241 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2242
2243    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2244    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2245    ISBN 0-201-10088-6
2246
2247    We find the fail state for each state in the trie, this state is the longest proper
2248    suffix of the current state's 'word' that is also a proper prefix of another word in our
2249    trie. State 1 represents the word '' and is thus the default fail state. This allows
2250    the DFA not to have to restart after its tried and failed a word at a given point, it
2251    simply continues as though it had been matching the other word in the first place.
2252    Consider
2253       'abcdgu'=~/abcdefg|cdgu/
2254    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2255    fail, which would bring us to the state representing 'd' in the second word where we would
2256    try 'g' and succeed, proceeding to match 'cdgu'.
2257  */
2258  /* add a fail transition */
2259     const U32 trie_offset = ARG(source);
2260     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2261     U32 *q;
2262     const U32 ucharcount = trie->uniquecharcount;
2263     const U32 numstates = trie->statecount;
2264     const U32 ubound = trie->lasttrans + ucharcount;
2265     U32 q_read = 0;
2266     U32 q_write = 0;
2267     U32 charid;
2268     U32 base = trie->states[ 1 ].trans.base;
2269     U32 *fail;
2270     reg_ac_data *aho;
2271     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2272     GET_RE_DEBUG_FLAGS_DECL;
2273
2274     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2275 #ifndef DEBUGGING
2276     PERL_UNUSED_ARG(depth);
2277 #endif
2278
2279
2280     ARG_SET( stclass, data_slot );
2281     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2282     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2283     aho->trie=trie_offset;
2284     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2285     Copy( trie->states, aho->states, numstates, reg_trie_state );
2286     Newxz( q, numstates, U32);
2287     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2288     aho->refcount = 1;
2289     fail = aho->fail;
2290     /* initialize fail[0..1] to be 1 so that we always have
2291        a valid final fail state */
2292     fail[ 0 ] = fail[ 1 ] = 1;
2293
2294     for ( charid = 0; charid < ucharcount ; charid++ ) {
2295         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2296         if ( newstate ) {
2297             q[ q_write ] = newstate;
2298             /* set to point at the root */
2299             fail[ q[ q_write++ ] ]=1;
2300         }
2301     }
2302     while ( q_read < q_write) {
2303         const U32 cur = q[ q_read++ % numstates ];
2304         base = trie->states[ cur ].trans.base;
2305
2306         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2307             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2308             if (ch_state) {
2309                 U32 fail_state = cur;
2310                 U32 fail_base;
2311                 do {
2312                     fail_state = fail[ fail_state ];
2313                     fail_base = aho->states[ fail_state ].trans.base;
2314                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2315
2316                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2317                 fail[ ch_state ] = fail_state;
2318                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2319                 {
2320                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2321                 }
2322                 q[ q_write++ % numstates] = ch_state;
2323             }
2324         }
2325     }
2326     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2327        when we fail in state 1, this allows us to use the
2328        charclass scan to find a valid start char. This is based on the principle
2329        that theres a good chance the string being searched contains lots of stuff
2330        that cant be a start char.
2331      */
2332     fail[ 0 ] = fail[ 1 ] = 0;
2333     DEBUG_TRIE_COMPILE_r({
2334         PerlIO_printf(Perl_debug_log,
2335                       "%*sStclass Failtable (%"UVuf" states): 0",
2336                       (int)(depth * 2), "", (UV)numstates
2337         );
2338         for( q_read=1; q_read<numstates; q_read++ ) {
2339             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2340         }
2341         PerlIO_printf(Perl_debug_log, "\n");
2342     });
2343     Safefree(q);
2344     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2345 }
2346
2347
2348 /*
2349  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2350  * These need to be revisited when a newer toolchain becomes available.
2351  */
2352 #if defined(__sparc64__) && defined(__GNUC__)
2353 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2354 #       undef  SPARC64_GCC_WORKAROUND
2355 #       define SPARC64_GCC_WORKAROUND 1
2356 #   endif
2357 #endif
2358
2359 #define DEBUG_PEEP(str,scan,depth) \
2360     DEBUG_OPTIMISE_r({if (scan){ \
2361        SV * const mysv=sv_newmortal(); \
2362        regnode *Next = regnext(scan); \
2363        regprop(RExC_rx, mysv, scan); \
2364        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2365        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2366        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2367    }});
2368
2369
2370
2371
2372
2373 #define JOIN_EXACT(scan,min,flags) \
2374     if (PL_regkind[OP(scan)] == EXACT) \
2375         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2376
2377 STATIC U32
2378 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2379     /* Merge several consecutive EXACTish nodes into one. */
2380     regnode *n = regnext(scan);
2381     U32 stringok = 1;
2382     regnode *next = scan + NODE_SZ_STR(scan);
2383     U32 merged = 0;
2384     U32 stopnow = 0;
2385 #ifdef DEBUGGING
2386     regnode *stop = scan;
2387     GET_RE_DEBUG_FLAGS_DECL;
2388 #else
2389     PERL_UNUSED_ARG(depth);
2390 #endif
2391
2392     PERL_ARGS_ASSERT_JOIN_EXACT;
2393 #ifndef EXPERIMENTAL_INPLACESCAN
2394     PERL_UNUSED_ARG(flags);
2395     PERL_UNUSED_ARG(val);
2396 #endif
2397     DEBUG_PEEP("join",scan,depth);
2398
2399     /* Skip NOTHING, merge EXACT*. */
2400     while (n &&
2401            ( PL_regkind[OP(n)] == NOTHING ||
2402              (stringok && (OP(n) == OP(scan))))
2403            && NEXT_OFF(n)
2404            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2405
2406         if (OP(n) == TAIL || n > next)
2407             stringok = 0;
2408         if (PL_regkind[OP(n)] == NOTHING) {
2409             DEBUG_PEEP("skip:",n,depth);
2410             NEXT_OFF(scan) += NEXT_OFF(n);
2411             next = n + NODE_STEP_REGNODE;
2412 #ifdef DEBUGGING
2413             if (stringok)
2414                 stop = n;
2415 #endif
2416             n = regnext(n);
2417         }
2418         else if (stringok) {
2419             const unsigned int oldl = STR_LEN(scan);
2420             regnode * const nnext = regnext(n);
2421
2422             DEBUG_PEEP("merg",n,depth);
2423
2424             merged++;
2425             if (oldl + STR_LEN(n) > U8_MAX)
2426                 break;
2427             NEXT_OFF(scan) += NEXT_OFF(n);
2428             STR_LEN(scan) += STR_LEN(n);
2429             next = n + NODE_SZ_STR(n);
2430             /* Now we can overwrite *n : */
2431             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2432 #ifdef DEBUGGING
2433             stop = next - 1;
2434 #endif
2435             n = nnext;
2436             if (stopnow) break;
2437         }
2438
2439 #ifdef EXPERIMENTAL_INPLACESCAN
2440         if (flags && !NEXT_OFF(n)) {
2441             DEBUG_PEEP("atch", val, depth);
2442             if (reg_off_by_arg[OP(n)]) {
2443                 ARG_SET(n, val - n);
2444             }
2445             else {
2446                 NEXT_OFF(n) = val - n;
2447             }
2448             stopnow = 1;
2449         }
2450 #endif
2451     }
2452 #define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS   0x0390
2453 #define IOTA_D_T        GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2454 #define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS     0x03B0
2455 #define UPSILON_D_T     GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2456
2457     if (UTF
2458         && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
2459         && ( STR_LEN(scan) >= 6 ) )
2460     {
2461     /*
2462     Two problematic code points in Unicode casefolding of EXACT nodes:
2463
2464     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2465     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2466
2467     which casefold to
2468
2469     Unicode                      UTF-8
2470
2471     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2472     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2473
2474     This means that in case-insensitive matching (or "loose matching",
2475     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2476     length of the above casefolded versions) can match a target string
2477     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2478     This would rather mess up the minimum length computation.
2479
2480     What we'll do is to look for the tail four bytes, and then peek
2481     at the preceding two bytes to see whether we need to decrease
2482     the minimum length by four (six minus two).
2483
2484     Thanks to the design of UTF-8, there cannot be false matches:
2485     A sequence of valid UTF-8 bytes cannot be a subsequence of
2486     another valid sequence of UTF-8 bytes.
2487
2488     */
2489          char * const s0 = STRING(scan), *s, *t;
2490          char * const s1 = s0 + STR_LEN(scan) - 1;
2491          char * const s2 = s1 - 4;
2492 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2493          const char t0[] = "\xaf\x49\xaf\x42";
2494 #else
2495          const char t0[] = "\xcc\x88\xcc\x81";
2496 #endif
2497          const char * const t1 = t0 + 3;
2498
2499          for (s = s0 + 2;
2500               s < s2 && (t = ninstr(s, s1, t0, t1));
2501               s = t + 4) {
2502 #ifdef EBCDIC
2503               if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2504                   ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2505 #else
2506               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2507                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2508 #endif
2509                    *min -= 4;
2510          }
2511     }
2512
2513 #ifdef DEBUGGING
2514     /* Allow dumping */
2515     n = scan + NODE_SZ_STR(scan);
2516     while (n <= stop) {
2517         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2518             OP(n) = OPTIMIZED;
2519             NEXT_OFF(n) = 0;
2520         }
2521         n++;
2522     }
2523 #endif
2524     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2525     return stopnow;
2526 }
2527
2528 /* REx optimizer.  Converts nodes into quickier variants "in place".
2529    Finds fixed substrings.  */
2530
2531 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2532    to the position after last scanned or to NULL. */
2533
2534 #define INIT_AND_WITHP \
2535     assert(!and_withp); \
2536     Newx(and_withp,1,struct regnode_charclass_class); \
2537     SAVEFREEPV(and_withp)
2538
2539 /* this is a chain of data about sub patterns we are processing that
2540    need to be handled seperately/specially in study_chunk. Its so
2541    we can simulate recursion without losing state.  */
2542 struct scan_frame;
2543 typedef struct scan_frame {
2544     regnode *last;  /* last node to process in this frame */
2545     regnode *next;  /* next node to process when last is reached */
2546     struct scan_frame *prev; /*previous frame*/
2547     I32 stop; /* what stopparen do we use */
2548 } scan_frame;
2549
2550
2551 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2552
2553 #define CASE_SYNST_FNC(nAmE)                                       \
2554 case nAmE:                                                         \
2555     if (flags & SCF_DO_STCLASS_AND) {                              \
2556             for (value = 0; value < 256; value++)                  \
2557                 if (!is_ ## nAmE ## _cp(value))                       \
2558                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2559     }                                                              \
2560     else {                                                         \
2561             for (value = 0; value < 256; value++)                  \
2562                 if (is_ ## nAmE ## _cp(value))                        \
2563                     ANYOF_BITMAP_SET(data->start_class, value);    \
2564     }                                                              \
2565     break;                                                         \
2566 case N ## nAmE:                                                    \
2567     if (flags & SCF_DO_STCLASS_AND) {                              \
2568             for (value = 0; value < 256; value++)                   \
2569                 if (is_ ## nAmE ## _cp(value))                         \
2570                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2571     }                                                               \
2572     else {                                                          \
2573             for (value = 0; value < 256; value++)                   \
2574                 if (!is_ ## nAmE ## _cp(value))                        \
2575                     ANYOF_BITMAP_SET(data->start_class, value);     \
2576     }                                                               \
2577     break
2578
2579
2580
2581 STATIC I32
2582 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2583                         I32 *minlenp, I32 *deltap,
2584                         regnode *last,
2585                         scan_data_t *data,
2586                         I32 stopparen,
2587                         U8* recursed,
2588                         struct regnode_charclass_class *and_withp,
2589                         U32 flags, U32 depth)
2590                         /* scanp: Start here (read-write). */
2591                         /* deltap: Write maxlen-minlen here. */
2592                         /* last: Stop before this one. */
2593                         /* data: string data about the pattern */
2594                         /* stopparen: treat close N as END */
2595                         /* recursed: which subroutines have we recursed into */
2596                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2597 {
2598     dVAR;
2599     I32 min = 0, pars = 0, code;
2600     regnode *scan = *scanp, *next;
2601     I32 delta = 0;
2602     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2603     int is_inf_internal = 0;            /* The studied chunk is infinite */
2604     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2605     scan_data_t data_fake;
2606     SV *re_trie_maxbuff = NULL;
2607     regnode *first_non_open = scan;
2608     I32 stopmin = I32_MAX;
2609     scan_frame *frame = NULL;
2610     GET_RE_DEBUG_FLAGS_DECL;
2611
2612     PERL_ARGS_ASSERT_STUDY_CHUNK;
2613
2614 #ifdef DEBUGGING
2615     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2616 #endif
2617
2618     if ( depth == 0 ) {
2619         while (first_non_open && OP(first_non_open) == OPEN)
2620             first_non_open=regnext(first_non_open);
2621     }
2622
2623
2624   fake_study_recurse:
2625     while ( scan && OP(scan) != END && scan < last ){
2626         /* Peephole optimizer: */
2627         DEBUG_STUDYDATA("Peep:", data,depth);
2628         DEBUG_PEEP("Peep",scan,depth);
2629         JOIN_EXACT(scan,&min,0);
2630
2631         /* Follow the next-chain of the current node and optimize
2632            away all the NOTHINGs from it.  */
2633         if (OP(scan) != CURLYX) {
2634             const int max = (reg_off_by_arg[OP(scan)]
2635                        ? I32_MAX
2636                        /* I32 may be smaller than U16 on CRAYs! */
2637                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2638             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2639             int noff;
2640             regnode *n = scan;
2641
2642             /* Skip NOTHING and LONGJMP. */
2643             while ((n = regnext(n))
2644                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2645                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2646                    && off + noff < max)
2647                 off += noff;
2648             if (reg_off_by_arg[OP(scan)])
2649                 ARG(scan) = off;
2650             else
2651                 NEXT_OFF(scan) = off;
2652         }
2653
2654
2655
2656         /* The principal pseudo-switch.  Cannot be a switch, since we
2657            look into several different things.  */
2658         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2659                    || OP(scan) == IFTHEN) {
2660             next = regnext(scan);
2661             code = OP(scan);
2662             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2663
2664             if (OP(next) == code || code == IFTHEN) {
2665                 /* NOTE - There is similar code to this block below for handling
2666                    TRIE nodes on a re-study.  If you change stuff here check there
2667                    too. */
2668                 I32 max1 = 0, min1 = I32_MAX, num = 0;
2669                 struct regnode_charclass_class accum;
2670                 regnode * const startbranch=scan;
2671
2672                 if (flags & SCF_DO_SUBSTR)
2673                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2674                 if (flags & SCF_DO_STCLASS)
2675                     cl_init_zero(pRExC_state, &accum);
2676
2677                 while (OP(scan) == code) {
2678                     I32 deltanext, minnext, f = 0, fake;
2679                     struct regnode_charclass_class this_class;
2680
2681                     num++;
2682                     data_fake.flags = 0;
2683                     if (data) {
2684                         data_fake.whilem_c = data->whilem_c;
2685                         data_fake.last_closep = data->last_closep;
2686                     }
2687                     else
2688                         data_fake.last_closep = &fake;
2689
2690                     data_fake.pos_delta = delta;
2691                     next = regnext(scan);
2692                     scan = NEXTOPER(scan);
2693                     if (code != BRANCH)
2694                         scan = NEXTOPER(scan);
2695                     if (flags & SCF_DO_STCLASS) {
2696                         cl_init(pRExC_state, &this_class);
2697                         data_fake.start_class = &this_class;
2698                         f = SCF_DO_STCLASS_AND;
2699                     }
2700                     if (flags & SCF_WHILEM_VISITED_POS)
2701                         f |= SCF_WHILEM_VISITED_POS;
2702
2703                     /* we suppose the run is continuous, last=next...*/
2704                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2705                                           next, &data_fake,
2706                                           stopparen, recursed, NULL, f,depth+1);
2707                     if (min1 > minnext)
2708                         min1 = minnext;
2709                     if (max1 < minnext + deltanext)
2710                         max1 = minnext + deltanext;
2711                     if (deltanext == I32_MAX)
2712                         is_inf = is_inf_internal = 1;
2713                     scan = next;
2714                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2715                         pars++;
2716                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
2717                         if ( stopmin > minnext)
2718                             stopmin = min + min1;
2719                         flags &= ~SCF_DO_SUBSTR;
2720                         if (data)
2721                             data->flags |= SCF_SEEN_ACCEPT;
2722                     }
2723                     if (data) {
2724                         if (data_fake.flags & SF_HAS_EVAL)
2725                             data->flags |= SF_HAS_EVAL;
2726                         data->whilem_c = data_fake.whilem_c;
2727                     }
2728                     if (flags & SCF_DO_STCLASS)
2729                         cl_or(pRExC_state, &accum, &this_class);
2730                 }
2731                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2732                     min1 = 0;
2733                 if (flags & SCF_DO_SUBSTR) {
2734                     data->pos_min += min1;
2735                     data->pos_delta += max1 - min1;
2736                     if (max1 != min1 || is_inf)
2737                         data->longest = &(data->longest_float);
2738                 }
2739                 min += min1;
2740                 delta += max1 - min1;
2741                 if (flags & SCF_DO_STCLASS_OR) {
2742                     cl_or(pRExC_state, data->start_class, &accum);
2743                     if (min1) {
2744                         cl_and(data->start_class, and_withp);
2745                         flags &= ~SCF_DO_STCLASS;
2746                     }
2747                 }
2748                 else if (flags & SCF_DO_STCLASS_AND) {
2749                     if (min1) {
2750                         cl_and(data->start_class, &accum);
2751                         flags &= ~SCF_DO_STCLASS;
2752                     }
2753                     else {
2754                         /* Switch to OR mode: cache the old value of
2755                          * data->start_class */
2756                         INIT_AND_WITHP;
2757                         StructCopy(data->start_class, and_withp,
2758                                    struct regnode_charclass_class);
2759                         flags &= ~SCF_DO_STCLASS_AND;
2760                         StructCopy(&accum, data->start_class,
2761                                    struct regnode_charclass_class);
2762                         flags |= SCF_DO_STCLASS_OR;
2763                         data->start_class->flags |= ANYOF_EOS;
2764                     }
2765                 }
2766
2767                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2768                 /* demq.
2769
2770                    Assuming this was/is a branch we are dealing with: 'scan' now
2771                    points at the item that follows the branch sequence, whatever
2772                    it is. We now start at the beginning of the sequence and look
2773                    for subsequences of
2774
2775                    BRANCH->EXACT=>x1
2776                    BRANCH->EXACT=>x2
2777                    tail
2778
2779                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
2780
2781                    If we can find such a subseqence we need to turn the first
2782                    element into a trie and then add the subsequent branch exact
2783                    strings to the trie.
2784
2785                    We have two cases
2786
2787                      1. patterns where the whole set of branches can be converted.
2788
2789                      2. patterns where only a subset can be converted.
2790
2791                    In case 1 we can replace the whole set with a single regop
2792                    for the trie. In case 2 we need to keep the start and end
2793                    branches so
2794
2795                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2796                      becomes BRANCH TRIE; BRANCH X;
2797
2798                   There is an additional case, that being where there is a
2799                   common prefix, which gets split out into an EXACT like node
2800                   preceding the TRIE node.
2801
2802                   If x(1..n)==tail then we can do a simple trie, if not we make
2803                   a "jump" trie, such that when we match the appropriate word
2804                   we "jump" to the appopriate tail node. Essentailly we turn
2805                   a nested if into a case structure of sorts.
2806
2807                 */
2808
2809                     int made=0;
2810                     if (!re_trie_maxbuff) {
2811                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2812                         if (!SvIOK(re_trie_maxbuff))
2813                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2814                     }
2815                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2816                         regnode *cur;
2817                         regnode *first = (regnode *)NULL;
2818                         regnode *last = (regnode *)NULL;
2819                         regnode *tail = scan;
2820                         U8 optype = 0;
2821                         U32 count=0;
2822
2823 #ifdef DEBUGGING
2824                         SV * const mysv = sv_newmortal();       /* for dumping */
2825 #endif
2826                         /* var tail is used because there may be a TAIL
2827                            regop in the way. Ie, the exacts will point to the
2828                            thing following the TAIL, but the last branch will
2829                            point at the TAIL. So we advance tail. If we
2830                            have nested (?:) we may have to move through several
2831                            tails.
2832                          */
2833
2834                         while ( OP( tail ) == TAIL ) {
2835                             /* this is the TAIL generated by (?:) */
2836                             tail = regnext( tail );
2837                         }
2838
2839
2840                         DEBUG_OPTIMISE_r({
2841                             regprop(RExC_rx, mysv, tail );
2842                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2843                                 (int)depth * 2 + 2, "",
2844                                 "Looking for TRIE'able sequences. Tail node is: ",
2845                                 SvPV_nolen_const( mysv )
2846                             );
2847                         });
2848
2849                         /*
2850
2851                            step through the branches, cur represents each
2852                            branch, noper is the first thing to be matched
2853                            as part of that branch and noper_next is the
2854                            regnext() of that node. if noper is an EXACT
2855                            and noper_next is the same as scan (our current
2856                            position in the regex) then the EXACT branch is
2857                            a possible optimization target. Once we have
2858                            two or more consequetive such branches we can
2859                            create a trie of the EXACT's contents and stich
2860                            it in place. If the sequence represents all of
2861                            the branches we eliminate the whole thing and
2862                            replace it with a single TRIE. If it is a
2863                            subsequence then we need to stitch it in. This
2864                            means the first branch has to remain, and needs
2865                            to be repointed at the item on the branch chain
2866                            following the last branch optimized. This could
2867                            be either a BRANCH, in which case the
2868                            subsequence is internal, or it could be the
2869                            item following the branch sequence in which
2870                            case the subsequence is at the end.
2871
2872                         */
2873
2874                         /* dont use tail as the end marker for this traverse */
2875                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2876                             regnode * const noper = NEXTOPER( cur );
2877 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2878                             regnode * const noper_next = regnext( noper );
2879 #endif
2880
2881                             DEBUG_OPTIMISE_r({
2882                                 regprop(RExC_rx, mysv, cur);
2883                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2884                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2885
2886                                 regprop(RExC_rx, mysv, noper);
2887                                 PerlIO_printf( Perl_debug_log, " -> %s",
2888                                     SvPV_nolen_const(mysv));
2889
2890                                 if ( noper_next ) {
2891                                   regprop(RExC_rx, mysv, noper_next );
2892                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2893                                     SvPV_nolen_const(mysv));
2894                                 }
2895                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2896                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2897                             });
2898                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2899                                          : PL_regkind[ OP( noper ) ] == EXACT )
2900                                   || OP(noper) == NOTHING )
2901 #ifdef NOJUMPTRIE
2902                                   && noper_next == tail
2903 #endif
2904                                   && count < U16_MAX)
2905                             {
2906                                 count++;
2907                                 if ( !first || optype == NOTHING ) {
2908                                     if (!first) first = cur;
2909                                     optype = OP( noper );
2910                                 } else {
2911                                     last = cur;
2912                                 }
2913                             } else {
2914 /*
2915     Currently we do not believe that the trie logic can
2916     handle case insensitive matching properly when the
2917     pattern is not unicode (thus forcing unicode semantics).
2918
2919     If/when this is fixed the following define can be swapped
2920     in below to fully enable trie logic.
2921
2922 #define TRIE_TYPE_IS_SAFE 1
2923
2924 */
2925 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2926
2927                                 if ( last && TRIE_TYPE_IS_SAFE ) {
2928                                     make_trie( pRExC_state,
2929                                             startbranch, first, cur, tail, count,
2930                                             optype, depth+1 );
2931                                 }
2932                                 if ( PL_regkind[ OP( noper ) ] == EXACT
2933 #ifdef NOJUMPTRIE
2934                                      && noper_next == tail
2935 #endif
2936                                 ){
2937                                     count = 1;
2938                                     first = cur;
2939                                     optype = OP( noper );
2940                                 } else {
2941                                     count = 0;
2942                                     first = NULL;
2943                                     optype = 0;
2944                                 }
2945                                 last = NULL;
2946                             }
2947                         }
2948                         DEBUG_OPTIMISE_r({
2949                             regprop(RExC_rx, mysv, cur);
2950                             PerlIO_printf( Perl_debug_log,
2951                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2952                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2953
2954                         });
2955
2956                         if ( last && TRIE_TYPE_IS_SAFE ) {
2957                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2958 #ifdef TRIE_STUDY_OPT
2959                             if ( ((made == MADE_EXACT_TRIE &&
2960                                  startbranch == first)
2961                                  || ( first_non_open == first )) &&
2962                                  depth==0 ) {
2963                                 flags |= SCF_TRIE_RESTUDY;
2964                                 if ( startbranch == first
2965                                      && scan == tail )
2966                                 {
2967                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2968                                 }
2969                             }
2970 #endif
2971                         }
2972                     }
2973
2974                 } /* do trie */
2975
2976             }
2977             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
2978                 scan = NEXTOPER(NEXTOPER(scan));
2979             } else                      /* single branch is optimized. */
2980                 scan = NEXTOPER(scan);
2981             continue;
2982         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2983             scan_frame *newframe = NULL;
2984             I32 paren;
2985             regnode *start;
2986             regnode *end;
2987
2988             if (OP(scan) != SUSPEND) {
2989             /* set the pointer */
2990                 if (OP(scan) == GOSUB) {
2991                     paren = ARG(scan);
2992                     RExC_recurse[ARG2L(scan)] = scan;
2993                     start = RExC_open_parens[paren-1];
2994                     end   = RExC_close_parens[paren-1];
2995                 } else {
2996                     paren = 0;
2997                     start = RExC_rxi->program + 1;
2998                     end   = RExC_opend;
2999                 }
3000                 if (!recursed) {
3001                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3002                     SAVEFREEPV(recursed);
3003                 }
3004                 if (!PAREN_TEST(recursed,paren+1)) {
3005                     PAREN_SET(recursed,paren+1);
3006                     Newx(newframe,1,scan_frame);
3007                 } else {
3008                     if (flags & SCF_DO_SUBSTR) {
3009                         SCAN_COMMIT(pRExC_state,data,minlenp);
3010                         data->longest = &(data->longest_float);
3011                     }
3012                     is_inf = is_inf_internal = 1;
3013                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3014                         cl_anything(pRExC_state, data->start_class);
3015                     flags &= ~SCF_DO_STCLASS;
3016                 }
3017             } else {
3018                 Newx(newframe,1,scan_frame);
3019                 paren = stopparen;
3020                 start = scan+2;
3021                 end = regnext(scan);
3022             }
3023             if (newframe) {
3024                 assert(start);
3025                 assert(end);
3026                 SAVEFREEPV(newframe);
3027                 newframe->next = regnext(scan);
3028                 newframe->last = last;
3029                 newframe->stop = stopparen;
3030                 newframe->prev = frame;
3031
3032                 frame = newframe;
3033                 scan =  start;
3034                 stopparen = paren;
3035                 last = end;
3036
3037                 continue;
3038             }
3039         }
3040         else if (OP(scan) == EXACT) {
3041             I32 l = STR_LEN(scan);
3042             UV uc;
3043             if (UTF) {
3044                 const U8 * const s = (U8*)STRING(scan);
3045                 l = utf8_length(s, s + l);
3046                 uc = utf8_to_uvchr(s, NULL);
3047             } else {
3048                 uc = *((U8*)STRING(scan));
3049             }
3050             min += l;
3051             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3052                 /* The code below prefers earlier match for fixed
3053                    offset, later match for variable offset.  */
3054                 if (data->last_end == -1) { /* Update the start info. */
3055                     data->last_start_min = data->pos_min;
3056                     data->last_start_max = is_inf
3057                         ? I32_MAX : data->pos_min + data->pos_delta;
3058                 }
3059                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3060                 if (UTF)
3061                     SvUTF8_on(data->last_found);
3062                 {
3063                     SV * const sv = data->last_found;
3064                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3065                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3066                     if (mg && mg->mg_len >= 0)
3067                         mg->mg_len += utf8_length((U8*)STRING(scan),
3068                                                   (U8*)STRING(scan)+STR_LEN(scan));
3069                 }
3070                 data->last_end = data->pos_min + l;
3071                 data->pos_min += l; /* As in the first entry. */
3072                 data->flags &= ~SF_BEFORE_EOL;
3073             }
3074             if (flags & SCF_DO_STCLASS_AND) {
3075                 /* Check whether it is compatible with what we know already! */
3076                 int compat = 1;
3077
3078
3079                 /* If compatibile, we or it in below.  It is compatible if is
3080                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3081                  * it's for a locale.  Even if there isn't unicode semantics
3082                  * here, at runtime there may be because of matching against a
3083                  * utf8 string, so accept a possible false positive for
3084                  * latin1-range folds */
3085                 if (uc >= 0x100 ||
3086                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3087                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3088                     && (!(data->start_class->flags & ANYOF_FOLD)
3089                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3090                     )
3091                     compat = 0;
3092                 ANYOF_CLASS_ZERO(data->start_class);
3093                 ANYOF_BITMAP_ZERO(data->start_class);
3094                 if (compat)
3095                     ANYOF_BITMAP_SET(data->start_class, uc);
3096                 data->start_class->flags &= ~ANYOF_EOS;
3097                 if (uc < 0x100)
3098                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3099             }
3100             else if (flags & SCF_DO_STCLASS_OR) {
3101                 /* false positive possible if the class is case-folded */
3102                 if (uc < 0x100)
3103                     ANYOF_BITMAP_SET(data->start_class, uc);
3104                 else
3105                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3106                 data->start_class->flags &= ~ANYOF_EOS;
3107                 cl_and(data->start_class, and_withp);
3108             }
3109             flags &= ~SCF_DO_STCLASS;
3110         }
3111         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3112             I32 l = STR_LEN(scan);
3113             UV uc = *((U8*)STRING(scan));
3114
3115             /* Search for fixed substrings supports EXACT only. */
3116             if (flags & SCF_DO_SUBSTR) {
3117                 assert(data);
3118                 SCAN_COMMIT(pRExC_state, data, minlenp);
3119             }
3120             if (UTF) {
3121                 const U8 * const s = (U8 *)STRING(scan);
3122                 l = utf8_length(s, s + l);
3123                 uc = utf8_to_uvchr(s, NULL);
3124             }
3125             min += l;
3126             if (flags & SCF_DO_SUBSTR)
3127                 data->pos_min += l;
3128             if (flags & SCF_DO_STCLASS_AND) {
3129                 /* Check whether it is compatible with what we know already! */
3130                 int compat = 1;
3131                 if (uc >= 0x100 ||
3132                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3133                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3134                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3135                 {
3136                     compat = 0;
3137                 }
3138                 ANYOF_CLASS_ZERO(data->start_class);
3139                 ANYOF_BITMAP_ZERO(data->start_class);
3140                 if (compat) {
3141                     ANYOF_BITMAP_SET(data->start_class, uc);
3142                     data->start_class->flags &= ~ANYOF_EOS;
3143                     data->start_class->flags |= ANYOF_FOLD;
3144                     if (OP(scan) == EXACTFL) {
3145                         data->start_class->flags |= ANYOF_LOCALE;
3146                     }
3147                     else {
3148
3149                         /* Also set the other member of the fold pair.  In case
3150                          * that unicode semantics is called for at runtime, use
3151                          * the full latin1 fold.  (Can't do this for locale,
3152                          * because not known until runtime */
3153                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3154                     }
3155                 }
3156             }
3157             else if (flags & SCF_DO_STCLASS_OR) {
3158                 if (data->start_class->flags & ANYOF_FOLD) {
3159                     /* false positive possible if the class is case-folded.
3160                        Assume that the locale settings are the same... */
3161                     if (uc < 0x100) {
3162                         ANYOF_BITMAP_SET(data->start_class, uc);
3163                         if (OP(scan) != EXACTFL) {
3164
3165                             /* And set the other member of the fold pair, but
3166                              * can't do that in locale because not known until
3167                              * run-time */
3168                             ANYOF_BITMAP_SET(data->start_class,
3169                                              PL_fold_latin1[uc]);
3170                         }
3171                     }
3172                     data->start_class->flags &= ~ANYOF_EOS;
3173                 }
3174                 cl_and(data->start_class, and_withp);
3175             }
3176             flags &= ~SCF_DO_STCLASS;
3177         }
3178         else if (REGNODE_VARIES(OP(scan))) {
3179             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3180             I32 f = flags, pos_before = 0;
3181             regnode * const oscan = scan;
3182             struct regnode_charclass_class this_class;
3183             struct regnode_charclass_class *oclass = NULL;
3184             I32 next_is_eval = 0;
3185
3186             switch (PL_regkind[OP(scan)]) {
3187             case WHILEM:                /* End of (?:...)* . */
3188                 scan = NEXTOPER(scan);
3189                 goto finish;
3190             case PLUS:
3191                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3192                     next = NEXTOPER(scan);
3193                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3194                         mincount = 1;
3195                         maxcount = REG_INFTY;
3196                         next = regnext(scan);
3197                         scan = NEXTOPER(scan);
3198                         goto do_curly;
3199                     }
3200                 }
3201                 if (flags & SCF_DO_SUBSTR)
3202                     data->pos_min++;
3203                 min++;
3204                 /* Fall through. */
3205             case STAR:
3206                 if (flags & SCF_DO_STCLASS) {
3207                     mincount = 0;
3208                     maxcount = REG_INFTY;
3209                     next = regnext(scan);
3210                     scan = NEXTOPER(scan);
3211                     goto do_curly;
3212                 }
3213                 is_inf = is_inf_internal = 1;
3214                 scan = regnext(scan);
3215                 if (flags & SCF_DO_SUBSTR) {
3216                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3217                     data->longest = &(data->longest_float);
3218                 }
3219                 goto optimize_curly_tail;
3220             case CURLY:
3221                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3222                     && (scan->flags == stopparen))
3223                 {
3224                     mincount = 1;
3225                     maxcount = 1;
3226                 } else {
3227                     mincount = ARG1(scan);
3228                     maxcount = ARG2(scan);
3229                 }
3230                 next = regnext(scan);
3231                 if (OP(scan) == CURLYX) {
3232                     I32 lp = (data ? *(data->last_closep) : 0);
3233                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3234                 }
3235                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3236                 next_is_eval = (OP(scan) == EVAL);
3237               do_curly:
3238                 if (flags & SCF_DO_SUBSTR) {
3239                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3240                     pos_before = data->pos_min;
3241                 }
3242                 if (data) {
3243                     fl = data->flags;
3244                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3245                     if (is_inf)
3246                         data->flags |= SF_IS_INF;
3247                 }
3248                 if (flags & SCF_DO_STCLASS) {
3249                     cl_init(pRExC_state, &this_class);
3250                     oclass = data->start_class;
3251                     data->start_class = &this_class;
3252                     f |= SCF_DO_STCLASS_AND;
3253                     f &= ~SCF_DO_STCLASS_OR;
3254                 }
3255                 /* Exclude from super-linear cache processing any {n,m}
3256                    regops for which the combination of input pos and regex
3257                    pos is not enough information to determine if a match
3258                    will be possible.
3259
3260                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3261                    regex pos at the \s*, the prospects for a match depend not
3262                    only on the input position but also on how many (bar\s*)
3263                    repeats into the {4,8} we are. */
3264                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3265                     f &= ~SCF_WHILEM_VISITED_POS;
3266
3267                 /* This will finish on WHILEM, setting scan, or on NULL: */
3268                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3269                                       last, data, stopparen, recursed, NULL,
3270                                       (mincount == 0
3271                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3272
3273                 if (flags & SCF_DO_STCLASS)
3274                     data->start_class = oclass;
3275                 if (mincount == 0 || minnext == 0) {
3276                     if (flags & SCF_DO_STCLASS_OR) {
3277                         cl_or(pRExC_state, data->start_class, &this_class);
3278                     }
3279                     else if (flags & SCF_DO_STCLASS_AND) {
3280                         /* Switch to OR mode: cache the old value of
3281                          * data->start_class */
3282                         INIT_AND_WITHP;
3283                         StructCopy(data->start_class, and_withp,
3284                                    struct regnode_charclass_class);
3285                         flags &= ~SCF_DO_STCLASS_AND;
3286                         StructCopy(&this_class, data->start_class,
3287                                    struct regnode_charclass_class);
3288                         flags |= SCF_DO_STCLASS_OR;
3289                         data->start_class->flags |= ANYOF_EOS;
3290                     }
3291                 } else {                /* Non-zero len */
3292                     if (flags & SCF_DO_STCLASS_OR) {
3293                         cl_or(pRExC_state, data->start_class, &this_class);
3294                         cl_and(data->start_class, and_withp);
3295                     }
3296                     else if (flags & SCF_DO_STCLASS_AND)
3297                         cl_and(data->start_class, &this_class);
3298                     flags &= ~SCF_DO_STCLASS;
3299                 }
3300                 if (!scan)              /* It was not CURLYX, but CURLY. */
3301                     scan = next;
3302                 if ( /* ? quantifier ok, except for (?{ ... }) */
3303                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3304                     && (minnext == 0) && (deltanext == 0)
3305                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3306                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3307                 {
3308                     ckWARNreg(RExC_parse,
3309                               "Quantifier unexpected on zero-length expression");
3310                 }
3311
3312                 min += minnext * mincount;
3313                 is_inf_internal |= ((maxcount == REG_INFTY
3314                                      && (minnext + deltanext) > 0)
3315                                     || deltanext == I32_MAX);
3316                 is_inf |= is_inf_internal;
3317                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3318
3319                 /* Try powerful optimization CURLYX => CURLYN. */
3320                 if (  OP(oscan) == CURLYX && data
3321                       && data->flags & SF_IN_PAR
3322                       && !(data->flags & SF_HAS_EVAL)
3323                       && !deltanext && minnext == 1 ) {
3324                     /* Try to optimize to CURLYN.  */
3325                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3326                     regnode * const nxt1 = nxt;
3327 #ifdef DEBUGGING
3328                     regnode *nxt2;
3329 #endif
3330
3331                     /* Skip open. */
3332                     nxt = regnext(nxt);
3333                     if (!REGNODE_SIMPLE(OP(nxt))
3334                         && !(PL_regkind[OP(nxt)] == EXACT
3335                              && STR_LEN(nxt) == 1))
3336                         goto nogo;
3337 #ifdef DEBUGGING
3338                     nxt2 = nxt;
3339 #endif
3340                     nxt = regnext(nxt);
3341                     if (OP(nxt) != CLOSE)
3342                         goto nogo;
3343                     if (RExC_open_parens) {
3344                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3345                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3346                     }
3347                     /* Now we know that nxt2 is the only contents: */
3348                     oscan->flags = (U8)ARG(nxt);
3349                     OP(oscan) = CURLYN;
3350                     OP(nxt1) = NOTHING; /* was OPEN. */
3351
3352 #ifdef DEBUGGING
3353                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3354                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3355                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3356                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3357                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3358                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3359 #endif
3360                 }
3361               nogo:
3362
3363                 /* Try optimization CURLYX => CURLYM. */
3364                 if (  OP(oscan) == CURLYX && data
3365                       && !(data->flags & SF_HAS_PAR)
3366                       && !(data->flags & SF_HAS_EVAL)
3367                       && !deltanext     /* atom is fixed width */
3368                       && minnext != 0   /* CURLYM can't handle zero width */
3369                 ) {
3370                     /* XXXX How to optimize if data == 0? */
3371                     /* Optimize to a simpler form.  */
3372                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3373                     regnode *nxt2;
3374
3375                     OP(oscan) = CURLYM;
3376                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3377                             && (OP(nxt2) != WHILEM))
3378                         nxt = nxt2;
3379                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3380                     /* Need to optimize away parenths. */
3381                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3382                         /* Set the parenth number.  */
3383                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3384
3385                         oscan->flags = (U8)ARG(nxt);
3386                         if (RExC_open_parens) {
3387                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3388                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3389                         }
3390                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3391                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3392
3393 #ifdef DEBUGGING
3394                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3395                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3396                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3397                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3398 #endif
3399 #if 0
3400                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3401                             regnode *nnxt = regnext(nxt1);
3402                             if (nnxt == nxt) {
3403                                 if (reg_off_by_arg[OP(nxt1)])
3404                                     ARG_SET(nxt1, nxt2 - nxt1);
3405                                 else if (nxt2 - nxt1 < U16_MAX)
3406                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3407                                 else
3408                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3409                             }
3410                             nxt1 = nnxt;
3411                         }
3412 #endif
3413                         /* Optimize again: */
3414                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3415                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3416                     }
3417                     else
3418                         oscan->flags = 0;
3419                 }
3420                 else if ((OP(oscan) == CURLYX)
3421                          && (flags & SCF_WHILEM_VISITED_POS)
3422                          /* See the comment on a similar expression above.
3423                             However, this time it's not a subexpression
3424                             we care about, but the expression itself. */
3425                          && (maxcount == REG_INFTY)
3426                          && data && ++data->whilem_c < 16) {
3427                     /* This stays as CURLYX, we can put the count/of pair. */
3428                     /* Find WHILEM (as in regexec.c) */
3429                     regnode *nxt = oscan + NEXT_OFF(oscan);
3430
3431                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3432                         nxt += ARG(nxt);
3433                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3434                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3435                 }
3436                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3437                     pars++;
3438                 if (flags & SCF_DO_SUBSTR) {
3439                     SV *last_str = NULL;
3440                     int counted = mincount != 0;
3441
3442                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3443 #if defined(SPARC64_GCC_WORKAROUND)
3444                         I32 b = 0;
3445                         STRLEN l = 0;
3446                         const char *s = NULL;
3447                         I32 old = 0;
3448
3449                         if (pos_before >= data->last_start_min)
3450                             b = pos_before;
3451                         else
3452                             b = data->last_start_min;
3453
3454                         l = 0;
3455                         s = SvPV_const(data->last_found, l);
3456                         old = b - data->last_start_min;
3457
3458 #else
3459                         I32 b = pos_before >= data->last_start_min
3460                             ? pos_before : data->last_start_min;
3461                         STRLEN l;
3462                         const char * const s = SvPV_const(data->last_found, l);
3463                         I32 old = b - data->last_start_min;
3464 #endif
3465
3466                         if (UTF)
3467                             old = utf8_hop((U8*)s, old) - (U8*)s;
3468                         l -= old;
3469                         /* Get the added string: */
3470                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3471                         if (deltanext == 0 && pos_before == b) {
3472                             /* What was added is a constant string */
3473                             if (mincount > 1) {
3474                                 SvGROW(last_str, (mincount * l) + 1);
3475                                 repeatcpy(SvPVX(last_str) + l,
3476                                           SvPVX_const(last_str), l, mincount - 1);
3477                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3478                                 /* Add additional parts. */
3479                                 SvCUR_set(data->last_found,
3480                                           SvCUR(data->last_found) - l);
3481                                 sv_catsv(data->last_found, last_str);
3482                                 {
3483                                     SV * sv = data->last_found;
3484                                     MAGIC *mg =
3485                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3486                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3487                                     if (mg && mg->mg_len >= 0)
3488                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3489                                 }
3490                                 data->last_end += l * (mincount - 1);
3491                             }
3492                         } else {
3493                             /* start offset must point into the last copy */
3494                             data->last_start_min += minnext * (mincount - 1);
3495                             data->last_start_max += is_inf ? I32_MAX
3496                                 : (maxcount - 1) * (minnext + data->pos_delta);
3497                         }
3498                     }
3499                     /* It is counted once already... */
3500                     data->pos_min += minnext * (mincount - counted);
3501                     data->pos_delta += - counted * deltanext +
3502                         (minnext + deltanext) * maxcount - minnext * mincount;
3503                     if (mincount != maxcount) {
3504                          /* Cannot extend fixed substrings found inside
3505                             the group.  */
3506                         SCAN_COMMIT(pRExC_state,data,minlenp);
3507                         if (mincount && last_str) {
3508                             SV * const sv = data->last_found;
3509                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3510                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3511
3512                             if (mg)
3513                                 mg->mg_len = -1;
3514                             sv_setsv(sv, last_str);
3515                             data->last_end = data->pos_min;
3516                             data->last_start_min =
3517                                 data->pos_min - CHR_SVLEN(last_str);
3518                             data->last_start_max = is_inf
3519                                 ? I32_MAX
3520                                 : data->pos_min + data->pos_delta
3521                                 - CHR_SVLEN(last_str);
3522                         }
3523                         data->longest = &(data->longest_float);
3524                     }
3525                     SvREFCNT_dec(last_str);
3526                 }
3527                 if (data && (fl & SF_HAS_EVAL))
3528                     data->flags |= SF_HAS_EVAL;
3529               optimize_curly_tail:
3530                 if (OP(oscan) != CURLYX) {
3531                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3532                            && NEXT_OFF(next))
3533                         NEXT_OFF(oscan) += NEXT_OFF(next);
3534                 }
3535                 continue;
3536             default:                    /* REF and CLUMP only? */
3537                 if (flags & SCF_DO_SUBSTR) {
3538                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3539                     data->longest = &(data->longest_float);
3540                 }
3541                 is_inf = is_inf_internal = 1;
3542                 if (flags & SCF_DO_STCLASS_OR)
3543                     cl_anything(pRExC_state, data->start_class);
3544                 flags &= ~SCF_DO_STCLASS;
3545                 break;
3546             }
3547         }
3548         else if (OP(scan) == LNBREAK) {
3549             if (flags & SCF_DO_STCLASS) {
3550                 int value = 0;
3551                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3552                 if (flags & SCF_DO_STCLASS_AND) {
3553                     for (value = 0; value < 256; value++)
3554                         if (!is_VERTWS_cp(value))
3555                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3556                 }
3557                 else {
3558                     for (value = 0; value < 256; value++)
3559                         if (is_VERTWS_cp(value))
3560                             ANYOF_BITMAP_SET(data->start_class, value);
3561                 }
3562                 if (flags & SCF_DO_STCLASS_OR)
3563                     cl_and(data->start_class, and_withp);
3564                 flags &= ~SCF_DO_STCLASS;
3565             }
3566             min += 1;
3567             delta += 1;
3568             if (flags & SCF_DO_SUBSTR) {
3569                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3570                 data->pos_min += 1;
3571                 data->pos_delta += 1;
3572                 data->longest = &(data->longest_float);
3573             }
3574         }
3575         else if (OP(scan) == FOLDCHAR) {
3576             int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
3577             flags &= ~SCF_DO_STCLASS;
3578             min += 1;
3579             delta += d;
3580             if (flags & SCF_DO_SUBSTR) {
3581                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3582                 data->pos_min += 1;
3583                 data->pos_delta += d;
3584                 data->longest = &(data->longest_float);
3585             }
3586         }
3587         else if (REGNODE_SIMPLE(OP(scan))) {
3588             int value = 0;
3589
3590             if (flags & SCF_DO_SUBSTR) {
3591                 SCAN_COMMIT(pRExC_state,data,minlenp);
3592                 data->pos_min++;
3593             }
3594             min++;
3595             if (flags & SCF_DO_STCLASS) {
3596                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3597
3598                 /* Some of the logic below assumes that switching
3599                    locale on will only add false positives. */
3600                 switch (PL_regkind[OP(scan)]) {
3601                 case SANY:
3602                 default:
3603                   do_default:
3604                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3605                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3606                         cl_anything(pRExC_state, data->start_class);
3607                     break;
3608                 case REG_ANY:
3609                     if (OP(scan) == SANY)
3610                         goto do_default;
3611                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3612                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3613                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3614                         cl_anything(pRExC_state, data->start_class);
3615                     }
3616                     if (flags & SCF_DO_STCLASS_AND || !value)
3617                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3618                     break;
3619                 case ANYOF:
3620                     if (flags & SCF_DO_STCLASS_AND)
3621                         cl_and(data->start_class,
3622                                (struct regnode_charclass_class*)scan);
3623                     else
3624                         cl_or(pRExC_state, data->start_class,
3625                               (struct regnode_charclass_class*)scan);
3626                     break;
3627                 case ALNUM:
3628                     if (flags & SCF_DO_STCLASS_AND) {
3629                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3630                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3631                             if (FLAGS(scan) & USE_UNI) {
3632                                 for (value = 0; value < 256; value++) {
3633                                     if (!isWORDCHAR_L1(value)) {
3634                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3635                                     }
3636                                 }
3637                             } else {
3638                                 for (value = 0; value < 256; value++) {
3639                                     if (!isALNUM(value)) {
3640                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3641                                     }
3642                                 }
3643                             }
3644                         }
3645                     }
3646                     else {
3647                         if (data->start_class->flags & ANYOF_LOCALE)
3648                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3649                         else if (FLAGS(scan) & USE_UNI) {
3650                             for (value = 0; value < 256; value++) {
3651                                 if (isWORDCHAR_L1(value)) {
3652                                     ANYOF_BITMAP_SET(data->start_class, value);
3653                                 }
3654                             }
3655                         } else {
3656                             for (value = 0; value < 256; value++) {
3657                                 if (isALNUM(value)) {
3658                                     ANYOF_BITMAP_SET(data->start_class, value);
3659                                 }
3660                             }
3661                         }
3662                     }
3663                     break;
3664                 case ALNUML:
3665                     if (flags & SCF_DO_STCLASS_AND) {
3666                         if (data->start_class->flags & ANYOF_LOCALE)
3667                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3668                     }
3669                     else {
3670                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3671                         data->start_class->flags |= ANYOF_LOCALE;
3672                     }
3673                     break;
3674                 case NALNUM:
3675                     if (flags & SCF_DO_STCLASS_AND) {
3676                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3677                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3678                             if (FLAGS(scan) & USE_UNI) {
3679                                 for (value = 0; value < 256; value++) {
3680                                     if (isWORDCHAR_L1(value)) {
3681                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3682                                     }
3683                                 }
3684                             } else {
3685                                 for (value = 0; value < 256; value++) {
3686                                     if (isALNUM(value)) {
3687                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3688                                     }
3689                                 }
3690                             }
3691                         }
3692                     }
3693                     else {
3694                         if (data->start_class->flags & ANYOF_LOCALE)
3695                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3696                         else {
3697                             for (value = 0; value < 256; value++)
3698                                 if (!isALNUM(value))
3699                                     ANYOF_BITMAP_SET(data->start_class, value);
3700                         }
3701                     }
3702                     break;
3703                 case NALNUML:
3704                     if (flags & SCF_DO_STCLASS_AND) {
3705                         if (data->start_class->flags & ANYOF_LOCALE)
3706                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3707                     }
3708                     else {
3709                         data->start_class->flags |= ANYOF_LOCALE;
3710                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3711                     }
3712                     break;
3713                 case SPACE:
3714                     if (flags & SCF_DO_STCLASS_AND) {
3715                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3716                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3717                             if (FLAGS(scan) & USE_UNI) {
3718                                 for (value = 0; value < 256; value++) {
3719                                     if (!isSPACE_L1(value)) {
3720                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3721                                     }
3722                                 }
3723                             } else {
3724                                 for (value = 0; value < 256; value++) {
3725                                     if (!isSPACE(value)) {
3726                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3727                                     }
3728                                 }
3729                             }
3730                         }
3731                     }
3732                     else {
3733                         if (data->start_class->flags & ANYOF_LOCALE) {
3734                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3735                         }
3736                         else if (FLAGS(scan) & USE_UNI) {
3737                             for (value = 0; value < 256; value++) {
3738                                 if (isSPACE_L1(value)) {
3739                                     ANYOF_BITMAP_SET(data->start_class, value);
3740                                 }
3741                             }
3742                         } else {
3743                             for (value = 0; value < 256; value++) {
3744                                 if (isSPACE(value)) {
3745                                     ANYOF_BITMAP_SET(data->start_class, value);
3746                                 }
3747                             }
3748                         }
3749                     }
3750                     break;
3751                 case SPACEL:
3752                     if (flags & SCF_DO_STCLASS_AND) {
3753                         if (data->start_class->flags & ANYOF_LOCALE)
3754                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3755                     }
3756                     else {
3757                         data->start_class->flags |= ANYOF_LOCALE;
3758                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3759                     }
3760                     break;
3761                 case NSPACE:
3762                     if (flags & SCF_DO_STCLASS_AND) {
3763                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3764                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3765                             if (FLAGS(scan) & USE_UNI) {
3766                                 for (value = 0; value < 256; value++) {
3767                                     if (isSPACE_L1(value)) {
3768                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3769                                     }
3770                                 }
3771                             } else {
3772                                 for (value = 0; value < 256; value++) {
3773                                     if (isSPACE(value)) {
3774                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3775                                     }
3776                                 }
3777                             }
3778                         }
3779                     }
3780                     else {
3781                         if (data->start_class->flags & ANYOF_LOCALE)
3782                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3783                         else if (FLAGS(scan) & USE_UNI) {
3784                             for (value = 0; value < 256; value++) {
3785                                 if (!isSPACE_L1(value)) {
3786                                     ANYOF_BITMAP_SET(data->start_class, value);
3787                                 }
3788                             }
3789                         }
3790                         else {
3791                             for (value = 0; value < 256; value++) {
3792                                 if (!isSPACE(value)) {
3793                                     ANYOF_BITMAP_SET(data->start_class, value);
3794                                 }
3795                             }
3796                         }
3797                     }
3798                     break;
3799                 case NSPACEL:
3800                     if (flags & SCF_DO_STCLASS_AND) {
3801                         if (data->start_class->flags & ANYOF_LOCALE) {
3802                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3803                             for (value = 0; value < 256; value++)
3804                                 if (!isSPACE(value))
3805                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3806                         }
3807                     }
3808                     else {
3809                         data->start_class->flags |= ANYOF_LOCALE;
3810                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3811                     }
3812                     break;
3813                 case DIGIT:
3814                     if (flags & SCF_DO_STCLASS_AND) {
3815                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3816                         for (value = 0; value < 256; value++)
3817                             if (!isDIGIT(value))
3818                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3819                     }
3820                     else {
3821                         if (data->start_class->flags & ANYOF_LOCALE)
3822                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3823                         else {
3824                             for (value = 0; value < 256; value++)
3825                                 if (isDIGIT(value))
3826                                     ANYOF_BITMAP_SET(data->start_class, value);
3827                         }
3828                     }
3829                     break;
3830                 case NDIGIT:
3831                     if (flags & SCF_DO_STCLASS_AND) {
3832                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3833                         for (value = 0; value < 256; value++)
3834                             if (isDIGIT(value))
3835                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3836                     }
3837                     else {
3838                         if (data->start_class->flags & ANYOF_LOCALE)
3839                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3840                         else {
3841                             for (value = 0; value < 256; value++)
3842                                 if (!isDIGIT(value))
3843                                     ANYOF_BITMAP_SET(data->start_class, value);
3844                         }
3845                     }
3846                     break;
3847                 CASE_SYNST_FNC(VERTWS);
3848                 CASE_SYNST_FNC(HORIZWS);
3849
3850                 }
3851                 if (flags & SCF_DO_STCLASS_OR)
3852                     cl_and(data->start_class, and_withp);
3853                 flags &= ~SCF_DO_STCLASS;
3854             }
3855         }
3856         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3857             data->flags |= (OP(scan) == MEOL
3858                             ? SF_BEFORE_MEOL
3859                             : SF_BEFORE_SEOL);
3860         }
3861         else if (  PL_regkind[OP(scan)] == BRANCHJ
3862                  /* Lookbehind, or need to calculate parens/evals/stclass: */
3863                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
3864                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3865             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3866                 || OP(scan) == UNLESSM )
3867             {
3868                 /* Negative Lookahead/lookbehind
3869                    In this case we can't do fixed string optimisation.
3870                 */
3871
3872                 I32 deltanext, minnext, fake = 0;
3873                 regnode *nscan;
3874                 struct regnode_charclass_class intrnl;
3875                 int f = 0;
3876
3877                 data_fake.flags = 0;
3878                 if (data) {
3879                     data_fake.whilem_c = data->whilem_c;
3880                     data_fake.last_closep = data->last_closep;
3881                 }
3882                 else
3883                     data_fake.last_closep = &fake;
3884                 data_fake.pos_delta = delta;
3885                 if ( flags & SCF_DO_STCLASS && !scan->flags
3886                      && OP(scan) == IFMATCH ) { /* Lookahead */
3887                     cl_init(pRExC_state, &intrnl);
3888                     data_fake.start_class = &intrnl;
3889                     f |= SCF_DO_STCLASS_AND;
3890                 }
3891                 if (flags & SCF_WHILEM_VISITED_POS)
3892                     f |= SCF_WHILEM_VISITED_POS;
3893                 next = regnext(scan);
3894                 nscan = NEXTOPER(NEXTOPER(scan));
3895                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3896                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3897                 if (scan->flags) {
3898                     if (deltanext) {
3899                         FAIL("Variable length lookbehind not implemented");
3900                     }
3901                     else if (minnext > (I32)U8_MAX) {
3902                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3903                     }
3904                     scan->flags = (U8)minnext;
3905                 }
3906                 if (data) {
3907                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3908                         pars++;
3909                     if (data_fake.flags & SF_HAS_EVAL)
3910                         data->flags |= SF_HAS_EVAL;
3911                     data->whilem_c = data_fake.whilem_c;
3912                 }
3913                 if (f & SCF_DO_STCLASS_AND) {
3914                     if (flags & SCF_DO_STCLASS_OR) {
3915                         /* OR before, AND after: ideally we would recurse with
3916                          * data_fake to get the AND applied by study of the
3917                          * remainder of the pattern, and then derecurse;
3918                          * *** HACK *** for now just treat as "no information".
3919                          * See [perl #56690].
3920                          */
3921                         cl_init(pRExC_state, data->start_class);
3922                     }  else {
3923                         /* AND before and after: combine and continue */
3924                         const int was = (data->start_class->flags & ANYOF_EOS);
3925
3926                         cl_and(data->start_class, &intrnl);
3927                         if (was)
3928                             data->start_class->flags |= ANYOF_EOS;
3929                     }
3930                 }
3931             }
3932 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3933             else {
3934                 /* Positive Lookahead/lookbehind
3935                    In this case we can do fixed string optimisation,
3936                    but we must be careful about it. Note in the case of
3937                    lookbehind the positions will be offset by the minimum
3938                    length of the pattern, something we won't know about
3939                    until after the recurse.
3940                 */
3941                 I32 deltanext, fake = 0;
3942                 regnode *nscan;
3943                 struct regnode_charclass_class intrnl;
3944                 int f = 0;
3945                 /* We use SAVEFREEPV so that when the full compile
3946                     is finished perl will clean up the allocated
3947                     minlens when it's all done. This way we don't
3948                     have to worry about freeing them when we know
3949                     they wont be used, which would be a pain.
3950                  */
3951                 I32 *minnextp;
3952                 Newx( minnextp, 1, I32 );
3953                 SAVEFREEPV(minnextp);
3954
3955                 if (data) {
3956                     StructCopy(data, &data_fake, scan_data_t);
3957                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3958                         f |= SCF_DO_SUBSTR;
3959                         if (scan->flags)
3960                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3961                         data_fake.last_found=newSVsv(data->last_found);
3962                     }
3963                 }
3964                 else
3965                     data_fake.last_closep = &fake;
3966                 data_fake.flags = 0;
3967                 data_fake.pos_delta = delta;
3968                 if (is_inf)
3969                     data_fake.flags |= SF_IS_INF;
3970                 if ( flags & SCF_DO_STCLASS && !scan->flags
3971                      && OP(scan) == IFMATCH ) { /* Lookahead */
3972                     cl_init(pRExC_state, &intrnl);
3973                     data_fake.start_class = &intrnl;
3974                     f |= SCF_DO_STCLASS_AND;
3975                 }
3976                 if (flags & SCF_WHILEM_VISITED_POS)
3977                     f |= SCF_WHILEM_VISITED_POS;
3978                 next = regnext(scan);
3979                 nscan = NEXTOPER(NEXTOPER(scan));
3980
3981                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3982                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3983                 if (scan->flags) {
3984                     if (deltanext) {
3985                         FAIL("Variable length lookbehind not implemented");
3986                     }
3987                     else if (*minnextp > (I32)U8_MAX) {
3988                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3989                     }
3990                     scan->flags = (U8)*minnextp;
3991                 }
3992
3993                 *minnextp += min;
3994
3995                 if (f & SCF_DO_STCLASS_AND) {
3996                     const int was = (data->start_class->flags & ANYOF_EOS);
3997
3998                     cl_and(data->start_class, &intrnl);
3999                     if (was)
4000                         data->start_class->flags |= ANYOF_EOS;
4001                 }
4002                 if (data) {
4003                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4004                         pars++;
4005                     if (data_fake.flags & SF_HAS_EVAL)
4006                         data->flags |= SF_HAS_EVAL;
4007                     data->whilem_c = data_fake.whilem_c;
4008                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4009                         if (RExC_rx->minlen<*minnextp)
4010                             RExC_rx->minlen=*minnextp;
4011                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4012                         SvREFCNT_dec(data_fake.last_found);
4013
4014                         if ( data_fake.minlen_fixed != minlenp )
4015                         {
4016                             data->offset_fixed= data_fake.offset_fixed;
4017                             data->minlen_fixed= data_fake.minlen_fixed;
4018                             data->lookbehind_fixed+= scan->flags;
4019                         }
4020                         if ( data_fake.minlen_float != minlenp )
4021                         {
4022                             data->minlen_float= data_fake.minlen_float;
4023                             data->offset_float_min=data_fake.offset_float_min;
4024                             data->offset_float_max=data_fake.offset_float_max;
4025                             data->lookbehind_float+= scan->flags;
4026                         }
4027                     }
4028                 }
4029
4030
4031             }
4032 #endif
4033         }
4034         else if (OP(scan) == OPEN) {
4035             if (stopparen != (I32)ARG(scan))
4036                 pars++;
4037         }
4038         else if (OP(scan) == CLOSE) {
4039             if (stopparen == (I32)ARG(scan)) {
4040                 break;
4041             }
4042             if ((I32)ARG(scan) == is_par) {
4043                 next = regnext(scan);
4044
4045                 if ( next && (OP(next) != WHILEM) && next < last)
4046                     is_par = 0;         /* Disable optimization */
4047             }
4048             if (data)
4049                 *(data->last_closep) = ARG(scan);
4050         }
4051         else if (OP(scan) == EVAL) {
4052                 if (data)
4053                     data->flags |= SF_HAS_EVAL;
4054         }
4055         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4056             if (flags & SCF_DO_SUBSTR) {
4057                 SCAN_COMMIT(pRExC_state,data,minlenp);
4058                 flags &= ~SCF_DO_SUBSTR;
4059             }
4060             if (data && OP(scan)==ACCEPT) {
4061                 data->flags |= SCF_SEEN_ACCEPT;
4062                 if (stopmin > min)
4063                     stopmin = min;
4064             }
4065         }
4066         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4067         {
4068                 if (flags & SCF_DO_SUBSTR) {
4069                     SCAN_COMMIT(pRExC_state,data,minlenp);
4070                     data->longest = &(data->longest_float);
4071                 }
4072                 is_inf = is_inf_internal = 1;
4073                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4074                     cl_anything(pRExC_state, data->start_class);
4075                 flags &= ~SCF_DO_STCLASS;
4076         }
4077         else if (OP(scan) == GPOS) {
4078             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4079                 !(delta || is_inf || (data && data->pos_delta)))
4080             {
4081                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4082                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4083                 if (RExC_rx->gofs < (U32)min)
4084                     RExC_rx->gofs = min;
4085             } else {
4086                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4087                 RExC_rx->gofs = 0;
4088             }
4089         }
4090 #ifdef TRIE_STUDY_OPT
4091 #ifdef FULL_TRIE_STUDY
4092         else if (PL_regkind[OP(scan)] == TRIE) {
4093             /* NOTE - There is similar code to this block above for handling
4094                BRANCH nodes on the initial study.  If you change stuff here
4095                check there too. */
4096             regnode *trie_node= scan;
4097             regnode *tail= regnext(scan);
4098             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4099             I32 max1 = 0, min1 = I32_MAX;
4100             struct regnode_charclass_class accum;
4101
4102             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4103                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4104             if (flags & SCF_DO_STCLASS)
4105                 cl_init_zero(pRExC_state, &accum);
4106
4107             if (!trie->jump) {
4108                 min1= trie->minlen;
4109                 max1= trie->maxlen;
4110             } else {
4111                 const regnode *nextbranch= NULL;
4112                 U32 word;
4113
4114                 for ( word=1 ; word <= trie->wordcount ; word++)
4115                 {
4116                     I32 deltanext=0, minnext=0, f = 0, fake;
4117                     struct regnode_charclass_class this_class;
4118
4119                     data_fake.flags = 0;
4120                     if (data) {
4121                         data_fake.whilem_c = data->whilem_c;
4122                         data_fake.last_closep = data->last_closep;
4123                     }
4124                     else
4125                         data_fake.last_closep = &fake;
4126                     data_fake.pos_delta = delta;
4127                     if (flags & SCF_DO_STCLASS) {
4128                         cl_init(pRExC_state, &this_class);
4129                         data_fake.start_class = &this_class;
4130                         f = SCF_DO_STCLASS_AND;
4131                     }
4132                     if (flags & SCF_WHILEM_VISITED_POS)
4133                         f |= SCF_WHILEM_VISITED_POS;
4134
4135                     if (trie->jump[word]) {
4136                         if (!nextbranch)
4137                             nextbranch = trie_node + trie->jump[0];
4138                         scan= trie_node + trie->jump[word];
4139                         /* We go from the jump point to the branch that follows
4140                            it. Note this means we need the vestigal unused branches
4141                            even though they arent otherwise used.
4142                          */
4143                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4144                             &deltanext, (regnode *)nextbranch, &data_fake,
4145                             stopparen, recursed, NULL, f,depth+1);
4146                     }
4147                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4148                         nextbranch= regnext((regnode*)nextbranch);
4149
4150                     if (min1 > (I32)(minnext + trie->minlen))
4151                         min1 = minnext + trie->minlen;
4152                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4153                         max1 = minnext + deltanext + trie->maxlen;
4154                     if (deltanext == I32_MAX)
4155                         is_inf = is_inf_internal = 1;
4156
4157                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4158                         pars++;
4159                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4160                         if ( stopmin > min + min1)
4161                             stopmin = min + min1;
4162                         flags &= ~SCF_DO_SUBSTR;
4163                         if (data)
4164                             data->flags |= SCF_SEEN_ACCEPT;
4165                     }
4166                     if (data) {
4167                         if (data_fake.flags & SF_HAS_EVAL)
4168                             data->flags |= SF_HAS_EVAL;
4169                         data->whilem_c = data_fake.whilem_c;
4170                     }
4171                     if (flags & SCF_DO_STCLASS)
4172                         cl_or(pRExC_state, &accum, &this_class);
4173                 }
4174             }
4175             if (flags & SCF_DO_SUBSTR) {
4176                 data->pos_min += min1;
4177                 data->pos_delta += max1 - min1;
4178                 if (max1 != min1 || is_inf)
4179                     data->longest = &(data->longest_float);
4180             }
4181             min += min1;
4182             delta += max1 - min1;
4183             if (flags & SCF_DO_STCLASS_OR) {
4184                 cl_or(pRExC_state, data->start_class, &accum);
4185                 if (min1) {
4186                     cl_and(data->start_class, and_withp);
4187                     flags &= ~SCF_DO_STCLASS;
4188                 }
4189             }
4190             else if (flags & SCF_DO_STCLASS_AND) {
4191                 if (min1) {
4192                     cl_and(data->start_class, &accum);
4193                     flags &= ~SCF_DO_STCLASS;
4194                 }
4195                 else {
4196                     /* Switch to OR mode: cache the old value of
4197                      * data->start_class */
4198                     INIT_AND_WITHP;
4199                     StructCopy(data->start_class, and_withp,
4200                                struct regnode_charclass_class);
4201                     flags &= ~SCF_DO_STCLASS_AND;
4202                     StructCopy(&accum, data->start_class,
4203                                struct regnode_charclass_class);
4204                     flags |= SCF_DO_STCLASS_OR;
4205                     data->start_class->flags |= ANYOF_EOS;
4206                 }
4207             }
4208             scan= tail;
4209             continue;
4210         }
4211 #else
4212         else if (PL_regkind[OP(scan)] == TRIE) {
4213             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4214             U8*bang=NULL;
4215
4216             min += trie->minlen;
4217             delta += (trie->maxlen - trie->minlen);
4218             flags &= ~SCF_DO_STCLASS; /* xxx */
4219             if (flags & SCF_DO_SUBSTR) {
4220                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4221                 data->pos_min += trie->minlen;
4222                 data->pos_delta += (trie->maxlen - trie->minlen);
4223                 if (trie->maxlen != trie->minlen)
4224                     data->longest = &(data->longest_float);
4225             }
4226             if (trie->jump) /* no more substrings -- for now /grr*/
4227                 flags &= ~SCF_DO_SUBSTR;
4228         }
4229 #endif /* old or new */
4230 #endif /* TRIE_STUDY_OPT */
4231
4232         /* Else: zero-length, ignore. */
4233         scan = regnext(scan);
4234     }
4235     if (frame) {
4236         last = frame->last;
4237         scan = frame->next;
4238         stopparen = frame->stop;
4239         frame = frame->prev;
4240         goto fake_study_recurse;
4241     }
4242
4243   finish:
4244     assert(!frame);
4245     DEBUG_STUDYDATA("pre-fin:",data,depth);
4246
4247     *scanp = scan;
4248     *deltap = is_inf_internal ? I32_MAX : delta;
4249     if (flags & SCF_DO_SUBSTR && is_inf)
4250         data->pos_delta = I32_MAX - data->pos_min;
4251     if (is_par > (I32)U8_MAX)
4252         is_par = 0;
4253     if (is_par && pars==1 && data) {
4254         data->flags |= SF_IN_PAR;
4255         data->flags &= ~SF_HAS_PAR;
4256     }
4257     else if (pars && data) {
4258         data->flags |= SF_HAS_PAR;
4259         data->flags &= ~SF_IN_PAR;
4260     }
4261     if (flags & SCF_DO_STCLASS_OR)
4262         cl_and(data->start_class, and_withp);
4263     if (flags & SCF_TRIE_RESTUDY)
4264         data->flags |=  SCF_TRIE_RESTUDY;
4265
4266     DEBUG_STUDYDATA("post-fin:",data,depth);
4267
4268     return min < stopmin ? min : stopmin;
4269 }
4270
4271 STATIC U32
4272 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4273 {
4274     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4275
4276     PERL_ARGS_ASSERT_ADD_DATA;
4277
4278     Renewc(RExC_rxi->data,
4279            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4280            char, struct reg_data);
4281     if(count)
4282         Renew(RExC_rxi->data->what, count + n, U8);
4283     else
4284         Newx(RExC_rxi->data->what, n, U8);
4285     RExC_rxi->data->count = count + n;
4286     Copy(s, RExC_rxi->data->what + count, n, U8);
4287     return count;
4288 }
4289
4290 /*XXX: todo make this not included in a non debugging perl */
4291 #ifndef PERL_IN_XSUB_RE
4292 void
4293 Perl_reginitcolors(pTHX)
4294 {
4295     dVAR;
4296     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4297     if (s) {
4298         char *t = savepv(s);
4299         int i = 0;
4300         PL_colors[0] = t;
4301         while (++i < 6) {
4302             t = strchr(t, '\t');
4303             if (t) {
4304                 *t = '\0';
4305                 PL_colors[i] = ++t;
4306             }
4307             else
4308                 PL_colors[i] = t = (char *)"";
4309         }
4310     } else {
4311         int i = 0;
4312         while (i < 6)
4313             PL_colors[i++] = (char *)"";
4314     }
4315     PL_colorset = 1;
4316 }
4317 #endif
4318
4319
4320 #ifdef TRIE_STUDY_OPT
4321 #define CHECK_RESTUDY_GOTO                                  \
4322         if (                                                \
4323               (data.flags & SCF_TRIE_RESTUDY)               \
4324               && ! restudied++                              \
4325         )     goto reStudy
4326 #else
4327 #define CHECK_RESTUDY_GOTO
4328 #endif
4329
4330 /*
4331  - pregcomp - compile a regular expression into internal code
4332  *
4333  * We can't allocate space until we know how big the compiled form will be,
4334  * but we can't compile it (and thus know how big it is) until we've got a
4335  * place to put the code.  So we cheat:  we compile it twice, once with code
4336  * generation turned off and size counting turned on, and once "for real".
4337  * This also means that we don't allocate space until we are sure that the
4338  * thing really will compile successfully, and we never have to move the
4339  * code and thus invalidate pointers into it.  (Note that it has to be in
4340  * one piece because free() must be able to free it all.) [NB: not true in perl]
4341  *
4342  * Beware that the optimization-preparation code in here knows about some
4343  * of the structure of the compiled regexp.  [I'll say.]
4344  */
4345
4346
4347
4348 #ifndef PERL_IN_XSUB_RE
4349 #define RE_ENGINE_PTR &PL_core_reg_engine
4350 #else
4351 extern const struct regexp_engine my_reg_engine;
4352 #define RE_ENGINE_PTR &my_reg_engine
4353 #endif
4354
4355 #ifndef PERL_IN_XSUB_RE
4356 REGEXP *
4357 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4358 {
4359     dVAR;
4360     HV * const table = GvHV(PL_hintgv);
4361
4362     PERL_ARGS_ASSERT_PREGCOMP;
4363
4364     /* Dispatch a request to compile a regexp to correct
4365        regexp engine. */
4366     if (table) {
4367         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4368         GET_RE_DEBUG_FLAGS_DECL;
4369         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4370             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4371             DEBUG_COMPILE_r({
4372                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4373                     SvIV(*ptr));
4374             });
4375             return CALLREGCOMP_ENG(eng, pattern, flags);
4376         }
4377     }
4378     return Perl_re_compile(aTHX_ pattern, flags);
4379 }
4380 #endif
4381
4382 REGEXP *
4383 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4384 {
4385     dVAR;
4386     REGEXP *rx;
4387     struct regexp *r;
4388     register regexp_internal *ri;
4389     STRLEN plen;
4390     char  *exp;
4391     char* xend;
4392     regnode *scan;
4393     I32 flags;
4394     I32 minlen = 0;
4395     U32 pm_flags;
4396
4397     /* these are all flags - maybe they should be turned
4398      * into a single int with different bit masks */
4399     I32 sawlookahead = 0;
4400     I32 sawplus = 0;
4401     I32 sawopen = 0;
4402     bool used_setjump = FALSE;
4403
4404     U8 jump_ret = 0;
4405     dJMPENV;
4406     scan_data_t data;
4407     RExC_state_t RExC_state;
4408     RExC_state_t * const pRExC_state = &RExC_state;
4409 #ifdef TRIE_STUDY_OPT
4410     int restudied;
4411     RExC_state_t copyRExC_state;
4412 #endif
4413     GET_RE_DEBUG_FLAGS_DECL;
4414
4415     PERL_ARGS_ASSERT_RE_COMPILE;
4416
4417     DEBUG_r(if (!PL_colorset) reginitcolors());
4418
4419     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4420
4421     /****************** LONG JUMP TARGET HERE***********************/
4422     /* Longjmp back to here if have to switch in midstream to utf8 */
4423     if (! RExC_orig_utf8) {
4424         JMPENV_PUSH(jump_ret);
4425         used_setjump = TRUE;
4426     }
4427
4428     if (jump_ret == 0) {    /* First time through */
4429         exp = SvPV(pattern, plen);
4430         xend = exp + plen;
4431         /* ignore the utf8ness if the pattern is 0 length */
4432         if (plen == 0) {
4433             RExC_utf8 = RExC_orig_utf8 = 0;
4434         }
4435
4436         DEBUG_COMPILE_r({
4437             SV *dsv= sv_newmortal();
4438             RE_PV_QUOTED_DECL(s, RExC_utf8,
4439                 dsv, exp, plen, 60);
4440             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4441                            PL_colors[4],PL_colors[5],s);
4442         });
4443     }
4444     else {  /* longjumped back */
4445         STRLEN len = plen;
4446
4447         /* If the cause for the longjmp was other than changing to utf8, pop
4448          * our own setjmp, and longjmp to the correct handler */
4449         if (jump_ret != UTF8_LONGJMP) {
4450             JMPENV_POP;
4451             JMPENV_JUMP(jump_ret);
4452         }
4453
4454         GET_RE_DEBUG_FLAGS;
4455
4456         /* It's possible to write a regexp in ascii that represents Unicode
4457         codepoints outside of the byte range, such as via \x{100}. If we
4458         detect such a sequence we have to convert the entire pattern to utf8
4459         and then recompile, as our sizing calculation will have been based
4460         on 1 byte == 1 character, but we will need to use utf8 to encode
4461         at least some part of the pattern, and therefore must convert the whole
4462         thing.
4463         -- dmq */
4464         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4465             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4466         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4467         xend = exp + len;
4468         RExC_orig_utf8 = RExC_utf8 = 1;
4469         SAVEFREEPV(exp);
4470     }
4471
4472 #ifdef TRIE_STUDY_OPT
4473     restudied = 0;
4474 #endif
4475
4476     /* Set to use unicode semantics if the pattern is in utf8 and has the
4477      * 'dual' charset specified, as it means unicode when utf8  */
4478     pm_flags = orig_pm_flags;
4479     if (RExC_utf8  && ! (pm_flags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE))) {
4480         pm_flags |= RXf_PMf_UNICODE;
4481     }
4482
4483     RExC_precomp = exp;
4484     RExC_flags = pm_flags;
4485     RExC_sawback = 0;
4486
4487     RExC_seen = 0;
4488     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4489     RExC_seen_evals = 0;
4490     RExC_extralen = 0;
4491
4492     /* First pass: determine size, legality. */
4493     RExC_parse = exp;
4494     RExC_start = exp;
4495     RExC_end = xend;
4496     RExC_naughty = 0;
4497     RExC_npar = 1;
4498     RExC_nestroot = 0;
4499     RExC_size = 0L;
4500     RExC_emit = &PL_regdummy;
4501     RExC_whilem_seen = 0;
4502     RExC_open_parens = NULL;
4503     RExC_close_parens = NULL;
4504     RExC_opend = NULL;
4505     RExC_paren_names = NULL;
4506 #ifdef DEBUGGING
4507     RExC_paren_name_list = NULL;
4508 #endif
4509     RExC_recurse = NULL;
4510     RExC_recurse_count = 0;
4511
4512 #if 0 /* REGC() is (currently) a NOP at the first pass.
4513        * Clever compilers notice this and complain. --jhi */
4514     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4515 #endif
4516     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4517     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4518         RExC_precomp = NULL;
4519         return(NULL);
4520     }
4521
4522     /* Here, finished first pass.  Get rid of any added setjmp */
4523     if (used_setjump) {
4524         JMPENV_POP;
4525     }
4526     DEBUG_PARSE_r({
4527         PerlIO_printf(Perl_debug_log,
4528             "Required size %"IVdf" nodes\n"
4529             "Starting second pass (creation)\n",
4530             (IV)RExC_size);
4531         RExC_lastnum=0;
4532         RExC_lastparse=NULL;
4533     });
4534     /* Small enough for pointer-storage convention?
4535        If extralen==0, this means that we will not need long jumps. */
4536     if (RExC_size >= 0x10000L && RExC_extralen)
4537         RExC_size += RExC_extralen;
4538     else
4539         RExC_extralen = 0;
4540     if (RExC_whilem_seen > 15)
4541         RExC_whilem_seen = 15;
4542
4543     /* Allocate space and zero-initialize. Note, the two step process
4544        of zeroing when in debug mode, thus anything assigned has to
4545        happen after that */
4546     rx = (REGEXP*) newSV_type(SVt_REGEXP);
4547     r = (struct regexp*)SvANY(rx);
4548     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4549          char, regexp_internal);
4550     if ( r == NULL || ri == NULL )
4551         FAIL("Regexp out of space");
4552 #ifdef DEBUGGING
4553     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4554     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4555 #else
4556     /* bulk initialize base fields with 0. */
4557     Zero(ri, sizeof(regexp_internal), char);
4558 #endif
4559
4560     /* non-zero initialization begins here */
4561     RXi_SET( r, ri );
4562     r->engine= RE_ENGINE_PTR;
4563     r->extflags = pm_flags;
4564     {
4565         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4566         bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
4567
4568         /* The caret is output if there are any defaults: if not all the STD
4569          * flags are set, or if no character set specifier is needed */
4570         bool has_default =
4571                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4572                     || ! has_charset);
4573         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4574         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4575                             >> RXf_PMf_STD_PMMOD_SHIFT);
4576         const char *fptr = STD_PAT_MODS;        /*"msix"*/
4577         char *p;
4578         /* Allocate for the worst case, which is all the std flags are turned
4579          * on.  If more precision is desired, we could do a population count of
4580          * the flags set.  This could be done with a small lookup table, or by
4581          * shifting, masking and adding, or even, when available, assembly
4582          * language for a machine-language population count.
4583          * We never output a minus, as all those are defaults, so are
4584          * covered by the caret */
4585         const STRLEN wraplen = plen + has_p + has_runon
4586             + has_default       /* If needs a caret */
4587             + has_charset       /* If needs a character set specifier */
4588             + (sizeof(STD_PAT_MODS) - 1)
4589             + (sizeof("(?:)") - 1);
4590
4591         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4592         SvPOK_on(rx);
4593         SvFLAGS(rx) |= SvUTF8(pattern);
4594         *p++='('; *p++='?';
4595
4596         /* If a default, cover it using the caret */
4597         if (has_default) {
4598             *p++= DEFAULT_PAT_MOD;
4599         }
4600         if (has_charset) {
4601             if (r->extflags & RXf_PMf_LOCALE) {
4602                 *p++ = LOCALE_PAT_MOD;
4603             } else {
4604                 *p++ = UNICODE_PAT_MOD;
4605             }
4606         }
4607         if (has_p)
4608             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4609         {
4610             char ch;
4611             while((ch = *fptr++)) {
4612                 if(reganch & 1)
4613                     *p++ = ch;
4614                 reganch >>= 1;
4615             }
4616         }
4617
4618         *p++ = ':';
4619         Copy(RExC_precomp, p, plen, char);
4620         assert ((RX_WRAPPED(rx) - p) < 16);
4621         r->pre_prefix = p - RX_WRAPPED(rx);
4622         p += plen;
4623         if (has_runon)
4624             *p++ = '\n';
4625         *p++ = ')';
4626         *p = 0;
4627         SvCUR_set(rx, p - SvPVX_const(rx));
4628     }
4629
4630     r->intflags = 0;
4631     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4632
4633     if (RExC_seen & REG_SEEN_RECURSE) {
4634         Newxz(RExC_open_parens, RExC_npar,regnode *);
4635         SAVEFREEPV(RExC_open_parens);
4636         Newxz(RExC_close_parens,RExC_npar,regnode *);
4637         SAVEFREEPV(RExC_close_parens);
4638     }
4639
4640     /* Useful during FAIL. */
4641 #ifdef RE_TRACK_PATTERN_OFFSETS
4642     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4643     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4644                           "%s %"UVuf" bytes for offset annotations.\n",
4645                           ri->u.offsets ? "Got" : "Couldn't get",
4646                           (UV)((2*RExC_size+1) * sizeof(U32))));
4647 #endif
4648     SetProgLen(ri,RExC_size);
4649     RExC_rx_sv = rx;
4650     RExC_rx = r;
4651     RExC_rxi = ri;
4652     REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4653
4654     /* Second pass: emit code. */
4655     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
4656     RExC_parse = exp;
4657     RExC_end = xend;
4658     RExC_naughty = 0;
4659     RExC_npar = 1;
4660     RExC_emit_start = ri->program;
4661     RExC_emit = ri->program;
4662     RExC_emit_bound = ri->program + RExC_size + 1;
4663
4664     /* Store the count of eval-groups for security checks: */
4665     RExC_rx->seen_evals = RExC_seen_evals;
4666     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4667     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4668         ReREFCNT_dec(rx);
4669         return(NULL);
4670     }
4671     /* XXXX To minimize changes to RE engine we always allocate
4672        3-units-long substrs field. */
4673     Newx(r->substrs, 1, struct reg_substr_data);
4674     if (RExC_recurse_count) {
4675         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4676         SAVEFREEPV(RExC_recurse);
4677     }
4678
4679 reStudy:
4680     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4681     Zero(r->substrs, 1, struct reg_substr_data);
4682
4683 #ifdef TRIE_STUDY_OPT
4684     if (!restudied) {
4685         StructCopy(&zero_scan_data, &data, scan_data_t);
4686         copyRExC_state = RExC_state;
4687     } else {
4688         U32 seen=RExC_seen;
4689         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4690
4691         RExC_state = copyRExC_state;
4692         if (seen & REG_TOP_LEVEL_BRANCHES)
4693             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4694         else
4695             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4696         if (data.last_found) {
4697             SvREFCNT_dec(data.longest_fixed);
4698             SvREFCNT_dec(data.longest_float);
4699             SvREFCNT_dec(data.last_found);
4700         }
4701         StructCopy(&zero_scan_data, &data, scan_data_t);
4702     }
4703 #else
4704     StructCopy(&zero_scan_data, &data, scan_data_t);
4705 #endif
4706
4707     /* Dig out information for optimizations. */
4708     r->extflags = RExC_flags; /* was pm_op */
4709     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4710
4711     if (UTF)
4712         SvUTF8_on(rx);  /* Unicode in it? */
4713     ri->regstclass = NULL;
4714     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
4715         r->intflags |= PREGf_NAUGHTY;
4716     scan = ri->program + 1;             /* First BRANCH. */
4717
4718     /* testing for BRANCH here tells us whether there is "must appear"
4719        data in the pattern. If there is then we can use it for optimisations */
4720     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4721         I32 fake;
4722         STRLEN longest_float_length, longest_fixed_length;
4723         struct regnode_charclass_class ch_class; /* pointed to by data */
4724         int stclass_flag;
4725         I32 last_close = 0; /* pointed to by data */
4726         regnode *first= scan;
4727         regnode *first_next= regnext(first);
4728         /*
4729          * Skip introductions and multiplicators >= 1
4730          * so that we can extract the 'meat' of the pattern that must
4731          * match in the large if() sequence following.
4732          * NOTE that EXACT is NOT covered here, as it is normally
4733          * picked up by the optimiser separately.
4734          *
4735          * This is unfortunate as the optimiser isnt handling lookahead
4736          * properly currently.
4737          *
4738          */
4739         while ((OP(first) == OPEN && (sawopen = 1)) ||
4740                /* An OR of *one* alternative - should not happen now. */
4741             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4742             /* for now we can't handle lookbehind IFMATCH*/
4743             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4744             (OP(first) == PLUS) ||
4745             (OP(first) == MINMOD) ||
4746                /* An {n,m} with n>0 */
4747             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4748             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4749         {
4750                 /*
4751                  * the only op that could be a regnode is PLUS, all the rest
4752                  * will be regnode_1 or regnode_2.
4753                  *
4754                  */
4755                 if (OP(first) == PLUS)
4756                     sawplus = 1;
4757                 else
4758                     first += regarglen[OP(first)];
4759
4760                 first = NEXTOPER(first);
4761                 first_next= regnext(first);
4762         }
4763
4764         /* Starting-point info. */
4765       again:
4766         DEBUG_PEEP("first:",first,0);
4767         /* Ignore EXACT as we deal with it later. */
4768         if (PL_regkind[OP(first)] == EXACT) {
4769             if (OP(first) == EXACT)
4770                 NOOP;   /* Empty, get anchored substr later. */
4771             else
4772                 ri->regstclass = first;
4773         }
4774 #ifdef TRIE_STCLASS
4775         else if (PL_regkind[OP(first)] == TRIE &&
4776                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4777         {
4778             regnode *trie_op;
4779             /* this can happen only on restudy */
4780             if ( OP(first) == TRIE ) {
4781                 struct regnode_1 *trieop = (struct regnode_1 *)
4782                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
4783                 StructCopy(first,trieop,struct regnode_1);
4784                 trie_op=(regnode *)trieop;
4785             } else {
4786                 struct regnode_charclass *trieop = (struct regnode_charclass *)
4787                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4788                 StructCopy(first,trieop,struct regnode_charclass);
4789                 trie_op=(regnode *)trieop;
4790             }
4791             OP(trie_op)+=2;
4792             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4793             ri->regstclass = trie_op;
4794         }
4795 #endif
4796         else if (REGNODE_SIMPLE(OP(first)))
4797             ri->regstclass = first;
4798         else if (PL_regkind[OP(first)] == BOUND ||
4799                  PL_regkind[OP(first)] == NBOUND)
4800             ri->regstclass = first;
4801         else if (PL_regkind[OP(first)] == BOL) {
4802             r->extflags |= (OP(first) == MBOL
4803                            ? RXf_ANCH_MBOL
4804                            : (OP(first) == SBOL
4805                               ? RXf_ANCH_SBOL
4806                               : RXf_ANCH_BOL));
4807             first = NEXTOPER(first);
4808             goto again;
4809         }
4810         else if (OP(first) == GPOS) {
4811             r->extflags |= RXf_ANCH_GPOS;
4812             first = NEXTOPER(first);
4813             goto again;
4814         }
4815         else if ((!sawopen || !RExC_sawback) &&
4816             (OP(first) == STAR &&
4817             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4818             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4819         {
4820             /* turn .* into ^.* with an implied $*=1 */
4821             const int type =
4822                 (OP(NEXTOPER(first)) == REG_ANY)
4823                     ? RXf_ANCH_MBOL
4824                     : RXf_ANCH_SBOL;
4825             r->extflags |= type;
4826             r->intflags |= PREGf_IMPLICIT;
4827             first = NEXTOPER(first);
4828             goto again;
4829         }
4830         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4831             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4832             /* x+ must match at the 1st pos of run of x's */
4833             r->intflags |= PREGf_SKIP;
4834
4835         /* Scan is after the zeroth branch, first is atomic matcher. */
4836 #ifdef TRIE_STUDY_OPT
4837         DEBUG_PARSE_r(
4838             if (!restudied)
4839                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4840                               (IV)(first - scan + 1))
4841         );
4842 #else
4843         DEBUG_PARSE_r(
4844             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4845                 (IV)(first - scan + 1))
4846         );
4847 #endif
4848
4849
4850         /*
4851         * If there's something expensive in the r.e., find the
4852         * longest literal string that must appear and make it the
4853         * regmust.  Resolve ties in favor of later strings, since
4854         * the regstart check works with the beginning of the r.e.
4855         * and avoiding duplication strengthens checking.  Not a
4856         * strong reason, but sufficient in the absence of others.
4857         * [Now we resolve ties in favor of the earlier string if
4858         * it happens that c_offset_min has been invalidated, since the
4859         * earlier string may buy us something the later one won't.]
4860         */
4861
4862         data.longest_fixed = newSVpvs("");
4863         data.longest_float = newSVpvs("");
4864         data.last_found = newSVpvs("");
4865         data.longest = &(data.longest_fixed);
4866         first = scan;
4867         if (!ri->regstclass) {
4868             cl_init(pRExC_state, &ch_class);
4869             data.start_class = &ch_class;
4870             stclass_flag = SCF_DO_STCLASS_AND;
4871         } else                          /* XXXX Check for BOUND? */
4872             stclass_flag = 0;
4873         data.last_closep = &last_close;
4874
4875         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4876             &data, -1, NULL, NULL,
4877             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4878
4879
4880         CHECK_RESTUDY_GOTO;
4881
4882
4883         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4884              && data.last_start_min == 0 && data.last_end > 0
4885              && !RExC_seen_zerolen
4886              && !(RExC_seen & REG_SEEN_VERBARG)
4887              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4888             r->extflags |= RXf_CHECK_ALL;
4889         scan_commit(pRExC_state, &data,&minlen,0);
4890         SvREFCNT_dec(data.last_found);
4891
4892         /* Note that code very similar to this but for anchored string
4893            follows immediately below, changes may need to be made to both.
4894            Be careful.
4895          */
4896         longest_float_length = CHR_SVLEN(data.longest_float);
4897         if (longest_float_length
4898             || (data.flags & SF_FL_BEFORE_EOL
4899                 && (!(data.flags & SF_FL_BEFORE_MEOL)
4900                     || (RExC_flags & RXf_PMf_MULTILINE))))
4901         {
4902             I32 t,ml;
4903
4904             if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
4905                 && data.offset_fixed == data.offset_float_min
4906                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4907                     goto remove_float;          /* As in (a)+. */
4908
4909             /* copy the information about the longest float from the reg_scan_data
4910                over to the program. */
4911             if (SvUTF8(data.longest_float)) {
4912                 r->float_utf8 = data.longest_float;
4913                 r->float_substr = NULL;
4914             } else {
4915                 r->float_substr = data.longest_float;
4916                 r->float_utf8 = NULL;
4917             }
4918             /* float_end_shift is how many chars that must be matched that
4919                follow this item. We calculate it ahead of time as once the
4920                lookbehind offset is added in we lose the ability to correctly
4921                calculate it.*/
4922             ml = data.minlen_float ? *(data.minlen_float)
4923                                    : (I32)longest_float_length;
4924             r->float_end_shift = ml - data.offset_float_min
4925                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4926                 + data.lookbehind_float;
4927             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4928             r->float_max_offset = data.offset_float_max;
4929             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4930                 r->float_max_offset -= data.lookbehind_float;
4931
4932             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4933                        && (!(data.flags & SF_FL_BEFORE_MEOL)
4934                            || (RExC_flags & RXf_PMf_MULTILINE)));
4935             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4936         }
4937         else {
4938           remove_float:
4939             r->float_substr = r->float_utf8 = NULL;
4940             SvREFCNT_dec(data.longest_float);
4941             longest_float_length = 0;
4942         }
4943
4944         /* Note that code very similar to this but for floating string
4945            is immediately above, changes may need to be made to both.
4946            Be careful.
4947          */
4948         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4949         if (longest_fixed_length
4950             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4951                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4952                     || (RExC_flags & RXf_PMf_MULTILINE))))
4953         {
4954             I32 t,ml;
4955
4956             /* copy the information about the longest fixed
4957                from the reg_scan_data over to the program. */
4958             if (SvUTF8(data.longest_fixed)) {
4959                 r->anchored_utf8 = data.longest_fixed;
4960                 r->anchored_substr = NULL;
4961             } else {
4962                 r->anchored_substr = data.longest_fixed;
4963                 r->anchored_utf8 = NULL;
4964             }
4965             /* fixed_end_shift is how many chars that must be matched that
4966                follow this item. We calculate it ahead of time as once the
4967                lookbehind offset is added in we lose the ability to correctly
4968                calculate it.*/
4969             ml = data.minlen_fixed ? *(data.minlen_fixed)
4970                                    : (I32)longest_fixed_length;
4971             r->anchored_end_shift = ml - data.offset_fixed
4972                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4973                 + data.lookbehind_fixed;
4974             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4975
4976             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4977                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
4978                      || (RExC_flags & RXf_PMf_MULTILINE)));
4979             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4980         }
4981         else {
4982             r->anchored_substr = r->anchored_utf8 = NULL;
4983             SvREFCNT_dec(data.longest_fixed);
4984             longest_fixed_length = 0;
4985         }
4986         if (ri->regstclass
4987             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4988             ri->regstclass = NULL;
4989         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4990             && stclass_flag
4991             && !(data.start_class->flags & ANYOF_EOS)
4992             && !cl_is_anything(data.start_class))
4993         {
4994             const U32 n = add_data(pRExC_state, 1, "f");
4995
4996             Newx(RExC_rxi->data->data[n], 1,
4997                 struct regnode_charclass_class);
4998             StructCopy(data.start_class,
4999                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5000                        struct regnode_charclass_class);
5001             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5002             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5003             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5004                       regprop(r, sv, (regnode*)data.start_class);
5005                       PerlIO_printf(Perl_debug_log,
5006                                     "synthetic stclass \"%s\".\n",
5007                                     SvPVX_const(sv));});
5008         }
5009
5010         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5011         if (longest_fixed_length > longest_float_length) {
5012             r->check_end_shift = r->anchored_end_shift;
5013             r->check_substr = r->anchored_substr;
5014             r->check_utf8 = r->anchored_utf8;
5015             r->check_offset_min = r->check_offset_max = r->anchored_offset;
5016             if (r->extflags & RXf_ANCH_SINGLE)
5017                 r->extflags |= RXf_NOSCAN;
5018         }
5019         else {
5020             r->check_end_shift = r->float_end_shift;
5021             r->check_substr = r->float_substr;
5022             r->check_utf8 = r->float_utf8;
5023             r->check_offset_min = r->float_min_offset;
5024             r->check_offset_max = r->float_max_offset;
5025         }
5026         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5027            This should be changed ASAP!  */
5028         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5029             r->extflags |= RXf_USE_INTUIT;
5030             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5031                 r->extflags |= RXf_INTUIT_TAIL;
5032         }
5033         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5034         if ( (STRLEN)minlen < longest_float_length )
5035             minlen= longest_float_length;
5036         if ( (STRLEN)minlen < longest_fixed_length )
5037             minlen= longest_fixed_length;
5038         */
5039     }
5040     else {
5041         /* Several toplevels. Best we can is to set minlen. */
5042         I32 fake;
5043         struct regnode_charclass_class ch_class;
5044         I32 last_close = 0;
5045
5046         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5047
5048         scan = ri->program + 1;
5049         cl_init(pRExC_state, &ch_class);
5050         data.start_class = &ch_class;
5051         data.last_closep = &last_close;
5052
5053
5054         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5055             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5056
5057         CHECK_RESTUDY_GOTO;
5058
5059         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5060                 = r->float_substr = r->float_utf8 = NULL;
5061         if (!(data.start_class->flags & ANYOF_EOS)
5062             && !cl_is_anything(data.start_class))
5063         {
5064             const U32 n = add_data(pRExC_state, 1, "f");
5065
5066             Newx(RExC_rxi->data->data[n], 1,
5067                 struct regnode_charclass_class);
5068             StructCopy(data.start_class,
5069                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5070                        struct regnode_charclass_class);
5071             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5072             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5073             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5074                       regprop(r, sv, (regnode*)data.start_class);
5075                       PerlIO_printf(Perl_debug_log,
5076                                     "synthetic stclass \"%s\".\n",
5077                                     SvPVX_const(sv));});
5078         }
5079     }
5080
5081     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5082        the "real" pattern. */
5083     DEBUG_OPTIMISE_r({
5084         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5085                       (IV)minlen, (IV)r->minlen);
5086     });
5087     r->minlenret = minlen;
5088     if (r->minlen < minlen)
5089         r->minlen = minlen;
5090
5091     if (RExC_seen & REG_SEEN_GPOS)
5092         r->extflags |= RXf_GPOS_SEEN;
5093     if (RExC_seen & REG_SEEN_LOOKBEHIND)
5094         r->extflags |= RXf_LOOKBEHIND_SEEN;
5095     if (RExC_seen & REG_SEEN_EVAL)
5096         r->extflags |= RXf_EVAL_SEEN;
5097     if (RExC_seen & REG_SEEN_CANY)
5098         r->extflags |= RXf_CANY_SEEN;
5099     if (RExC_seen & REG_SEEN_VERBARG)
5100         r->intflags |= PREGf_VERBARG_SEEN;
5101     if (RExC_seen & REG_SEEN_CUTGROUP)
5102         r->intflags |= PREGf_CUTGROUP_SEEN;
5103     if (RExC_paren_names)
5104         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5105     else
5106         RXp_PAREN_NAMES(r) = NULL;
5107
5108 #ifdef STUPID_PATTERN_CHECKS
5109     if (RX_PRELEN(rx) == 0)
5110         r->extflags |= RXf_NULL;
5111     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5112         /* XXX: this should happen BEFORE we compile */
5113         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5114     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5115         r->extflags |= RXf_WHITE;
5116     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5117         r->extflags |= RXf_START_ONLY;
5118 #else
5119     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5120             /* XXX: this should happen BEFORE we compile */
5121             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5122     else {
5123         regnode *first = ri->program + 1;
5124         U8 fop = OP(first);
5125         U8 nop = OP(NEXTOPER(first));
5126
5127         if (PL_regkind[fop] == NOTHING && nop == END)
5128             r->extflags |= RXf_NULL;
5129         else if (PL_regkind[fop] == BOL && nop == END)
5130             r->extflags |= RXf_START_ONLY;
5131         else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
5132             r->extflags |= RXf_WHITE;
5133     }
5134 #endif
5135 #ifdef DEBUGGING
5136     if (RExC_paren_names) {
5137         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5138         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5139     } else
5140 #endif
5141         ri->name_list_idx = 0;
5142
5143     if (RExC_recurse_count) {
5144         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5145             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5146             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5147         }
5148     }
5149     Newxz(r->offs, RExC_npar, regexp_paren_pair);
5150     /* assume we don't need to swap parens around before we match */
5151
5152     DEBUG_DUMP_r({
5153         PerlIO_printf(Perl_debug_log,"Final program:\n");
5154         regdump(r);
5155     });
5156 #ifdef RE_TRACK_PATTERN_OFFSETS
5157     DEBUG_OFFSETS_r(if (ri->u.offsets) {
5158         const U32 len = ri->u.offsets[0];
5159         U32 i;
5160         GET_RE_DEBUG_FLAGS_DECL;
5161         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5162         for (i = 1; i <= len; i++) {
5163             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5164                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5165                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5166             }
5167         PerlIO_printf(Perl_debug_log, "\n");
5168     });
5169 #endif
5170     return rx;
5171 }
5172
5173 #undef RE_ENGINE_PTR
5174
5175
5176 SV*
5177 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5178                     const U32 flags)
5179 {
5180     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5181
5182     PERL_UNUSED_ARG(value);
5183
5184     if (flags & RXapif_FETCH) {
5185         return reg_named_buff_fetch(rx, key, flags);
5186     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5187         Perl_croak_no_modify(aTHX);
5188         return NULL;
5189     } else if (flags & RXapif_EXISTS) {
5190         return reg_named_buff_exists(rx, key, flags)
5191             ? &PL_sv_yes
5192             : &PL_sv_no;
5193     } else if (flags & RXapif_REGNAMES) {
5194         return reg_named_buff_all(rx, flags);
5195     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5196         return reg_named_buff_scalar(rx, flags);
5197     } else {
5198         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5199         return NULL;
5200     }
5201 }
5202
5203 SV*
5204 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5205                          const U32 flags)
5206 {
5207     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5208     PERL_UNUSED_ARG(lastkey);
5209
5210     if (flags & RXapif_FIRSTKEY)
5211         return reg_named_buff_firstkey(rx, flags);
5212     else if (flags & RXapif_NEXTKEY)
5213         return reg_named_buff_nextkey(rx, flags);
5214     else {
5215         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5216         return NULL;
5217     }
5218 }
5219
5220 SV*
5221 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5222                           const U32 flags)
5223 {
5224     AV *retarray = NULL;
5225     SV *ret;
5226     struct regexp *const rx = (struct regexp *)SvANY(r);
5227
5228     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5229
5230     if (flags & RXapif_ALL)
5231         retarray=newAV();
5232
5233     if (rx && RXp_PAREN_NAMES(rx)) {
5234         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5235         if (he_str) {
5236             IV i;
5237             SV* sv_dat=HeVAL(he_str);
5238             I32 *nums=(I32*)SvPVX(sv_dat);
5239             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5240                 if ((I32)(rx->nparens) >= nums[i]
5241                     && rx->offs[nums[i]].start != -1
5242                     && rx->offs[nums[i]].end != -1)
5243                 {
5244                     ret = newSVpvs("");
5245                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5246                     if (!retarray)
5247                         return ret;
5248                 } else {
5249                     ret = newSVsv(&PL_sv_undef);
5250                 }
5251                 if (retarray)
5252                     av_push(retarray, ret);
5253             }
5254             if (retarray)
5255                 return newRV_noinc(MUTABLE_SV(retarray));
5256         }
5257     }
5258     return NULL;
5259 }
5260
5261 bool
5262 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5263                            const U32 flags)
5264 {
5265     struct regexp *const rx = (struct regexp *)SvANY(r);
5266
5267     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5268
5269     if (rx && RXp_PAREN_NAMES(rx)) {
5270         if (flags & RXapif_ALL) {
5271             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5272         } else {
5273             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5274             if (sv) {
5275                 SvREFCNT_dec(sv);
5276                 return TRUE;
5277             } else {
5278                 return FALSE;
5279             }
5280         }
5281     } else {
5282         return FALSE;
5283     }
5284 }
5285
5286 SV*
5287 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5288 {
5289     struct regexp *const rx = (struct regexp *)SvANY(r);
5290
5291     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5292
5293     if ( rx && RXp_PAREN_NAMES(rx) ) {
5294         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5295
5296         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5297     } else {
5298         return FALSE;
5299     }
5300 }
5301
5302 SV*
5303 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5304 {
5305     struct regexp *const rx = (struct regexp *)SvANY(r);
5306     GET_RE_DEBUG_FLAGS_DECL;
5307
5308     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5309
5310     if (rx && RXp_PAREN_NAMES(rx)) {
5311         HV *hv = RXp_PAREN_NAMES(rx);
5312         HE *temphe;
5313         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5314             IV i;
5315             IV parno = 0;
5316             SV* sv_dat = HeVAL(temphe);
5317             I32 *nums = (I32*)SvPVX(sv_dat);
5318             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5319                 if ((I32)(rx->lastparen) >= nums[i] &&
5320                     rx->offs[nums[i]].start != -1 &&
5321                     rx->offs[nums[i]].end != -1)
5322                 {
5323                     parno = nums[i];
5324                     break;
5325                 }
5326             }
5327             if (parno || flags & RXapif_ALL) {
5328                 return newSVhek(HeKEY_hek(temphe));
5329             }
5330         }
5331     }
5332     return NULL;
5333 }
5334
5335 SV*
5336 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5337 {
5338     SV *ret;
5339     AV *av;
5340     I32 length;
5341     struct regexp *const rx = (struct regexp *)SvANY(r);
5342
5343     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5344
5345     if (rx && RXp_PAREN_NAMES(rx)) {
5346         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5347             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5348         } else if (flags & RXapif_ONE) {
5349             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5350             av = MUTABLE_AV(SvRV(ret));
5351             length = av_len(av);
5352             SvREFCNT_dec(ret);
5353             return newSViv(length + 1);
5354         } else {
5355             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5356             return NULL;
5357         }
5358     }
5359     return &PL_sv_undef;
5360 }
5361
5362 SV*
5363 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5364 {
5365     struct regexp *const rx = (struct regexp *)SvANY(r);
5366     AV *av = newAV();
5367
5368     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5369
5370     if (rx && RXp_PAREN_NAMES(rx)) {
5371         HV *hv= RXp_PAREN_NAMES(rx);
5372         HE *temphe;
5373         (void)hv_iterinit(hv);
5374         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5375             IV i;
5376             IV parno = 0;
5377             SV* sv_dat = HeVAL(temphe);
5378             I32 *nums = (I32*)SvPVX(sv_dat);
5379             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5380                 if ((I32)(rx->lastparen) >= nums[i] &&
5381                     rx->offs[nums[i]].start != -1 &&
5382                     rx->offs[nums[i]].end != -1)
5383                 {
5384                     parno = nums[i];
5385                     break;
5386                 }
5387             }
5388             if (parno || flags & RXapif_ALL) {
5389                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5390             }
5391         }
5392     }
5393
5394     return newRV_noinc(MUTABLE_SV(av));
5395 }
5396
5397 void
5398 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5399                              SV * const sv)
5400 {
5401     struct regexp *const rx = (struct regexp *)SvANY(r);
5402     char *s = NULL;
5403     I32 i = 0;
5404     I32 s1, t1;
5405
5406     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5407
5408     if (!rx->subbeg) {
5409         sv_setsv(sv,&PL_sv_undef);
5410         return;
5411     }
5412     else
5413     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5414         /* $` */
5415         i = rx->offs[0].start;
5416         s = rx->subbeg;
5417     }
5418     else
5419     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5420         /* $' */
5421         s = rx->subbeg + rx->offs[0].end;
5422         i = rx->sublen - rx->offs[0].end;
5423     }
5424     else
5425     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5426         (s1 = rx->offs[paren].start) != -1 &&
5427         (t1 = rx->offs[paren].end) != -1)
5428     {
5429         /* $& $1 ... */
5430         i = t1 - s1;
5431         s = rx->subbeg + s1;
5432     } else {
5433         sv_setsv(sv,&PL_sv_undef);
5434         return;
5435     }
5436     assert(rx->sublen >= (s - rx->subbeg) + i );
5437     if (i >= 0) {
5438         const int oldtainted = PL_tainted;
5439         TAINT_NOT;
5440         sv_setpvn(sv, s, i);
5441         PL_tainted = oldtainted;
5442         if ( (rx->extflags & RXf_CANY_SEEN)
5443             ? (RXp_MATCH_UTF8(rx)
5444                         && (!i || is_utf8_string((U8*)s, i)))
5445             : (RXp_MATCH_UTF8(rx)) )
5446         {
5447             SvUTF8_on(sv);
5448         }
5449         else
5450             SvUTF8_off(sv);
5451         if (PL_tainting) {
5452             if (RXp_MATCH_TAINTED(rx)) {
5453                 if (SvTYPE(sv) >= SVt_PVMG) {
5454                     MAGIC* const mg = SvMAGIC(sv);
5455                     MAGIC* mgt;
5456                     PL_tainted = 1;
5457                     SvMAGIC_set(sv, mg->mg_moremagic);
5458                     SvTAINT(sv);
5459                     if ((mgt = SvMAGIC(sv))) {
5460                         mg->mg_moremagic = mgt;
5461                         SvMAGIC_set(sv, mg);
5462                     }
5463                 } else {
5464                     PL_tainted = 1;
5465                     SvTAINT(sv);
5466                 }
5467             } else
5468                 SvTAINTED_off(sv);
5469         }
5470     } else {
5471         sv_setsv(sv,&PL_sv_undef);
5472         return;
5473     }
5474 }
5475
5476 void
5477 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5478                                                          SV const * const value)
5479 {
5480     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5481
5482     PERL_UNUSED_ARG(rx);
5483     PERL_UNUSED_ARG(paren);
5484     PERL_UNUSED_ARG(value);
5485
5486     if (!PL_localizing)
5487         Perl_croak_no_modify(aTHX);
5488 }
5489
5490 I32
5491 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5492                               const I32 paren)
5493 {
5494     struct regexp *const rx = (struct regexp *)SvANY(r);
5495     I32 i;
5496     I32 s1, t1;
5497
5498     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5499
5500     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5501         switch (paren) {
5502       /* $` / ${^PREMATCH} */
5503       case RX_BUFF_IDX_PREMATCH:
5504         if (rx->offs[0].start != -1) {
5505                         i = rx->offs[0].start;
5506                         if (i > 0) {
5507                                 s1 = 0;
5508                                 t1 = i;
5509                                 goto getlen;
5510                         }
5511             }
5512         return 0;
5513       /* $' / ${^POSTMATCH} */
5514       case RX_BUFF_IDX_POSTMATCH:
5515             if (rx->offs[0].end != -1) {
5516                         i = rx->sublen - rx->offs[0].end;
5517                         if (i > 0) {
5518                                 s1 = rx->offs[0].end;
5519                                 t1 = rx->sublen;
5520                                 goto getlen;
5521                         }
5522             }
5523         return 0;
5524       /* $& / ${^MATCH}, $1, $2, ... */
5525       default:
5526             if (paren <= (I32)rx->nparens &&
5527             (s1 = rx->offs[paren].start) != -1 &&
5528             (t1 = rx->offs[paren].end) != -1)
5529             {
5530             i = t1 - s1;
5531             goto getlen;
5532         } else {
5533             if (ckWARN(WARN_UNINITIALIZED))
5534                 report_uninit((const SV *)sv);
5535             return 0;
5536         }
5537     }
5538   getlen:
5539     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5540         const char * const s = rx->subbeg + s1;
5541         const U8 *ep;
5542         STRLEN el;
5543
5544         i = t1 - s1;
5545         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5546                         i = el;
5547     }
5548     return i;
5549 }
5550
5551 SV*
5552 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5553 {
5554     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5555         PERL_UNUSED_ARG(rx);
5556         if (0)
5557             return NULL;
5558         else
5559             return newSVpvs("Regexp");
5560 }
5561
5562 /* Scans the name of a named buffer from the pattern.
5563  * If flags is REG_RSN_RETURN_NULL returns null.
5564  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5565  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5566  * to the parsed name as looked up in the RExC_paren_names hash.
5567  * If there is an error throws a vFAIL().. type exception.
5568  */
5569
5570 #define REG_RSN_RETURN_NULL    0
5571 #define REG_RSN_RETURN_NAME    1
5572 #define REG_RSN_RETURN_DATA    2
5573
5574 STATIC SV*
5575 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5576 {
5577     char *name_start = RExC_parse;
5578
5579     PERL_ARGS_ASSERT_REG_SCAN_NAME;
5580
5581     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5582          /* skip IDFIRST by using do...while */
5583         if (UTF)
5584             do {
5585                 RExC_parse += UTF8SKIP(RExC_parse);
5586             } while (isALNUM_utf8((U8*)RExC_parse));
5587         else
5588             do {
5589                 RExC_parse++;
5590             } while (isALNUM(*RExC_parse));
5591     }
5592
5593     if ( flags ) {
5594         SV* sv_name
5595             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5596                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5597         if ( flags == REG_RSN_RETURN_NAME)
5598             return sv_name;
5599         else if (flags==REG_RSN_RETURN_DATA) {
5600             HE *he_str = NULL;
5601             SV *sv_dat = NULL;
5602             if ( ! sv_name )      /* should not happen*/
5603                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5604             if (RExC_paren_names)
5605                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5606             if ( he_str )
5607                 sv_dat = HeVAL(he_str);
5608             if ( ! sv_dat )
5609                 vFAIL("Reference to nonexistent named group");
5610             return sv_dat;
5611         }
5612         else {
5613             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5614         }
5615         /* NOT REACHED */
5616     }
5617     return NULL;
5618 }
5619
5620 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5621     int rem=(int)(RExC_end - RExC_parse);                       \
5622     int cut;                                                    \
5623     int num;                                                    \
5624     int iscut=0;                                                \
5625     if (rem>10) {                                               \
5626         rem=10;                                                 \
5627         iscut=1;                                                \
5628     }                                                           \
5629     cut=10-rem;                                                 \
5630     if (RExC_lastparse!=RExC_parse)                             \
5631         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5632             rem, RExC_parse,                                    \
5633             cut + 4,                                            \
5634             iscut ? "..." : "<"                                 \
5635         );                                                      \
5636     else                                                        \
5637         PerlIO_printf(Perl_debug_log,"%16s","");                \
5638                                                                 \
5639     if (SIZE_ONLY)                                              \
5640        num = RExC_size + 1;                                     \
5641     else                                                        \
5642        num=REG_NODE_NUM(RExC_emit);                             \
5643     if (RExC_lastnum!=num)                                      \
5644        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5645     else                                                        \
5646        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5647     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5648         (int)((depth*2)), "",                                   \
5649         (funcname)                                              \
5650     );                                                          \
5651     RExC_lastnum=num;                                           \
5652     RExC_lastparse=RExC_parse;                                  \
5653 })
5654
5655
5656
5657 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5658     DEBUG_PARSE_MSG((funcname));                            \
5659     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5660 })
5661 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5662     DEBUG_PARSE_MSG((funcname));                            \
5663     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5664 })
5665 /*
5666  - reg - regular expression, i.e. main body or parenthesized thing
5667  *
5668  * Caller must absorb opening parenthesis.
5669  *
5670  * Combining parenthesis handling with the base level of regular expression
5671  * is a trifle forced, but the need to tie the tails of the branches to what
5672  * follows makes it hard to avoid.
5673  */
5674 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5675 #ifdef DEBUGGING
5676 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5677 #else
5678 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5679 #endif
5680
5681 STATIC regnode *
5682 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5683     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5684 {
5685     dVAR;
5686     register regnode *ret;              /* Will be the head of the group. */
5687     register regnode *br;
5688     register regnode *lastbr;
5689     register regnode *ender = NULL;
5690     register I32 parno = 0;
5691     I32 flags;
5692     U32 oregflags = RExC_flags;
5693     bool have_branch = 0;
5694     bool is_open = 0;
5695     I32 freeze_paren = 0;
5696     I32 after_freeze = 0;
5697
5698     /* for (?g), (?gc), and (?o) warnings; warning
5699        about (?c) will warn about (?g) -- japhy    */
5700
5701 #define WASTED_O  0x01
5702 #define WASTED_G  0x02
5703 #define WASTED_C  0x04
5704 #define WASTED_GC (0x02|0x04)
5705     I32 wastedflags = 0x00;
5706
5707     char * parse_start = RExC_parse; /* MJD */
5708     char * const oregcomp_parse = RExC_parse;
5709
5710     GET_RE_DEBUG_FLAGS_DECL;
5711
5712     PERL_ARGS_ASSERT_REG;
5713     DEBUG_PARSE("reg ");
5714
5715     *flagp = 0;                         /* Tentatively. */
5716
5717
5718     /* Make an OPEN node, if parenthesized. */
5719     if (paren) {
5720         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5721             char *start_verb = RExC_parse;
5722             STRLEN verb_len = 0;
5723             char *start_arg = NULL;
5724             unsigned char op = 0;
5725             int argok = 1;
5726             int internal_argval = 0; /* internal_argval is only useful if !argok */
5727             while ( *RExC_parse && *RExC_parse != ')' ) {
5728                 if ( *RExC_parse == ':' ) {
5729                     start_arg = RExC_parse + 1;
5730                     break;
5731                 }
5732                 RExC_parse++;
5733             }
5734             ++start_verb;
5735             verb_len = RExC_parse - start_verb;
5736             if ( start_arg ) {
5737                 RExC_parse++;
5738                 while ( *RExC_parse && *RExC_parse != ')' )
5739                     RExC_parse++;
5740                 if ( *RExC_parse != ')' )
5741                     vFAIL("Unterminated verb pattern argument");
5742                 if ( RExC_parse == start_arg )
5743                     start_arg = NULL;
5744             } else {
5745                 if ( *RExC_parse != ')' )
5746                     vFAIL("Unterminated verb pattern");
5747             }
5748
5749             switch ( *start_verb ) {
5750             case 'A':  /* (*ACCEPT) */
5751                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5752                     op = ACCEPT;
5753                     internal_argval = RExC_nestroot;
5754                 }
5755                 break;
5756             case 'C':  /* (*COMMIT) */
5757                 if ( memEQs(start_verb,verb_len,"COMMIT") )
5758                     op = COMMIT;
5759                 break;
5760             case 'F':  /* (*FAIL) */
5761                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5762                     op = OPFAIL;
5763                     argok = 0;
5764                 }
5765                 break;
5766             case ':':  /* (*:NAME) */
5767             case 'M':  /* (*MARK:NAME) */
5768                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5769                     op = MARKPOINT;
5770                     argok = -1;
5771                 }
5772                 break;
5773             case 'P':  /* (*PRUNE) */
5774                 if ( memEQs(start_verb,verb_len,"PRUNE") )
5775                     op = PRUNE;
5776                 break;
5777             case 'S':   /* (*SKIP) */
5778                 if ( memEQs(start_verb,verb_len,"SKIP") )
5779                     op = SKIP;
5780                 break;
5781             case 'T':  /* (*THEN) */
5782                 /* [19:06] <TimToady> :: is then */
5783                 if ( memEQs(start_verb,verb_len,"THEN") ) {
5784                     op = CUTGROUP;
5785                     RExC_seen |= REG_SEEN_CUTGROUP;
5786                 }
5787                 break;
5788             }
5789             if ( ! op ) {
5790                 RExC_parse++;
5791                 vFAIL3("Unknown verb pattern '%.*s'",
5792                     verb_len, start_verb);
5793             }
5794             if ( argok ) {
5795                 if ( start_arg && internal_argval ) {
5796                     vFAIL3("Verb pattern '%.*s' may not have an argument",
5797                         verb_len, start_verb);
5798                 } else if ( argok < 0 && !start_arg ) {
5799                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5800                         verb_len, start_verb);
5801                 } else {
5802                     ret = reganode(pRExC_state, op, internal_argval);
5803                     if ( ! internal_argval && ! SIZE_ONLY ) {
5804                         if (start_arg) {
5805                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5806                             ARG(ret) = add_data( pRExC_state, 1, "S" );
5807                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5808                             ret->flags = 0;
5809                         } else {
5810                             ret->flags = 1;
5811                         }
5812                     }
5813                 }
5814                 if (!internal_argval)
5815                     RExC_seen |= REG_SEEN_VERBARG;
5816             } else if ( start_arg ) {
5817                 vFAIL3("Verb pattern '%.*s' may not have an argument",
5818                         verb_len, start_verb);
5819             } else {
5820                 ret = reg_node(pRExC_state, op);
5821             }
5822             nextchar(pRExC_state);
5823             return ret;
5824         } else
5825         if (*RExC_parse == '?') { /* (?...) */
5826             bool is_logical = 0;
5827             const char * const seqstart = RExC_parse;
5828             bool has_use_defaults = FALSE;
5829
5830             RExC_parse++;
5831             paren = *RExC_parse++;
5832             ret = NULL;                 /* For look-ahead/behind. */
5833             switch (paren) {
5834
5835             case 'P':   /* (?P...) variants for those used to PCRE/Python */
5836                 paren = *RExC_parse++;
5837                 if ( paren == '<')         /* (?P<...>) named capture */
5838                     goto named_capture;
5839                 else if (paren == '>') {   /* (?P>name) named recursion */
5840                     goto named_recursion;
5841                 }
5842                 else if (paren == '=') {   /* (?P=...)  named backref */
5843                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
5844                        you change this make sure you change that */
5845                     char* name_start = RExC_parse;
5846                     U32 num = 0;
5847                     SV *sv_dat = reg_scan_name(pRExC_state,
5848                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5849                     if (RExC_parse == name_start || *RExC_parse != ')')
5850                         vFAIL2("Sequence %.3s... not terminated",parse_start);
5851
5852                     if (!SIZE_ONLY) {
5853                         num = add_data( pRExC_state, 1, "S" );
5854                         RExC_rxi->data->data[num]=(void*)sv_dat;
5855                         SvREFCNT_inc_simple_void(sv_dat);
5856                     }
5857                     RExC_sawback = 1;
5858                     ret = reganode(pRExC_state,
5859                                    ((! FOLD)
5860                                      ? NREF
5861                                      : (UNI_SEMANTICS)
5862                                        ? NREFFU
5863                                        : (LOC)
5864                                          ? NREFFL
5865                                          : NREFF),
5866                                     num);
5867                     *flagp |= HASWIDTH;
5868
5869                     Set_Node_Offset(ret, parse_start+1);
5870                     Set_Node_Cur_Length(ret); /* MJD */
5871
5872                     nextchar(pRExC_state);
5873                     return ret;
5874                 }
5875                 RExC_parse++;
5876                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5877                 /*NOTREACHED*/
5878             case '<':           /* (?<...) */
5879                 if (*RExC_parse == '!')
5880                     paren = ',';
5881                 else if (*RExC_parse != '=')
5882               named_capture:
5883                 {               /* (?<...>) */
5884                     char *name_start;
5885                     SV *svname;
5886                     paren= '>';
5887             case '\'':          /* (?'...') */
5888                     name_start= RExC_parse;
5889                     svname = reg_scan_name(pRExC_state,
5890                         SIZE_ONLY ?  /* reverse test from the others */
5891                         REG_RSN_RETURN_NAME :
5892                         REG_RSN_RETURN_NULL);
5893                     if (RExC_parse == name_start) {
5894                         RExC_parse++;
5895                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5896                         /*NOTREACHED*/
5897                     }
5898                     if (*RExC_parse != paren)
5899                         vFAIL2("Sequence (?%c... not terminated",
5900                             paren=='>' ? '<' : paren);
5901                     if (SIZE_ONLY) {
5902                         HE *he_str;
5903                         SV *sv_dat = NULL;
5904                         if (!svname) /* shouldnt happen */
5905                             Perl_croak(aTHX_
5906                                 "panic: reg_scan_name returned NULL");
5907                         if (!RExC_paren_names) {
5908                             RExC_paren_names= newHV();
5909                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
5910 #ifdef DEBUGGING
5911                             RExC_paren_name_list= newAV();
5912                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5913 #endif
5914                         }
5915                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5916                         if ( he_str )
5917                             sv_dat = HeVAL(he_str);
5918                         if ( ! sv_dat ) {
5919                             /* croak baby croak */
5920                             Perl_croak(aTHX_
5921                                 "panic: paren_name hash element allocation failed");
5922                         } else if ( SvPOK(sv_dat) ) {
5923                             /* (?|...) can mean we have dupes so scan to check
5924                                its already been stored. Maybe a flag indicating
5925                                we are inside such a construct would be useful,
5926                                but the arrays are likely to be quite small, so
5927                                for now we punt -- dmq */
5928                             IV count = SvIV(sv_dat);
5929                             I32 *pv = (I32*)SvPVX(sv_dat);
5930                             IV i;
5931                             for ( i = 0 ; i < count ; i++ ) {
5932                                 if ( pv[i] == RExC_npar ) {
5933                                     count = 0;
5934                                     break;
5935                                 }
5936                             }
5937                             if ( count ) {
5938                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5939                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5940                                 pv[count] = RExC_npar;
5941                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5942                             }
5943                         } else {
5944                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
5945                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5946                             SvIOK_on(sv_dat);
5947                             SvIV_set(sv_dat, 1);
5948                         }
5949 #ifdef DEBUGGING
5950                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5951                             SvREFCNT_dec(svname);
5952 #endif
5953
5954                         /*sv_dump(sv_dat);*/
5955                     }
5956                     nextchar(pRExC_state);
5957                     paren = 1;
5958                     goto capturing_parens;
5959                 }
5960                 RExC_seen |= REG_SEEN_LOOKBEHIND;
5961                 RExC_parse++;
5962             case '=':           /* (?=...) */
5963                 RExC_seen_zerolen++;
5964                 break;
5965             case '!':           /* (?!...) */
5966                 RExC_seen_zerolen++;
5967                 if (*RExC_parse == ')') {
5968                     ret=reg_node(pRExC_state, OPFAIL);
5969                     nextchar(pRExC_state);
5970                     return ret;
5971                 }
5972                 break;
5973             case '|':           /* (?|...) */
5974                 /* branch reset, behave like a (?:...) except that
5975                    buffers in alternations share the same numbers */
5976                 paren = ':';
5977                 after_freeze = freeze_paren = RExC_npar;
5978                 break;
5979             case ':':           /* (?:...) */
5980             case '>':           /* (?>...) */
5981                 break;
5982             case '$':           /* (?$...) */
5983             case '@':           /* (?@...) */
5984                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5985                 break;
5986             case '#':           /* (?#...) */
5987                 while (*RExC_parse && *RExC_parse != ')')
5988                     RExC_parse++;
5989                 if (*RExC_parse != ')')
5990                     FAIL("Sequence (?#... not terminated");
5991                 nextchar(pRExC_state);
5992                 *flagp = TRYAGAIN;
5993                 return NULL;
5994             case '0' :           /* (?0) */
5995             case 'R' :           /* (?R) */
5996                 if (*RExC_parse != ')')
5997                     FAIL("Sequence (?R) not terminated");
5998                 ret = reg_node(pRExC_state, GOSTART);
5999                 *flagp |= POSTPONED;
6000                 nextchar(pRExC_state);
6001                 return ret;
6002                 /*notreached*/
6003             { /* named and numeric backreferences */
6004                 I32 num;
6005             case '&':            /* (?&NAME) */
6006                 parse_start = RExC_parse - 1;
6007               named_recursion:
6008                 {
6009                     SV *sv_dat = reg_scan_name(pRExC_state,
6010                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6011                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6012                 }
6013                 goto gen_recurse_regop;
6014                 /* NOT REACHED */
6015             case '+':
6016                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6017                     RExC_parse++;
6018                     vFAIL("Illegal pattern");
6019                 }
6020                 goto parse_recursion;
6021                 /* NOT REACHED*/
6022             case '-': /* (?-1) */
6023                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6024                     RExC_parse--; /* rewind to let it be handled later */
6025                     goto parse_flags;
6026                 }
6027                 /*FALLTHROUGH */
6028             case '1': case '2': case '3': case '4': /* (?1) */
6029             case '5': case '6': case '7': case '8': case '9':
6030                 RExC_parse--;
6031               parse_recursion:
6032                 num = atoi(RExC_parse);
6033                 parse_start = RExC_parse - 1; /* MJD */
6034                 if (*RExC_parse == '-')
6035                     RExC_parse++;
6036                 while (isDIGIT(*RExC_parse))
6037                         RExC_parse++;
6038                 if (*RExC_parse!=')')
6039                     vFAIL("Expecting close bracket");
6040
6041               gen_recurse_regop:
6042                 if ( paren == '-' ) {
6043                     /*
6044                     Diagram of capture buffer numbering.
6045                     Top line is the normal capture buffer numbers
6046                     Bottom line is the negative indexing as from
6047                     the X (the (?-2))
6048
6049                     +   1 2    3 4 5 X          6 7
6050                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6051                     -   5 4    3 2 1 X          x x
6052
6053                     */
6054                     num = RExC_npar + num;
6055                     if (num < 1)  {
6056                         RExC_parse++;
6057                         vFAIL("Reference to nonexistent group");
6058                     }
6059                 } else if ( paren == '+' ) {
6060                     num = RExC_npar + num - 1;
6061                 }
6062
6063                 ret = reganode(pRExC_state, GOSUB, num);
6064                 if (!SIZE_ONLY) {
6065                     if (num > (I32)RExC_rx->nparens) {
6066                         RExC_parse++;
6067                         vFAIL("Reference to nonexistent group");
6068                     }
6069                     ARG2L_SET( ret, RExC_recurse_count++);
6070                     RExC_emit++;
6071                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6072                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6073                 } else {
6074                     RExC_size++;
6075                 }
6076                 RExC_seen |= REG_SEEN_RECURSE;
6077                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6078                 Set_Node_Offset(ret, parse_start); /* MJD */
6079
6080                 *flagp |= POSTPONED;
6081                 nextchar(pRExC_state);
6082                 return ret;
6083             } /* named and numeric backreferences */
6084             /* NOT REACHED */
6085
6086             case '?':           /* (??...) */
6087                 is_logical = 1;
6088                 if (*RExC_parse != '{') {
6089                     RExC_parse++;
6090                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6091                     /*NOTREACHED*/
6092                 }
6093                 *flagp |= POSTPONED;
6094                 paren = *RExC_parse++;
6095                 /* FALL THROUGH */
6096             case '{':           /* (?{...}) */
6097             {
6098                 I32 count = 1;
6099                 U32 n = 0;
6100                 char c;
6101                 char *s = RExC_parse;
6102
6103                 RExC_seen_zerolen++;
6104                 RExC_seen |= REG_SEEN_EVAL;
6105                 while (count && (c = *RExC_parse)) {
6106                     if (c == '\\') {
6107                         if (RExC_parse[1])
6108                             RExC_parse++;
6109                     }
6110                     else if (c == '{')
6111                         count++;
6112                     else if (c == '}')
6113                         count--;
6114                     RExC_parse++;
6115                 }
6116                 if (*RExC_parse != ')') {
6117                     RExC_parse = s;
6118                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6119                 }
6120                 if (!SIZE_ONLY) {
6121                     PAD *pad;
6122                     OP_4tree *sop, *rop;
6123                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6124
6125                     ENTER;
6126                     Perl_save_re_context(aTHX);
6127                     rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6128                     sop->op_private |= OPpREFCOUNTED;
6129                     /* re_dup will OpREFCNT_inc */
6130                     OpREFCNT_set(sop, 1);
6131                     LEAVE;
6132
6133                     n = add_data(pRExC_state, 3, "nop");
6134                     RExC_rxi->data->data[n] = (void*)rop;
6135                     RExC_rxi->data->data[n+1] = (void*)sop;
6136                     RExC_rxi->data->data[n+2] = (void*)pad;
6137                     SvREFCNT_dec(sv);
6138                 }
6139                 else {                                          /* First pass */
6140                     if (PL_reginterp_cnt < ++RExC_seen_evals
6141                         && IN_PERL_RUNTIME)
6142                         /* No compiled RE interpolated, has runtime
6143                            components ===> unsafe.  */
6144                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
6145                     if (PL_tainting && PL_tainted)
6146                         FAIL("Eval-group in insecure regular expression");
6147 #if PERL_VERSION > 8
6148                     if (IN_PERL_COMPILETIME)
6149                         PL_cv_has_eval = 1;
6150 #endif
6151                 }
6152
6153                 nextchar(pRExC_state);
6154                 if (is_logical) {
6155                     ret = reg_node(pRExC_state, LOGICAL);
6156                     if (!SIZE_ONLY)
6157                         ret->flags = 2;
6158                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6159                     /* deal with the length of this later - MJD */
6160                     return ret;
6161                 }
6162                 ret = reganode(pRExC_state, EVAL, n);
6163                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6164                 Set_Node_Offset(ret, parse_start);
6165                 return ret;
6166             }
6167             case '(':           /* (?(?{...})...) and (?(?=...)...) */
6168             {
6169                 int is_define= 0;
6170                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
6171                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6172                         || RExC_parse[1] == '<'
6173                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
6174                         I32 flag;
6175
6176                         ret = reg_node(pRExC_state, LOGICAL);
6177                         if (!SIZE_ONLY)
6178                             ret->flags = 1;
6179                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6180                         goto insert_if;
6181                     }
6182                 }
6183                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
6184                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6185                 {
6186                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
6187                     char *name_start= RExC_parse++;
6188                     U32 num = 0;
6189                     SV *sv_dat=reg_scan_name(pRExC_state,
6190                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6191                     if (RExC_parse == name_start || *RExC_parse != ch)
6192                         vFAIL2("Sequence (?(%c... not terminated",
6193                             (ch == '>' ? '<' : ch));
6194                     RExC_parse++;
6195                     if (!SIZE_ONLY) {
6196                         num = add_data( pRExC_state, 1, "S" );
6197                         RExC_rxi->data->data[num]=(void*)sv_dat;
6198                         SvREFCNT_inc_simple_void(sv_dat);
6199                     }
6200                     ret = reganode(pRExC_state,NGROUPP,num);
6201                     goto insert_if_check_paren;
6202                 }
6203                 else if (RExC_parse[0] == 'D' &&
6204                          RExC_parse[1] == 'E' &&
6205                          RExC_parse[2] == 'F' &&
6206                          RExC_parse[3] == 'I' &&
6207                          RExC_parse[4] == 'N' &&
6208                          RExC_parse[5] == 'E')
6209                 {
6210                     ret = reganode(pRExC_state,DEFINEP,0);
6211                     RExC_parse +=6 ;
6212                     is_define = 1;
6213                     goto insert_if_check_paren;
6214                 }
6215                 else if (RExC_parse[0] == 'R') {
6216                     RExC_parse++;
6217                     parno = 0;
6218                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6219                         parno = atoi(RExC_parse++);
6220                         while (isDIGIT(*RExC_parse))
6221                             RExC_parse++;
6222                     } else if (RExC_parse[0] == '&') {
6223                         SV *sv_dat;
6224                         RExC_parse++;
6225                         sv_dat = reg_scan_name(pRExC_state,
6226                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6227                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6228                     }
6229                     ret = reganode(pRExC_state,INSUBP,parno);
6230                     goto insert_if_check_paren;
6231                 }
6232                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6233                     /* (?(1)...) */
6234                     char c;
6235                     parno = atoi(RExC_parse++);
6236
6237                     while (isDIGIT(*RExC_parse))
6238                         RExC_parse++;
6239                     ret = reganode(pRExC_state, GROUPP, parno);
6240
6241                  insert_if_check_paren:
6242                     if ((c = *nextchar(pRExC_state)) != ')')
6243                         vFAIL("Switch condition not recognized");
6244                   insert_if:
6245                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6246                     br = regbranch(pRExC_state, &flags, 1,depth+1);
6247                     if (br == NULL)
6248                         br = reganode(pRExC_state, LONGJMP, 0);
6249                     else
6250                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6251                     c = *nextchar(pRExC_state);
6252                     if (flags&HASWIDTH)
6253                         *flagp |= HASWIDTH;
6254                     if (c == '|') {
6255                         if (is_define)
6256                             vFAIL("(?(DEFINE)....) does not allow branches");
6257                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6258                         regbranch(pRExC_state, &flags, 1,depth+1);
6259                         REGTAIL(pRExC_state, ret, lastbr);
6260                         if (flags&HASWIDTH)
6261                             *flagp |= HASWIDTH;
6262                         c = *nextchar(pRExC_state);
6263                     }
6264                     else
6265                         lastbr = NULL;
6266                     if (c != ')')
6267                         vFAIL("Switch (?(condition)... contains too many branches");
6268                     ender = reg_node(pRExC_state, TAIL);
6269                     REGTAIL(pRExC_state, br, ender);
6270                     if (lastbr) {
6271                         REGTAIL(pRExC_state, lastbr, ender);
6272                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6273                     }
6274                     else
6275                         REGTAIL(pRExC_state, ret, ender);
6276                     RExC_size++; /* XXX WHY do we need this?!!
6277                                     For large programs it seems to be required
6278                                     but I can't figure out why. -- dmq*/
6279                     return ret;
6280                 }
6281                 else {
6282                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6283                 }
6284             }
6285             case 0:
6286                 RExC_parse--; /* for vFAIL to print correctly */
6287                 vFAIL("Sequence (? incomplete");
6288                 break;
6289             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
6290                                        that follow */
6291                 has_use_defaults = TRUE;
6292                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
6293                 if (RExC_utf8) {    /* But the default for a utf8 pattern is
6294                                        unicode semantics */
6295                     RExC_flags |= RXf_PMf_UNICODE;
6296                 }
6297                 goto parse_flags;
6298             default:
6299                 --RExC_parse;
6300                 parse_flags:      /* (?i) */
6301             {
6302                 U32 posflags = 0, negflags = 0;
6303                 U32 *flagsp = &posflags;
6304                 bool has_charset_modifier = 0;
6305
6306                 while (*RExC_parse) {
6307                     /* && strchr("iogcmsx", *RExC_parse) */
6308                     /* (?g), (?gc) and (?o) are useless here
6309                        and must be globally applied -- japhy */
6310                     switch (*RExC_parse) {
6311                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6312                     case LOCALE_PAT_MOD:
6313                         if (has_charset_modifier || flagsp == &negflags) {
6314                             goto fail_modifiers;
6315                         }
6316                         posflags |= RXf_PMf_LOCALE;
6317                         negflags |= RXf_PMf_UNICODE;
6318                         has_charset_modifier = 1;
6319                         break;
6320                     case UNICODE_PAT_MOD:
6321                         if (has_charset_modifier || flagsp == &negflags) {
6322                             goto fail_modifiers;
6323                         }
6324                         posflags |= RXf_PMf_UNICODE;
6325                         negflags |= RXf_PMf_LOCALE;
6326                         has_charset_modifier = 1;
6327                         break;
6328                     case DUAL_PAT_MOD:
6329                         if (has_use_defaults
6330                             || has_charset_modifier
6331                             || flagsp == &negflags)
6332                         {
6333                             goto fail_modifiers;
6334                         }
6335
6336                         /* The dual charset means unicode semantics if the
6337                          * pattern (or target, not known until runtime) are
6338                          * utf8 */
6339                         if (RExC_utf8) {
6340                             posflags |= RXf_PMf_UNICODE;
6341                             negflags |= RXf_PMf_LOCALE;
6342                         }
6343                         else {
6344                             negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
6345                         }
6346                         has_charset_modifier = 1;
6347                         break;
6348                     case ONCE_PAT_MOD: /* 'o' */
6349                     case GLOBAL_PAT_MOD: /* 'g' */
6350                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6351                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6352                             if (! (wastedflags & wflagbit) ) {
6353                                 wastedflags |= wflagbit;
6354                                 vWARN5(
6355                                     RExC_parse + 1,
6356                                     "Useless (%s%c) - %suse /%c modifier",
6357                                     flagsp == &negflags ? "?-" : "?",
6358                                     *RExC_parse,
6359                                     flagsp == &negflags ? "don't " : "",
6360                                     *RExC_parse
6361                                 );
6362                             }
6363                         }
6364                         break;
6365
6366                     case CONTINUE_PAT_MOD: /* 'c' */
6367                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6368                             if (! (wastedflags & WASTED_C) ) {
6369                                 wastedflags |= WASTED_GC;
6370                                 vWARN3(
6371                                     RExC_parse + 1,
6372                                     "Useless (%sc) - %suse /gc modifier",
6373                                     flagsp == &negflags ? "?-" : "?",
6374                                     flagsp == &negflags ? "don't " : ""
6375                                 );
6376                             }
6377                         }
6378                         break;
6379                     case KEEPCOPY_PAT_MOD: /* 'p' */
6380                         if (flagsp == &negflags) {
6381                             if (SIZE_ONLY)
6382                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6383                         } else {
6384                             *flagsp |= RXf_PMf_KEEPCOPY;
6385                         }
6386                         break;
6387                     case '-':
6388                         /* A flag is a default iff it is following a minus, so
6389                          * if there is a minus, it means will be trying to
6390                          * re-specify a default which is an error */
6391                         if (has_use_defaults || flagsp == &negflags) {
6392             fail_modifiers:
6393                             RExC_parse++;
6394                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6395                             /*NOTREACHED*/
6396                         }
6397                         flagsp = &negflags;
6398                         wastedflags = 0;  /* reset so (?g-c) warns twice */
6399                         break;
6400                     case ':':
6401                         paren = ':';
6402                         /*FALLTHROUGH*/
6403                     case ')':
6404                         RExC_flags |= posflags;
6405                         RExC_flags &= ~negflags;
6406                         if (paren != ':') {
6407                             oregflags |= posflags;
6408                             oregflags &= ~negflags;
6409                         }
6410                         nextchar(pRExC_state);
6411                         if (paren != ':') {
6412                             *flagp = TRYAGAIN;
6413                             return NULL;
6414                         } else {
6415                             ret = NULL;
6416                             goto parse_rest;
6417                         }
6418                         /*NOTREACHED*/
6419                     default:
6420                         RExC_parse++;
6421                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6422                         /*NOTREACHED*/
6423                     }
6424                     ++RExC_parse;
6425                 }
6426             }} /* one for the default block, one for the switch */
6427         }
6428         else {                  /* (...) */
6429           capturing_parens:
6430             parno = RExC_npar;
6431             RExC_npar++;
6432
6433             ret = reganode(pRExC_state, OPEN, parno);
6434             if (!SIZE_ONLY ){
6435                 if (!RExC_nestroot)
6436                     RExC_nestroot = parno;
6437                 if (RExC_seen & REG_SEEN_RECURSE
6438                     && !RExC_open_parens[parno-1])
6439                 {
6440                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6441                         "Setting open paren #%"IVdf" to %d\n",
6442                         (IV)parno, REG_NODE_NUM(ret)));
6443                     RExC_open_parens[parno-1]= ret;
6444                 }
6445             }
6446             Set_Node_Length(ret, 1); /* MJD */
6447             Set_Node_Offset(ret, RExC_parse); /* MJD */
6448             is_open = 1;
6449         }
6450     }
6451     else                        /* ! paren */
6452         ret = NULL;
6453
6454    parse_rest:
6455     /* Pick up the branches, linking them together. */
6456     parse_start = RExC_parse;   /* MJD */
6457     br = regbranch(pRExC_state, &flags, 1,depth+1);
6458
6459     if (freeze_paren) {
6460         if (RExC_npar > after_freeze)
6461             after_freeze = RExC_npar;
6462         RExC_npar = freeze_paren;
6463     }
6464
6465     /*     branch_len = (paren != 0); */
6466
6467     if (br == NULL)
6468         return(NULL);
6469     if (*RExC_parse == '|') {
6470         if (!SIZE_ONLY && RExC_extralen) {
6471             reginsert(pRExC_state, BRANCHJ, br, depth+1);
6472         }
6473         else {                  /* MJD */
6474             reginsert(pRExC_state, BRANCH, br, depth+1);
6475             Set_Node_Length(br, paren != 0);
6476             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6477         }
6478         have_branch = 1;
6479         if (SIZE_ONLY)
6480             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
6481     }
6482     else if (paren == ':') {
6483         *flagp |= flags&SIMPLE;
6484     }
6485     if (is_open) {                              /* Starts with OPEN. */
6486         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
6487     }
6488     else if (paren != '?')              /* Not Conditional */
6489         ret = br;
6490     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6491     lastbr = br;
6492     while (*RExC_parse == '|') {
6493         if (!SIZE_ONLY && RExC_extralen) {
6494             ender = reganode(pRExC_state, LONGJMP,0);
6495             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6496         }
6497         if (SIZE_ONLY)
6498             RExC_extralen += 2;         /* Account for LONGJMP. */
6499         nextchar(pRExC_state);
6500         if (freeze_paren) {
6501             if (RExC_npar > after_freeze)
6502                 after_freeze = RExC_npar;
6503             RExC_npar = freeze_paren;
6504         }
6505         br = regbranch(pRExC_state, &flags, 0, depth+1);
6506
6507         if (br == NULL)
6508             return(NULL);
6509         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
6510         lastbr = br;
6511         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6512     }
6513
6514     if (have_branch || paren != ':') {
6515         /* Make a closing node, and hook it on the end. */
6516         switch (paren) {
6517         case ':':
6518             ender = reg_node(pRExC_state, TAIL);
6519             break;
6520         case 1:
6521             ender = reganode(pRExC_state, CLOSE, parno);
6522             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6523                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6524                         "Setting close paren #%"IVdf" to %d\n",
6525                         (IV)parno, REG_NODE_NUM(ender)));
6526                 RExC_close_parens[parno-1]= ender;
6527                 if (RExC_nestroot == parno)
6528                     RExC_nestroot = 0;
6529             }
6530             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6531             Set_Node_Length(ender,1); /* MJD */
6532             break;
6533         case '<':
6534         case ',':
6535         case '=':
6536         case '!':
6537             *flagp &= ~HASWIDTH;
6538             /* FALL THROUGH */
6539         case '>':
6540             ender = reg_node(pRExC_state, SUCCEED);
6541             break;
6542         case 0:
6543             ender = reg_node(pRExC_state, END);
6544             if (!SIZE_ONLY) {
6545                 assert(!RExC_opend); /* there can only be one! */
6546                 RExC_opend = ender;
6547             }
6548             break;
6549         }
6550         REGTAIL(pRExC_state, lastbr, ender);
6551
6552         if (have_branch && !SIZE_ONLY) {
6553             if (depth==1)
6554                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6555
6556             /* Hook the tails of the branches to the closing node. */
6557             for (br = ret; br; br = regnext(br)) {
6558                 const U8 op = PL_regkind[OP(br)];
6559                 if (op == BRANCH) {
6560                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6561                 }
6562                 else if (op == BRANCHJ) {
6563                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6564                 }
6565             }
6566         }
6567     }
6568
6569     {
6570         const char *p;
6571         static const char parens[] = "=!<,>";
6572
6573         if (paren && (p = strchr(parens, paren))) {
6574             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6575             int flag = (p - parens) > 1;
6576
6577             if (paren == '>')
6578                 node = SUSPEND, flag = 0;
6579             reginsert(pRExC_state, node,ret, depth+1);
6580             Set_Node_Cur_Length(ret);
6581             Set_Node_Offset(ret, parse_start + 1);
6582             ret->flags = flag;
6583             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6584         }
6585     }
6586
6587     /* Check for proper termination. */
6588     if (paren) {
6589         RExC_flags = oregflags;
6590         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6591             RExC_parse = oregcomp_parse;
6592             vFAIL("Unmatched (");
6593         }
6594     }
6595     else if (!paren && RExC_parse < RExC_end) {
6596         if (*RExC_parse == ')') {
6597             RExC_parse++;
6598             vFAIL("Unmatched )");
6599         }
6600         else
6601             FAIL("Junk on end of regexp");      /* "Can't happen". */
6602         /* NOTREACHED */
6603     }
6604     if (after_freeze)
6605         RExC_npar = after_freeze;
6606     return(ret);
6607 }
6608
6609 /*
6610  - regbranch - one alternative of an | operator
6611  *
6612  * Implements the concatenation operator.
6613  */
6614 STATIC regnode *
6615 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6616 {
6617     dVAR;
6618     register regnode *ret;
6619     register regnode *chain = NULL;
6620     register regnode *latest;
6621     I32 flags = 0, c = 0;
6622     GET_RE_DEBUG_FLAGS_DECL;
6623
6624     PERL_ARGS_ASSERT_REGBRANCH;
6625
6626     DEBUG_PARSE("brnc");
6627
6628     if (first)
6629         ret = NULL;
6630     else {
6631         if (!SIZE_ONLY && RExC_extralen)
6632             ret = reganode(pRExC_state, BRANCHJ,0);
6633         else {
6634             ret = reg_node(pRExC_state, BRANCH);
6635             Set_Node_Length(ret, 1);
6636         }
6637     }
6638
6639     if (!first && SIZE_ONLY)
6640         RExC_extralen += 1;                     /* BRANCHJ */
6641
6642     *flagp = WORST;                     /* Tentatively. */
6643
6644     RExC_parse--;
6645     nextchar(pRExC_state);
6646     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6647         flags &= ~TRYAGAIN;
6648         latest = regpiece(pRExC_state, &flags,depth+1);
6649         if (latest == NULL) {
6650             if (flags & TRYAGAIN)
6651                 continue;
6652             return(NULL);
6653         }
6654         else if (ret == NULL)
6655             ret = latest;
6656         *flagp |= flags&(HASWIDTH|POSTPONED);
6657         if (chain == NULL)      /* First piece. */
6658             *flagp |= flags&SPSTART;
6659         else {
6660             RExC_naughty++;
6661             REGTAIL(pRExC_state, chain, latest);
6662         }
6663         chain = latest;
6664         c++;
6665     }
6666     if (chain == NULL) {        /* Loop ran zero times. */
6667         chain = reg_node(pRExC_state, NOTHING);
6668         if (ret == NULL)
6669             ret = chain;
6670     }
6671     if (c == 1) {
6672         *flagp |= flags&SIMPLE;
6673     }
6674
6675     return ret;
6676 }
6677
6678 /*
6679  - regpiece - something followed by possible [*+?]
6680  *
6681  * Note that the branching code sequences used for ? and the general cases
6682  * of * and + are somewhat optimized:  they use the same NOTHING node as
6683  * both the endmarker for their branch list and the body of the last branch.
6684  * It might seem that this node could be dispensed with entirely, but the
6685  * endmarker role is not redundant.
6686  */
6687 STATIC regnode *
6688 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6689 {
6690     dVAR;
6691     register regnode *ret;
6692     register char op;
6693     register char *next;
6694     I32 flags;
6695     const char * const origparse = RExC_parse;
6696     I32 min;
6697     I32 max = REG_INFTY;
6698     char *parse_start;
6699     const char *maxpos = NULL;
6700     GET_RE_DEBUG_FLAGS_DECL;
6701
6702     PERL_ARGS_ASSERT_REGPIECE;
6703
6704     DEBUG_PARSE("piec");
6705
6706     ret = regatom(pRExC_state, &flags,depth+1);
6707     if (ret == NULL) {
6708         if (flags & TRYAGAIN)
6709             *flagp |= TRYAGAIN;
6710         return(NULL);
6711     }
6712
6713     op = *RExC_parse;
6714
6715     if (op == '{' && regcurly(RExC_parse)) {
6716         maxpos = NULL;
6717         parse_start = RExC_parse; /* MJD */
6718         next = RExC_parse + 1;
6719         while (isDIGIT(*next) || *next == ',') {
6720             if (*next == ',') {
6721                 if (maxpos)
6722                     break;
6723                 else
6724                     maxpos = next;
6725             }
6726             next++;
6727         }
6728         if (*next == '}') {             /* got one */
6729             if (!maxpos)
6730                 maxpos = next;
6731             RExC_parse++;
6732             min = atoi(RExC_parse);
6733             if (*maxpos == ',')
6734                 maxpos++;
6735             else
6736                 maxpos = RExC_parse;
6737             max = atoi(maxpos);
6738             if (!max && *maxpos != '0')
6739                 max = REG_INFTY;                /* meaning "infinity" */
6740             else if (max >= REG_INFTY)
6741                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6742             RExC_parse = next;
6743             nextchar(pRExC_state);
6744
6745         do_curly:
6746             if ((flags&SIMPLE)) {
6747                 RExC_naughty += 2 + RExC_naughty / 2;
6748                 reginsert(pRExC_state, CURLY, ret, depth+1);
6749                 Set_Node_Offset(ret, parse_start+1); /* MJD */
6750                 Set_Node_Cur_Length(ret);
6751             }
6752             else {
6753                 regnode * const w = reg_node(pRExC_state, WHILEM);
6754
6755                 w->flags = 0;
6756                 REGTAIL(pRExC_state, ret, w);
6757                 if (!SIZE_ONLY && RExC_extralen) {
6758                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
6759                     reginsert(pRExC_state, NOTHING,ret, depth+1);
6760                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
6761                 }
6762                 reginsert(pRExC_state, CURLYX,ret, depth+1);
6763                                 /* MJD hk */
6764                 Set_Node_Offset(ret, parse_start+1);
6765                 Set_Node_Length(ret,
6766                                 op == '{' ? (RExC_parse - parse_start) : 1);
6767
6768                 if (!SIZE_ONLY && RExC_extralen)
6769                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
6770                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6771                 if (SIZE_ONLY)
6772                     RExC_whilem_seen++, RExC_extralen += 3;
6773                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
6774             }
6775             ret->flags = 0;
6776
6777             if (min > 0)
6778                 *flagp = WORST;
6779             if (max > 0)
6780                 *flagp |= HASWIDTH;
6781             if (max < min)
6782                 vFAIL("Can't do {n,m} with n > m");
6783             if (!SIZE_ONLY) {
6784                 ARG1_SET(ret, (U16)min);
6785                 ARG2_SET(ret, (U16)max);
6786             }
6787
6788             goto nest_check;
6789         }
6790     }
6791
6792     if (!ISMULT1(op)) {
6793         *flagp = flags;
6794         return(ret);
6795     }
6796
6797 #if 0                           /* Now runtime fix should be reliable. */
6798
6799     /* if this is reinstated, don't forget to put this back into perldiag:
6800
6801             =item Regexp *+ operand could be empty at {#} in regex m/%s/
6802
6803            (F) The part of the regexp subject to either the * or + quantifier
6804            could match an empty string. The {#} shows in the regular
6805            expression about where the problem was discovered.
6806
6807     */
6808
6809     if (!(flags&HASWIDTH) && op != '?')
6810       vFAIL("Regexp *+ operand could be empty");
6811 #endif
6812
6813     parse_start = RExC_parse;
6814     nextchar(pRExC_state);
6815
6816     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6817
6818     if (op == '*' && (flags&SIMPLE)) {
6819         reginsert(pRExC_state, STAR, ret, depth+1);
6820         ret->flags = 0;
6821         RExC_naughty += 4;
6822     }
6823     else if (op == '*') {
6824         min = 0;
6825         goto do_curly;
6826     }
6827     else if (op == '+' && (flags&SIMPLE)) {
6828         reginsert(pRExC_state, PLUS, ret, depth+1);
6829         ret->flags = 0;
6830         RExC_naughty += 3;
6831     }
6832     else if (op == '+') {
6833         min = 1;
6834         goto do_curly;
6835     }
6836     else if (op == '?') {
6837         min = 0; max = 1;
6838         goto do_curly;
6839     }
6840   nest_check:
6841     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6842         ckWARN3reg(RExC_parse,
6843                    "%.*s matches null string many times",
6844                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6845                    origparse);
6846     }
6847
6848     if (RExC_parse < RExC_end && *RExC_parse == '?') {
6849         nextchar(pRExC_state);
6850         reginsert(pRExC_state, MINMOD, ret, depth+1);
6851         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6852     }
6853 #ifndef REG_ALLOW_MINMOD_SUSPEND
6854     else
6855 #endif
6856     if (RExC_parse < RExC_end && *RExC_parse == '+') {
6857         regnode *ender;
6858         nextchar(pRExC_state);
6859         ender = reg_node(pRExC_state, SUCCEED);
6860         REGTAIL(pRExC_state, ret, ender);
6861         reginsert(pRExC_state, SUSPEND, ret, depth+1);
6862         ret->flags = 0;
6863         ender = reg_node(pRExC_state, TAIL);
6864         REGTAIL(pRExC_state, ret, ender);
6865         /*ret= ender;*/
6866     }
6867
6868     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6869         RExC_parse++;
6870         vFAIL("Nested quantifiers");
6871     }
6872
6873     return(ret);
6874 }
6875
6876
6877 /* reg_namedseq(pRExC_state,UVp)
6878
6879    This is expected to be called by a parser routine that has
6880    recognized '\N' and needs to handle the rest. RExC_parse is
6881    expected to point at the first char following the N at the time
6882    of the call.
6883
6884    The \N may be inside (indicated by valuep not being NULL) or outside a
6885    character class.
6886
6887    \N may begin either a named sequence, or if outside a character class, mean
6888    to match a non-newline.  For non single-quoted regexes, the tokenizer has
6889    attempted to decide which, and in the case of a named sequence converted it
6890    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6891    where c1... are the characters in the sequence.  For single-quoted regexes,
6892    the tokenizer passes the \N sequence through unchanged; this code will not
6893    attempt to determine this nor expand those.  The net effect is that if the
6894    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6895    signals that this \N occurrence means to match a non-newline.
6896
6897    Only the \N{U+...} form should occur in a character class, for the same
6898    reason that '.' inside a character class means to just match a period: it
6899    just doesn't make sense.
6900
6901    If valuep is non-null then it is assumed that we are parsing inside
6902    of a charclass definition and the first codepoint in the resolved
6903    string is returned via *valuep and the routine will return NULL.
6904    In this mode if a multichar string is returned from the charnames
6905    handler, a warning will be issued, and only the first char in the
6906    sequence will be examined. If the string returned is zero length
6907    then the value of *valuep is undefined and NON-NULL will
6908    be returned to indicate failure. (This will NOT be a valid pointer
6909    to a regnode.)
6910
6911    If valuep is null then it is assumed that we are parsing normal text and a
6912    new EXACT node is inserted into the program containing the resolved string,
6913    and a pointer to the new node is returned.  But if the string is zero length
6914    a NOTHING node is emitted instead.
6915
6916    On success RExC_parse is set to the char following the endbrace.
6917    Parsing failures will generate a fatal error via vFAIL(...)
6918  */
6919 STATIC regnode *
6920 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6921 {
6922     char * endbrace;    /* '}' following the name */
6923     regnode *ret = NULL;
6924 #ifdef DEBUGGING
6925     char* parse_start = RExC_parse - 2;     /* points to the '\N' */
6926 #endif
6927     char* p;
6928
6929     GET_RE_DEBUG_FLAGS_DECL;
6930
6931     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6932
6933     GET_RE_DEBUG_FLAGS;
6934
6935     /* The [^\n] meaning of \N ignores spaces and comments under the /x
6936      * modifier.  The other meaning does not */
6937     p = (RExC_flags & RXf_PMf_EXTENDED)
6938         ? regwhite( pRExC_state, RExC_parse )
6939         : RExC_parse;
6940
6941     /* Disambiguate between \N meaning a named character versus \N meaning
6942      * [^\n].  The former is assumed when it can't be the latter. */
6943     if (*p != '{' || regcurly(p)) {
6944         RExC_parse = p;
6945         if (valuep) {
6946             /* no bare \N in a charclass */
6947             vFAIL("\\N in a character class must be a named character: \\N{...}");
6948         }
6949         nextchar(pRExC_state);
6950         ret = reg_node(pRExC_state, REG_ANY);
6951         *flagp |= HASWIDTH|SIMPLE;
6952         RExC_naughty++;
6953         RExC_parse--;
6954         Set_Node_Length(ret, 1); /* MJD */
6955         return ret;
6956     }
6957
6958     /* Here, we have decided it should be a named sequence */
6959
6960     /* The test above made sure that the next real character is a '{', but
6961      * under the /x modifier, it could be separated by space (or a comment and
6962      * \n) and this is not allowed (for consistency with \x{...} and the
6963      * tokenizer handling of \N{NAME}). */
6964     if (*RExC_parse != '{') {
6965         vFAIL("Missing braces on \\N{}");
6966     }
6967
6968     RExC_parse++;       /* Skip past the '{' */
6969
6970     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6971         || ! (endbrace == RExC_parse            /* nothing between the {} */
6972               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
6973                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6974     {
6975         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
6976         vFAIL("\\N{NAME} must be resolved by the lexer");
6977     }
6978
6979     if (endbrace == RExC_parse) {   /* empty: \N{} */
6980         if (! valuep) {
6981             RExC_parse = endbrace + 1;
6982             return reg_node(pRExC_state,NOTHING);
6983         }
6984
6985         if (SIZE_ONLY) {
6986             ckWARNreg(RExC_parse,
6987                     "Ignoring zero length \\N{} in character class"
6988             );
6989             RExC_parse = endbrace + 1;
6990         }
6991         *valuep = 0;
6992         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6993     }
6994
6995     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
6996     RExC_parse += 2;    /* Skip past the 'U+' */
6997
6998     if (valuep) {   /* In a bracketed char class */
6999         /* We only pay attention to the first char of
7000         multichar strings being returned. I kinda wonder
7001         if this makes sense as it does change the behaviour
7002         from earlier versions, OTOH that behaviour was broken
7003         as well. XXX Solution is to recharacterize as
7004         [rest-of-class]|multi1|multi2... */
7005
7006         STRLEN length_of_hex;
7007         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7008             | PERL_SCAN_DISALLOW_PREFIX
7009             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7010
7011         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7012         if (endchar < endbrace) {
7013             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7014         }
7015
7016         length_of_hex = (STRLEN)(endchar - RExC_parse);
7017         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7018
7019         /* The tokenizer should have guaranteed validity, but it's possible to
7020          * bypass it by using single quoting, so check */
7021         if (length_of_hex == 0
7022             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7023         {
7024             RExC_parse += length_of_hex;        /* Includes all the valid */
7025             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
7026                             ? UTF8SKIP(RExC_parse)
7027                             : 1;
7028             /* Guard against malformed utf8 */
7029             if (RExC_parse >= endchar) RExC_parse = endchar;
7030             vFAIL("Invalid hexadecimal number in \\N{U+...}");
7031         }
7032
7033         RExC_parse = endbrace + 1;
7034         if (endchar == endbrace) return NULL;
7035
7036         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
7037     }
7038     else {      /* Not a char class */
7039         char *s;            /* String to put in generated EXACT node */
7040         STRLEN len = 0;     /* Its current byte length */
7041         char *endchar;      /* Points to '.' or '}' ending cur char in the input
7042                                stream */
7043
7044         ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
7045                                                    : (LOC)
7046                                                       ? EXACTFL
7047                                                       : UNI_SEMANTICS
7048                                                         ? EXACTFU
7049                                                         : EXACTF));
7050         s= STRING(ret);
7051
7052         /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
7053          * the input which is of the form now 'c1.c2.c3...}' until find the
7054          * ending brace or exceed length 255.  The characters that exceed this
7055          * limit are dropped.  The limit could be relaxed should it become
7056          * desirable by reparsing this as (?:\N{NAME}), so could generate
7057          * multiple EXACT nodes, as is done for just regular input.  But this
7058          * is primarily a named character, and not intended to be a huge long
7059          * string, so 255 bytes should be good enough */
7060         while (1) {
7061             STRLEN length_of_hex;
7062             I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7063                             | PERL_SCAN_DISALLOW_PREFIX
7064                             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7065             UV cp;  /* Ord of current character */
7066
7067             /* Code points are separated by dots.  If none, there is only one
7068              * code point, and is terminated by the brace */
7069             endchar = RExC_parse + strcspn(RExC_parse, ".}");
7070
7071             /* The values are Unicode even on EBCDIC machines */
7072             length_of_hex = (STRLEN)(endchar - RExC_parse);
7073             cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7074             if ( length_of_hex == 0
7075                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7076             {
7077                 RExC_parse += length_of_hex;        /* Includes all the valid */
7078                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
7079                                 ? UTF8SKIP(RExC_parse)
7080                                 : 1;
7081                 /* Guard against malformed utf8 */
7082                 if (RExC_parse >= endchar) RExC_parse = endchar;
7083                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7084             }
7085
7086             if (! FOLD) {       /* Not folding, just append to the string */
7087                 STRLEN unilen;
7088
7089                 /* Quit before adding this character if would exceed limit */
7090                 if (len + UNISKIP(cp) > U8_MAX) break;
7091
7092                 unilen = reguni(pRExC_state, cp, s);
7093                 if (unilen > 0) {
7094                     s   += unilen;
7095                     len += unilen;
7096                 }
7097             } else {    /* Folding, output the folded equivalent */
7098                 STRLEN foldlen,numlen;
7099                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7100                 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7101
7102                 /* Quit before exceeding size limit */
7103                 if (len + foldlen > U8_MAX) break;
7104
7105                 for (foldbuf = tmpbuf;
7106                     foldlen;
7107                     foldlen -= numlen)
7108                 {
7109                     cp = utf8_to_uvchr(foldbuf, &numlen);
7110                     if (numlen > 0) {
7111                         const STRLEN unilen = reguni(pRExC_state, cp, s);
7112                         s       += unilen;
7113                         len     += unilen;
7114                         /* In EBCDIC the numlen and unilen can differ. */
7115                         foldbuf += numlen;
7116                         if (numlen >= foldlen)
7117                             break;
7118                     }
7119                     else
7120                         break; /* "Can't happen." */
7121                 }
7122             }
7123
7124             /* Point to the beginning of the next character in the sequence. */
7125             RExC_parse = endchar + 1;
7126
7127             /* Quit if no more characters */
7128             if (RExC_parse >= endbrace) break;
7129         }
7130
7131
7132         if (SIZE_ONLY) {
7133             if (RExC_parse < endbrace) {
7134                 ckWARNreg(RExC_parse - 1,
7135                           "Using just the first characters returned by \\N{}");
7136             }
7137
7138             RExC_size += STR_SZ(len);
7139         } else {
7140             STR_LEN(ret) = len;
7141             RExC_emit += STR_SZ(len);
7142         }
7143
7144         RExC_parse = endbrace + 1;
7145
7146         *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7147                                with malformed in t/re/pat_advanced.t */
7148         RExC_parse --;
7149         Set_Node_Cur_Length(ret); /* MJD */
7150         nextchar(pRExC_state);
7151     }
7152
7153     return ret;
7154 }
7155
7156
7157 /*
7158  * reg_recode
7159  *
7160  * It returns the code point in utf8 for the value in *encp.
7161  *    value: a code value in the source encoding
7162  *    encp:  a pointer to an Encode object
7163  *
7164  * If the result from Encode is not a single character,
7165  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7166  */
7167 STATIC UV
7168 S_reg_recode(pTHX_ const char value, SV **encp)
7169 {
7170     STRLEN numlen = 1;
7171     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7172     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7173     const STRLEN newlen = SvCUR(sv);
7174     UV uv = UNICODE_REPLACEMENT;
7175
7176     PERL_ARGS_ASSERT_REG_RECODE;
7177
7178     if (newlen)
7179         uv = SvUTF8(sv)
7180              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7181              : *(U8*)s;
7182
7183     if (!newlen || numlen != newlen) {
7184         uv = UNICODE_REPLACEMENT;
7185         *encp = NULL;
7186     }
7187     return uv;
7188 }
7189
7190
7191 /*
7192  - regatom - the lowest level
7193
7194    Try to identify anything special at the start of the pattern. If there
7195    is, then handle it as required. This may involve generating a single regop,
7196    such as for an assertion; or it may involve recursing, such as to
7197    handle a () structure.
7198
7199    If the string doesn't start with something special then we gobble up
7200    as much literal text as we can.
7201
7202    Once we have been able to handle whatever type of thing started the
7203    sequence, we return.
7204
7205    Note: we have to be careful with escapes, as they can be both literal
7206    and special, and in the case of \10 and friends can either, depending
7207    on context. Specifically there are two seperate switches for handling
7208    escape sequences, with the one for handling literal escapes requiring
7209    a dummy entry for all of the special escapes that are actually handled
7210    by the other.
7211 */
7212
7213 STATIC regnode *
7214 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7215 {
7216     dVAR;
7217     register regnode *ret = NULL;
7218     I32 flags;
7219     char *parse_start = RExC_parse;
7220     GET_RE_DEBUG_FLAGS_DECL;
7221     DEBUG_PARSE("atom");
7222     *flagp = WORST;             /* Tentatively. */
7223
7224     PERL_ARGS_ASSERT_REGATOM;
7225
7226 tryagain:
7227     switch ((U8)*RExC_parse) {
7228     case '^':
7229         RExC_seen_zerolen++;
7230         nextchar(pRExC_state);
7231         if (RExC_flags & RXf_PMf_MULTILINE)
7232             ret = reg_node(pRExC_state, MBOL);
7233         else if (RExC_flags & RXf_PMf_SINGLELINE)
7234             ret = reg_node(pRExC_state, SBOL);
7235         else
7236             ret = reg_node(pRExC_state, BOL);
7237         Set_Node_Length(ret, 1); /* MJD */
7238         break;
7239     case '$':
7240         nextchar(pRExC_state);
7241         if (*RExC_parse)
7242             RExC_seen_zerolen++;
7243         if (RExC_flags & RXf_PMf_MULTILINE)
7244             ret = reg_node(pRExC_state, MEOL);
7245         else if (RExC_flags & RXf_PMf_SINGLELINE)
7246             ret = reg_node(pRExC_state, SEOL);
7247         else
7248             ret = reg_node(pRExC_state, EOL);
7249         Set_Node_Length(ret, 1); /* MJD */
7250         break;
7251     case '.':
7252         nextchar(pRExC_state);
7253         if (RExC_flags & RXf_PMf_SINGLELINE)
7254             ret = reg_node(pRExC_state, SANY);
7255         else
7256             ret = reg_node(pRExC_state, REG_ANY);
7257         *flagp |= HASWIDTH|SIMPLE;
7258         RExC_naughty++;
7259         Set_Node_Length(ret, 1); /* MJD */
7260         break;
7261     case '[':
7262     {
7263         char * const oregcomp_parse = ++RExC_parse;
7264         ret = regclass(pRExC_state,depth+1);
7265         if (*RExC_parse != ']') {
7266             RExC_parse = oregcomp_parse;
7267             vFAIL("Unmatched [");
7268         }
7269         nextchar(pRExC_state);
7270         *flagp |= HASWIDTH|SIMPLE;
7271         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7272         break;
7273     }
7274     case '(':
7275         nextchar(pRExC_state);
7276         ret = reg(pRExC_state, 1, &flags,depth+1);
7277         if (ret == NULL) {
7278                 if (flags & TRYAGAIN) {
7279                     if (RExC_parse == RExC_end) {
7280                          /* Make parent create an empty node if needed. */
7281                         *flagp |= TRYAGAIN;
7282                         return(NULL);
7283                     }
7284                     goto tryagain;
7285                 }
7286                 return(NULL);
7287         }
7288         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7289         break;
7290     case '|':
7291     case ')':
7292         if (flags & TRYAGAIN) {
7293             *flagp |= TRYAGAIN;
7294             return NULL;
7295         }
7296         vFAIL("Internal urp");
7297                                 /* Supposed to be caught earlier. */
7298         break;
7299     case '{':
7300         if (!regcurly(RExC_parse)) {
7301             RExC_parse++;
7302             goto defchar;
7303         }
7304         /* FALL THROUGH */
7305     case '?':
7306     case '+':
7307     case '*':
7308         RExC_parse++;
7309         vFAIL("Quantifier follows nothing");
7310         break;
7311     case LATIN_SMALL_LETTER_SHARP_S:
7312     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7313     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7314 #if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
7315 #error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ.  Other instances in this code should have the case statement below.
7316     case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
7317 #endif
7318         do_foldchar:
7319         if (!LOC && FOLD) {
7320             U32 len,cp;
7321             len=0; /* silence a spurious compiler warning */
7322             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7323                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7324                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7325                 ret = reganode(pRExC_state, FOLDCHAR, cp);
7326                 Set_Node_Length(ret, 1); /* MJD */
7327                 nextchar(pRExC_state); /* kill whitespace under /x */
7328                 return ret;
7329             }
7330         }
7331         goto outer_default;
7332     case '\\':
7333         /* Special Escapes
7334
7335            This switch handles escape sequences that resolve to some kind
7336            of special regop and not to literal text. Escape sequnces that
7337            resolve to literal text are handled below in the switch marked
7338            "Literal Escapes".
7339
7340            Every entry in this switch *must* have a corresponding entry
7341            in the literal escape switch. However, the opposite is not
7342            required, as the default for this switch is to jump to the
7343            literal text handling code.
7344         */
7345         switch ((U8)*++RExC_parse) {
7346         case LATIN_SMALL_LETTER_SHARP_S:
7347         case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7348         case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7349                    goto do_foldchar;
7350         /* Special Escapes */
7351         case 'A':
7352             RExC_seen_zerolen++;
7353             ret = reg_node(pRExC_state, SBOL);
7354             *flagp |= SIMPLE;
7355             goto finish_meta_pat;
7356         case 'G':
7357             ret = reg_node(pRExC_state, GPOS);
7358             RExC_seen |= REG_SEEN_GPOS;
7359             *flagp |= SIMPLE;
7360             goto finish_meta_pat;
7361         case 'K':
7362             RExC_seen_zerolen++;
7363             ret = reg_node(pRExC_state, KEEPS);
7364             *flagp |= SIMPLE;
7365             /* XXX:dmq : disabling in-place substitution seems to
7366              * be necessary here to avoid cases of memory corruption, as
7367              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7368              */
7369             RExC_seen |= REG_SEEN_LOOKBEHIND;
7370             goto finish_meta_pat;
7371         case 'Z':
7372             ret = reg_node(pRExC_state, SEOL);
7373             *flagp |= SIMPLE;
7374             RExC_seen_zerolen++;                /* Do not optimize RE away */
7375             goto finish_meta_pat;
7376         case 'z':
7377             ret = reg_node(pRExC_state, EOS);
7378             *flagp |= SIMPLE;
7379             RExC_seen_zerolen++;                /* Do not optimize RE away */
7380             goto finish_meta_pat;
7381         case 'C':
7382             ret = reg_node(pRExC_state, CANY);
7383             RExC_seen |= REG_SEEN_CANY;
7384             *flagp |= HASWIDTH|SIMPLE;
7385             goto finish_meta_pat;
7386         case 'X':
7387             ret = reg_node(pRExC_state, CLUMP);
7388             *flagp |= HASWIDTH;
7389             goto finish_meta_pat;
7390         case 'w':
7391             if (LOC) {
7392                 ret = reg_node(pRExC_state, (U8)(ALNUML));
7393             } else {
7394                 ret = reg_node(pRExC_state, (U8)(ALNUM));
7395                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7396             }
7397             *flagp |= HASWIDTH|SIMPLE;
7398             goto finish_meta_pat;
7399         case 'W':
7400             if (LOC) {
7401                 ret = reg_node(pRExC_state, (U8)(NALNUML));
7402             } else {
7403                 ret = reg_node(pRExC_state, (U8)(NALNUM));
7404                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7405             }
7406             *flagp |= HASWIDTH|SIMPLE;
7407             goto finish_meta_pat;
7408         case 'b':
7409             RExC_seen_zerolen++;
7410             RExC_seen |= REG_SEEN_LOOKBEHIND;
7411             if (LOC) {
7412                 ret = reg_node(pRExC_state, (U8)(BOUNDL));
7413             } else {
7414                 ret = reg_node(pRExC_state, (U8)(BOUND));
7415                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7416             }
7417             *flagp |= SIMPLE;
7418             goto finish_meta_pat;
7419         case 'B':
7420             RExC_seen_zerolen++;
7421             RExC_seen |= REG_SEEN_LOOKBEHIND;
7422             if (LOC) {
7423                 ret = reg_node(pRExC_state, (U8)(NBOUNDL));
7424             } else {
7425                 ret = reg_node(pRExC_state, (U8)(NBOUND));
7426                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7427             }
7428             *flagp |= SIMPLE;
7429             goto finish_meta_pat;
7430         case 's':
7431             if (LOC) {
7432                 ret = reg_node(pRExC_state, (U8)(SPACEL));
7433             } else {
7434                 ret = reg_node(pRExC_state, (U8)(SPACE));
7435                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7436             }
7437             *flagp |= HASWIDTH|SIMPLE;
7438             goto finish_meta_pat;
7439         case 'S':
7440             if (LOC) {
7441                 ret = reg_node(pRExC_state, (U8)(NSPACEL));
7442             } else {
7443                 ret = reg_node(pRExC_state, (U8)(NSPACE));
7444                 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7445             }
7446             *flagp |= HASWIDTH|SIMPLE;
7447             goto finish_meta_pat;
7448         case 'd':
7449             if (LOC) {
7450                 ret = reg_node(pRExC_state, (U8)(DIGITL));
7451             } else {
7452                 ret = reg_node(pRExC_state, (U8)(DIGIT));
7453             }
7454             *flagp |= HASWIDTH|SIMPLE;
7455             goto finish_meta_pat;
7456         case 'D':
7457             if (LOC) {
7458                 ret = reg_node(pRExC_state, (U8)(NDIGITL));
7459             } else {
7460                 ret = reg_node(pRExC_state, (U8)(NDIGIT));
7461             }
7462             *flagp |= HASWIDTH|SIMPLE;
7463             goto finish_meta_pat;
7464         case 'R':
7465             ret = reg_node(pRExC_state, LNBREAK);
7466             *flagp |= HASWIDTH|SIMPLE;
7467             goto finish_meta_pat;
7468         case 'h':
7469             ret = reg_node(pRExC_state, HORIZWS);
7470             *flagp |= HASWIDTH|SIMPLE;
7471             goto finish_meta_pat;
7472         case 'H':
7473             ret = reg_node(pRExC_state, NHORIZWS);
7474             *flagp |= HASWIDTH|SIMPLE;
7475             goto finish_meta_pat;
7476         case 'v':
7477             ret = reg_node(pRExC_state, VERTWS);
7478             *flagp |= HASWIDTH|SIMPLE;
7479             goto finish_meta_pat;
7480         case 'V':
7481             ret = reg_node(pRExC_state, NVERTWS);
7482             *flagp |= HASWIDTH|SIMPLE;
7483          finish_meta_pat:
7484             nextchar(pRExC_state);
7485             Set_Node_Length(ret, 2); /* MJD */
7486             break;
7487         case 'p':
7488         case 'P':
7489             {
7490                 char* const oldregxend = RExC_end;
7491 #ifdef DEBUGGING
7492                 char* parse_start = RExC_parse - 2;
7493 #endif
7494
7495                 if (RExC_parse[1] == '{') {
7496                   /* a lovely hack--pretend we saw [\pX] instead */
7497                     RExC_end = strchr(RExC_parse, '}');
7498                     if (!RExC_end) {
7499                         const U8 c = (U8)*RExC_parse;
7500                         RExC_parse += 2;
7501                         RExC_end = oldregxend;
7502                         vFAIL2("Missing right brace on \\%c{}", c);
7503                     }
7504                     RExC_end++;
7505                 }
7506                 else {
7507                     RExC_end = RExC_parse + 2;
7508                     if (RExC_end > oldregxend)
7509                         RExC_end = oldregxend;
7510                 }
7511                 RExC_parse--;
7512
7513                 ret = regclass(pRExC_state,depth+1);
7514
7515                 RExC_end = oldregxend;
7516                 RExC_parse--;
7517
7518                 Set_Node_Offset(ret, parse_start + 2);
7519                 Set_Node_Cur_Length(ret);
7520                 nextchar(pRExC_state);
7521                 *flagp |= HASWIDTH|SIMPLE;
7522             }
7523             break;
7524         case 'N':
7525             /* Handle \N and \N{NAME} here and not below because it can be
7526             multicharacter. join_exact() will join them up later on.
7527             Also this makes sure that things like /\N{BLAH}+/ and
7528             \N{BLAH} being multi char Just Happen. dmq*/
7529             ++RExC_parse;
7530             ret= reg_namedseq(pRExC_state, NULL, flagp);
7531             break;
7532         case 'k':    /* Handle \k<NAME> and \k'NAME' */
7533         parse_named_seq:
7534         {
7535             char ch= RExC_parse[1];
7536             if (ch != '<' && ch != '\'' && ch != '{') {
7537                 RExC_parse++;
7538                 vFAIL2("Sequence %.2s... not terminated",parse_start);
7539             } else {
7540                 /* this pretty much dupes the code for (?P=...) in reg(), if
7541                    you change this make sure you change that */
7542                 char* name_start = (RExC_parse += 2);
7543                 U32 num = 0;
7544                 SV *sv_dat = reg_scan_name(pRExC_state,
7545                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7546                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7547                 if (RExC_parse == name_start || *RExC_parse != ch)
7548                     vFAIL2("Sequence %.3s... not terminated",parse_start);
7549
7550                 if (!SIZE_ONLY) {
7551                     num = add_data( pRExC_state, 1, "S" );
7552                     RExC_rxi->data->data[num]=(void*)sv_dat;
7553                     SvREFCNT_inc_simple_void(sv_dat);
7554                 }
7555
7556                 RExC_sawback = 1;
7557                 ret = reganode(pRExC_state,
7558                                ((! FOLD)
7559                                  ? NREF
7560                                  : (UNI_SEMANTICS)
7561                                    ? NREFFU
7562                                    : (LOC)
7563                                      ? NREFFL
7564                                      : NREFF),
7565                                 num);
7566                 *flagp |= HASWIDTH;
7567
7568                 /* override incorrect value set in reganode MJD */
7569                 Set_Node_Offset(ret, parse_start+1);
7570                 Set_Node_Cur_Length(ret); /* MJD */
7571                 nextchar(pRExC_state);
7572
7573             }
7574             break;
7575         }
7576         case 'g':
7577         case '1': case '2': case '3': case '4':
7578         case '5': case '6': case '7': case '8': case '9':
7579             {
7580                 I32 num;
7581                 bool isg = *RExC_parse == 'g';
7582                 bool isrel = 0;
7583                 bool hasbrace = 0;
7584                 if (isg) {
7585                     RExC_parse++;
7586                     if (*RExC_parse == '{') {
7587                         RExC_parse++;
7588                         hasbrace = 1;
7589                     }
7590                     if (*RExC_parse == '-') {
7591                         RExC_parse++;
7592                         isrel = 1;
7593                     }
7594                     if (hasbrace && !isDIGIT(*RExC_parse)) {
7595                         if (isrel) RExC_parse--;
7596                         RExC_parse -= 2;
7597                         goto parse_named_seq;
7598                 }   }
7599                 num = atoi(RExC_parse);
7600                 if (isg && num == 0)
7601                     vFAIL("Reference to invalid group 0");
7602                 if (isrel) {
7603                     num = RExC_npar - num;
7604                     if (num < 1)
7605                         vFAIL("Reference to nonexistent or unclosed group");
7606                 }
7607                 if (!isg && num > 9 && num >= RExC_npar)
7608                     goto defchar;
7609                 else {
7610                     char * const parse_start = RExC_parse - 1; /* MJD */
7611                     while (isDIGIT(*RExC_parse))
7612                         RExC_parse++;
7613                     if (parse_start == RExC_parse - 1)
7614                         vFAIL("Unterminated \\g... pattern");
7615                     if (hasbrace) {
7616                         if (*RExC_parse != '}')
7617                             vFAIL("Unterminated \\g{...} pattern");
7618                         RExC_parse++;
7619                     }
7620                     if (!SIZE_ONLY) {
7621                         if (num > (I32)RExC_rx->nparens)
7622                             vFAIL("Reference to nonexistent group");
7623                     }
7624                     RExC_sawback = 1;
7625                     ret = reganode(pRExC_state,
7626                                    ((! FOLD)
7627                                      ? REF
7628                                      : (UNI_SEMANTICS)
7629                                        ? REFFU
7630                                        : (LOC)
7631                                          ? REFFL
7632                                          : REFF),
7633                                     num);
7634                     *flagp |= HASWIDTH;
7635
7636                     /* override incorrect value set in reganode MJD */
7637                     Set_Node_Offset(ret, parse_start+1);
7638                     Set_Node_Cur_Length(ret); /* MJD */
7639                     RExC_parse--;
7640                     nextchar(pRExC_state);
7641                 }
7642             }
7643             break;
7644         case '\0':
7645             if (RExC_parse >= RExC_end)
7646                 FAIL("Trailing \\");
7647             /* FALL THROUGH */
7648         default:
7649             /* Do not generate "unrecognized" warnings here, we fall
7650                back into the quick-grab loop below */
7651             parse_start--;
7652             goto defchar;
7653         }
7654         break;
7655
7656     case '#':
7657         if (RExC_flags & RXf_PMf_EXTENDED) {
7658             if ( reg_skipcomment( pRExC_state ) )
7659                 goto tryagain;
7660         }
7661         /* FALL THROUGH */
7662
7663     default:
7664         outer_default:{
7665             register STRLEN len;
7666             register UV ender;
7667             register char *p;
7668             char *s;
7669             STRLEN foldlen;
7670             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7671
7672             parse_start = RExC_parse - 1;
7673
7674             RExC_parse++;
7675
7676         defchar:
7677             ender = 0;
7678             ret = reg_node(pRExC_state,
7679                            (U8) ((! FOLD) ? EXACT
7680                                           : (LOC)
7681                                              ? EXACTFL
7682                                              : (UNI_SEMANTICS)
7683                                                ? EXACTFU
7684                                                : EXACTF)
7685                     );
7686             s = STRING(ret);
7687             for (len = 0, p = RExC_parse - 1;
7688               len < 127 && p < RExC_end;
7689               len++)
7690             {
7691                 char * const oldp = p;
7692
7693                 if (RExC_flags & RXf_PMf_EXTENDED)
7694                     p = regwhite( pRExC_state, p );
7695                 switch ((U8)*p) {
7696                 case LATIN_SMALL_LETTER_SHARP_S:
7697                 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7698                 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7699                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7700                                 goto normal_default;
7701                 case '^':
7702                 case '$':
7703                 case '.':
7704                 case '[':
7705                 case '(':
7706                 case ')':
7707                 case '|':
7708                     goto loopdone;
7709                 case '\\':
7710                     /* Literal Escapes Switch
7711
7712                        This switch is meant to handle escape sequences that
7713                        resolve to a literal character.
7714
7715                        Every escape sequence that represents something
7716                        else, like an assertion or a char class, is handled
7717                        in the switch marked 'Special Escapes' above in this
7718                        routine, but also has an entry here as anything that
7719                        isn't explicitly mentioned here will be treated as
7720                        an unescaped equivalent literal.
7721                     */
7722
7723                     switch ((U8)*++p) {
7724                     /* These are all the special escapes. */
7725                     case LATIN_SMALL_LETTER_SHARP_S:
7726                     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7727                     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7728                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7729                                 goto normal_default;
7730                     case 'A':             /* Start assertion */
7731                     case 'b': case 'B':   /* Word-boundary assertion*/
7732                     case 'C':             /* Single char !DANGEROUS! */
7733                     case 'd': case 'D':   /* digit class */
7734                     case 'g': case 'G':   /* generic-backref, pos assertion */
7735                     case 'h': case 'H':   /* HORIZWS */
7736                     case 'k': case 'K':   /* named backref, keep marker */
7737                     case 'N':             /* named char sequence */
7738                     case 'p': case 'P':   /* Unicode property */
7739                               case 'R':   /* LNBREAK */
7740                     case 's': case 'S':   /* space class */
7741                     case 'v': case 'V':   /* VERTWS */
7742                     case 'w': case 'W':   /* word class */
7743                     case 'X':             /* eXtended Unicode "combining character sequence" */
7744                     case 'z': case 'Z':   /* End of line/string assertion */
7745                         --p;
7746                         goto loopdone;
7747
7748                     /* Anything after here is an escape that resolves to a
7749                        literal. (Except digits, which may or may not)
7750                      */
7751                     case 'n':
7752                         ender = '\n';
7753                         p++;
7754                         break;
7755                     case 'r':
7756                         ender = '\r';
7757                         p++;
7758                         break;
7759                     case 't':
7760                         ender = '\t';
7761                         p++;
7762                         break;
7763                     case 'f':
7764                         ender = '\f';
7765                         p++;
7766                         break;
7767                     case 'e':
7768                           ender = ASCII_TO_NATIVE('\033');
7769                         p++;
7770                         break;
7771                     case 'a':
7772                           ender = ASCII_TO_NATIVE('\007');
7773                         p++;
7774                         break;
7775                     case 'o':
7776                         {
7777                             STRLEN brace_len = len;
7778                             UV result;
7779                             const char* error_msg;
7780
7781                             bool valid = grok_bslash_o(p,
7782                                                        &result,
7783                                                        &brace_len,
7784                                                        &error_msg,
7785                                                        1);
7786                             p += brace_len;
7787                             if (! valid) {
7788                                 RExC_parse = p; /* going to die anyway; point
7789                                                    to exact spot of failure */
7790                                 vFAIL(error_msg);
7791                             }
7792                             else
7793                             {
7794                                 ender = result;
7795                             }
7796                             if (PL_encoding && ender < 0x100) {
7797                                 goto recode_encoding;
7798                             }
7799                             if (ender > 0xff) {
7800                                 REQUIRE_UTF8;
7801                             }
7802                             break;
7803                         }
7804                     case 'x':
7805                         if (*++p == '{') {
7806                             char* const e = strchr(p, '}');
7807
7808                             if (!e) {
7809                                 RExC_parse = p + 1;
7810                                 vFAIL("Missing right brace on \\x{}");
7811                             }
7812                             else {
7813                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7814                                     | PERL_SCAN_DISALLOW_PREFIX;
7815                                 STRLEN numlen = e - p - 1;
7816                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7817                                 if (ender > 0xff)
7818                                     REQUIRE_UTF8;
7819                                 p = e + 1;
7820                             }
7821                         }
7822                         else {
7823                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7824                             STRLEN numlen = 2;
7825                             ender = grok_hex(p, &numlen, &flags, NULL);
7826                             p += numlen;
7827                         }
7828                         if (PL_encoding && ender < 0x100)
7829                             goto recode_encoding;
7830                         break;
7831                     case 'c':
7832                         p++;
7833                         ender = grok_bslash_c(*p++, SIZE_ONLY);
7834                         break;
7835                     case '0': case '1': case '2': case '3':case '4':
7836                     case '5': case '6': case '7': case '8':case '9':
7837                         if (*p == '0' ||
7838                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
7839                         {
7840                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
7841                             STRLEN numlen = 3;
7842                             ender = grok_oct(p, &numlen, &flags, NULL);
7843                             if (ender > 0xff) {
7844                                 REQUIRE_UTF8;
7845                             }
7846                             p += numlen;
7847                         }
7848                         else {
7849                             --p;
7850                             goto loopdone;
7851                         }
7852                         if (PL_encoding && ender < 0x100)
7853                             goto recode_encoding;
7854                         break;
7855                     recode_encoding:
7856                         {
7857                             SV* enc = PL_encoding;
7858                             ender = reg_recode((const char)(U8)ender, &enc);
7859                             if (!enc && SIZE_ONLY)
7860                                 ckWARNreg(p, "Invalid escape in the specified encoding");
7861                             REQUIRE_UTF8;
7862                         }
7863                         break;
7864                     case '\0':
7865                         if (p >= RExC_end)
7866                             FAIL("Trailing \\");
7867                         /* FALL THROUGH */
7868                     default:
7869                         if (!SIZE_ONLY&& isALPHA(*p))
7870                             ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7871                         goto normal_default;
7872                     }
7873                     break;
7874                 default:
7875                   normal_default:
7876                     if (UTF8_IS_START(*p) && UTF) {
7877                         STRLEN numlen;
7878                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7879                                                &numlen, UTF8_ALLOW_DEFAULT);
7880                         p += numlen;
7881                     }
7882                     else
7883                         ender = *p++;
7884                     break;
7885                 }
7886                 if ( RExC_flags & RXf_PMf_EXTENDED)
7887                     p = regwhite( pRExC_state, p );
7888                 if (UTF && FOLD) {
7889                     /* Prime the casefolded buffer. */
7890                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7891                 }
7892                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7893                     if (len)
7894                         p = oldp;
7895                     else if (UTF) {
7896                          if (FOLD) {
7897                               /* Emit all the Unicode characters. */
7898                               STRLEN numlen;
7899                               for (foldbuf = tmpbuf;
7900                                    foldlen;
7901                                    foldlen -= numlen) {
7902                                    ender = utf8_to_uvchr(foldbuf, &numlen);
7903                                    if (numlen > 0) {
7904                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
7905                                         s       += unilen;
7906                                         len     += unilen;
7907                                         /* In EBCDIC the numlen
7908                                          * and unilen can differ. */
7909                                         foldbuf += numlen;
7910                                         if (numlen >= foldlen)
7911                                              break;
7912                                    }
7913                                    else
7914                                         break; /* "Can't happen." */
7915                               }
7916                          }
7917                          else {
7918                               const STRLEN unilen = reguni(pRExC_state, ender, s);
7919                               if (unilen > 0) {
7920                                    s   += unilen;
7921                                    len += unilen;
7922                               }
7923                          }
7924                     }
7925                     else {
7926                         len++;
7927                         REGC((char)ender, s++);
7928                     }
7929                     break;
7930                 }
7931                 if (UTF) {
7932                      if (FOLD) {
7933                           /* Emit all the Unicode characters. */
7934                           STRLEN numlen;
7935                           for (foldbuf = tmpbuf;
7936                                foldlen;
7937                                foldlen -= numlen) {
7938                                ender = utf8_to_uvchr(foldbuf, &numlen);
7939                                if (numlen > 0) {
7940                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
7941                                     len     += unilen;
7942                                     s       += unilen;
7943                                     /* In EBCDIC the numlen
7944                                      * and unilen can differ. */
7945                                     foldbuf += numlen;
7946                                     if (numlen >= foldlen)
7947                                          break;
7948                                }
7949                                else
7950                                     break;
7951                           }
7952                      }
7953                      else {
7954                           const STRLEN unilen = reguni(pRExC_state, ender, s);
7955                           if (unilen > 0) {
7956                                s   += unilen;
7957                                len += unilen;
7958                           }
7959                      }
7960                      len--;
7961                 }
7962                 else
7963                     REGC((char)ender, s++);
7964             }
7965         loopdone:
7966             RExC_parse = p - 1;
7967             Set_Node_Cur_Length(ret); /* MJD */
7968             nextchar(pRExC_state);
7969             {
7970                 /* len is STRLEN which is unsigned, need to copy to signed */
7971                 IV iv = len;
7972                 if (iv < 0)
7973                     vFAIL("Internal disaster");
7974             }
7975             if (len > 0)
7976                 *flagp |= HASWIDTH;
7977             if (len == 1 && UNI_IS_INVARIANT(ender))
7978                 *flagp |= SIMPLE;
7979
7980             if (SIZE_ONLY)
7981                 RExC_size += STR_SZ(len);
7982             else {
7983                 STR_LEN(ret) = len;
7984                 RExC_emit += STR_SZ(len);
7985             }
7986         }
7987         break;
7988     }
7989
7990     return(ret);
7991 }
7992
7993 STATIC char *
7994 S_regwhite( RExC_state_t *pRExC_state, char *p )
7995 {
7996     const char *e = RExC_end;
7997
7998     PERL_ARGS_ASSERT_REGWHITE;
7999
8000     while (p < e) {
8001         if (isSPACE(*p))
8002             ++p;
8003         else if (*p == '#') {
8004             bool ended = 0;
8005             do {
8006                 if (*p++ == '\n') {
8007                     ended = 1;
8008                     break;
8009                 }
8010             } while (p < e);
8011             if (!ended)
8012                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8013         }
8014         else
8015             break;
8016     }
8017     return p;
8018 }
8019
8020 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
8021    Character classes ([:foo:]) can also be negated ([:^foo:]).
8022    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
8023    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
8024    but trigger failures because they are currently unimplemented. */
8025
8026 #define POSIXCC_DONE(c)   ((c) == ':')
8027 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
8028 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
8029
8030 STATIC I32
8031 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
8032 {
8033     dVAR;
8034     I32 namedclass = OOB_NAMEDCLASS;
8035
8036     PERL_ARGS_ASSERT_REGPPOSIXCC;
8037
8038     if (value == '[' && RExC_parse + 1 < RExC_end &&
8039         /* I smell either [: or [= or [. -- POSIX has been here, right? */
8040         POSIXCC(UCHARAT(RExC_parse))) {
8041         const char c = UCHARAT(RExC_parse);
8042         char* const s = RExC_parse++;
8043
8044         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
8045             RExC_parse++;
8046         if (RExC_parse == RExC_end)
8047             /* Grandfather lone [:, [=, [. */
8048             RExC_parse = s;
8049         else {
8050             const char* const t = RExC_parse++; /* skip over the c */
8051             assert(*t == c);
8052
8053             if (UCHARAT(RExC_parse) == ']') {
8054                 const char *posixcc = s + 1;
8055                 RExC_parse++; /* skip over the ending ] */
8056
8057                 if (*s == ':') {
8058                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
8059                     const I32 skip = t - posixcc;
8060
8061                     /* Initially switch on the length of the name.  */
8062                     switch (skip) {
8063                     case 4:
8064                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
8065                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
8066                         break;
8067                     case 5:
8068                         /* Names all of length 5.  */
8069                         /* alnum alpha ascii blank cntrl digit graph lower
8070                            print punct space upper  */
8071                         /* Offset 4 gives the best switch position.  */
8072                         switch (posixcc[4]) {
8073                         case 'a':
8074                             if (memEQ(posixcc, "alph", 4)) /* alpha */
8075                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
8076                             break;
8077                         case 'e':
8078                             if (memEQ(posixcc, "spac", 4)) /* space */
8079                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
8080                             break;
8081                         case 'h':
8082                             if (memEQ(posixcc, "grap", 4)) /* graph */
8083                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
8084                             break;
8085                         case 'i':
8086                             if (memEQ(posixcc, "asci", 4)) /* ascii */
8087                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
8088                             break;
8089                         case 'k':
8090                             if (memEQ(posixcc, "blan", 4)) /* blank */
8091                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
8092                             break;
8093                         case 'l':
8094                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
8095                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
8096                             break;
8097                         case 'm':
8098                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
8099                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
8100                             break;
8101                         case 'r':
8102                             if (memEQ(posixcc, "lowe", 4)) /* lower */
8103                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
8104                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
8105                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
8106                             break;
8107                         case 't':
8108                             if (memEQ(posixcc, "digi", 4)) /* digit */
8109                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
8110                             else if (memEQ(posixcc, "prin", 4)) /* print */
8111                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
8112                             else if (memEQ(posixcc, "punc", 4)) /* punct */
8113                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
8114                             break;
8115                         }
8116                         break;
8117                     case 6:
8118                         if (memEQ(posixcc, "xdigit", 6))
8119                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
8120                         break;
8121                     }
8122
8123                     if (namedclass == OOB_NAMEDCLASS)
8124                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8125                                       t - s - 1, s + 1);
8126                     assert (posixcc[skip] == ':');
8127                     assert (posixcc[skip+1] == ']');
8128                 } else if (!SIZE_ONLY) {
8129                     /* [[=foo=]] and [[.foo.]] are still future. */
8130
8131                     /* adjust RExC_parse so the warning shows after
8132                        the class closes */
8133                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
8134                         RExC_parse++;
8135                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8136                 }
8137             } else {
8138                 /* Maternal grandfather:
8139                  * "[:" ending in ":" but not in ":]" */
8140                 RExC_parse = s;
8141             }
8142         }
8143     }
8144
8145     return namedclass;
8146 }
8147
8148 STATIC void
8149 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
8150 {
8151     dVAR;
8152
8153     PERL_ARGS_ASSERT_CHECKPOSIXCC;
8154
8155     if (POSIXCC(UCHARAT(RExC_parse))) {
8156         const char *s = RExC_parse;
8157         const char  c = *s++;
8158
8159         while (isALNUM(*s))
8160             s++;
8161         if (*s && c == *s && s[1] == ']') {
8162             ckWARN3reg(s+2,
8163                        "POSIX syntax [%c %c] belongs inside character classes",
8164                        c, c);
8165
8166             /* [[=foo=]] and [[.foo.]] are still future. */
8167             if (POSIXCC_NOTYET(c)) {
8168                 /* adjust RExC_parse so the error shows after
8169                    the class closes */
8170                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
8171                     NOOP;
8172                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8173             }
8174         }
8175     }
8176 }
8177
8178 /* No locale test */
8179 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
8180 ANYOF_##NAME:                                           \
8181         for (value = 0; value < 256; value++)           \
8182             if (TEST)                                   \
8183                 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8184     yesno = '+';                                        \
8185     what = WORD;                                        \
8186     break;                                              \
8187 case ANYOF_N##NAME:                                     \
8188         for (value = 0; value < 256; value++)           \
8189             if (!TEST)                                  \
8190                 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8191     yesno = '!';                                        \
8192     what = WORD;                                        \
8193     break
8194
8195 /* Like the above, but there are differences if we are in uni-8-bit or not, so
8196  * there are two tests passed in, to use depending on that. There aren't any
8197  * cases where the label is different from the name, so no need for that
8198  * parameter */
8199 #define _C_C_T_(NAME,TEST_8,TEST_7,WORD)       \
8200 ANYOF_##NAME:                                           \
8201     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);        \
8202     else if (UNI_SEMANTICS) {                           \
8203         for (value = 0; value < 256; value++) {         \
8204             if (TEST_8) stored +=                       \
8205                       S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8206         }                                               \
8207     }                                                   \
8208     else {                                              \
8209         for (value = 0; value < 128; value++) {         \
8210             if (TEST_7) stored +=                       \
8211                        S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value)); \
8212         }                                               \
8213     }                                                   \
8214     yesno = '+';                                        \
8215     what = WORD;                                        \
8216     break;                                              \
8217 case ANYOF_N##NAME:                                     \
8218     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);       \
8219     else if (UNI_SEMANTICS) {                           \
8220         for (value = 0; value < 256; value++) {         \
8221             if (! TEST_8) stored +=                     \
8222                         S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8223         }                                               \
8224     }                                                   \
8225     else {                                              \
8226         for (value = 0; value < 128; value++) {         \
8227             if (! TEST_7) stored +=                     \
8228                         S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8229         }                                               \
8230         for (value = 128; value < 256; value++) {         \
8231                         S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8232         }                                               \
8233     }                                                   \
8234     yesno = '!';                                        \
8235     what = WORD;                                        \
8236     break
8237
8238 /*
8239    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8240    so that it is possible to override the option here without having to
8241    rebuild the entire core. as we are required to do if we change regcomp.h
8242    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8243 */
8244 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8245 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8246 #endif
8247
8248 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8249 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8250 #else
8251 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8252 #endif
8253
8254 STATIC U8
8255 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8256 {
8257
8258     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
8259      * Locale folding is done at run-time, so this function should not be
8260      * called for nodes that are for locales.
8261      *
8262      * This function simply sets the bit corresponding to the fold of the input
8263      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
8264      * 'F' is 'f'.
8265      *
8266      * It also sets any necessary flags, and returns the number of bits that
8267      * actually changed from 0 to 1 */
8268
8269     U8 stored = 0;
8270     U8 fold;
8271
8272     fold = (UNI_SEMANTICS) ? PL_fold_latin1[value]
8273                            : PL_fold[value];
8274
8275     /* It assumes the bit for 'value' has already been set */
8276     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
8277         ANYOF_BITMAP_SET(node, fold);
8278         stored++;
8279     }
8280
8281     /* The fold of the German sharp s is two ASCII characters, so isn't in the
8282      * bitmap and doesn't have to be in utf8, but we only process it if unicode
8283      * semantics are called for */
8284     if (UNI_SEMANTICS && value == LATIN_SMALL_LETTER_SHARP_S) {
8285         ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
8286     }
8287     else if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
8288              || (! UNI_SEMANTICS
8289                  && ! isASCII(value)
8290                  && PL_fold_latin1[value] != value))
8291     {   /* A character that has a fold outside of Latin1 matches outside the
8292            bitmap, but only when the target string is utf8.  Similarly when we
8293            don't have unicode semantics for the above ASCII Latin-1 characters,
8294            and they have a fold, they should match if the target is utf8, and
8295            not otherwise */
8296         ANYOF_FLAGS(node) |= ANYOF_UTF8;
8297     }
8298
8299     return stored;
8300 }
8301
8302
8303 PERL_STATIC_INLINE U8
8304 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8305 {
8306     /* This inline function sets a bit in the bitmap if not already set, and if
8307      * appropriate, its fold, returning the number of bits that actually
8308      * changed from 0 to 1 */
8309
8310     U8 stored;
8311
8312     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
8313         return 0;
8314     }
8315
8316     ANYOF_BITMAP_SET(node, value);
8317     stored = 1;
8318
8319     if (FOLD && ! LOC) {        /* Locale folds aren't known until runtime */
8320         stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value);
8321     }
8322
8323     return stored;
8324 }
8325
8326 /*
8327    parse a class specification and produce either an ANYOF node that
8328    matches the pattern or if the pattern matches a single char only and
8329    that char is < 256 and we are case insensitive then we produce an
8330    EXACT node instead.
8331 */
8332
8333 STATIC regnode *
8334 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
8335 {
8336     dVAR;
8337     register UV nextvalue;
8338     register IV prevvalue = OOB_UNICODE;
8339     register IV range = 0;
8340     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
8341     register regnode *ret;
8342     STRLEN numlen;
8343     IV namedclass;
8344     char *rangebegin = NULL;
8345     bool need_class = 0;
8346     SV *listsv = NULL;
8347     UV n;
8348     AV* unicode_alternate  = NULL;
8349 #ifdef EBCDIC
8350     UV literal_endpoint = 0;
8351 #endif
8352     UV stored = 0;  /* how many chars stored in the bitmap */
8353
8354     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
8355         case we need to change the emitted regop to an EXACT. */
8356     const char * orig_parse = RExC_parse;
8357     GET_RE_DEBUG_FLAGS_DECL;
8358
8359     PERL_ARGS_ASSERT_REGCLASS;
8360 #ifndef DEBUGGING
8361     PERL_UNUSED_ARG(depth);
8362 #endif
8363
8364     DEBUG_PARSE("clas");
8365
8366     /* Assume we are going to generate an ANYOF node. */
8367     ret = reganode(pRExC_state, ANYOF, 0);
8368
8369     if (!SIZE_ONLY)
8370         ANYOF_FLAGS(ret) = 0;
8371
8372     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
8373         RExC_naughty++;
8374         RExC_parse++;
8375         if (!SIZE_ONLY)
8376             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8377     }
8378
8379     if (SIZE_ONLY) {
8380         RExC_size += ANYOF_SKIP;
8381 #ifdef ANYOF_ADD_LOC_SKIP
8382         if (LOC) {
8383             RExC_size += ANYOF_ADD_LOC_SKIP;
8384         }
8385 #endif
8386         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8387     }
8388     else {
8389         RExC_emit += ANYOF_SKIP;
8390         if (LOC) {
8391             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
8392 #ifdef ANYOF_ADD_LOC_SKIP
8393             RExC_emit += ANYOF_ADD_LOC_SKIP;
8394 #endif
8395         }
8396         ANYOF_BITMAP_ZERO(ret);
8397         listsv = newSVpvs("# comment\n");
8398     }
8399
8400     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8401
8402     if (!SIZE_ONLY && POSIXCC(nextvalue))
8403         checkposixcc(pRExC_state);
8404
8405     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8406     if (UCHARAT(RExC_parse) == ']')
8407         goto charclassloop;
8408
8409 parseit:
8410     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
8411
8412     charclassloop:
8413
8414         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8415
8416         if (!range)
8417             rangebegin = RExC_parse;
8418         if (UTF) {
8419             value = utf8n_to_uvchr((U8*)RExC_parse,
8420                                    RExC_end - RExC_parse,
8421                                    &numlen, UTF8_ALLOW_DEFAULT);
8422             RExC_parse += numlen;
8423         }
8424         else
8425             value = UCHARAT(RExC_parse++);
8426
8427         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8428         if (value == '[' && POSIXCC(nextvalue))
8429             namedclass = regpposixcc(pRExC_state, value);
8430         else if (value == '\\') {
8431             if (UTF) {
8432                 value = utf8n_to_uvchr((U8*)RExC_parse,
8433                                    RExC_end - RExC_parse,
8434                                    &numlen, UTF8_ALLOW_DEFAULT);
8435                 RExC_parse += numlen;
8436             }
8437             else
8438                 value = UCHARAT(RExC_parse++);
8439             /* Some compilers cannot handle switching on 64-bit integer
8440              * values, therefore value cannot be an UV.  Yes, this will
8441              * be a problem later if we want switch on Unicode.
8442              * A similar issue a little bit later when switching on
8443              * namedclass. --jhi */
8444             switch ((I32)value) {
8445             case 'w':   namedclass = ANYOF_ALNUM;       break;
8446             case 'W':   namedclass = ANYOF_NALNUM;      break;
8447             case 's':   namedclass = ANYOF_SPACE;       break;
8448             case 'S':   namedclass = ANYOF_NSPACE;      break;
8449             case 'd':   namedclass = ANYOF_DIGIT;       break;
8450             case 'D':   namedclass = ANYOF_NDIGIT;      break;
8451             case 'v':   namedclass = ANYOF_VERTWS;      break;
8452             case 'V':   namedclass = ANYOF_NVERTWS;     break;
8453             case 'h':   namedclass = ANYOF_HORIZWS;     break;
8454             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
8455             case 'N':  /* Handle \N{NAME} in class */
8456                 {
8457                     /* We only pay attention to the first char of
8458                     multichar strings being returned. I kinda wonder
8459                     if this makes sense as it does change the behaviour
8460                     from earlier versions, OTOH that behaviour was broken
8461                     as well. */
8462                     UV v; /* value is register so we cant & it /grrr */
8463                     if (reg_namedseq(pRExC_state, &v, NULL)) {
8464                         goto parseit;
8465                     }
8466                     value= v;
8467                 }
8468                 break;
8469             case 'p':
8470             case 'P':
8471                 {
8472                 char *e;
8473                 if (RExC_parse >= RExC_end)
8474                     vFAIL2("Empty \\%c{}", (U8)value);
8475                 if (*RExC_parse == '{') {
8476                     const U8 c = (U8)value;
8477                     e = strchr(RExC_parse++, '}');
8478                     if (!e)
8479                         vFAIL2("Missing right brace on \\%c{}", c);
8480                     while (isSPACE(UCHARAT(RExC_parse)))
8481                         RExC_parse++;
8482                     if (e == RExC_parse)
8483                         vFAIL2("Empty \\%c{}", c);
8484                     n = e - RExC_parse;
8485                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8486                         n--;
8487                 }
8488                 else {
8489                     e = RExC_parse;
8490                     n = 1;
8491                 }
8492                 if (!SIZE_ONLY) {
8493                     if (UCHARAT(RExC_parse) == '^') {
8494                          RExC_parse++;
8495                          n--;
8496                          value = value == 'p' ? 'P' : 'p'; /* toggle */
8497                          while (isSPACE(UCHARAT(RExC_parse))) {
8498                               RExC_parse++;
8499                               n--;
8500                          }
8501                     }
8502                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8503                         (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8504                 }
8505                 RExC_parse = e + 1;
8506
8507                 /* The \p could match something in the Latin1 range, hence
8508                  * something that isn't utf8 */
8509                 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
8510                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
8511                 }
8512                 break;
8513             case 'n':   value = '\n';                   break;
8514             case 'r':   value = '\r';                   break;
8515             case 't':   value = '\t';                   break;
8516             case 'f':   value = '\f';                   break;
8517             case 'b':   value = '\b';                   break;
8518             case 'e':   value = ASCII_TO_NATIVE('\033');break;
8519             case 'a':   value = ASCII_TO_NATIVE('\007');break;
8520             case 'o':
8521                 RExC_parse--;   /* function expects to be pointed at the 'o' */
8522                 {
8523                     const char* error_msg;
8524                     bool valid = grok_bslash_o(RExC_parse,
8525                                                &value,
8526                                                &numlen,
8527                                                &error_msg,
8528                                                SIZE_ONLY);
8529                     RExC_parse += numlen;
8530                     if (! valid) {
8531                         vFAIL(error_msg);
8532                     }
8533                 }
8534                 if (PL_encoding && value < 0x100) {
8535                     goto recode_encoding;
8536                 }
8537                 break;
8538             case 'x':
8539                 if (*RExC_parse == '{') {
8540                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8541                         | PERL_SCAN_DISALLOW_PREFIX;
8542                     char * const e = strchr(RExC_parse++, '}');
8543                     if (!e)
8544                         vFAIL("Missing right brace on \\x{}");
8545
8546                     numlen = e - RExC_parse;
8547                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8548                     RExC_parse = e + 1;
8549                 }
8550                 else {
8551                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8552                     numlen = 2;
8553                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8554                     RExC_parse += numlen;
8555                 }
8556                 if (PL_encoding && value < 0x100)
8557                     goto recode_encoding;
8558                 break;
8559             case 'c':
8560                 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
8561                 break;
8562             case '0': case '1': case '2': case '3': case '4':
8563             case '5': case '6': case '7':
8564                 {
8565                     /* Take 1-3 octal digits */
8566                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8567                     numlen = 3;
8568                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8569                     RExC_parse += numlen;
8570                     if (PL_encoding && value < 0x100)
8571                         goto recode_encoding;
8572                     break;
8573                 }
8574             recode_encoding:
8575                 {
8576                     SV* enc = PL_encoding;
8577                     value = reg_recode((const char)(U8)value, &enc);
8578                     if (!enc && SIZE_ONLY)
8579                         ckWARNreg(RExC_parse,
8580                                   "Invalid escape in the specified encoding");
8581                     break;
8582                 }
8583             default:
8584                 /* Allow \_ to not give an error */
8585                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
8586                     ckWARN2reg(RExC_parse,
8587                                "Unrecognized escape \\%c in character class passed through",
8588                                (int)value);
8589                 }
8590                 break;
8591             }
8592         } /* end of \blah */
8593 #ifdef EBCDIC
8594         else
8595             literal_endpoint++;
8596 #endif
8597
8598         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8599
8600             /* What matches in a locale is not known until runtime, so need to
8601              * (one time per class) allocate extra space to pass to regexec.
8602              * The space will contain a bit for each named class that is to be
8603              * matched against.  This isn't needed for \p{} and pseudo-classes,
8604              * as they are not affected by locale, and hence are dealt with
8605              * separately */
8606             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
8607                 need_class = 1;
8608                 if (SIZE_ONLY) {
8609 #ifdef ANYOF_CLASS_ADD_SKIP
8610                     RExC_size += ANYOF_CLASS_ADD_SKIP;
8611 #endif
8612                 }
8613                 else {
8614 #ifdef ANYOF_CLASS_ADD_SKIP
8615                     RExC_emit += ANYOF_CLASS_ADD_SKIP;
8616 #endif
8617                     ANYOF_CLASS_ZERO(ret);
8618                 }
8619                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8620             }
8621
8622             /* a bad range like a-\d, a-[:digit:] ? */
8623             if (range) {
8624                 if (!SIZE_ONLY) {
8625                     const int w =
8626                         RExC_parse >= rangebegin ?
8627                         RExC_parse - rangebegin : 0;
8628                     ckWARN4reg(RExC_parse,
8629                                "False [] range \"%*.*s\"",
8630                                w, w, rangebegin);
8631
8632                     if (prevvalue < 256) {
8633                         stored +=
8634                          S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue);
8635                         stored +=
8636                          S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
8637                     }
8638                     else {
8639                         ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8640                         Perl_sv_catpvf(aTHX_ listsv,
8641                            "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8642                     }
8643                 }
8644
8645                 range = 0; /* this was not a true range */
8646             }
8647
8648
8649
8650             if (!SIZE_ONLY) {
8651                 const char *what = NULL;
8652                 char yesno = 0;
8653
8654                 /* Possible truncation here but in some 64-bit environments
8655                  * the compiler gets heartburn about switch on 64-bit values.
8656                  * A similar issue a little earlier when switching on value.
8657                  * --jhi */
8658                 switch ((I32)namedclass) {
8659
8660                 case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
8661                 case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
8662                 case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
8663                 case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
8664                 case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
8665                 case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
8666                 case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
8667                 case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
8668                 case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
8669                 case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
8670 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8671                 /* \s, \w match all unicode if utf8. */
8672                 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
8673                 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
8674 #else
8675                 /* \s, \w match ascii and locale only */
8676                 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
8677                 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
8678 #endif
8679                 case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
8680                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8681                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8682                 case ANYOF_ASCII:
8683                     if (LOC)
8684                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8685                     else {
8686                         for (value = 0; value < 128; value++)
8687                             stored +=
8688                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
8689                     }
8690                     yesno = '+';
8691                     what = NULL;        /* Doesn't match outside ascii, so
8692                                            don't want to add +utf8:: */
8693                     break;
8694                 case ANYOF_NASCII:
8695                     if (LOC)
8696                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8697                     else {
8698                         for (value = 128; value < 256; value++)
8699                             stored +=
8700                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
8701                     }
8702                     yesno = '!';
8703                     what = "ASCII";
8704                     break;
8705                 case ANYOF_DIGIT:
8706                     if (LOC)
8707                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8708                     else {
8709                         /* consecutive digits assumed */
8710                         for (value = '0'; value <= '9'; value++)
8711                             stored +=
8712                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
8713                     }
8714                     yesno = '+';
8715                     what = POSIX_CC_UNI_NAME("Digit");
8716                     break;
8717                 case ANYOF_NDIGIT:
8718                     if (LOC)
8719                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8720                     else {
8721                         /* consecutive digits assumed */
8722                         for (value = 0; value < '0'; value++)
8723                             stored +=
8724                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
8725                         for (value = '9' + 1; value < 256; value++)
8726                             stored +=
8727                               S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
8728                     }
8729                     yesno = '!';
8730                     what = POSIX_CC_UNI_NAME("Digit");
8731                     break;
8732                 case ANYOF_MAX:
8733                     /* this is to handle \p and \P */
8734                     break;
8735                 default:
8736                     vFAIL("Invalid [::] class");
8737                     break;
8738                 }
8739                 if (what) {
8740                     /* Strings such as "+utf8::isWord\n" */
8741                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8742                     ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8743                 }
8744
8745                 continue;
8746             }
8747         } /* end of namedclass \blah */
8748
8749         if (range) {
8750             if (prevvalue > (IV)value) /* b-a */ {
8751                 const int w = RExC_parse - rangebegin;
8752                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8753                 range = 0; /* not a valid range */
8754             }
8755         }
8756         else {
8757             prevvalue = value; /* save the beginning of the range */
8758             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8759                 RExC_parse[1] != ']') {
8760                 RExC_parse++;
8761
8762                 /* a bad range like \w-, [:word:]- ? */
8763                 if (namedclass > OOB_NAMEDCLASS) {
8764                     if (ckWARN(WARN_REGEXP)) {
8765                         const int w =
8766                             RExC_parse >= rangebegin ?
8767                             RExC_parse - rangebegin : 0;
8768                         vWARN4(RExC_parse,
8769                                "False [] range \"%*.*s\"",
8770                                w, w, rangebegin);
8771                     }
8772                     if (!SIZE_ONLY)
8773                         stored +=
8774                             S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
8775                 } else
8776                     range = 1;  /* yeah, it's a range! */
8777                 continue;       /* but do it the next time */
8778             }
8779         }
8780
8781         /* now is the next time */
8782         if (!SIZE_ONLY) {
8783             if (prevvalue < 256) {
8784                 const IV ceilvalue = value < 256 ? value : 255;
8785                 IV i;
8786 #ifdef EBCDIC
8787                 /* In EBCDIC [\x89-\x91] should include
8788                  * the \x8e but [i-j] should not. */
8789                 if (literal_endpoint == 2 &&
8790                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8791                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8792                 {
8793                     if (isLOWER(prevvalue)) {
8794                         for (i = prevvalue; i <= ceilvalue; i++)
8795                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8796                                 stored +=
8797                                   S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
8798                             }
8799                     } else {
8800                         for (i = prevvalue; i <= ceilvalue; i++)
8801                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8802                                 stored +=
8803                                   S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
8804                             }
8805                     }
8806                 }
8807                 else
8808 #endif
8809                       for (i = prevvalue; i <= ceilvalue; i++) {
8810                         stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
8811                       }
8812           }
8813           if (value > 255 || UTF) {
8814                 const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
8815                 const UV natvalue      = NATIVE_TO_UNI(value);
8816
8817                 /* If the code point requires utf8 to represent, and we are not
8818                  * folding, it can't match unless the target is in utf8.  Only
8819                  * a few code points above 255 fold to below it, so XXX an
8820                  * optimization would be to know which ones and set the flag
8821                  * appropriately. */
8822                 ANYOF_FLAGS(ret) |= (FOLD || value < 256)
8823                                     ? ANYOF_NONBITMAP
8824                                     : ANYOF_UTF8;
8825                 if (prevnatvalue < natvalue) { /* what about > ? */
8826                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8827                                    prevnatvalue, natvalue);
8828                 }
8829                 else if (prevnatvalue == natvalue) {
8830                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8831                     if (FOLD) {
8832                          U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8833                          STRLEN foldlen;
8834                          const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8835
8836 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8837                          if (RExC_precomp[0] == ':' &&
8838                              RExC_precomp[1] == '[' &&
8839                              (f == 0xDF || f == 0x92)) {
8840                              f = NATIVE_TO_UNI(f);
8841                         }
8842 #endif
8843                          /* If folding and foldable and a single
8844                           * character, insert also the folded version
8845                           * to the charclass. */
8846                          if (f != value) {
8847 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8848                              if ((RExC_precomp[0] == ':' &&
8849                                   RExC_precomp[1] == '[' &&
8850                                   (f == 0xA2 &&
8851                                    (value == 0xFB05 || value == 0xFB06))) ?
8852                                  foldlen == ((STRLEN)UNISKIP(f) - 1) :
8853                                  foldlen == (STRLEN)UNISKIP(f) )
8854 #else
8855                               if (foldlen == (STRLEN)UNISKIP(f))
8856 #endif
8857                                   Perl_sv_catpvf(aTHX_ listsv,
8858                                                  "%04"UVxf"\n", f);
8859                               else {
8860                                   /* Any multicharacter foldings
8861                                    * require the following transform:
8862                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8863                                    * where E folds into "pq" and F folds
8864                                    * into "rst", all other characters
8865                                    * fold to single characters.  We save
8866                                    * away these multicharacter foldings,
8867                                    * to be later saved as part of the
8868                                    * additional "s" data. */
8869                                   SV *sv;
8870
8871                                   if (!unicode_alternate)
8872                                       unicode_alternate = newAV();
8873                                   sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8874                                                      TRUE);
8875                                   av_push(unicode_alternate, sv);
8876                               }
8877                          }
8878
8879                          /* If folding and the value is one of the Greek
8880                           * sigmas insert a few more sigmas to make the
8881                           * folding rules of the sigmas to work right.
8882                           * Note that not all the possible combinations
8883                           * are handled here: some of them are handled
8884                           * by the standard folding rules, and some of
8885                           * them (literal or EXACTF cases) are handled
8886                           * during runtime in regexec.c:S_find_byclass(). */
8887                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8888                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8889                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8890                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8891                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8892                          }
8893                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8894                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8895                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8896                     }
8897                 }
8898             }
8899 #ifdef EBCDIC
8900             literal_endpoint = 0;
8901 #endif
8902         }
8903
8904         range = 0; /* this range (if it was one) is done now */
8905     }
8906
8907
8908
8909     if (SIZE_ONLY)
8910         return ret;
8911     /****** !SIZE_ONLY AFTER HERE *********/
8912
8913     /* Folding in the bitmap is taken care of above, but not for locale, for
8914      * which we have to wait to see what folding is in effect at runtime, and
8915      * for things not in the bitmap */
8916     if (FOLD && (LOC || ANYOF_FLAGS(ret) & ANYOF_NONBITMAP)) {
8917         ANYOF_FLAGS(ret) |= ANYOF_FOLD;
8918     }
8919
8920     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that this doesn't
8921      * optimize locale.  Doing so perhaps could be done as long as there is
8922      * nothing like \w in it; some thought also would have to be given to the
8923      * interaction with above 0x100 chars */
8924     if ((ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8925         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8926             ANYOF_BITMAP(ret)[value] ^= 0xFF;
8927         stored = 256 - stored;
8928
8929         /* The inversion means that everything above 255 is matched */
8930         ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
8931     }
8932
8933     /* A single character class can be "optimized" into an EXACTish node.
8934      * Note that since we don't currently count how many characters there are
8935      * outside the bitmap, we are XXX missing optimization possibilities for
8936      * them.  This optimization can't happen unless this is a truly single
8937      * character class, which means that it can't be an inversion into a
8938      * many-character class, and there must be no possibility of there being
8939      * things outside the bitmap.  'stored' (only) for locales doesn't include
8940      * \w, etc, so have to make a special test that they aren't present
8941      *
8942      * Similarly A 2-character class of the very special form like [bB] can be
8943      * optimized into an EXACTFish node, but only for non-locales, and for
8944      * characters which only have the two folds; so things like 'fF' and 'Ii'
8945      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
8946      * FI'. */
8947     if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
8948         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
8949                               || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
8950             || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
8951                                  && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
8952                                  /* If the latest code point has a fold whose
8953                                   * bit is set, it must be the only other one */
8954                                 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
8955                                  && ANYOF_BITMAP_TEST(ret, prevvalue)))))
8956     {
8957         /* Note that the information needed to decide to do this optimization
8958          * is not currently available until the 2nd pass, and that the actually
8959          * used EXACTish node takes less space than the calculated ANYOF node,
8960          * and hence the amount of space calculated in the first pass is larger
8961          * than actually used, so this optimization doesn't gain us any space.
8962          * But an EXACT node is faster than an ANYOF node, and can be combined
8963          * with any adjacent EXACT nodes later by the optimizer for further
8964          * gains.  The speed of executing an EXACTF is similar to an ANYOF
8965          * node, so the optimization advantage comes from the ability to join
8966          * it to adjacent EXACT nodes */
8967
8968         const char * cur_parse= RExC_parse;
8969         U8 op;
8970         RExC_emit = (regnode *)orig_emit;
8971         RExC_parse = (char *)orig_parse;
8972
8973         if (stored == 1) {
8974
8975             /* A locale node with one point can be folded; all the other cases
8976              * with folding will have two points, since we calculate them above
8977              */
8978             if (ANYOF_FLAGS(ret) & ANYOF_FOLD) {
8979                  op = EXACTFL;
8980             }
8981             else {
8982                 op = EXACT;
8983             }
8984         }   /* else 2 chars in the bit map: the folds of each other */
8985         else if (UNI_SEMANTICS || !isASCII(value)) {
8986
8987             /* To join adjacent nodes, they must be the exact EXACTish type.
8988              * Try to use the most likely type, by using EXACTFU if the regex
8989              * calls for them, or is required because the character is
8990              * non-ASCII */
8991             op = EXACTFU;
8992         }
8993         else {    /* Otherwise, more likely to be EXACTF type */
8994             op = EXACTF;
8995         }
8996
8997         ret = reg_node(pRExC_state, op);
8998         RExC_parse = (char *)cur_parse;
8999         if (UTF && ! NATIVE_IS_INVARIANT(value)) {
9000             *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
9001             *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
9002             STR_LEN(ret)= 2;
9003             RExC_emit += STR_SZ(2);
9004         }
9005         else {
9006             *STRING(ret)= (char)value;
9007             STR_LEN(ret)= 1;
9008             RExC_emit += STR_SZ(1);
9009         }
9010         SvREFCNT_dec(listsv);
9011         return ret;
9012     }
9013
9014     {
9015         AV * const av = newAV();
9016         SV *rv;
9017         /* The 0th element stores the character class description
9018          * in its textual form: used later (regexec.c:Perl_regclass_swash())
9019          * to initialize the appropriate swash (which gets stored in
9020          * the 1st element), and also useful for dumping the regnode.
9021          * The 2nd element stores the multicharacter foldings,
9022          * used later (regexec.c:S_reginclass()). */
9023         av_store(av, 0, listsv);
9024         av_store(av, 1, NULL);
9025         av_store(av, 2, MUTABLE_SV(unicode_alternate));
9026         rv = newRV_noinc(MUTABLE_SV(av));
9027         n = add_data(pRExC_state, 1, "s");
9028         RExC_rxi->data->data[n] = (void*)rv;
9029         ARG_SET(ret, n);
9030     }
9031     return ret;
9032 }
9033 #undef _C_C_T_
9034
9035
9036 /* reg_skipcomment()
9037
9038    Absorbs an /x style # comments from the input stream.
9039    Returns true if there is more text remaining in the stream.
9040    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
9041    terminates the pattern without including a newline.
9042
9043    Note its the callers responsibility to ensure that we are
9044    actually in /x mode
9045
9046 */
9047
9048 STATIC bool
9049 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
9050 {
9051     bool ended = 0;
9052
9053     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
9054
9055     while (RExC_parse < RExC_end)
9056         if (*RExC_parse++ == '\n') {
9057             ended = 1;
9058             break;
9059         }
9060     if (!ended) {
9061         /* we ran off the end of the pattern without ending
9062            the comment, so we have to add an \n when wrapping */
9063         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9064         return 0;
9065     } else
9066         return 1;
9067 }
9068
9069 /* nextchar()
9070
9071    Advances the parse position, and optionally absorbs
9072    "whitespace" from the inputstream.
9073
9074    Without /x "whitespace" means (?#...) style comments only,
9075    with /x this means (?#...) and # comments and whitespace proper.
9076
9077    Returns the RExC_parse point from BEFORE the scan occurs.
9078
9079    This is the /x friendly way of saying RExC_parse++.
9080 */
9081
9082 STATIC char*
9083 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
9084 {
9085     char* const retval = RExC_parse++;
9086
9087     PERL_ARGS_ASSERT_NEXTCHAR;
9088
9089     for (;;) {
9090         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
9091                 RExC_parse[2] == '#') {
9092             while (*RExC_parse != ')') {
9093                 if (RExC_parse == RExC_end)
9094                     FAIL("Sequence (?#... not terminated");
9095                 RExC_parse++;
9096             }
9097             RExC_parse++;
9098             continue;
9099         }
9100         if (RExC_flags & RXf_PMf_EXTENDED) {
9101             if (isSPACE(*RExC_parse)) {
9102                 RExC_parse++;
9103                 continue;
9104             }
9105             else if (*RExC_parse == '#') {
9106                 if ( reg_skipcomment( pRExC_state ) )
9107                     continue;
9108             }
9109         }
9110         return retval;
9111     }
9112 }
9113
9114 /*
9115 - reg_node - emit a node
9116 */
9117 STATIC regnode *                        /* Location. */
9118 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
9119 {
9120     dVAR;
9121     register regnode *ptr;
9122     regnode * const ret = RExC_emit;
9123     GET_RE_DEBUG_FLAGS_DECL;
9124
9125     PERL_ARGS_ASSERT_REG_NODE;
9126
9127     if (SIZE_ONLY) {
9128         SIZE_ALIGN(RExC_size);
9129         RExC_size += 1;
9130         return(ret);
9131     }
9132     if (RExC_emit >= RExC_emit_bound)
9133         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9134
9135     NODE_ALIGN_FILL(ret);
9136     ptr = ret;
9137     FILL_ADVANCE_NODE(ptr, op);
9138     REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (ptr) - 1);
9139 #ifdef RE_TRACK_PATTERN_OFFSETS
9140     if (RExC_offsets) {         /* MJD */
9141         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
9142               "reg_node", __LINE__,
9143               PL_reg_name[op],
9144               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
9145                 ? "Overwriting end of array!\n" : "OK",
9146               (UV)(RExC_emit - RExC_emit_start),
9147               (UV)(RExC_parse - RExC_start),
9148               (UV)RExC_offsets[0]));
9149         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
9150     }
9151 #endif
9152     RExC_emit = ptr;
9153     return(ret);
9154 }
9155
9156 /*
9157 - reganode - emit a node with an argument
9158 */
9159 STATIC regnode *                        /* Location. */
9160 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
9161 {
9162     dVAR;
9163     register regnode *ptr;
9164     regnode * const ret = RExC_emit;
9165     GET_RE_DEBUG_FLAGS_DECL;
9166
9167     PERL_ARGS_ASSERT_REGANODE;
9168
9169     if (SIZE_ONLY) {
9170         SIZE_ALIGN(RExC_size);
9171         RExC_size += 2;
9172         /*
9173            We can't do this:
9174
9175            assert(2==regarglen[op]+1);
9176
9177            Anything larger than this has to allocate the extra amount.
9178            If we changed this to be:
9179
9180            RExC_size += (1 + regarglen[op]);
9181
9182            then it wouldn't matter. Its not clear what side effect
9183            might come from that so its not done so far.
9184            -- dmq
9185         */
9186         return(ret);
9187     }
9188     if (RExC_emit >= RExC_emit_bound)
9189         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9190
9191     NODE_ALIGN_FILL(ret);
9192     ptr = ret;
9193     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
9194     REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (ptr) - 2);
9195 #ifdef RE_TRACK_PATTERN_OFFSETS
9196     if (RExC_offsets) {         /* MJD */
9197         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9198               "reganode",
9199               __LINE__,
9200               PL_reg_name[op],
9201               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
9202               "Overwriting end of array!\n" : "OK",
9203               (UV)(RExC_emit - RExC_emit_start),
9204               (UV)(RExC_parse - RExC_start),
9205               (UV)RExC_offsets[0]));
9206         Set_Cur_Node_Offset;
9207     }
9208 #endif
9209     RExC_emit = ptr;
9210     return(ret);
9211 }
9212
9213 /*
9214 - reguni - emit (if appropriate) a Unicode character
9215 */
9216 STATIC STRLEN
9217 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
9218 {
9219     dVAR;
9220
9221     PERL_ARGS_ASSERT_REGUNI;
9222
9223     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
9224 }
9225
9226 /*
9227 - reginsert - insert an operator in front of already-emitted operand
9228 *
9229 * Means relocating the operand.
9230 */
9231 STATIC void
9232 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
9233 {
9234     dVAR;
9235     register regnode *src;
9236     register regnode *dst;
9237     register regnode *place;
9238     const int offset = regarglen[(U8)op];
9239     const int size = NODE_STEP_REGNODE + offset;
9240     GET_RE_DEBUG_FLAGS_DECL;
9241
9242     PERL_ARGS_ASSERT_REGINSERT;
9243     PERL_UNUSED_ARG(depth);
9244 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
9245     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
9246     if (SIZE_ONLY) {
9247         RExC_size += size;
9248         return;
9249     }
9250
9251     src = RExC_emit;
9252     RExC_emit += size;
9253     dst = RExC_emit;
9254     if (RExC_open_parens) {
9255         int paren;
9256         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
9257         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
9258             if ( RExC_open_parens[paren] >= opnd ) {
9259                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
9260                 RExC_open_parens[paren] += size;
9261             } else {
9262                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
9263             }
9264             if ( RExC_close_parens[paren] >= opnd ) {
9265                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
9266                 RExC_close_parens[paren] += size;
9267             } else {
9268                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
9269             }
9270         }
9271     }
9272
9273     while (src > opnd) {
9274         StructCopy(--src, --dst, regnode);
9275 #ifdef RE_TRACK_PATTERN_OFFSETS
9276         if (RExC_offsets) {     /* MJD 20010112 */
9277             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
9278                   "reg_insert",
9279                   __LINE__,
9280                   PL_reg_name[op],
9281                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
9282                     ? "Overwriting end of array!\n" : "OK",
9283                   (UV)(src - RExC_emit_start),
9284                   (UV)(dst - RExC_emit_start),
9285                   (UV)RExC_offsets[0]));
9286             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
9287             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
9288         }
9289 #endif
9290     }
9291
9292
9293     place = opnd;               /* Op node, where operand used to be. */
9294 #ifdef RE_TRACK_PATTERN_OFFSETS
9295     if (RExC_offsets) {         /* MJD */
9296         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9297               "reginsert",
9298               __LINE__,
9299               PL_reg_name[op],
9300               (UV)(place - RExC_emit_start) > RExC_offsets[0]
9301               ? "Overwriting end of array!\n" : "OK",
9302               (UV)(place - RExC_emit_start),
9303               (UV)(RExC_parse - RExC_start),
9304               (UV)RExC_offsets[0]));
9305         Set_Node_Offset(place, RExC_parse);
9306         Set_Node_Length(place, 1);
9307     }
9308 #endif
9309     src = NEXTOPER(place);
9310     FILL_ADVANCE_NODE(place, op);
9311     REH_CALL_REGCOMP_HOOK(pRExC_state->rx, (place) - 1);
9312     Zero(src, offset, regnode);
9313 }
9314
9315 /*
9316 - regtail - set the next-pointer at the end of a node chain of p to val.
9317 - SEE ALSO: regtail_study
9318 */
9319 /* TODO: All three parms should be const */
9320 STATIC void
9321 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9322 {
9323     dVAR;
9324     register regnode *scan;
9325     GET_RE_DEBUG_FLAGS_DECL;
9326
9327     PERL_ARGS_ASSERT_REGTAIL;
9328 #ifndef DEBUGGING
9329     PERL_UNUSED_ARG(depth);
9330 #endif
9331
9332     if (SIZE_ONLY)
9333         return;
9334
9335     /* Find last node. */
9336     scan = p;
9337     for (;;) {
9338         regnode * const temp = regnext(scan);
9339         DEBUG_PARSE_r({
9340             SV * const mysv=sv_newmortal();
9341             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
9342             regprop(RExC_rx, mysv, scan);
9343             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
9344                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
9345                     (temp == NULL ? "->" : ""),
9346                     (temp == NULL ? PL_reg_name[OP(val)] : "")
9347             );
9348         });
9349         if (temp == NULL)
9350             break;
9351         scan = temp;
9352     }
9353
9354     if (reg_off_by_arg[OP(scan)]) {
9355         ARG_SET(scan, val - scan);
9356     }
9357     else {
9358         NEXT_OFF(scan) = val - scan;
9359     }
9360 }
9361
9362 #ifdef DEBUGGING
9363 /*
9364 - regtail_study - set the next-pointer at the end of a node chain of p to val.
9365 - Look for optimizable sequences at the same time.
9366 - currently only looks for EXACT chains.
9367
9368 This is expermental code. The idea is to use this routine to perform
9369 in place optimizations on branches and groups as they are constructed,
9370 with the long term intention of removing optimization from study_chunk so
9371 that it is purely analytical.
9372
9373 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
9374 to control which is which.
9375
9376 */
9377 /* TODO: All four parms should be const */
9378
9379 STATIC U8
9380 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9381 {
9382     dVAR;
9383     register regnode *scan;
9384     U8 exact = PSEUDO;
9385 #ifdef EXPERIMENTAL_INPLACESCAN
9386     I32 min = 0;
9387 #endif
9388     GET_RE_DEBUG_FLAGS_DECL;
9389
9390     PERL_ARGS_ASSERT_REGTAIL_STUDY;
9391
9392
9393     if (SIZE_ONLY)
9394         return exact;
9395
9396     /* Find last node. */
9397
9398     scan = p;
9399     for (;;) {
9400         regnode * const temp = regnext(scan);
9401 #ifdef EXPERIMENTAL_INPLACESCAN
9402         if (PL_regkind[OP(scan)] == EXACT)
9403             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
9404                 return EXACT;
9405 #endif
9406         if ( exact ) {
9407             switch (OP(scan)) {
9408                 case EXACT:
9409                 case EXACTF:
9410                 case EXACTFU:
9411                 case EXACTFL:
9412                         if( exact == PSEUDO )
9413                             exact= OP(scan);
9414                         else if ( exact != OP(scan) )
9415                             exact= 0;
9416                 case NOTHING:
9417                     break;
9418                 default:
9419                     exact= 0;
9420             }
9421         }
9422         DEBUG_PARSE_r({
9423             SV * const mysv=sv_newmortal();
9424             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
9425             regprop(RExC_rx, mysv, scan);
9426             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
9427                 SvPV_nolen_const(mysv),
9428                 REG_NODE_NUM(scan),
9429                 PL_reg_name[exact]);
9430         });
9431         if (temp == NULL)
9432             break;
9433         scan = temp;
9434     }
9435     DEBUG_PARSE_r({
9436         SV * const mysv_val=sv_newmortal();
9437         DEBUG_PARSE_MSG("");
9438         regprop(RExC_rx, mysv_val, val);
9439         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
9440                       SvPV_nolen_const(mysv_val),
9441                       (IV)REG_NODE_NUM(val),
9442                       (IV)(val - scan)
9443         );
9444     });
9445     if (reg_off_by_arg[OP(scan)]) {
9446         ARG_SET(scan, val - scan);
9447     }
9448     else {
9449         NEXT_OFF(scan) = val - scan;
9450     }
9451
9452     return exact;
9453 }
9454 #endif
9455
9456 /*
9457  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
9458  */
9459 #ifdef DEBUGGING
9460 static void
9461 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9462 {
9463     int bit;
9464     int set=0;
9465
9466     for (bit=0; bit<32; bit++) {
9467         if (flags & (1<<bit)) {
9468             if (!set++ && lead)
9469                 PerlIO_printf(Perl_debug_log, "%s",lead);
9470             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9471         }
9472     }
9473     if (lead)  {
9474         if (set)
9475             PerlIO_printf(Perl_debug_log, "\n");
9476         else
9477             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9478     }
9479 }
9480 #endif
9481
9482 void
9483 Perl_regdump(pTHX_ const regexp *r)
9484 {
9485 #ifdef DEBUGGING
9486     dVAR;
9487     SV * const sv = sv_newmortal();
9488     SV *dsv= sv_newmortal();
9489     RXi_GET_DECL(r,ri);
9490     GET_RE_DEBUG_FLAGS_DECL;
9491
9492     PERL_ARGS_ASSERT_REGDUMP;
9493
9494     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
9495
9496     /* Header fields of interest. */
9497     if (r->anchored_substr) {
9498         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9499             RE_SV_DUMPLEN(r->anchored_substr), 30);
9500         PerlIO_printf(Perl_debug_log,
9501                       "anchored %s%s at %"IVdf" ",
9502                       s, RE_SV_TAIL(r->anchored_substr),
9503                       (IV)r->anchored_offset);
9504     } else if (r->anchored_utf8) {
9505         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9506             RE_SV_DUMPLEN(r->anchored_utf8), 30);
9507         PerlIO_printf(Perl_debug_log,
9508                       "anchored utf8 %s%s at %"IVdf" ",
9509                       s, RE_SV_TAIL(r->anchored_utf8),
9510                       (IV)r->anchored_offset);
9511     }
9512     if (r->float_substr) {
9513         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9514             RE_SV_DUMPLEN(r->float_substr), 30);
9515         PerlIO_printf(Perl_debug_log,
9516                       "floating %s%s at %"IVdf"..%"UVuf" ",
9517                       s, RE_SV_TAIL(r->float_substr),
9518                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9519     } else if (r->float_utf8) {
9520         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9521             RE_SV_DUMPLEN(r->float_utf8), 30);
9522         PerlIO_printf(Perl_debug_log,
9523                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9524                       s, RE_SV_TAIL(r->float_utf8),
9525                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9526     }
9527     if (r->check_substr || r->check_utf8)
9528         PerlIO_printf(Perl_debug_log,
9529                       (const char *)
9530                       (r->check_substr == r->float_substr
9531                        && r->check_utf8 == r->float_utf8
9532                        ? "(checking floating" : "(checking anchored"));
9533     if (r->extflags & RXf_NOSCAN)
9534         PerlIO_printf(Perl_debug_log, " noscan");
9535     if (r->extflags & RXf_CHECK_ALL)
9536         PerlIO_printf(Perl_debug_log, " isall");
9537     if (r->check_substr || r->check_utf8)
9538         PerlIO_printf(Perl_debug_log, ") ");
9539
9540     if (ri->regstclass) {
9541         regprop(r, sv, ri->regstclass);
9542         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9543     }
9544     if (r->extflags & RXf_ANCH) {
9545         PerlIO_printf(Perl_debug_log, "anchored");
9546         if (r->extflags & RXf_ANCH_BOL)
9547             PerlIO_printf(Perl_debug_log, "(BOL)");
9548         if (r->extflags & RXf_ANCH_MBOL)
9549             PerlIO_printf(Perl_debug_log, "(MBOL)");
9550         if (r->extflags & RXf_ANCH_SBOL)
9551             PerlIO_printf(Perl_debug_log, "(SBOL)");
9552         if (r->extflags & RXf_ANCH_GPOS)
9553             PerlIO_printf(Perl_debug_log, "(GPOS)");
9554         PerlIO_putc(Perl_debug_log, ' ');
9555     }
9556     if (r->extflags & RXf_GPOS_SEEN)
9557         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9558     if (r->intflags & PREGf_SKIP)
9559         PerlIO_printf(Perl_debug_log, "plus ");
9560     if (r->intflags & PREGf_IMPLICIT)
9561         PerlIO_printf(Perl_debug_log, "implicit ");
9562     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9563     if (r->extflags & RXf_EVAL_SEEN)
9564         PerlIO_printf(Perl_debug_log, "with eval ");
9565     PerlIO_printf(Perl_debug_log, "\n");
9566     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9567 #else
9568     PERL_ARGS_ASSERT_REGDUMP;
9569     PERL_UNUSED_CONTEXT;
9570     PERL_UNUSED_ARG(r);
9571 #endif  /* DEBUGGING */
9572 }
9573
9574 /*
9575 - regprop - printable representation of opcode
9576 */
9577 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9578 STMT_START { \
9579         if (do_sep) {                           \
9580             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9581             if (flags & ANYOF_INVERT)           \
9582                 /*make sure the invert info is in each */ \
9583                 sv_catpvs(sv, "^");             \
9584             do_sep = 0;                         \
9585         }                                       \
9586 } STMT_END
9587
9588 void
9589 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9590 {
9591 #ifdef DEBUGGING
9592     dVAR;
9593     register int k;
9594     RXi_GET_DECL(prog,progi);
9595     GET_RE_DEBUG_FLAGS_DECL;
9596
9597     PERL_ARGS_ASSERT_REGPROP;
9598
9599     sv_setpvs(sv, "");
9600
9601     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
9602         /* It would be nice to FAIL() here, but this may be called from
9603            regexec.c, and it would be hard to supply pRExC_state. */
9604         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9605     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9606
9607     k = PL_regkind[OP(o)];
9608
9609     if (k == EXACT) {
9610         sv_catpvs(sv, " ");
9611         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9612          * is a crude hack but it may be the best for now since
9613          * we have no flag "this EXACTish node was UTF-8"
9614          * --jhi */
9615         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9616                   PERL_PV_ESCAPE_UNI_DETECT |
9617                   PERL_PV_PRETTY_ELLIPSES   |
9618                   PERL_PV_PRETTY_LTGT       |
9619                   PERL_PV_PRETTY_NOCLEAR
9620                   );
9621     } else if (k == TRIE) {
9622         /* print the details of the trie in dumpuntil instead, as
9623          * progi->data isn't available here */
9624         const char op = OP(o);
9625         const U32 n = ARG(o);
9626         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9627                (reg_ac_data *)progi->data->data[n] :
9628                NULL;
9629         const reg_trie_data * const trie
9630             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9631
9632         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9633         DEBUG_TRIE_COMPILE_r(
9634             Perl_sv_catpvf(aTHX_ sv,
9635                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9636                 (UV)trie->startstate,
9637                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9638                 (UV)trie->wordcount,
9639                 (UV)trie->minlen,
9640                 (UV)trie->maxlen,
9641                 (UV)TRIE_CHARCOUNT(trie),
9642                 (UV)trie->uniquecharcount
9643             )
9644         );
9645         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9646             int i;
9647             int rangestart = -1;
9648             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9649             sv_catpvs(sv, "[");
9650             for (i = 0; i <= 256; i++) {
9651                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9652                     if (rangestart == -1)
9653                         rangestart = i;
9654                 } else if (rangestart != -1) {
9655                     if (i <= rangestart + 3)
9656                         for (; rangestart < i; rangestart++)
9657                             put_byte(sv, rangestart);
9658                     else {
9659                         put_byte(sv, rangestart);
9660                         sv_catpvs(sv, "-");
9661                         put_byte(sv, i - 1);
9662                     }
9663                     rangestart = -1;
9664                 }
9665             }
9666             sv_catpvs(sv, "]");
9667         }
9668
9669     } else if (k == CURLY) {
9670         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9671             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9672         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9673     }
9674     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
9675         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9676     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9677         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
9678         if ( RXp_PAREN_NAMES(prog) ) {
9679             if ( k != REF || (OP(o) < NREF)) {
9680                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9681                 SV **name= av_fetch(list, ARG(o), 0 );
9682                 if (name)
9683                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9684             }
9685             else {
9686                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9687                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9688                 I32 *nums=(I32*)SvPVX(sv_dat);
9689                 SV **name= av_fetch(list, nums[0], 0 );
9690                 I32 n;
9691                 if (name) {
9692                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
9693                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9694                                     (n ? "," : ""), (IV)nums[n]);
9695                     }
9696                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9697                 }
9698             }
9699         }
9700     } else if (k == GOSUB)
9701         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9702     else if (k == VERB) {
9703         if (!o->flags)
9704             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9705                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9706     } else if (k == LOGICAL)
9707         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
9708     else if (k == FOLDCHAR)
9709         Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9710     else if (k == ANYOF) {
9711         int i, rangestart = -1;
9712         const U8 flags = ANYOF_FLAGS(o);
9713         int do_sep = 0;
9714
9715         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9716         static const char * const anyofs[] = {
9717             "\\w",
9718             "\\W",
9719             "\\s",
9720             "\\S",
9721             "\\d",
9722             "\\D",
9723             "[:alnum:]",
9724             "[:^alnum:]",
9725             "[:alpha:]",
9726             "[:^alpha:]",
9727             "[:ascii:]",
9728             "[:^ascii:]",
9729             "[:cntrl:]",
9730             "[:^cntrl:]",
9731             "[:graph:]",
9732             "[:^graph:]",
9733             "[:lower:]",
9734             "[:^lower:]",
9735             "[:print:]",
9736             "[:^print:]",
9737             "[:punct:]",
9738             "[:^punct:]",
9739             "[:upper:]",
9740             "[:^upper:]",
9741             "[:xdigit:]",
9742             "[:^xdigit:]",
9743             "[:space:]",
9744             "[:^space:]",
9745             "[:blank:]",
9746             "[:^blank:]"
9747         };
9748
9749         if (flags & ANYOF_LOCALE)
9750             sv_catpvs(sv, "{loc}");
9751         if (flags & ANYOF_FOLD)
9752             sv_catpvs(sv, "{i}");
9753         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9754         if (flags & ANYOF_INVERT)
9755             sv_catpvs(sv, "^");
9756
9757         /* output what the standard cp 0-255 bitmap matches */
9758         for (i = 0; i <= 256; i++) {
9759             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9760                 if (rangestart == -1)
9761                     rangestart = i;
9762             } else if (rangestart != -1) {
9763                 if (i <= rangestart + 3)
9764                     for (; rangestart < i; rangestart++)
9765                         put_byte(sv, rangestart);
9766                 else {
9767                     put_byte(sv, rangestart);
9768                     sv_catpvs(sv, "-");
9769                     put_byte(sv, i - 1);
9770                 }
9771                 do_sep = 1;
9772                 rangestart = -1;
9773             }
9774         }
9775
9776         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9777         /* output any special charclass tests (used entirely under use locale) */
9778         if (ANYOF_CLASS_TEST_ANY_SET(o))
9779             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9780                 if (ANYOF_CLASS_TEST(o,i)) {
9781                     sv_catpv(sv, anyofs[i]);
9782                     do_sep = 1;
9783                 }
9784
9785         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9786
9787         /* output information about the unicode matching */
9788         if (flags & ANYOF_UNICODE_ALL)
9789             sv_catpvs(sv, "{unicode_all}");
9790         else if (flags & ANYOF_UTF8)
9791             sv_catpvs(sv, "{unicode}");
9792         if (flags & ANYOF_NONBITMAP_NON_UTF8)
9793             sv_catpvs(sv, "{outside bitmap}");
9794
9795         {
9796             SV *lv;
9797             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9798
9799             if (lv) {
9800                 if (sw) {
9801                     U8 s[UTF8_MAXBYTES_CASE+1];
9802
9803                     for (i = 0; i <= 256; i++) { /* just the first 256 */
9804                         uvchr_to_utf8(s, i);
9805
9806                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
9807                             if (rangestart == -1)
9808                                 rangestart = i;
9809                         } else if (rangestart != -1) {
9810                             if (i <= rangestart + 3)
9811                                 for (; rangestart < i; rangestart++) {
9812                                     const U8 * const e = uvchr_to_utf8(s,rangestart);
9813                                     U8 *p;
9814                                     for(p = s; p < e; p++)
9815                                         put_byte(sv, *p);
9816                                 }
9817                             else {
9818                                 const U8 *e = uvchr_to_utf8(s,rangestart);
9819                                 U8 *p;
9820                                 for (p = s; p < e; p++)
9821                                     put_byte(sv, *p);
9822                                 sv_catpvs(sv, "-");
9823                                 e = uvchr_to_utf8(s, i-1);
9824                                 for (p = s; p < e; p++)
9825                                     put_byte(sv, *p);
9826                                 }
9827                                 rangestart = -1;
9828                             }
9829                         }
9830
9831                     sv_catpvs(sv, "..."); /* et cetera */
9832                 }
9833
9834                 {
9835                     char *s = savesvpv(lv);
9836                     char * const origs = s;
9837
9838                     while (*s && *s != '\n')
9839                         s++;
9840
9841                     if (*s == '\n') {
9842                         const char * const t = ++s;
9843
9844                         while (*s) {
9845                             if (*s == '\n')
9846                                 *s = ' ';
9847                             s++;
9848                         }
9849                         if (s[-1] == ' ')
9850                             s[-1] = 0;
9851
9852                         sv_catpv(sv, t);
9853                     }
9854
9855                     Safefree(origs);
9856                 }
9857             }
9858         }
9859
9860         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9861     }
9862     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9863         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9864 #else
9865     PERL_UNUSED_CONTEXT;
9866     PERL_UNUSED_ARG(sv);
9867     PERL_UNUSED_ARG(o);
9868     PERL_UNUSED_ARG(prog);
9869 #endif  /* DEBUGGING */
9870 }
9871
9872 SV *
9873 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9874 {                               /* Assume that RE_INTUIT is set */
9875     dVAR;
9876     struct regexp *const prog = (struct regexp *)SvANY(r);
9877     GET_RE_DEBUG_FLAGS_DECL;
9878
9879     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9880     PERL_UNUSED_CONTEXT;
9881
9882     DEBUG_COMPILE_r(
9883         {
9884             const char * const s = SvPV_nolen_const(prog->check_substr
9885                       ? prog->check_substr : prog->check_utf8);
9886
9887             if (!PL_colorset) reginitcolors();
9888             PerlIO_printf(Perl_debug_log,
9889                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9890                       PL_colors[4],
9891                       prog->check_substr ? "" : "utf8 ",
9892                       PL_colors[5],PL_colors[0],
9893                       s,
9894                       PL_colors[1],
9895                       (strlen(s) > 60 ? "..." : ""));
9896         } );
9897
9898     return prog->check_substr ? prog->check_substr : prog->check_utf8;
9899 }
9900
9901 /*
9902    pregfree()
9903
9904    handles refcounting and freeing the perl core regexp structure. When
9905    it is necessary to actually free the structure the first thing it
9906    does is call the 'free' method of the regexp_engine associated to
9907    the regexp, allowing the handling of the void *pprivate; member
9908    first. (This routine is not overridable by extensions, which is why
9909    the extensions free is called first.)
9910
9911    See regdupe and regdupe_internal if you change anything here.
9912 */
9913 #ifndef PERL_IN_XSUB_RE
9914 void
9915 Perl_pregfree(pTHX_ REGEXP *r)
9916 {
9917     SvREFCNT_dec(r);
9918 }
9919
9920 void
9921 Perl_pregfree2(pTHX_ REGEXP *rx)
9922 {
9923     dVAR;
9924     struct regexp *const r = (struct regexp *)SvANY(rx);
9925     GET_RE_DEBUG_FLAGS_DECL;
9926
9927     PERL_ARGS_ASSERT_PREGFREE2;
9928
9929     if (r->mother_re) {
9930         ReREFCNT_dec(r->mother_re);
9931     } else {
9932         CALLREGFREE_PVT(rx); /* free the private data */
9933         SvREFCNT_dec(RXp_PAREN_NAMES(r));
9934     }
9935     if (r->substrs) {
9936         SvREFCNT_dec(r->anchored_substr);
9937         SvREFCNT_dec(r->anchored_utf8);
9938         SvREFCNT_dec(r->float_substr);
9939         SvREFCNT_dec(r->float_utf8);
9940         Safefree(r->substrs);
9941     }
9942     RX_MATCH_COPY_FREE(rx);
9943 #ifdef PERL_OLD_COPY_ON_WRITE
9944     SvREFCNT_dec(r->saved_copy);
9945 #endif
9946     Safefree(r->offs);
9947 }
9948
9949 /*  reg_temp_copy()
9950
9951     This is a hacky workaround to the structural issue of match results
9952     being stored in the regexp structure which is in turn stored in
9953     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9954     could be PL_curpm in multiple contexts, and could require multiple
9955     result sets being associated with the pattern simultaneously, such
9956     as when doing a recursive match with (??{$qr})
9957
9958     The solution is to make a lightweight copy of the regexp structure
9959     when a qr// is returned from the code executed by (??{$qr}) this
9960     lightweight copy doesnt actually own any of its data except for
9961     the starp/end and the actual regexp structure itself.
9962
9963 */
9964
9965
9966 REGEXP *
9967 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9968 {
9969     struct regexp *ret;
9970     struct regexp *const r = (struct regexp *)SvANY(rx);
9971     register const I32 npar = r->nparens+1;
9972
9973     PERL_ARGS_ASSERT_REG_TEMP_COPY;
9974
9975     if (!ret_x)
9976         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9977     ret = (struct regexp *)SvANY(ret_x);
9978
9979     (void)ReREFCNT_inc(rx);
9980     /* We can take advantage of the existing "copied buffer" mechanism in SVs
9981        by pointing directly at the buffer, but flagging that the allocated
9982        space in the copy is zero. As we've just done a struct copy, it's now
9983        a case of zero-ing that, rather than copying the current length.  */
9984     SvPV_set(ret_x, RX_WRAPPED(rx));
9985     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9986     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9987            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9988     SvLEN_set(ret_x, 0);
9989     SvSTASH_set(ret_x, NULL);
9990     SvMAGIC_set(ret_x, NULL);
9991     Newx(ret->offs, npar, regexp_paren_pair);
9992     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9993     if (r->substrs) {
9994         Newx(ret->substrs, 1, struct reg_substr_data);
9995         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9996
9997         SvREFCNT_inc_void(ret->anchored_substr);
9998         SvREFCNT_inc_void(ret->anchored_utf8);
9999         SvREFCNT_inc_void(ret->float_substr);
10000         SvREFCNT_inc_void(ret->float_utf8);
10001
10002         /* check_substr and check_utf8, if non-NULL, point to either their
10003            anchored or float namesakes, and don't hold a second reference.  */
10004     }
10005     RX_MATCH_COPIED_off(ret_x);
10006 #ifdef PERL_OLD_COPY_ON_WRITE
10007     ret->saved_copy = NULL;
10008 #endif
10009     ret->mother_re = rx;
10010
10011     return ret_x;
10012 }
10013 #endif
10014
10015 /* regfree_internal()
10016
10017    Free the private data in a regexp. This is overloadable by
10018    extensions. Perl takes care of the regexp structure in pregfree(),
10019    this covers the *pprivate pointer which technically perl doesn't
10020    know about, however of course we have to handle the
10021    regexp_internal structure when no extension is in use.
10022
10023    Note this is called before freeing anything in the regexp
10024    structure.
10025  */
10026
10027 void
10028 Perl_regfree_internal(pTHX_ REGEXP * const rx)
10029 {
10030     dVAR;
10031     struct regexp *const r = (struct regexp *)SvANY(rx);
10032     RXi_GET_DECL(r,ri);
10033     GET_RE_DEBUG_FLAGS_DECL;
10034
10035     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
10036
10037     DEBUG_COMPILE_r({
10038         if (!PL_colorset)
10039             reginitcolors();
10040         {
10041             SV *dsv= sv_newmortal();
10042             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
10043                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
10044             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
10045                 PL_colors[4],PL_colors[5],s);
10046         }
10047     });
10048 #ifdef RE_TRACK_PATTERN_OFFSETS
10049     if (ri->u.offsets)
10050         Safefree(ri->u.offsets);             /* 20010421 MJD */
10051 #endif
10052     if (ri->data) {
10053         int n = ri->data->count;
10054         PAD* new_comppad = NULL;
10055         PAD* old_comppad;
10056         PADOFFSET refcnt;
10057
10058         while (--n >= 0) {
10059           /* If you add a ->what type here, update the comment in regcomp.h */
10060             switch (ri->data->what[n]) {
10061             case 'a':
10062             case 's':
10063             case 'S':
10064             case 'u':
10065                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
10066                 break;
10067             case 'f':
10068                 Safefree(ri->data->data[n]);
10069                 break;
10070             case 'p':
10071                 new_comppad = MUTABLE_AV(ri->data->data[n]);
10072                 break;
10073             case 'o':
10074                 if (new_comppad == NULL)
10075                     Perl_croak(aTHX_ "panic: pregfree comppad");
10076                 PAD_SAVE_LOCAL(old_comppad,
10077                     /* Watch out for global destruction's random ordering. */
10078                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
10079                 );
10080                 OP_REFCNT_LOCK;
10081                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
10082                 OP_REFCNT_UNLOCK;
10083                 if (!refcnt)
10084                     op_free((OP_4tree*)ri->data->data[n]);
10085
10086                 PAD_RESTORE_LOCAL(old_comppad);
10087                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
10088                 new_comppad = NULL;
10089                 break;
10090             case 'n':
10091                 break;
10092             case 'T':
10093                 { /* Aho Corasick add-on structure for a trie node.
10094                      Used in stclass optimization only */
10095                     U32 refcount;
10096                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
10097                     OP_REFCNT_LOCK;
10098                     refcount = --aho->refcount;
10099                     OP_REFCNT_UNLOCK;
10100                     if ( !refcount ) {
10101                         PerlMemShared_free(aho->states);
10102                         PerlMemShared_free(aho->fail);
10103                          /* do this last!!!! */
10104                         PerlMemShared_free(ri->data->data[n]);
10105                         PerlMemShared_free(ri->regstclass);
10106                     }
10107                 }
10108                 break;
10109             case 't':
10110                 {
10111                     /* trie structure. */
10112                     U32 refcount;
10113                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
10114                     OP_REFCNT_LOCK;
10115                     refcount = --trie->refcount;
10116                     OP_REFCNT_UNLOCK;
10117                     if ( !refcount ) {
10118                         PerlMemShared_free(trie->charmap);
10119                         PerlMemShared_free(trie->states);
10120                         PerlMemShared_free(trie->trans);
10121                         if (trie->bitmap)
10122                             PerlMemShared_free(trie->bitmap);
10123                         if (trie->jump)
10124                             PerlMemShared_free(trie->jump);
10125                         PerlMemShared_free(trie->wordinfo);
10126                         /* do this last!!!! */
10127                         PerlMemShared_free(ri->data->data[n]);
10128                     }
10129                 }
10130                 break;
10131             default:
10132                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
10133             }
10134         }
10135         Safefree(ri->data->what);
10136         Safefree(ri->data);
10137     }
10138
10139     Safefree(ri);
10140 }
10141
10142 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
10143 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
10144 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
10145
10146 /*
10147    re_dup - duplicate a regexp.
10148
10149    This routine is expected to clone a given regexp structure. It is only
10150    compiled under USE_ITHREADS.
10151
10152    After all of the core data stored in struct regexp is duplicated
10153    the regexp_engine.dupe method is used to copy any private data
10154    stored in the *pprivate pointer. This allows extensions to handle
10155    any duplication it needs to do.
10156
10157    See pregfree() and regfree_internal() if you change anything here.
10158 */
10159 #if defined(USE_ITHREADS)
10160 #ifndef PERL_IN_XSUB_RE
10161 void
10162 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
10163 {
10164     dVAR;
10165     I32 npar;
10166     const struct regexp *r = (const struct regexp *)SvANY(sstr);
10167     struct regexp *ret = (struct regexp *)SvANY(dstr);
10168
10169     PERL_ARGS_ASSERT_RE_DUP_GUTS;
10170
10171     npar = r->nparens+1;
10172     Newx(ret->offs, npar, regexp_paren_pair);
10173     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
10174     if(ret->swap) {
10175         /* no need to copy these */
10176         Newx(ret->swap, npar, regexp_paren_pair);
10177     }
10178
10179     if (ret->substrs) {
10180         /* Do it this way to avoid reading from *r after the StructCopy().
10181            That way, if any of the sv_dup_inc()s dislodge *r from the L1
10182            cache, it doesn't matter.  */
10183         const bool anchored = r->check_substr
10184             ? r->check_substr == r->anchored_substr
10185             : r->check_utf8 == r->anchored_utf8;
10186         Newx(ret->substrs, 1, struct reg_substr_data);
10187         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
10188
10189         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
10190         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
10191         ret->float_substr = sv_dup_inc(ret->float_substr, param);
10192         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
10193
10194         /* check_substr and check_utf8, if non-NULL, point to either their
10195            anchored or float namesakes, and don't hold a second reference.  */
10196
10197         if (ret->check_substr) {
10198             if (anchored) {
10199                 assert(r->check_utf8 == r->anchored_utf8);
10200                 ret->check_substr = ret->anchored_substr;
10201                 ret->check_utf8 = ret->anchored_utf8;
10202             } else {
10203                 assert(r->check_substr == r->float_substr);
10204                 assert(r->check_utf8 == r->float_utf8);
10205                 ret->check_substr = ret->float_substr;
10206                 ret->check_utf8 = ret->float_utf8;
10207             }
10208         } else if (ret->check_utf8) {
10209             if (anchored) {
10210                 ret->check_utf8 = ret->anchored_utf8;
10211             } else {
10212                 ret->check_utf8 = ret->float_utf8;
10213             }
10214         }
10215     }
10216
10217     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
10218
10219     if (ret->pprivate)
10220         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
10221
10222     if (RX_MATCH_COPIED(dstr))
10223         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
10224     else
10225         ret->subbeg = NULL;
10226 #ifdef PERL_OLD_COPY_ON_WRITE
10227     ret->saved_copy = NULL;
10228 #endif
10229
10230     if (ret->mother_re) {
10231         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
10232             /* Our storage points directly to our mother regexp, but that's
10233                1: a buffer in a different thread
10234                2: something we no longer hold a reference on
10235                so we need to copy it locally.  */
10236             /* Note we need to sue SvCUR() on our mother_re, because it, in
10237                turn, may well be pointing to its own mother_re.  */
10238             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
10239                                    SvCUR(ret->mother_re)+1));
10240             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
10241         }
10242         ret->mother_re      = NULL;
10243     }
10244     ret->gofs = 0;
10245 }
10246 #endif /* PERL_IN_XSUB_RE */
10247
10248 /*
10249    regdupe_internal()
10250
10251    This is the internal complement to regdupe() which is used to copy
10252    the structure pointed to by the *pprivate pointer in the regexp.
10253    This is the core version of the extension overridable cloning hook.
10254    The regexp structure being duplicated will be copied by perl prior
10255    to this and will be provided as the regexp *r argument, however
10256    with the /old/ structures pprivate pointer value. Thus this routine
10257    may override any copying normally done by perl.
10258
10259    It returns a pointer to the new regexp_internal structure.
10260 */
10261
10262 void *
10263 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
10264 {
10265     dVAR;
10266     struct regexp *const r = (struct regexp *)SvANY(rx);
10267     regexp_internal *reti;
10268     int len, npar;
10269     RXi_GET_DECL(r,ri);
10270
10271     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
10272
10273     npar = r->nparens+1;
10274     len = ProgLen(ri);
10275
10276     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
10277     Copy(ri->program, reti->program, len+1, regnode);
10278
10279
10280     reti->regstclass = NULL;
10281
10282     if (ri->data) {
10283         struct reg_data *d;
10284         const int count = ri->data->count;
10285         int i;
10286
10287         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
10288                 char, struct reg_data);
10289         Newx(d->what, count, U8);
10290
10291         d->count = count;
10292         for (i = 0; i < count; i++) {
10293             d->what[i] = ri->data->what[i];
10294             switch (d->what[i]) {
10295                 /* legal options are one of: sSfpontTua
10296                    see also regcomp.h and pregfree() */
10297             case 'a': /* actually an AV, but the dup function is identical.  */
10298             case 's':
10299             case 'S':
10300             case 'p': /* actually an AV, but the dup function is identical.  */
10301             case 'u': /* actually an HV, but the dup function is identical.  */
10302                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
10303                 break;
10304             case 'f':
10305                 /* This is cheating. */
10306                 Newx(d->data[i], 1, struct regnode_charclass_class);
10307                 StructCopy(ri->data->data[i], d->data[i],
10308                             struct regnode_charclass_class);
10309                 reti->regstclass = (regnode*)d->data[i];
10310                 break;
10311             case 'o':
10312                 /* Compiled op trees are readonly and in shared memory,
10313                    and can thus be shared without duplication. */
10314                 OP_REFCNT_LOCK;
10315                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
10316                 OP_REFCNT_UNLOCK;
10317                 break;
10318             case 'T':
10319                 /* Trie stclasses are readonly and can thus be shared
10320                  * without duplication. We free the stclass in pregfree
10321                  * when the corresponding reg_ac_data struct is freed.
10322                  */
10323                 reti->regstclass= ri->regstclass;
10324                 /* Fall through */
10325             case 't':
10326                 OP_REFCNT_LOCK;
10327                 ((reg_trie_data*)ri->data->data[i])->refcount++;
10328                 OP_REFCNT_UNLOCK;
10329                 /* Fall through */
10330             case 'n':
10331                 d->data[i] = ri->data->data[i];
10332                 break;
10333             default:
10334                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
10335             }
10336         }
10337
10338         reti->data = d;
10339     }
10340     else
10341         reti->data = NULL;
10342
10343     reti->name_list_idx = ri->name_list_idx;
10344
10345 #ifdef RE_TRACK_PATTERN_OFFSETS
10346     if (ri->u.offsets) {
10347         Newx(reti->u.offsets, 2*len+1, U32);
10348         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
10349     }
10350 #else
10351     SetProgLen(reti,len);
10352 #endif
10353
10354     return (void*)reti;
10355 }
10356
10357 #endif    /* USE_ITHREADS */
10358
10359 #ifndef PERL_IN_XSUB_RE
10360
10361 /*
10362  - regnext - dig the "next" pointer out of a node
10363  */
10364 regnode *
10365 Perl_regnext(pTHX_ register regnode *p)
10366 {
10367     dVAR;
10368     register I32 offset;
10369
10370     if (!p)
10371         return(NULL);
10372
10373     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
10374         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
10375     }
10376
10377     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
10378     if (offset == 0)
10379         return(NULL);
10380
10381     return(p+offset);
10382 }
10383 #endif
10384
10385 STATIC void
10386 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
10387 {
10388     va_list args;
10389     STRLEN l1 = strlen(pat1);
10390     STRLEN l2 = strlen(pat2);
10391     char buf[512];
10392     SV *msv;
10393     const char *message;
10394
10395     PERL_ARGS_ASSERT_RE_CROAK2;
10396
10397     if (l1 > 510)
10398         l1 = 510;
10399     if (l1 + l2 > 510)
10400         l2 = 510 - l1;
10401     Copy(pat1, buf, l1 , char);
10402     Copy(pat2, buf + l1, l2 , char);
10403     buf[l1 + l2] = '\n';
10404     buf[l1 + l2 + 1] = '\0';
10405 #ifdef I_STDARG
10406     /* ANSI variant takes additional second argument */
10407     va_start(args, pat2);
10408 #else
10409     va_start(args);
10410 #endif
10411     msv = vmess(buf, &args);
10412     va_end(args);
10413     message = SvPV_const(msv,l1);
10414     if (l1 > 512)
10415         l1 = 512;
10416     Copy(message, buf, l1 , char);
10417     buf[l1-1] = '\0';                   /* Overwrite \n */
10418     Perl_croak(aTHX_ "%s", buf);
10419 }
10420
10421 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
10422
10423 #ifndef PERL_IN_XSUB_RE
10424 void
10425 Perl_save_re_context(pTHX)
10426 {
10427     dVAR;
10428
10429     struct re_save_state *state;
10430
10431     SAVEVPTR(PL_curcop);
10432     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10433
10434     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10435     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
10436     SSPUSHUV(SAVEt_RE_STATE);
10437
10438     Copy(&PL_reg_state, state, 1, struct re_save_state);
10439
10440     PL_reg_start_tmp = 0;
10441     PL_reg_start_tmpl = 0;
10442     PL_reg_oldsaved = NULL;
10443     PL_reg_oldsavedlen = 0;
10444     PL_reg_maxiter = 0;
10445     PL_reg_leftiter = 0;
10446     PL_reg_poscache = NULL;
10447     PL_reg_poscache_size = 0;
10448 #ifdef PERL_OLD_COPY_ON_WRITE
10449     PL_nrs = NULL;
10450 #endif
10451
10452     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10453     if (PL_curpm) {
10454         const REGEXP * const rx = PM_GETRE(PL_curpm);
10455         if (rx) {
10456             U32 i;
10457             for (i = 1; i <= RX_NPARENS(rx); i++) {
10458                 char digits[TYPE_CHARS(long)];
10459                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
10460                 GV *const *const gvp
10461                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10462
10463                 if (gvp) {
10464                     GV * const gv = *gvp;
10465                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10466                         save_scalar(gv);
10467                 }
10468             }
10469         }
10470     }
10471 }
10472 #endif
10473
10474 static void
10475 clear_re(pTHX_ void *r)
10476 {
10477     dVAR;
10478     ReREFCNT_dec((REGEXP *)r);
10479 }
10480
10481 #ifdef DEBUGGING
10482
10483 STATIC void
10484 S_put_byte(pTHX_ SV *sv, int c)
10485 {
10486     PERL_ARGS_ASSERT_PUT_BYTE;
10487
10488     /* Our definition of isPRINT() ignores locales, so only bytes that are
10489        not part of UTF-8 are considered printable. I assume that the same
10490        holds for UTF-EBCDIC.
10491        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10492        which Wikipedia says:
10493
10494        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10495        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10496        identical, to the ASCII delete (DEL) or rubout control character.
10497        ) So the old condition can be simplified to !isPRINT(c)  */
10498     if (!isPRINT(c)) {
10499         if (c < 256) {
10500             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
10501         }
10502         else {
10503             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
10504         }
10505     }
10506     else {
10507         const char string = c;
10508         if (c == '-' || c == ']' || c == '\\' || c == '^')
10509             sv_catpvs(sv, "\\");
10510         sv_catpvn(sv, &string, 1);
10511     }
10512 }
10513
10514
10515 #define CLEAR_OPTSTART \
10516     if (optstart) STMT_START { \
10517             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
10518             optstart=NULL; \
10519     } STMT_END
10520
10521 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
10522
10523 STATIC const regnode *
10524 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
10525             const regnode *last, const regnode *plast,
10526             SV* sv, I32 indent, U32 depth)
10527 {
10528     dVAR;
10529     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
10530     register const regnode *next;
10531     const regnode *optstart= NULL;
10532
10533     RXi_GET_DECL(r,ri);
10534     GET_RE_DEBUG_FLAGS_DECL;
10535
10536     PERL_ARGS_ASSERT_DUMPUNTIL;
10537
10538 #ifdef DEBUG_DUMPUNTIL
10539     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10540         last ? last-start : 0,plast ? plast-start : 0);
10541 #endif
10542
10543     if (plast && plast < last)
10544         last= plast;
10545
10546     while (PL_regkind[op] != END && (!last || node < last)) {
10547         /* While that wasn't END last time... */
10548         NODE_ALIGN(node);
10549         op = OP(node);
10550         if (op == CLOSE || op == WHILEM)
10551             indent--;
10552         next = regnext((regnode *)node);
10553
10554         /* Where, what. */
10555         if (OP(node) == OPTIMIZED) {
10556             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10557                 optstart = node;
10558             else
10559                 goto after_print;
10560         } else
10561             CLEAR_OPTSTART;
10562
10563         regprop(r, sv, node);
10564         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10565                       (int)(2*indent + 1), "", SvPVX_const(sv));
10566
10567         if (OP(node) != OPTIMIZED) {
10568             if (next == NULL)           /* Next ptr. */
10569                 PerlIO_printf(Perl_debug_log, " (0)");
10570             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10571                 PerlIO_printf(Perl_debug_log, " (FAIL)");
10572             else
10573                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10574             (void)PerlIO_putc(Perl_debug_log, '\n');
10575         }
10576
10577       after_print:
10578         if (PL_regkind[(U8)op] == BRANCHJ) {
10579             assert(next);
10580             {
10581                 register const regnode *nnode = (OP(next) == LONGJMP
10582                                              ? regnext((regnode *)next)
10583                                              : next);
10584                 if (last && nnode > last)
10585                     nnode = last;
10586                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10587             }
10588         }
10589         else if (PL_regkind[(U8)op] == BRANCH) {
10590             assert(next);
10591             DUMPUNTIL(NEXTOPER(node), next);
10592         }
10593         else if ( PL_regkind[(U8)op]  == TRIE ) {
10594             const regnode *this_trie = node;
10595             const char op = OP(node);
10596             const U32 n = ARG(node);
10597             const reg_ac_data * const ac = op>=AHOCORASICK ?
10598                (reg_ac_data *)ri->data->data[n] :
10599                NULL;
10600             const reg_trie_data * const trie =
10601                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10602 #ifdef DEBUGGING
10603             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10604 #endif
10605             const regnode *nextbranch= NULL;
10606             I32 word_idx;
10607             sv_setpvs(sv, "");
10608             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10609                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10610
10611                 PerlIO_printf(Perl_debug_log, "%*s%s ",
10612                    (int)(2*(indent+3)), "",
10613                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10614                             PL_colors[0], PL_colors[1],
10615                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10616                             PERL_PV_PRETTY_ELLIPSES    |
10617                             PERL_PV_PRETTY_LTGT
10618                             )
10619                             : "???"
10620                 );
10621                 if (trie->jump) {
10622                     U16 dist= trie->jump[word_idx+1];
10623                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10624                                   (UV)((dist ? this_trie + dist : next) - start));
10625                     if (dist) {
10626                         if (!nextbranch)
10627                             nextbranch= this_trie + trie->jump[0];
10628                         DUMPUNTIL(this_trie + dist, nextbranch);
10629                     }
10630                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10631                         nextbranch= regnext((regnode *)nextbranch);
10632                 } else {
10633                     PerlIO_printf(Perl_debug_log, "\n");
10634                 }
10635             }
10636             if (last && next > last)
10637                 node= last;
10638             else
10639                 node= next;
10640         }
10641         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
10642             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10643                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10644         }
10645         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10646             assert(next);
10647             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10648         }
10649         else if ( op == PLUS || op == STAR) {
10650             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10651         }
10652         else if (op == ANYOF) {
10653             /* arglen 1 + class block */
10654             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
10655                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10656             node = NEXTOPER(node);
10657         }
10658         else if (PL_regkind[(U8)op] == EXACT) {
10659             /* Literal string, where present. */
10660             node += NODE_SZ_STR(node) - 1;
10661             node = NEXTOPER(node);
10662         }
10663         else {
10664             node = NEXTOPER(node);
10665             node += regarglen[(U8)op];
10666         }
10667         if (op == CURLYX || op == OPEN)
10668             indent++;
10669     }
10670     CLEAR_OPTSTART;
10671 #ifdef DEBUG_DUMPUNTIL
10672     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10673 #endif
10674     return node;
10675 }
10676
10677 #endif  /* DEBUGGING */
10678
10679 /*
10680  * Local variables:
10681  * c-indentation-style: bsd
10682  * c-basic-offset: 4
10683  * indent-tabs-mode: t
10684  * End:
10685  *
10686  * ex: set ts=8 sts=4 sw=4 noet:
10687  */