src/5013001/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to  pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #ifdef op
  89 #undef op
  90 #endif /* op */
  91
  92 #ifdef MSDOS
  93 #  if defined(BUGGY_MSC6)
  94  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  95 #    pragma optimize("a",off)
  96  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  97 #    pragma optimize("w",on )
  98 #  endif /* BUGGY_MSC6 */
  99 #endif /* MSDOS */
 100
 101 #ifndef STATIC
 102 #define STATIC  static
 103 #endif
 104
 105 typedef struct RExC_state_t {
 106     U32         flags;                  /* are we folding, multilining? */
 107     char        *precomp;               /* uncompiled string. */
 108     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 109     regexp      *rx;                    /* perl core regexp structure */
 110     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 111     char        *start;                 /* Start of input for compile */
 112     char        *end;                   /* End of input for compile */
 113     char        *parse;                 /* Input-scan pointer. */
 114     I32         whilem_seen;            /* number of WHILEM in this expr */
 115     regnode     *emit_start;            /* Start of emitted-code area */
 116     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 117     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 118     I32         naughty;                /* How bad is this pattern? */
 119     I32         sawback;                /* Did we see \1, ...? */
 120     U32         seen;
 121     I32         size;                   /* Code size. */
 122     I32         npar;                   /* Capture buffer count, (OPEN). */
 123     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 124     I32         nestroot;               /* root parens we are in - used by accept */
 125     I32         extralen;
 126     I32         seen_zerolen;
 127     I32         seen_evals;
 128     regnode     **open_parens;          /* pointers to open parens */
 129     regnode     **close_parens;         /* pointers to close parens */
 130     regnode     *opend;                 /* END node in program */
 131     I32         utf8;           /* whether the pattern is utf8 or not */
 132     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 133                                 /* XXX use this for future optimisation of case
 134                                  * where pattern must be upgraded to utf8. */
 135     HV          *paren_names;           /* Paren names */
 136
 137     regnode     **recurse;              /* Recurse regops */
 138     I32         recurse_count;          /* Number of recurse regops */
 139 #if ADD_TO_REGEXEC
 140     char        *starttry;              /* -Dr: where regtry was called. */
 141 #define RExC_starttry   (pRExC_state->starttry)
 142 #endif
 143 #ifdef DEBUGGING
 144     const char  *lastparse;
 145     I32         lastnum;
 146     AV          *paren_name_list;       /* idx -> name */
 147 #define RExC_lastparse  (pRExC_state->lastparse)
 148 #define RExC_lastnum    (pRExC_state->lastnum)
 149 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 150 #endif
 151 } RExC_state_t;
 152
 153 #define RExC_flags      (pRExC_state->flags)
 154 #define RExC_precomp    (pRExC_state->precomp)
 155 #define RExC_rx_sv      (pRExC_state->rx_sv)
 156 #define RExC_rx         (pRExC_state->rx)
 157 #define RExC_rxi        (pRExC_state->rxi)
 158 #define RExC_start      (pRExC_state->start)
 159 #define RExC_end        (pRExC_state->end)
 160 #define RExC_parse      (pRExC_state->parse)
 161 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 162 #ifdef RE_TRACK_PATTERN_OFFSETS
 163 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 164 #endif
 165 #define RExC_emit       (pRExC_state->emit)
 166 #define RExC_emit_start (pRExC_state->emit_start)
 167 #define RExC_emit_bound (pRExC_state->emit_bound)
 168 #define RExC_naughty    (pRExC_state->naughty)
 169 #define RExC_sawback    (pRExC_state->sawback)
 170 #define RExC_seen       (pRExC_state->seen)
 171 #define RExC_size       (pRExC_state->size)
 172 #define RExC_npar       (pRExC_state->npar)
 173 #define RExC_nestroot   (pRExC_state->nestroot)
 174 #define RExC_extralen   (pRExC_state->extralen)
 175 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 176 #define RExC_seen_evals (pRExC_state->seen_evals)
 177 #define RExC_utf8       (pRExC_state->utf8)
 178 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 179 #define RExC_open_parens        (pRExC_state->open_parens)
 180 #define RExC_close_parens       (pRExC_state->close_parens)
 181 #define RExC_opend      (pRExC_state->opend)
 182 #define RExC_paren_names        (pRExC_state->paren_names)
 183 #define RExC_recurse    (pRExC_state->recurse)
 184 #define RExC_recurse_count      (pRExC_state->recurse_count)
 185
 186
 187 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 188 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 189         ((*s) == '{' && regcurly(s)))
 190
 191 #ifdef SPSTART
 192 #undef SPSTART          /* dratted cpp namespace... */
 193 #endif
 194 /*
 195  * Flags to be passed up and down.
 196  */
 197 #define WORST           0       /* Worst case. */
 198 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 199 #define SIMPLE          0x02    /* Simple enough to be STAR/PLUS operand. */
 200 #define SPSTART         0x04    /* Starts with * or +. */
 201 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 202 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 203
 204 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 205
 206 /* whether trie related optimizations are enabled */
 207 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 208 #define TRIE_STUDY_OPT
 209 #define FULL_TRIE_STUDY
 210 #define TRIE_STCLASS
 211 #endif
 212
 213
 214
 215 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 216 #define PBITVAL(paren) (1 << ((paren) & 7))
 217 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 218 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 219 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 220
 221
 222 /* About scan_data_t.
 223
 224   During optimisation we recurse through the regexp program performing
 225   various inplace (keyhole style) optimisations. In addition study_chunk
 226   and scan_commit populate this data structure with information about
 227   what strings MUST appear in the pattern. We look for the longest
 228   string that must appear for at a fixed location, and we look for the
 229   longest string that may appear at a floating location. So for instance
 230   in the pattern:
 231
 232     /FOO[xX]A.*B[xX]BAR/
 233
 234   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 235   strings (because they follow a .* construct). study_chunk will identify
 236   both FOO and BAR as being the longest fixed and floating strings respectively.
 237
 238   The strings can be composites, for instance
 239
 240      /(f)(o)(o)/
 241
 242   will result in a composite fixed substring 'foo'.
 243
 244   For each string some basic information is maintained:
 245
 246   - offset or min_offset
 247     This is the position the string must appear at, or not before.
 248     It also implicitly (when combined with minlenp) tells us how many
 249     character must match before the string we are searching.
 250     Likewise when combined with minlenp and the length of the string
 251     tells us how many characters must appear after the string we have
 252     found.
 253
 254   - max_offset
 255     Only used for floating strings. This is the rightmost point that
 256     the string can appear at. Ifset to I32 max it indicates that the
 257     string can occur infinitely far to the right.
 258
 259   - minlenp
 260     A pointer to the minimum length of the pattern that the string
 261     was found inside. This is important as in the case of positive
 262     lookahead or positive lookbehind we can have multiple patterns
 263     involved. Consider
 264
 265     /(?=FOO).*F/
 266
 267     The minimum length of the pattern overall is 3, the minimum length
 268     of the lookahead part is 3, but the minimum length of the part that
 269     will actually match is 1. So 'FOO's minimum length is 3, but the
 270     minimum length for the F is 1. This is important as the minimum length
 271     is used to determine offsets in front of and behind the string being
 272     looked for.  Since strings can be composites this is the length of the
 273     pattern at the time it was commited with a scan_commit. Note that
 274     the length is calculated by study_chunk, so that the minimum lengths
 275     are not known until the full pattern has been compiled, thus the
 276     pointer to the value.
 277
 278   - lookbehind
 279
 280     In the case of lookbehind the string being searched for can be
 281     offset past the start point of the final matching string.
 282     If this value was just blithely removed from the min_offset it would
 283     invalidate some of the calculations for how many chars must match
 284     before or after (as they are derived from min_offset and minlen and
 285     the length of the string being searched for).
 286     When the final pattern is compiled and the data is moved from the
 287     scan_data_t structure into the regexp structure the information
 288     about lookbehind is factored in, with the information that would
 289     have been lost precalculated in the end_shift field for the
 290     associated string.
 291
 292   The fields pos_min and pos_delta are used to store the minimum offset
 293   and the delta to the maximum offset at the current point in the pattern.
 294
 295 */
 296
 297 typedef struct scan_data_t {
 298     /*I32 len_min;      unused */
 299     /*I32 len_delta;    unused */
 300     I32 pos_min;
 301     I32 pos_delta;
 302     SV *last_found;
 303     I32 last_end;           /* min value, <0 unless valid. */
 304     I32 last_start_min;
 305     I32 last_start_max;
 306     SV **longest;           /* Either &l_fixed, or &l_float. */
 307     SV *longest_fixed;      /* longest fixed string found in pattern */
 308     I32 offset_fixed;       /* offset where it starts */
 309     I32 *minlen_fixed;      /* pointer to the minlen relevent to the string */
 310     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 311     SV *longest_float;      /* longest floating string found in pattern */
 312     I32 offset_float_min;   /* earliest point in string it can appear */
 313     I32 offset_float_max;   /* latest point in string it can appear */
 314     I32 *minlen_float;      /* pointer to the minlen relevent to the string */
 315     I32 lookbehind_float;   /* is the position of the string modified by LB */
 316     I32 flags;
 317     I32 whilem_c;
 318     I32 *last_closep;
 319     struct regnode_charclass_class *start_class;
 320 } scan_data_t;
 321
 322 /*
 323  * Forward declarations for pregcomp()'s friends.
 324  */
 325
 326 static const scan_data_t zero_scan_data =
 327   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 328
 329 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 330 #define SF_BEFORE_SEOL          0x0001
 331 #define SF_BEFORE_MEOL          0x0002
 332 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 333 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 334
 335 #ifdef NO_UNARY_PLUS
 336 #  define SF_FIX_SHIFT_EOL      (0+2)
 337 #  define SF_FL_SHIFT_EOL               (0+4)
 338 #else
 339 #  define SF_FIX_SHIFT_EOL      (+2)
 340 #  define SF_FL_SHIFT_EOL               (+4)
 341 #endif
 342
 343 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 344 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 345
 346 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 347 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 348 #define SF_IS_INF               0x0040
 349 #define SF_HAS_PAR              0x0080
 350 #define SF_IN_PAR               0x0100
 351 #define SF_HAS_EVAL             0x0200
 352 #define SCF_DO_SUBSTR           0x0400
 353 #define SCF_DO_STCLASS_AND      0x0800
 354 #define SCF_DO_STCLASS_OR       0x1000
 355 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 356 #define SCF_WHILEM_VISITED_POS  0x2000
 357
 358 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 359 #define SCF_SEEN_ACCEPT         0x8000
 360
 361 #define UTF (RExC_utf8 != 0)
 362 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
 363 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
 364
 365 #define OOB_UNICODE             12345678
 366 #define OOB_NAMEDCLASS          -1
 367
 368 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 369 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 370
 371
 372 /* length of regex to show in messages that don't mark a position within */
 373 #define RegexLengthToShowInErrorMessages 127
 374
 375 /*
 376  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 377  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 378  * op/pragma/warn/regcomp.
 379  */
 380 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 381 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 382
 383 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 384
 385 /*
 386  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 387  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 388  * "...".
 389  */
 390 #define _FAIL(code) STMT_START {                                        \
 391     const char *ellipses = "";                                          \
 392     IV len = RExC_end - RExC_precomp;                                   \
 393                                                                         \
 394     if (!SIZE_ONLY)                                                     \
 395         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 396     if (len > RegexLengthToShowInErrorMessages) {                       \
 397         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 398         len = RegexLengthToShowInErrorMessages - 10;                    \
 399         ellipses = "...";                                               \
 400     }                                                                   \
 401     code;                                                               \
 402 } STMT_END
 403
 404 #define FAIL(msg) _FAIL(                            \
 405     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 406             msg, (int)len, RExC_precomp, ellipses))
 407
 408 #define FAIL2(msg,arg) _FAIL(                       \
 409     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 410             arg, (int)len, RExC_precomp, ellipses))
 411
 412 /*
 413  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 414  */
 415 #define Simple_vFAIL(m) STMT_START {                                    \
 416     const IV offset = RExC_parse - RExC_precomp;                        \
 417     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 418             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 419 } STMT_END
 420
 421 /*
 422  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 423  */
 424 #define vFAIL(m) STMT_START {                           \
 425     if (!SIZE_ONLY)                                     \
 426         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 427     Simple_vFAIL(m);                                    \
 428 } STMT_END
 429
 430 /*
 431  * Like Simple_vFAIL(), but accepts two arguments.
 432  */
 433 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 434     const IV offset = RExC_parse - RExC_precomp;                        \
 435     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 436             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 437 } STMT_END
 438
 439 /*
 440  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 441  */
 442 #define vFAIL2(m,a1) STMT_START {                       \
 443     if (!SIZE_ONLY)                                     \
 444         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 445     Simple_vFAIL2(m, a1);                               \
 446 } STMT_END
 447
 448
 449 /*
 450  * Like Simple_vFAIL(), but accepts three arguments.
 451  */
 452 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 453     const IV offset = RExC_parse - RExC_precomp;                \
 454     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 455             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 456 } STMT_END
 457
 458 /*
 459  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 460  */
 461 #define vFAIL3(m,a1,a2) STMT_START {                    \
 462     if (!SIZE_ONLY)                                     \
 463         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 464     Simple_vFAIL3(m, a1, a2);                           \
 465 } STMT_END
 466
 467 /*
 468  * Like Simple_vFAIL(), but accepts four arguments.
 469  */
 470 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 471     const IV offset = RExC_parse - RExC_precomp;                \
 472     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 473             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 474 } STMT_END
 475
 476 #define ckWARNreg(loc,m) STMT_START {                                   \
 477     const IV offset = loc - RExC_precomp;                               \
 478     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 479             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 480 } STMT_END
 481
 482 #define ckWARNregdep(loc,m) STMT_START {                                \
 483     const IV offset = loc - RExC_precomp;                               \
 484     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 485             m REPORT_LOCATION,                                          \
 486             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 487 } STMT_END
 488
 489 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 490     const IV offset = loc - RExC_precomp;                               \
 491     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 492             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 493 } STMT_END
 494
 495 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 496     const IV offset = loc - RExC_precomp;                               \
 497     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 498             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 499 } STMT_END
 500
 501 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 502     const IV offset = loc - RExC_precomp;                               \
 503     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 504             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 505 } STMT_END
 506
 507 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 508     const IV offset = loc - RExC_precomp;                               \
 509     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 510             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 511 } STMT_END
 512
 513 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 514     const IV offset = loc - RExC_precomp;                               \
 515     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 516             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 517 } STMT_END
 518
 519 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 520     const IV offset = loc - RExC_precomp;                               \
 521     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 522             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 523 } STMT_END
 524
 525
 526 /* Allow for side effects in s */
 527 #define REGC(c,s) STMT_START {                  \
 528     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 529 } STMT_END
 530
 531 /* Macros for recording node offsets.   20001227 mjd@plover.com
 532  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 533  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 534  * Element 0 holds the number n.
 535  * Position is 1 indexed.
 536  */
 537 #ifndef RE_TRACK_PATTERN_OFFSETS
 538 #define Set_Node_Offset_To_R(node,byte)
 539 #define Set_Node_Offset(node,byte)
 540 #define Set_Cur_Node_Offset
 541 #define Set_Node_Length_To_R(node,len)
 542 #define Set_Node_Length(node,len)
 543 #define Set_Node_Cur_Length(node)
 544 #define Node_Offset(n)
 545 #define Node_Length(n)
 546 #define Set_Node_Offset_Length(node,offset,len)
 547 #define ProgLen(ri) ri->u.proglen
 548 #define SetProgLen(ri,x) ri->u.proglen = x
 549 #else
 550 #define ProgLen(ri) ri->u.offsets[0]
 551 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 552 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 553     if (! SIZE_ONLY) {                                                  \
 554         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 555                     __LINE__, (int)(node), (int)(byte)));               \
 556         if((node) < 0) {                                                \
 557             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 558         } else {                                                        \
 559             RExC_offsets[2*(node)-1] = (byte);                          \
 560         }                                                               \
 561     }                                                                   \
 562 } STMT_END
 563
 564 #define Set_Node_Offset(node,byte) \
 565     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 566 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 567
 568 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 569     if (! SIZE_ONLY) {                                                  \
 570         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 571                 __LINE__, (int)(node), (int)(len)));                    \
 572         if((node) < 0) {                                                \
 573             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 574         } else {                                                        \
 575             RExC_offsets[2*(node)] = (len);                             \
 576         }                                                               \
 577     }                                                                   \
 578 } STMT_END
 579
 580 #define Set_Node_Length(node,len) \
 581     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 582 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 583 #define Set_Node_Cur_Length(node) \
 584     Set_Node_Length(node, RExC_parse - parse_start)
 585
 586 /* Get offsets and lengths */
 587 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 588 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 589
 590 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 591     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 592     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 593 } STMT_END
 594 #endif
 595
 596 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 597 #define EXPERIMENTAL_INPLACESCAN
 598 #endif /*RE_TRACK_PATTERN_OFFSETS*/
 599
 600 #define DEBUG_STUDYDATA(str,data,depth)                              \
 601 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 602     PerlIO_printf(Perl_debug_log,                                    \
 603         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 604         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 605         (int)(depth)*2, "",                                          \
 606         (IV)((data)->pos_min),                                       \
 607         (IV)((data)->pos_delta),                                     \
 608         (UV)((data)->flags),                                         \
 609         (IV)((data)->whilem_c),                                      \
 610         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 611         is_inf ? "INF " : ""                                         \
 612     );                                                               \
 613     if ((data)->last_found)                                          \
 614         PerlIO_printf(Perl_debug_log,                                \
 615             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 616             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 617             SvPVX_const((data)->last_found),                         \
 618             (IV)((data)->last_end),                                  \
 619             (IV)((data)->last_start_min),                            \
 620             (IV)((data)->last_start_max),                            \
 621             ((data)->longest &&                                      \
 622              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 623             SvPVX_const((data)->longest_fixed),                      \
 624             (IV)((data)->offset_fixed),                              \
 625             ((data)->longest &&                                      \
 626              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 627             SvPVX_const((data)->longest_float),                      \
 628             (IV)((data)->offset_float_min),                          \
 629             (IV)((data)->offset_float_max)                           \
 630         );                                                           \
 631     PerlIO_printf(Perl_debug_log,"\n");                              \
 632 });
 633
 634 static void clear_re(pTHX_ void *r);
 635
 636 /* Mark that we cannot extend a found fixed substring at this point.
 637    Update the longest found anchored substring and the longest found
 638    floating substrings if needed. */
 639
 640 STATIC void
 641 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 642 {
 643     const STRLEN l = CHR_SVLEN(data->last_found);
 644     const STRLEN old_l = CHR_SVLEN(*data->longest);
 645     GET_RE_DEBUG_FLAGS_DECL;
 646
 647     PERL_ARGS_ASSERT_SCAN_COMMIT;
 648
 649     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 650         SvSetMagicSV(*data->longest, data->last_found);
 651         if (*data->longest == data->longest_fixed) {
 652             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 653             if (data->flags & SF_BEFORE_EOL)
 654                 data->flags
 655                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 656             else
 657                 data->flags &= ~SF_FIX_BEFORE_EOL;
 658             data->minlen_fixed=minlenp;
 659             data->lookbehind_fixed=0;
 660         }
 661         else { /* *data->longest == data->longest_float */
 662             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 663             data->offset_float_max = (l
 664                                       ? data->last_start_max
 665                                       : data->pos_min + data->pos_delta);
 666             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 667                 data->offset_float_max = I32_MAX;
 668             if (data->flags & SF_BEFORE_EOL)
 669                 data->flags
 670                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 671             else
 672                 data->flags &= ~SF_FL_BEFORE_EOL;
 673             data->minlen_float=minlenp;
 674             data->lookbehind_float=0;
 675         }
 676     }
 677     SvCUR_set(data->last_found, 0);
 678     {
 679         SV * const sv = data->last_found;
 680         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 681             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 682             if (mg)
 683                 mg->mg_len = 0;
 684         }
 685     }
 686     data->last_end = -1;
 687     data->flags &= ~SF_BEFORE_EOL;
 688     DEBUG_STUDYDATA("commit: ",data,0);
 689 }
 690
 691 /* Can match anything (initialization) */
 692 STATIC void
 693 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 694 {
 695     PERL_ARGS_ASSERT_CL_ANYTHING;
 696
 697     ANYOF_CLASS_ZERO(cl);
 698     ANYOF_BITMAP_SETALL(cl);
 699     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
 700     if (LOC)
 701         cl->flags |= ANYOF_LOCALE;
 702 }
 703
 704 /* Can match anything (initialization) */
 705 STATIC int
 706 S_cl_is_anything(const struct regnode_charclass_class *cl)
 707 {
 708     int value;
 709
 710     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 711
 712     for (value = 0; value <= ANYOF_MAX; value += 2)
 713         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 714             return 1;
 715     if (!(cl->flags & ANYOF_UNICODE_ALL))
 716         return 0;
 717     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 718         return 0;
 719     return 1;
 720 }
 721
 722 /* Can match anything (initialization) */
 723 STATIC void
 724 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 725 {
 726     PERL_ARGS_ASSERT_CL_INIT;
 727
 728     Zero(cl, 1, struct regnode_charclass_class);
 729     cl->type = ANYOF;
 730     cl_anything(pRExC_state, cl);
 731 }
 732
 733 STATIC void
 734 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 735 {
 736     PERL_ARGS_ASSERT_CL_INIT_ZERO;
 737
 738     Zero(cl, 1, struct regnode_charclass_class);
 739     cl->type = ANYOF;
 740     cl_anything(pRExC_state, cl);
 741     if (LOC)
 742         cl->flags |= ANYOF_LOCALE;
 743 }
 744
 745 /* 'And' a given class with another one.  Can create false positives */
 746 /* We assume that cl is not inverted */
 747 STATIC void
 748 S_cl_and(struct regnode_charclass_class *cl,
 749         const struct regnode_charclass_class *and_with)
 750 {
 751     PERL_ARGS_ASSERT_CL_AND;
 752
 753     assert(and_with->type == ANYOF);
 754     if (!(and_with->flags & ANYOF_CLASS)
 755         && !(cl->flags & ANYOF_CLASS)
 756         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 757         && !(and_with->flags & ANYOF_FOLD)
 758         && !(cl->flags & ANYOF_FOLD)) {
 759         int i;
 760
 761         if (and_with->flags & ANYOF_INVERT)
 762             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 763                 cl->bitmap[i] &= ~and_with->bitmap[i];
 764         else
 765             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 766                 cl->bitmap[i] &= and_with->bitmap[i];
 767     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 768     if (!(and_with->flags & ANYOF_EOS))
 769         cl->flags &= ~ANYOF_EOS;
 770
 771     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
 772         !(and_with->flags & ANYOF_INVERT)) {
 773         cl->flags &= ~ANYOF_UNICODE_ALL;
 774         cl->flags |= ANYOF_UNICODE;
 775         ARG_SET(cl, ARG(and_with));
 776     }
 777     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
 778         !(and_with->flags & ANYOF_INVERT))
 779         cl->flags &= ~ANYOF_UNICODE_ALL;
 780     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
 781         !(and_with->flags & ANYOF_INVERT))
 782         cl->flags &= ~ANYOF_UNICODE;
 783 }
 784
 785 /* 'OR' a given class with another one.  Can create false positives */
 786 /* We assume that cl is not inverted */
 787 STATIC void
 788 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 789 {
 790     PERL_ARGS_ASSERT_CL_OR;
 791
 792     if (or_with->flags & ANYOF_INVERT) {
 793         /* We do not use
 794          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 795          *   <= (B1 | !B2) | (CL1 | !CL2)
 796          * which is wasteful if CL2 is small, but we ignore CL2:
 797          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 798          * XXXX Can we handle case-fold?  Unclear:
 799          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 800          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 801          */
 802         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 803              && !(or_with->flags & ANYOF_FOLD)
 804              && !(cl->flags & ANYOF_FOLD) ) {
 805             int i;
 806
 807             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 808                 cl->bitmap[i] |= ~or_with->bitmap[i];
 809         } /* XXXX: logic is complicated otherwise */
 810         else {
 811             cl_anything(pRExC_state, cl);
 812         }
 813     } else {
 814         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 815         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 816              && (!(or_with->flags & ANYOF_FOLD)
 817                  || (cl->flags & ANYOF_FOLD)) ) {
 818             int i;
 819
 820             /* OR char bitmap and class bitmap separately */
 821             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 822                 cl->bitmap[i] |= or_with->bitmap[i];
 823             if (or_with->flags & ANYOF_CLASS) {
 824                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 825                     cl->classflags[i] |= or_with->classflags[i];
 826                 cl->flags |= ANYOF_CLASS;
 827             }
 828         }
 829         else { /* XXXX: logic is complicated, leave it along for a moment. */
 830             cl_anything(pRExC_state, cl);
 831         }
 832     }
 833     if (or_with->flags & ANYOF_EOS)
 834         cl->flags |= ANYOF_EOS;
 835
 836     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
 837         ARG(cl) != ARG(or_with)) {
 838         cl->flags |= ANYOF_UNICODE_ALL;
 839         cl->flags &= ~ANYOF_UNICODE;
 840     }
 841     if (or_with->flags & ANYOF_UNICODE_ALL) {
 842         cl->flags |= ANYOF_UNICODE_ALL;
 843         cl->flags &= ~ANYOF_UNICODE;
 844     }
 845 }
 846
 847 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 848 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 849 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 850 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
 851
 852
 853 #ifdef DEBUGGING
 854 /*
 855    dump_trie(trie,widecharmap,revcharmap)
 856    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
 857    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 858
 859    These routines dump out a trie in a somewhat readable format.
 860    The _interim_ variants are used for debugging the interim
 861    tables that are used to generate the final compressed
 862    representation which is what dump_trie expects.
 863
 864    Part of the reason for their existance is to provide a form
 865    of documentation as to how the different representations function.
 866
 867 */
 868
 869 /*
 870   Dumps the final compressed table form of the trie to Perl_debug_log.
 871   Used for debugging make_trie().
 872 */
 873
 874 STATIC void
 875 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
 876             AV *revcharmap, U32 depth)
 877 {
 878     U32 state;
 879     SV *sv=sv_newmortal();
 880     int colwidth= widecharmap ? 6 : 4;
 881     U16 word;
 882     GET_RE_DEBUG_FLAGS_DECL;
 883
 884     PERL_ARGS_ASSERT_DUMP_TRIE;
 885
 886     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
 887         (int)depth * 2 + 2,"",
 888         "Match","Base","Ofs" );
 889
 890     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
 891         SV ** const tmp = av_fetch( revcharmap, state, 0);
 892         if ( tmp ) {
 893             PerlIO_printf( Perl_debug_log, "%*s",
 894                 colwidth,
 895                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 896                             PL_colors[0], PL_colors[1],
 897                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 898                             PERL_PV_ESCAPE_FIRSTCHAR
 899                 )
 900             );
 901         }
 902     }
 903     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
 904         (int)depth * 2 + 2,"");
 905
 906     for( state = 0 ; state < trie->uniquecharcount ; state++ )
 907         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
 908     PerlIO_printf( Perl_debug_log, "\n");
 909
 910     for( state = 1 ; state < trie->statecount ; state++ ) {
 911         const U32 base = trie->states[ state ].trans.base;
 912
 913         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
 914
 915         if ( trie->states[ state ].wordnum ) {
 916             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
 917         } else {
 918             PerlIO_printf( Perl_debug_log, "%6s", "" );
 919         }
 920
 921         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
 922
 923         if ( base ) {
 924             U32 ofs = 0;
 925
 926             while( ( base + ofs  < trie->uniquecharcount ) ||
 927                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
 928                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
 929                     ofs++;
 930
 931             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
 932
 933             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
 934                 if ( ( base + ofs >= trie->uniquecharcount ) &&
 935                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
 936                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
 937                 {
 938                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
 939                     colwidth,
 940                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
 941                 } else {
 942                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
 943                 }
 944             }
 945
 946             PerlIO_printf( Perl_debug_log, "]");
 947
 948         }
 949         PerlIO_printf( Perl_debug_log, "\n" );
 950     }
 951     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
 952     for (word=1; word <= trie->wordcount; word++) {
 953         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
 954             (int)word, (int)(trie->wordinfo[word].prev),
 955             (int)(trie->wordinfo[word].len));
 956     }
 957     PerlIO_printf(Perl_debug_log, "\n" );
 958 }
 959 /*
 960   Dumps a fully constructed but uncompressed trie in list form.
 961   List tries normally only are used for construction when the number of
 962   possible chars (trie->uniquecharcount) is very high.
 963   Used for debugging make_trie().
 964 */
 965 STATIC void
 966 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
 967                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
 968                          U32 depth)
 969 {
 970     U32 state;
 971     SV *sv=sv_newmortal();
 972     int colwidth= widecharmap ? 6 : 4;
 973     GET_RE_DEBUG_FLAGS_DECL;
 974
 975     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
 976
 977     /* print out the table precompression.  */
 978     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
 979         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
 980         "------:-----+-----------------\n" );
 981
 982     for( state=1 ; state < next_alloc ; state ++ ) {
 983         U16 charid;
 984
 985         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
 986             (int)depth * 2 + 2,"", (UV)state  );
 987         if ( ! trie->states[ state ].wordnum ) {
 988             PerlIO_printf( Perl_debug_log, "%5s| ","");
 989         } else {
 990             PerlIO_printf( Perl_debug_log, "W%4x| ",
 991                 trie->states[ state ].wordnum
 992             );
 993         }
 994         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
 995             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
 996             if ( tmp ) {
 997                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
 998                     colwidth,
 999                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1000                             PL_colors[0], PL_colors[1],
1001                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1002                             PERL_PV_ESCAPE_FIRSTCHAR
1003                     ) ,
1004                     TRIE_LIST_ITEM(state,charid).forid,
1005                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1006                 );
1007                 if (!(charid % 10))
1008                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1009                         (int)((depth * 2) + 14), "");
1010             }
1011         }
1012         PerlIO_printf( Perl_debug_log, "\n");
1013     }
1014 }
1015
1016 /*
1017   Dumps a fully constructed but uncompressed trie in table form.
1018   This is the normal DFA style state transition table, with a few
1019   twists to facilitate compression later.
1020   Used for debugging make_trie().
1021 */
1022 STATIC void
1023 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1024                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1025                           U32 depth)
1026 {
1027     U32 state;
1028     U16 charid;
1029     SV *sv=sv_newmortal();
1030     int colwidth= widecharmap ? 6 : 4;
1031     GET_RE_DEBUG_FLAGS_DECL;
1032
1033     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1034
1035     /*
1036        print out the table precompression so that we can do a visual check
1037        that they are identical.
1038      */
1039
1040     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1041
1042     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1043         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1044         if ( tmp ) {
1045             PerlIO_printf( Perl_debug_log, "%*s",
1046                 colwidth,
1047                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1048                             PL_colors[0], PL_colors[1],
1049                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1050                             PERL_PV_ESCAPE_FIRSTCHAR
1051                 )
1052             );
1053         }
1054     }
1055
1056     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1057
1058     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1059         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1060     }
1061
1062     PerlIO_printf( Perl_debug_log, "\n" );
1063
1064     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1065
1066         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1067             (int)depth * 2 + 2,"",
1068             (UV)TRIE_NODENUM( state ) );
1069
1070         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1071             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1072             if (v)
1073                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1074             else
1075                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1076         }
1077         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1078             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1079         } else {
1080             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1081             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1082         }
1083     }
1084 }
1085
1086 #endif
1087
1088
1089 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1090   startbranch: the first branch in the whole branch sequence
1091   first      : start branch of sequence of branch-exact nodes.
1092                May be the same as startbranch
1093   last       : Thing following the last branch.
1094                May be the same as tail.
1095   tail       : item following the branch sequence
1096   count      : words in the sequence
1097   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1098   depth      : indent depth
1099
1100 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1101
1102 A trie is an N'ary tree where the branches are determined by digital
1103 decomposition of the key. IE, at the root node you look up the 1st character and
1104 follow that branch repeat until you find the end of the branches. Nodes can be
1105 marked as "accepting" meaning they represent a complete word. Eg:
1106
1107   /he|she|his|hers/
1108
1109 would convert into the following structure. Numbers represent states, letters
1110 following numbers represent valid transitions on the letter from that state, if
1111 the number is in square brackets it represents an accepting state, otherwise it
1112 will be in parenthesis.
1113
1114       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1115       |    |
1116       |   (2)
1117       |    |
1118      (1)   +-i->(6)-+-s->[7]
1119       |
1120       +-s->(3)-+-h->(4)-+-e->[5]
1121
1122       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1123
1124 This shows that when matching against the string 'hers' we will begin at state 1
1125 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1126 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1127 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1128 single traverse. We store a mapping from accepting to state to which word was
1129 matched, and then when we have multiple possibilities we try to complete the
1130 rest of the regex in the order in which they occured in the alternation.
1131
1132 The only prior NFA like behaviour that would be changed by the TRIE support is
1133 the silent ignoring of duplicate alternations which are of the form:
1134
1135  / (DUPE|DUPE) X? (?{ ... }) Y /x
1136
1137 Thus EVAL blocks follwing a trie may be called a different number of times with
1138 and without the optimisation. With the optimisations dupes will be silently
1139 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1140 the following demonstrates:
1141
1142  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1143
1144 which prints out 'word' three times, but
1145
1146  'words'=~/(word|word|word)(?{ print $1 })S/
1147
1148 which doesnt print it out at all. This is due to other optimisations kicking in.
1149
1150 Example of what happens on a structural level:
1151
1152 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1153
1154    1: CURLYM[1] {1,32767}(18)
1155    5:   BRANCH(8)
1156    6:     EXACT <ac>(16)
1157    8:   BRANCH(11)
1158    9:     EXACT <ad>(16)
1159   11:   BRANCH(14)
1160   12:     EXACT <ab>(16)
1161   16:   SUCCEED(0)
1162   17:   NOTHING(18)
1163   18: END(0)
1164
1165 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1166 and should turn into:
1167
1168    1: CURLYM[1] {1,32767}(18)
1169    5:   TRIE(16)
1170         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1171           <ac>
1172           <ad>
1173           <ab>
1174   16:   SUCCEED(0)
1175   17:   NOTHING(18)
1176   18: END(0)
1177
1178 Cases where tail != last would be like /(?foo|bar)baz/:
1179
1180    1: BRANCH(4)
1181    2:   EXACT <foo>(8)
1182    4: BRANCH(7)
1183    5:   EXACT <bar>(8)
1184    7: TAIL(8)
1185    8: EXACT <baz>(10)
1186   10: END(0)
1187
1188 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1189 and would end up looking like:
1190
1191     1: TRIE(8)
1192       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1193         <foo>
1194         <bar>
1195    7: TAIL(8)
1196    8: EXACT <baz>(10)
1197   10: END(0)
1198
1199     d = uvuni_to_utf8_flags(d, uv, 0);
1200
1201 is the recommended Unicode-aware way of saying
1202
1203     *(d++) = uv;
1204 */
1205
1206 #define TRIE_STORE_REVCHAR                                                 \
1207     STMT_START {                                                           \
1208         if (UTF) {                                                         \
1209             SV *zlopp = newSV(2);                                          \
1210             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1211             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1212             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1213             SvPOK_on(zlopp);                                               \
1214             SvUTF8_on(zlopp);                                              \
1215             av_push(revcharmap, zlopp);                                    \
1216         } else {                                                           \
1217             char ooooff = (char)uvc;                                               \
1218             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1219         }                                                                  \
1220         } STMT_END
1221
1222 #define TRIE_READ_CHAR STMT_START {                                           \
1223     wordlen++;                                                                \
1224     if ( UTF ) {                                                              \
1225         if ( folder ) {                                                       \
1226             if ( foldlen > 0 ) {                                              \
1227                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1228                foldlen -= len;                                                \
1229                scan += len;                                                   \
1230                len = 0;                                                       \
1231             } else {                                                          \
1232                 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1233                 uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1234                 foldlen -= UNISKIP( uvc );                                    \
1235                 scan = foldbuf + UNISKIP( uvc );                              \
1236             }                                                                 \
1237         } else {                                                              \
1238             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1239         }                                                                     \
1240     } else {                                                                  \
1241         uvc = (U32)*uc;                                                       \
1242         len = 1;                                                              \
1243     }                                                                         \
1244 } STMT_END
1245
1246
1247
1248 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1249     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1250         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1251         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1252     }                                                           \
1253     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1254     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1255     TRIE_LIST_CUR( state )++;                                   \
1256 } STMT_END
1257
1258 #define TRIE_LIST_NEW(state) STMT_START {                       \
1259     Newxz( trie->states[ state ].trans.list,               \
1260         4, reg_trie_trans_le );                                 \
1261      TRIE_LIST_CUR( state ) = 1;                                \
1262      TRIE_LIST_LEN( state ) = 4;                                \
1263 } STMT_END
1264
1265 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1266     U16 dupe= trie->states[ state ].wordnum;                    \
1267     regnode * const noper_next = regnext( noper );              \
1268                                                                 \
1269     DEBUG_r({                                                   \
1270         /* store the word for dumping */                        \
1271         SV* tmp;                                                \
1272         if (OP(noper) != NOTHING)                               \
1273             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1274         else                                                    \
1275             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1276         av_push( trie_words, tmp );                             \
1277     });                                                         \
1278                                                                 \
1279     curword++;                                                  \
1280     trie->wordinfo[curword].prev   = 0;                         \
1281     trie->wordinfo[curword].len    = wordlen;                   \
1282     trie->wordinfo[curword].accept = state;                     \
1283                                                                 \
1284     if ( noper_next < tail ) {                                  \
1285         if (!trie->jump)                                        \
1286             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1287         trie->jump[curword] = (U16)(noper_next - convert);      \
1288         if (!jumper)                                            \
1289             jumper = noper_next;                                \
1290         if (!nextbranch)                                        \
1291             nextbranch= regnext(cur);                           \
1292     }                                                           \
1293                                                                 \
1294     if ( dupe ) {                                               \
1295         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1296         /* chain, so that when the bits of chain are later    */\
1297         /* linked together, the dups appear in the chain      */\
1298         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1299         trie->wordinfo[dupe].prev = curword;                    \
1300     } else {                                                    \
1301         /* we haven't inserted this word yet.                */ \
1302         trie->states[ state ].wordnum = curword;                \
1303     }                                                           \
1304 } STMT_END
1305
1306
1307 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1308      ( ( base + charid >=  ucharcount                                   \
1309          && base + charid < ubound                                      \
1310          && state == trie->trans[ base - ucharcount + charid ].check    \
1311          && trie->trans[ base - ucharcount + charid ].next )            \
1312            ? trie->trans[ base - ucharcount + charid ].next             \
1313            : ( state==1 ? special : 0 )                                 \
1314       )
1315
1316 #define MADE_TRIE       1
1317 #define MADE_JUMP_TRIE  2
1318 #define MADE_EXACT_TRIE 4
1319
1320 STATIC I32
1321 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1322 {
1323     dVAR;
1324     /* first pass, loop through and scan words */
1325     reg_trie_data *trie;
1326     HV *widecharmap = NULL;
1327     AV *revcharmap = newAV();
1328     regnode *cur;
1329     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1330     STRLEN len = 0;
1331     UV uvc = 0;
1332     U16 curword = 0;
1333     U32 next_alloc = 0;
1334     regnode *jumper = NULL;
1335     regnode *nextbranch = NULL;
1336     regnode *convert = NULL;
1337     U32 *prev_states; /* temp array mapping each state to previous one */
1338     /* we just use folder as a flag in utf8 */
1339     const U8 * const folder = ( flags == EXACTF
1340                        ? PL_fold
1341                        : ( flags == EXACTFL
1342                            ? PL_fold_locale
1343                            : NULL
1344                          )
1345                      );
1346
1347 #ifdef DEBUGGING
1348     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1349     AV *trie_words = NULL;
1350     /* along with revcharmap, this only used during construction but both are
1351      * useful during debugging so we store them in the struct when debugging.
1352      */
1353 #else
1354     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1355     STRLEN trie_charcount=0;
1356 #endif
1357     SV *re_trie_maxbuff;
1358     GET_RE_DEBUG_FLAGS_DECL;
1359
1360     PERL_ARGS_ASSERT_MAKE_TRIE;
1361 #ifndef DEBUGGING
1362     PERL_UNUSED_ARG(depth);
1363 #endif
1364
1365     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1366     trie->refcount = 1;
1367     trie->startstate = 1;
1368     trie->wordcount = word_count;
1369     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1370     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1371     if (!(UTF && folder))
1372         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1373     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1374                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1375
1376     DEBUG_r({
1377         trie_words = newAV();
1378     });
1379
1380     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1381     if (!SvIOK(re_trie_maxbuff)) {
1382         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1383     }
1384     DEBUG_OPTIMISE_r({
1385                 PerlIO_printf( Perl_debug_log,
1386                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1387                   (int)depth * 2 + 2, "",
1388                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1389                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1390                   (int)depth);
1391     });
1392
1393    /* Find the node we are going to overwrite */
1394     if ( first == startbranch && OP( last ) != BRANCH ) {
1395         /* whole branch chain */
1396         convert = first;
1397     } else {
1398         /* branch sub-chain */
1399         convert = NEXTOPER( first );
1400     }
1401
1402     /*  -- First loop and Setup --
1403
1404        We first traverse the branches and scan each word to determine if it
1405        contains widechars, and how many unique chars there are, this is
1406        important as we have to build a table with at least as many columns as we
1407        have unique chars.
1408
1409        We use an array of integers to represent the character codes 0..255
1410        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1411        native representation of the character value as the key and IV's for the
1412        coded index.
1413
1414        *TODO* If we keep track of how many times each character is used we can
1415        remap the columns so that the table compression later on is more
1416        efficient in terms of memory by ensuring most common value is in the
1417        middle and the least common are on the outside.  IMO this would be better
1418        than a most to least common mapping as theres a decent chance the most
1419        common letter will share a node with the least common, meaning the node
1420        will not be compressable. With a middle is most common approach the worst
1421        case is when we have the least common nodes twice.
1422
1423      */
1424
1425     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1426         regnode * const noper = NEXTOPER( cur );
1427         const U8 *uc = (U8*)STRING( noper );
1428         const U8 * const e  = uc + STR_LEN( noper );
1429         STRLEN foldlen = 0;
1430         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1431         const U8 *scan = (U8*)NULL;
1432         U32 wordlen      = 0;         /* required init */
1433         STRLEN chars = 0;
1434         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1435
1436         if (OP(noper) == NOTHING) {
1437             trie->minlen= 0;
1438             continue;
1439         }
1440         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1441             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1442                                           regardless of encoding */
1443
1444         for ( ; uc < e ; uc += len ) {
1445             TRIE_CHARCOUNT(trie)++;
1446             TRIE_READ_CHAR;
1447             chars++;
1448             if ( uvc < 256 ) {
1449                 if ( !trie->charmap[ uvc ] ) {
1450                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1451                     if ( folder )
1452                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1453                     TRIE_STORE_REVCHAR;
1454                 }
1455                 if ( set_bit ) {
1456                     /* store the codepoint in the bitmap, and if its ascii
1457                        also store its folded equivelent. */
1458                     TRIE_BITMAP_SET(trie,uvc);
1459
1460                     /* store the folded codepoint */
1461                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1462
1463                     if ( !UTF ) {
1464                         /* store first byte of utf8 representation of
1465                            codepoints in the 127 < uvc < 256 range */
1466                         if (127 < uvc && uvc < 192) {
1467                             TRIE_BITMAP_SET(trie,194);
1468                         } else if (191 < uvc ) {
1469                             TRIE_BITMAP_SET(trie,195);
1470                         /* && uvc < 256 -- we know uvc is < 256 already */
1471                         }
1472                     }
1473                     set_bit = 0; /* We've done our bit :-) */
1474                 }
1475             } else {
1476                 SV** svpp;
1477                 if ( !widecharmap )
1478                     widecharmap = newHV();
1479
1480                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1481
1482                 if ( !svpp )
1483                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1484
1485                 if ( !SvTRUE( *svpp ) ) {
1486                     sv_setiv( *svpp, ++trie->uniquecharcount );
1487                     TRIE_STORE_REVCHAR;
1488                 }
1489             }
1490         }
1491         if( cur == first ) {
1492             trie->minlen=chars;
1493             trie->maxlen=chars;
1494         } else if (chars < trie->minlen) {
1495             trie->minlen=chars;
1496         } else if (chars > trie->maxlen) {
1497             trie->maxlen=chars;
1498         }
1499
1500     } /* end first pass */
1501     DEBUG_TRIE_COMPILE_r(
1502         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1503                 (int)depth * 2 + 2,"",
1504                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1505                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1506                 (int)trie->minlen, (int)trie->maxlen )
1507     );
1508
1509     /*
1510         We now know what we are dealing with in terms of unique chars and
1511         string sizes so we can calculate how much memory a naive
1512         representation using a flat table  will take. If it's over a reasonable
1513         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1514         conservative but potentially much slower representation using an array
1515         of lists.
1516
1517         At the end we convert both representations into the same compressed
1518         form that will be used in regexec.c for matching with. The latter
1519         is a form that cannot be used to construct with but has memory
1520         properties similar to the list form and access properties similar
1521         to the table form making it both suitable for fast searches and
1522         small enough that its feasable to store for the duration of a program.
1523
1524         See the comment in the code where the compressed table is produced
1525         inplace from the flat tabe representation for an explanation of how
1526         the compression works.
1527
1528     */
1529
1530
1531     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1532     prev_states[1] = 0;
1533
1534     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1535         /*
1536             Second Pass -- Array Of Lists Representation
1537
1538             Each state will be represented by a list of charid:state records
1539             (reg_trie_trans_le) the first such element holds the CUR and LEN
1540             points of the allocated array. (See defines above).
1541
1542             We build the initial structure using the lists, and then convert
1543             it into the compressed table form which allows faster lookups
1544             (but cant be modified once converted).
1545         */
1546
1547         STRLEN transcount = 1;
1548
1549         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1550             "%*sCompiling trie using list compiler\n",
1551             (int)depth * 2 + 2, ""));
1552
1553         trie->states = (reg_trie_state *)
1554             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1555                                   sizeof(reg_trie_state) );
1556         TRIE_LIST_NEW(1);
1557         next_alloc = 2;
1558
1559         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1560
1561             regnode * const noper = NEXTOPER( cur );
1562             U8 *uc           = (U8*)STRING( noper );
1563             const U8 * const e = uc + STR_LEN( noper );
1564             U32 state        = 1;         /* required init */
1565             U16 charid       = 0;         /* sanity init */
1566             U8 *scan         = (U8*)NULL; /* sanity init */
1567             STRLEN foldlen   = 0;         /* required init */
1568             U32 wordlen      = 0;         /* required init */
1569             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1570
1571             if (OP(noper) != NOTHING) {
1572                 for ( ; uc < e ; uc += len ) {
1573
1574                     TRIE_READ_CHAR;
1575
1576                     if ( uvc < 256 ) {
1577                         charid = trie->charmap[ uvc ];
1578                     } else {
1579                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1580                         if ( !svpp ) {
1581                             charid = 0;
1582                         } else {
1583                             charid=(U16)SvIV( *svpp );
1584                         }
1585                     }
1586                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1587                     if ( charid ) {
1588
1589                         U16 check;
1590                         U32 newstate = 0;
1591
1592                         charid--;
1593                         if ( !trie->states[ state ].trans.list ) {
1594                             TRIE_LIST_NEW( state );
1595                         }
1596                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1597                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1598                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1599                                 break;
1600                             }
1601                         }
1602                         if ( ! newstate ) {
1603                             newstate = next_alloc++;
1604                             prev_states[newstate] = state;
1605                             TRIE_LIST_PUSH( state, charid, newstate );
1606                             transcount++;
1607                         }
1608                         state = newstate;
1609                     } else {
1610                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1611                     }
1612                 }
1613             }
1614             TRIE_HANDLE_WORD(state);
1615
1616         } /* end second pass */
1617
1618         /* next alloc is the NEXT state to be allocated */
1619         trie->statecount = next_alloc;
1620         trie->states = (reg_trie_state *)
1621             PerlMemShared_realloc( trie->states,
1622                                    next_alloc
1623                                    * sizeof(reg_trie_state) );
1624
1625         /* and now dump it out before we compress it */
1626         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1627                                                          revcharmap, next_alloc,
1628                                                          depth+1)
1629         );
1630
1631         trie->trans = (reg_trie_trans *)
1632             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1633         {
1634             U32 state;
1635             U32 tp = 0;
1636             U32 zp = 0;
1637
1638
1639             for( state=1 ; state < next_alloc ; state ++ ) {
1640                 U32 base=0;
1641
1642                 /*
1643                 DEBUG_TRIE_COMPILE_MORE_r(
1644                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1645                 );
1646                 */
1647
1648                 if (trie->states[state].trans.list) {
1649                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1650                     U16 maxid=minid;
1651                     U16 idx;
1652
1653                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1654                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1655                         if ( forid < minid ) {
1656                             minid=forid;
1657                         } else if ( forid > maxid ) {
1658                             maxid=forid;
1659                         }
1660                     }
1661                     if ( transcount < tp + maxid - minid + 1) {
1662                         transcount *= 2;
1663                         trie->trans = (reg_trie_trans *)
1664                             PerlMemShared_realloc( trie->trans,
1665                                                      transcount
1666                                                      * sizeof(reg_trie_trans) );
1667                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1668                     }
1669                     base = trie->uniquecharcount + tp - minid;
1670                     if ( maxid == minid ) {
1671                         U32 set = 0;
1672                         for ( ; zp < tp ; zp++ ) {
1673                             if ( ! trie->trans[ zp ].next ) {
1674                                 base = trie->uniquecharcount + zp - minid;
1675                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1676                                 trie->trans[ zp ].check = state;
1677                                 set = 1;
1678                                 break;
1679                             }
1680                         }
1681                         if ( !set ) {
1682                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1683                             trie->trans[ tp ].check = state;
1684                             tp++;
1685                             zp = tp;
1686                         }
1687                     } else {
1688                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1689                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1690                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1691                             trie->trans[ tid ].check = state;
1692                         }
1693                         tp += ( maxid - minid + 1 );
1694                     }
1695                     Safefree(trie->states[ state ].trans.list);
1696                 }
1697                 /*
1698                 DEBUG_TRIE_COMPILE_MORE_r(
1699                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1700                 );
1701                 */
1702                 trie->states[ state ].trans.base=base;
1703             }
1704             trie->lasttrans = tp + 1;
1705         }
1706     } else {
1707         /*
1708            Second Pass -- Flat Table Representation.
1709
1710            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1711            We know that we will need Charcount+1 trans at most to store the data
1712            (one row per char at worst case) So we preallocate both structures
1713            assuming worst case.
1714
1715            We then construct the trie using only the .next slots of the entry
1716            structs.
1717
1718            We use the .check field of the first entry of the node  temporarily to
1719            make compression both faster and easier by keeping track of how many non
1720            zero fields are in the node.
1721
1722            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1723            transition.
1724
1725            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1726            number representing the first entry of the node, and state as a
1727            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1728            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1729            are 2 entrys per node. eg:
1730
1731              A B       A B
1732           1. 2 4    1. 3 7
1733           2. 0 3    3. 0 5
1734           3. 0 0    5. 0 0
1735           4. 0 0    7. 0 0
1736
1737            The table is internally in the right hand, idx form. However as we also
1738            have to deal with the states array which is indexed by nodenum we have to
1739            use TRIE_NODENUM() to convert.
1740
1741         */
1742         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1743             "%*sCompiling trie using table compiler\n",
1744             (int)depth * 2 + 2, ""));
1745
1746         trie->trans = (reg_trie_trans *)
1747             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1748                                   * trie->uniquecharcount + 1,
1749                                   sizeof(reg_trie_trans) );
1750         trie->states = (reg_trie_state *)
1751             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1752                                   sizeof(reg_trie_state) );
1753         next_alloc = trie->uniquecharcount + 1;
1754
1755
1756         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1757
1758             regnode * const noper   = NEXTOPER( cur );
1759             const U8 *uc     = (U8*)STRING( noper );
1760             const U8 * const e = uc + STR_LEN( noper );
1761
1762             U32 state        = 1;         /* required init */
1763
1764             U16 charid       = 0;         /* sanity init */
1765             U32 accept_state = 0;         /* sanity init */
1766             U8 *scan         = (U8*)NULL; /* sanity init */
1767
1768             STRLEN foldlen   = 0;         /* required init */
1769             U32 wordlen      = 0;         /* required init */
1770             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1771
1772             if ( OP(noper) != NOTHING ) {
1773                 for ( ; uc < e ; uc += len ) {
1774
1775                     TRIE_READ_CHAR;
1776
1777                     if ( uvc < 256 ) {
1778                         charid = trie->charmap[ uvc ];
1779                     } else {
1780                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1781                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1782                     }
1783                     if ( charid ) {
1784                         charid--;
1785                         if ( !trie->trans[ state + charid ].next ) {
1786                             trie->trans[ state + charid ].next = next_alloc;
1787                             trie->trans[ state ].check++;
1788                             prev_states[TRIE_NODENUM(next_alloc)]
1789                                     = TRIE_NODENUM(state);
1790                             next_alloc += trie->uniquecharcount;
1791                         }
1792                         state = trie->trans[ state + charid ].next;
1793                     } else {
1794                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1795                     }
1796                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1797                 }
1798             }
1799             accept_state = TRIE_NODENUM( state );
1800             TRIE_HANDLE_WORD(accept_state);
1801
1802         } /* end second pass */
1803
1804         /* and now dump it out before we compress it */
1805         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1806                                                           revcharmap,
1807                                                           next_alloc, depth+1));
1808
1809         {
1810         /*
1811            * Inplace compress the table.*
1812
1813            For sparse data sets the table constructed by the trie algorithm will
1814            be mostly 0/FAIL transitions or to put it another way mostly empty.
1815            (Note that leaf nodes will not contain any transitions.)
1816
1817            This algorithm compresses the tables by eliminating most such
1818            transitions, at the cost of a modest bit of extra work during lookup:
1819
1820            - Each states[] entry contains a .base field which indicates the
1821            index in the state[] array wheres its transition data is stored.
1822
1823            - If .base is 0 there are no  valid transitions from that node.
1824
1825            - If .base is nonzero then charid is added to it to find an entry in
1826            the trans array.
1827
1828            -If trans[states[state].base+charid].check!=state then the
1829            transition is taken to be a 0/Fail transition. Thus if there are fail
1830            transitions at the front of the node then the .base offset will point
1831            somewhere inside the previous nodes data (or maybe even into a node
1832            even earlier), but the .check field determines if the transition is
1833            valid.
1834
1835            XXX - wrong maybe?
1836            The following process inplace converts the table to the compressed
1837            table: We first do not compress the root node 1,and mark its all its
1838            .check pointers as 1 and set its .base pointer as 1 as well. This
1839            allows to do a DFA construction from the compressed table later, and
1840            ensures that any .base pointers we calculate later are greater than
1841            0.
1842
1843            - We set 'pos' to indicate the first entry of the second node.
1844
1845            - We then iterate over the columns of the node, finding the first and
1846            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1847            and set the .check pointers accordingly, and advance pos
1848            appropriately and repreat for the next node. Note that when we copy
1849            the next pointers we have to convert them from the original
1850            NODEIDX form to NODENUM form as the former is not valid post
1851            compression.
1852
1853            - If a node has no transitions used we mark its base as 0 and do not
1854            advance the pos pointer.
1855
1856            - If a node only has one transition we use a second pointer into the
1857            structure to fill in allocated fail transitions from other states.
1858            This pointer is independent of the main pointer and scans forward
1859            looking for null transitions that are allocated to a state. When it
1860            finds one it writes the single transition into the "hole".  If the
1861            pointer doesnt find one the single transition is appended as normal.
1862
1863            - Once compressed we can Renew/realloc the structures to release the
1864            excess space.
1865
1866            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1867            specifically Fig 3.47 and the associated pseudocode.
1868
1869            demq
1870         */
1871         const U32 laststate = TRIE_NODENUM( next_alloc );
1872         U32 state, charid;
1873         U32 pos = 0, zp=0;
1874         trie->statecount = laststate;
1875
1876         for ( state = 1 ; state < laststate ; state++ ) {
1877             U8 flag = 0;
1878             const U32 stateidx = TRIE_NODEIDX( state );
1879             const U32 o_used = trie->trans[ stateidx ].check;
1880             U32 used = trie->trans[ stateidx ].check;
1881             trie->trans[ stateidx ].check = 0;
1882
1883             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1884                 if ( flag || trie->trans[ stateidx + charid ].next ) {
1885                     if ( trie->trans[ stateidx + charid ].next ) {
1886                         if (o_used == 1) {
1887                             for ( ; zp < pos ; zp++ ) {
1888                                 if ( ! trie->trans[ zp ].next ) {
1889                                     break;
1890                                 }
1891                             }
1892                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1893                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1894                             trie->trans[ zp ].check = state;
1895                             if ( ++zp > pos ) pos = zp;
1896                             break;
1897                         }
1898                         used--;
1899                     }
1900                     if ( !flag ) {
1901                         flag = 1;
1902                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1903                     }
1904                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1905                     trie->trans[ pos ].check = state;
1906                     pos++;
1907                 }
1908             }
1909         }
1910         trie->lasttrans = pos + 1;
1911         trie->states = (reg_trie_state *)
1912             PerlMemShared_realloc( trie->states, laststate
1913                                    * sizeof(reg_trie_state) );
1914         DEBUG_TRIE_COMPILE_MORE_r(
1915                 PerlIO_printf( Perl_debug_log,
1916                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1917                     (int)depth * 2 + 2,"",
1918                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1919                     (IV)next_alloc,
1920                     (IV)pos,
1921                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1922             );
1923
1924         } /* end table compress */
1925     }
1926     DEBUG_TRIE_COMPILE_MORE_r(
1927             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1928                 (int)depth * 2 + 2, "",
1929                 (UV)trie->statecount,
1930                 (UV)trie->lasttrans)
1931     );
1932     /* resize the trans array to remove unused space */
1933     trie->trans = (reg_trie_trans *)
1934         PerlMemShared_realloc( trie->trans, trie->lasttrans
1935                                * sizeof(reg_trie_trans) );
1936
1937     {   /* Modify the program and insert the new TRIE node*/
1938         U8 nodetype =(U8)(flags & 0xFF);
1939         char *str=NULL;
1940
1941 #ifdef DEBUGGING
1942         regnode *optimize = NULL;
1943 #ifdef RE_TRACK_PATTERN_OFFSETS
1944
1945         U32 mjd_offset = 0;
1946         U32 mjd_nodelen = 0;
1947 #endif /* RE_TRACK_PATTERN_OFFSETS */
1948 #endif /* DEBUGGING */
1949         /*
1950            This means we convert either the first branch or the first Exact,
1951            depending on whether the thing following (in 'last') is a branch
1952            or not and whther first is the startbranch (ie is it a sub part of
1953            the alternation or is it the whole thing.)
1954            Assuming its a sub part we conver the EXACT otherwise we convert
1955            the whole branch sequence, including the first.
1956          */
1957         /* Find the node we are going to overwrite */
1958         if ( first != startbranch || OP( last ) == BRANCH ) {
1959             /* branch sub-chain */
1960             NEXT_OFF( first ) = (U16)(last - first);
1961 #ifdef RE_TRACK_PATTERN_OFFSETS
1962             DEBUG_r({
1963                 mjd_offset= Node_Offset((convert));
1964                 mjd_nodelen= Node_Length((convert));
1965             });
1966 #endif
1967             /* whole branch chain */
1968         }
1969 #ifdef RE_TRACK_PATTERN_OFFSETS
1970         else {
1971             DEBUG_r({
1972                 const  regnode *nop = NEXTOPER( convert );
1973                 mjd_offset= Node_Offset((nop));
1974                 mjd_nodelen= Node_Length((nop));
1975             });
1976         }
1977         DEBUG_OPTIMISE_r(
1978             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1979                 (int)depth * 2 + 2, "",
1980                 (UV)mjd_offset, (UV)mjd_nodelen)
1981         );
1982 #endif
1983         /* But first we check to see if there is a common prefix we can
1984            split out as an EXACT and put in front of the TRIE node.  */
1985         trie->startstate= 1;
1986         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
1987             U32 state;
1988             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1989                 U32 ofs = 0;
1990                 I32 idx = -1;
1991                 U32 count = 0;
1992                 const U32 base = trie->states[ state ].trans.base;
1993
1994                 if ( trie->states[state].wordnum )
1995                         count = 1;
1996
1997                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1998                     if ( ( base + ofs >= trie->uniquecharcount ) &&
1999                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2000                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2001                     {
2002                         if ( ++count > 1 ) {
2003                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2004                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2005                             if ( state == 1 ) break;
2006                             if ( count == 2 ) {
2007                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2008                                 DEBUG_OPTIMISE_r(
2009                                     PerlIO_printf(Perl_debug_log,
2010                                         "%*sNew Start State=%"UVuf" Class: [",
2011                                         (int)depth * 2 + 2, "",
2012                                         (UV)state));
2013                                 if (idx >= 0) {
2014                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2015                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2016
2017                                     TRIE_BITMAP_SET(trie,*ch);
2018                                     if ( folder )
2019                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2020                                     DEBUG_OPTIMISE_r(
2021                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2022                                     );
2023                                 }
2024                             }
2025                             TRIE_BITMAP_SET(trie,*ch);
2026                             if ( folder )
2027                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2028                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2029                         }
2030                         idx = ofs;
2031                     }
2032                 }
2033                 if ( count == 1 ) {
2034                     SV **tmp = av_fetch( revcharmap, idx, 0);
2035                     STRLEN len;
2036                     char *ch = SvPV( *tmp, len );
2037                     DEBUG_OPTIMISE_r({
2038                         SV *sv=sv_newmortal();
2039                         PerlIO_printf( Perl_debug_log,
2040                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2041                             (int)depth * 2 + 2, "",
2042                             (UV)state, (UV)idx,
2043                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2044                                 PL_colors[0], PL_colors[1],
2045                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2046                                 PERL_PV_ESCAPE_FIRSTCHAR
2047                             )
2048                         );
2049                     });
2050                     if ( state==1 ) {
2051                         OP( convert ) = nodetype;
2052                         str=STRING(convert);
2053                         STR_LEN(convert)=0;
2054                     }
2055                     STR_LEN(convert) += len;
2056                     while (len--)
2057                         *str++ = *ch++;
2058                 } else {
2059 #ifdef DEBUGGING
2060                     if (state>1)
2061                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2062 #endif
2063                     break;
2064                 }
2065             }
2066             trie->prefixlen = (state-1);
2067             if (str) {
2068                 regnode *n = convert+NODE_SZ_STR(convert);
2069                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2070                 trie->startstate = state;
2071                 trie->minlen -= (state - 1);
2072                 trie->maxlen -= (state - 1);
2073 #ifdef DEBUGGING
2074                /* At least the UNICOS C compiler choked on this
2075                 * being argument to DEBUG_r(), so let's just have
2076                 * it right here. */
2077                if (
2078 #ifdef PERL_EXT_RE_BUILD
2079                    1
2080 #else
2081                    DEBUG_r_TEST
2082 #endif
2083                    ) {
2084                    regnode *fix = convert;
2085                    U32 word = trie->wordcount;
2086                    mjd_nodelen++;
2087                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2088                    while( ++fix < n ) {
2089                        Set_Node_Offset_Length(fix, 0, 0);
2090                    }
2091                    while (word--) {
2092                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2093                        if (tmp) {
2094                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2095                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2096                            else
2097                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2098                        }
2099                    }
2100                }
2101 #endif
2102                 if (trie->maxlen) {
2103                     convert = n;
2104                 } else {
2105                     NEXT_OFF(convert) = (U16)(tail - convert);
2106                     DEBUG_r(optimize= n);
2107                 }
2108             }
2109         }
2110         if (!jumper)
2111             jumper = last;
2112         if ( trie->maxlen ) {
2113             NEXT_OFF( convert ) = (U16)(tail - convert);
2114             ARG_SET( convert, data_slot );
2115             /* Store the offset to the first unabsorbed branch in
2116                jump[0], which is otherwise unused by the jump logic.
2117                We use this when dumping a trie and during optimisation. */
2118             if (trie->jump)
2119                 trie->jump[0] = (U16)(nextbranch - convert);
2120
2121             /* XXXX */
2122             if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2123                  ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2124             {
2125                 OP( convert ) = TRIEC;
2126                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2127                 PerlMemShared_free(trie->bitmap);
2128                 trie->bitmap= NULL;
2129             } else
2130                 OP( convert ) = TRIE;
2131
2132             /* store the type in the flags */
2133             convert->flags = nodetype;
2134             DEBUG_r({
2135             optimize = convert
2136                       + NODE_STEP_REGNODE
2137                       + regarglen[ OP( convert ) ];
2138             });
2139             /* XXX We really should free up the resource in trie now,
2140                    as we won't use them - (which resources?) dmq */
2141         }
2142         /* needed for dumping*/
2143         DEBUG_r(if (optimize) {
2144             regnode *opt = convert;
2145
2146             while ( ++opt < optimize) {
2147                 Set_Node_Offset_Length(opt,0,0);
2148             }
2149             /*
2150                 Try to clean up some of the debris left after the
2151                 optimisation.
2152              */
2153             while( optimize < jumper ) {
2154                 mjd_nodelen += Node_Length((optimize));
2155                 OP( optimize ) = OPTIMIZED;
2156                 Set_Node_Offset_Length(optimize,0,0);
2157                 optimize++;
2158             }
2159             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2160         });
2161     } /* end node insert */
2162
2163     /*  Finish populating the prev field of the wordinfo array.  Walk back
2164      *  from each accept state until we find another accept state, and if
2165      *  so, point the first word's .prev field at the second word. If the
2166      *  second already has a .prev field set, stop now. This will be the
2167      *  case either if we've already processed that word's accept state,
2168      *  or that that state had multiple words, and the overspill words
2169      *  were already linked up earlier.
2170      */
2171     {
2172         U16 word;
2173         U32 state;
2174         U16 prev;
2175
2176         for (word=1; word <= trie->wordcount; word++) {
2177             prev = 0;
2178             if (trie->wordinfo[word].prev)
2179                 continue;
2180             state = trie->wordinfo[word].accept;
2181             while (state) {
2182                 state = prev_states[state];
2183                 if (!state)
2184                     break;
2185                 prev = trie->states[state].wordnum;
2186                 if (prev)
2187                     break;
2188             }
2189             trie->wordinfo[word].prev = prev;
2190         }
2191         Safefree(prev_states);
2192     }
2193
2194
2195     /* and now dump out the compressed format */
2196     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2197
2198     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2199 #ifdef DEBUGGING
2200     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2201     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2202 #else
2203     SvREFCNT_dec(revcharmap);
2204 #endif
2205     return trie->jump
2206            ? MADE_JUMP_TRIE
2207            : trie->startstate>1
2208              ? MADE_EXACT_TRIE
2209              : MADE_TRIE;
2210 }
2211
2212 STATIC void
2213 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2214 {
2215 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2216
2217    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2218    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2219    ISBN 0-201-10088-6
2220
2221    We find the fail state for each state in the trie, this state is the longest proper
2222    suffix of the current states 'word' that is also a proper prefix of another word in our
2223    trie. State 1 represents the word '' and is the thus the default fail state. This allows
2224    the DFA not to have to restart after its tried and failed a word at a given point, it
2225    simply continues as though it had been matching the other word in the first place.
2226    Consider
2227       'abcdgu'=~/abcdefg|cdgu/
2228    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2229    fail, which would bring use to the state representing 'd' in the second word where we would
2230    try 'g' and succeed, prodceding to match 'cdgu'.
2231  */
2232  /* add a fail transition */
2233     const U32 trie_offset = ARG(source);
2234     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2235     U32 *q;
2236     const U32 ucharcount = trie->uniquecharcount;
2237     const U32 numstates = trie->statecount;
2238     const U32 ubound = trie->lasttrans + ucharcount;
2239     U32 q_read = 0;
2240     U32 q_write = 0;
2241     U32 charid;
2242     U32 base = trie->states[ 1 ].trans.base;
2243     U32 *fail;
2244     reg_ac_data *aho;
2245     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2246     GET_RE_DEBUG_FLAGS_DECL;
2247
2248     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2249 #ifndef DEBUGGING
2250     PERL_UNUSED_ARG(depth);
2251 #endif
2252
2253
2254     ARG_SET( stclass, data_slot );
2255     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2256     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2257     aho->trie=trie_offset;
2258     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2259     Copy( trie->states, aho->states, numstates, reg_trie_state );
2260     Newxz( q, numstates, U32);
2261     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2262     aho->refcount = 1;
2263     fail = aho->fail;
2264     /* initialize fail[0..1] to be 1 so that we always have
2265        a valid final fail state */
2266     fail[ 0 ] = fail[ 1 ] = 1;
2267
2268     for ( charid = 0; charid < ucharcount ; charid++ ) {
2269         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2270         if ( newstate ) {
2271             q[ q_write ] = newstate;
2272             /* set to point at the root */
2273             fail[ q[ q_write++ ] ]=1;
2274         }
2275     }
2276     while ( q_read < q_write) {
2277         const U32 cur = q[ q_read++ % numstates ];
2278         base = trie->states[ cur ].trans.base;
2279
2280         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2281             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2282             if (ch_state) {
2283                 U32 fail_state = cur;
2284                 U32 fail_base;
2285                 do {
2286                     fail_state = fail[ fail_state ];
2287                     fail_base = aho->states[ fail_state ].trans.base;
2288                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2289
2290                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2291                 fail[ ch_state ] = fail_state;
2292                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2293                 {
2294                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2295                 }
2296                 q[ q_write++ % numstates] = ch_state;
2297             }
2298         }
2299     }
2300     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2301        when we fail in state 1, this allows us to use the
2302        charclass scan to find a valid start char. This is based on the principle
2303        that theres a good chance the string being searched contains lots of stuff
2304        that cant be a start char.
2305      */
2306     fail[ 0 ] = fail[ 1 ] = 0;
2307     DEBUG_TRIE_COMPILE_r({
2308         PerlIO_printf(Perl_debug_log,
2309                       "%*sStclass Failtable (%"UVuf" states): 0",
2310                       (int)(depth * 2), "", (UV)numstates
2311         );
2312         for( q_read=1; q_read<numstates; q_read++ ) {
2313             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2314         }
2315         PerlIO_printf(Perl_debug_log, "\n");
2316     });
2317     Safefree(q);
2318     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2319 }
2320
2321
2322 /*
2323  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2324  * These need to be revisited when a newer toolchain becomes available.
2325  */
2326 #if defined(__sparc64__) && defined(__GNUC__)
2327 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2328 #       undef  SPARC64_GCC_WORKAROUND
2329 #       define SPARC64_GCC_WORKAROUND 1
2330 #   endif
2331 #endif
2332
2333 #define DEBUG_PEEP(str,scan,depth) \
2334     DEBUG_OPTIMISE_r({if (scan){ \
2335        SV * const mysv=sv_newmortal(); \
2336        regnode *Next = regnext(scan); \
2337        regprop(RExC_rx, mysv, scan); \
2338        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2339        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2340        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2341    }});
2342
2343
2344
2345
2346
2347 #define JOIN_EXACT(scan,min,flags) \
2348     if (PL_regkind[OP(scan)] == EXACT) \
2349         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2350
2351 STATIC U32
2352 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2353     /* Merge several consecutive EXACTish nodes into one. */
2354     regnode *n = regnext(scan);
2355     U32 stringok = 1;
2356     regnode *next = scan + NODE_SZ_STR(scan);
2357     U32 merged = 0;
2358     U32 stopnow = 0;
2359 #ifdef DEBUGGING
2360     regnode *stop = scan;
2361     GET_RE_DEBUG_FLAGS_DECL;
2362 #else
2363     PERL_UNUSED_ARG(depth);
2364 #endif
2365
2366     PERL_ARGS_ASSERT_JOIN_EXACT;
2367 #ifndef EXPERIMENTAL_INPLACESCAN
2368     PERL_UNUSED_ARG(flags);
2369     PERL_UNUSED_ARG(val);
2370 #endif
2371     DEBUG_PEEP("join",scan,depth);
2372
2373     /* Skip NOTHING, merge EXACT*. */
2374     while (n &&
2375            ( PL_regkind[OP(n)] == NOTHING ||
2376              (stringok && (OP(n) == OP(scan))))
2377            && NEXT_OFF(n)
2378            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2379
2380         if (OP(n) == TAIL || n > next)
2381             stringok = 0;
2382         if (PL_regkind[OP(n)] == NOTHING) {
2383             DEBUG_PEEP("skip:",n,depth);
2384             NEXT_OFF(scan) += NEXT_OFF(n);
2385             next = n + NODE_STEP_REGNODE;
2386 #ifdef DEBUGGING
2387             if (stringok)
2388                 stop = n;
2389 #endif
2390             n = regnext(n);
2391         }
2392         else if (stringok) {
2393             const unsigned int oldl = STR_LEN(scan);
2394             regnode * const nnext = regnext(n);
2395
2396             DEBUG_PEEP("merg",n,depth);
2397
2398             merged++;
2399             if (oldl + STR_LEN(n) > U8_MAX)
2400                 break;
2401             NEXT_OFF(scan) += NEXT_OFF(n);
2402             STR_LEN(scan) += STR_LEN(n);
2403             next = n + NODE_SZ_STR(n);
2404             /* Now we can overwrite *n : */
2405             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2406 #ifdef DEBUGGING
2407             stop = next - 1;
2408 #endif
2409             n = nnext;
2410             if (stopnow) break;
2411         }
2412
2413 #ifdef EXPERIMENTAL_INPLACESCAN
2414         if (flags && !NEXT_OFF(n)) {
2415             DEBUG_PEEP("atch", val, depth);
2416             if (reg_off_by_arg[OP(n)]) {
2417                 ARG_SET(n, val - n);
2418             }
2419             else {
2420                 NEXT_OFF(n) = val - n;
2421             }
2422             stopnow = 1;
2423         }
2424 #endif
2425     }
2426
2427     if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2428     /*
2429     Two problematic code points in Unicode casefolding of EXACT nodes:
2430
2431     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2432     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2433
2434     which casefold to
2435
2436     Unicode                      UTF-8
2437
2438     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2439     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2440
2441     This means that in case-insensitive matching (or "loose matching",
2442     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2443     length of the above casefolded versions) can match a target string
2444     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2445     This would rather mess up the minimum length computation.
2446
2447     What we'll do is to look for the tail four bytes, and then peek
2448     at the preceding two bytes to see whether we need to decrease
2449     the minimum length by four (six minus two).
2450
2451     Thanks to the design of UTF-8, there cannot be false matches:
2452     A sequence of valid UTF-8 bytes cannot be a subsequence of
2453     another valid sequence of UTF-8 bytes.
2454
2455     */
2456          char * const s0 = STRING(scan), *s, *t;
2457          char * const s1 = s0 + STR_LEN(scan) - 1;
2458          char * const s2 = s1 - 4;
2459 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2460          const char t0[] = "\xaf\x49\xaf\x42";
2461 #else
2462          const char t0[] = "\xcc\x88\xcc\x81";
2463 #endif
2464          const char * const t1 = t0 + 3;
2465
2466          for (s = s0 + 2;
2467               s < s2 && (t = ninstr(s, s1, t0, t1));
2468               s = t + 4) {
2469 #ifdef EBCDIC
2470               if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2471                   ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2472 #else
2473               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2474                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2475 #endif
2476                    *min -= 4;
2477          }
2478     }
2479
2480 #ifdef DEBUGGING
2481     /* Allow dumping */
2482     n = scan + NODE_SZ_STR(scan);
2483     while (n <= stop) {
2484         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2485             OP(n) = OPTIMIZED;
2486             NEXT_OFF(n) = 0;
2487         }
2488         n++;
2489     }
2490 #endif
2491     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2492     return stopnow;
2493 }
2494
2495 /* REx optimizer.  Converts nodes into quickier variants "in place".
2496    Finds fixed substrings.  */
2497
2498 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2499    to the position after last scanned or to NULL. */
2500
2501 #define INIT_AND_WITHP \
2502     assert(!and_withp); \
2503     Newx(and_withp,1,struct regnode_charclass_class); \
2504     SAVEFREEPV(and_withp)
2505
2506 /* this is a chain of data about sub patterns we are processing that
2507    need to be handled seperately/specially in study_chunk. Its so
2508    we can simulate recursion without losing state.  */
2509 struct scan_frame;
2510 typedef struct scan_frame {
2511     regnode *last;  /* last node to process in this frame */
2512     regnode *next;  /* next node to process when last is reached */
2513     struct scan_frame *prev; /*previous frame*/
2514     I32 stop; /* what stopparen do we use */
2515 } scan_frame;
2516
2517
2518 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2519
2520 #define CASE_SYNST_FNC(nAmE)                                       \
2521 case nAmE:                                                         \
2522     if (flags & SCF_DO_STCLASS_AND) {                              \
2523             for (value = 0; value < 256; value++)                  \
2524                 if (!is_ ## nAmE ## _cp(value))                       \
2525                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2526     }                                                              \
2527     else {                                                         \
2528             for (value = 0; value < 256; value++)                  \
2529                 if (is_ ## nAmE ## _cp(value))                        \
2530                     ANYOF_BITMAP_SET(data->start_class, value);    \
2531     }                                                              \
2532     break;                                                         \
2533 case N ## nAmE:                                                    \
2534     if (flags & SCF_DO_STCLASS_AND) {                              \
2535             for (value = 0; value < 256; value++)                   \
2536                 if (is_ ## nAmE ## _cp(value))                         \
2537                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2538     }                                                               \
2539     else {                                                          \
2540             for (value = 0; value < 256; value++)                   \
2541                 if (!is_ ## nAmE ## _cp(value))                        \
2542                     ANYOF_BITMAP_SET(data->start_class, value);     \
2543     }                                                               \
2544     break
2545
2546
2547
2548 STATIC I32
2549 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2550                         I32 *minlenp, I32 *deltap,
2551                         regnode *last,
2552                         scan_data_t *data,
2553                         I32 stopparen,
2554                         U8* recursed,
2555                         struct regnode_charclass_class *and_withp,
2556                         U32 flags, U32 depth)
2557                         /* scanp: Start here (read-write). */
2558                         /* deltap: Write maxlen-minlen here. */
2559                         /* last: Stop before this one. */
2560                         /* data: string data about the pattern */
2561                         /* stopparen: treat close N as END */
2562                         /* recursed: which subroutines have we recursed into */
2563                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2564 {
2565     dVAR;
2566     I32 min = 0, pars = 0, code;
2567     regnode *scan = *scanp, *next;
2568     I32 delta = 0;
2569     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2570     int is_inf_internal = 0;            /* The studied chunk is infinite */
2571     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2572     scan_data_t data_fake;
2573     SV *re_trie_maxbuff = NULL;
2574     regnode *first_non_open = scan;
2575     I32 stopmin = I32_MAX;
2576     scan_frame *frame = NULL;
2577     GET_RE_DEBUG_FLAGS_DECL;
2578
2579     PERL_ARGS_ASSERT_STUDY_CHUNK;
2580
2581 #ifdef DEBUGGING
2582     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2583 #endif
2584
2585     if ( depth == 0 ) {
2586         while (first_non_open && OP(first_non_open) == OPEN)
2587             first_non_open=regnext(first_non_open);
2588     }
2589
2590
2591   fake_study_recurse:
2592     while ( scan && OP(scan) != END && scan < last ){
2593         /* Peephole optimizer: */
2594         DEBUG_STUDYDATA("Peep:", data,depth);
2595         DEBUG_PEEP("Peep",scan,depth);
2596         JOIN_EXACT(scan,&min,0);
2597
2598         /* Follow the next-chain of the current node and optimize
2599            away all the NOTHINGs from it.  */
2600         if (OP(scan) != CURLYX) {
2601             const int max = (reg_off_by_arg[OP(scan)]
2602                        ? I32_MAX
2603                        /* I32 may be smaller than U16 on CRAYs! */
2604                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2605             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2606             int noff;
2607             regnode *n = scan;
2608
2609             /* Skip NOTHING and LONGJMP. */
2610             while ((n = regnext(n))
2611                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2612                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2613                    && off + noff < max)
2614                 off += noff;
2615             if (reg_off_by_arg[OP(scan)])
2616                 ARG(scan) = off;
2617             else
2618                 NEXT_OFF(scan) = off;
2619         }
2620
2621
2622
2623         /* The principal pseudo-switch.  Cannot be a switch, since we
2624            look into several different things.  */
2625         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2626                    || OP(scan) == IFTHEN) {
2627             next = regnext(scan);
2628             code = OP(scan);
2629             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2630
2631             if (OP(next) == code || code == IFTHEN) {
2632                 /* NOTE - There is similar code to this block below for handling
2633                    TRIE nodes on a re-study.  If you change stuff here check there
2634                    too. */
2635                 I32 max1 = 0, min1 = I32_MAX, num = 0;
2636                 struct regnode_charclass_class accum;
2637                 regnode * const startbranch=scan;
2638
2639                 if (flags & SCF_DO_SUBSTR)
2640                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2641                 if (flags & SCF_DO_STCLASS)
2642                     cl_init_zero(pRExC_state, &accum);
2643
2644                 while (OP(scan) == code) {
2645                     I32 deltanext, minnext, f = 0, fake;
2646                     struct regnode_charclass_class this_class;
2647
2648                     num++;
2649                     data_fake.flags = 0;
2650                     if (data) {
2651                         data_fake.whilem_c = data->whilem_c;
2652                         data_fake.last_closep = data->last_closep;
2653                     }
2654                     else
2655                         data_fake.last_closep = &fake;
2656
2657                     data_fake.pos_delta = delta;
2658                     next = regnext(scan);
2659                     scan = NEXTOPER(scan);
2660                     if (code != BRANCH)
2661                         scan = NEXTOPER(scan);
2662                     if (flags & SCF_DO_STCLASS) {
2663                         cl_init(pRExC_state, &this_class);
2664                         data_fake.start_class = &this_class;
2665                         f = SCF_DO_STCLASS_AND;
2666                     }
2667                     if (flags & SCF_WHILEM_VISITED_POS)
2668                         f |= SCF_WHILEM_VISITED_POS;
2669
2670                     /* we suppose the run is continuous, last=next...*/
2671                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2672                                           next, &data_fake,
2673                                           stopparen, recursed, NULL, f,depth+1);
2674                     if (min1 > minnext)
2675                         min1 = minnext;
2676                     if (max1 < minnext + deltanext)
2677                         max1 = minnext + deltanext;
2678                     if (deltanext == I32_MAX)
2679                         is_inf = is_inf_internal = 1;
2680                     scan = next;
2681                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2682                         pars++;
2683                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
2684                         if ( stopmin > minnext)
2685                             stopmin = min + min1;
2686                         flags &= ~SCF_DO_SUBSTR;
2687                         if (data)
2688                             data->flags |= SCF_SEEN_ACCEPT;
2689                     }
2690                     if (data) {
2691                         if (data_fake.flags & SF_HAS_EVAL)
2692                             data->flags |= SF_HAS_EVAL;
2693                         data->whilem_c = data_fake.whilem_c;
2694                     }
2695                     if (flags & SCF_DO_STCLASS)
2696                         cl_or(pRExC_state, &accum, &this_class);
2697                 }
2698                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2699                     min1 = 0;
2700                 if (flags & SCF_DO_SUBSTR) {
2701                     data->pos_min += min1;
2702                     data->pos_delta += max1 - min1;
2703                     if (max1 != min1 || is_inf)
2704                         data->longest = &(data->longest_float);
2705                 }
2706                 min += min1;
2707                 delta += max1 - min1;
2708                 if (flags & SCF_DO_STCLASS_OR) {
2709                     cl_or(pRExC_state, data->start_class, &accum);
2710                     if (min1) {
2711                         cl_and(data->start_class, and_withp);
2712                         flags &= ~SCF_DO_STCLASS;
2713                     }
2714                 }
2715                 else if (flags & SCF_DO_STCLASS_AND) {
2716                     if (min1) {
2717                         cl_and(data->start_class, &accum);
2718                         flags &= ~SCF_DO_STCLASS;
2719                     }
2720                     else {
2721                         /* Switch to OR mode: cache the old value of
2722                          * data->start_class */
2723                         INIT_AND_WITHP;
2724                         StructCopy(data->start_class, and_withp,
2725                                    struct regnode_charclass_class);
2726                         flags &= ~SCF_DO_STCLASS_AND;
2727                         StructCopy(&accum, data->start_class,
2728                                    struct regnode_charclass_class);
2729                         flags |= SCF_DO_STCLASS_OR;
2730                         data->start_class->flags |= ANYOF_EOS;
2731                     }
2732                 }
2733
2734                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2735                 /* demq.
2736
2737                    Assuming this was/is a branch we are dealing with: 'scan' now
2738                    points at the item that follows the branch sequence, whatever
2739                    it is. We now start at the beginning of the sequence and look
2740                    for subsequences of
2741
2742                    BRANCH->EXACT=>x1
2743                    BRANCH->EXACT=>x2
2744                    tail
2745
2746                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
2747
2748                    If we can find such a subseqence we need to turn the first
2749                    element into a trie and then add the subsequent branch exact
2750                    strings to the trie.
2751
2752                    We have two cases
2753
2754                      1. patterns where the whole set of branch can be converted.
2755
2756                      2. patterns where only a subset can be converted.
2757
2758                    In case 1 we can replace the whole set with a single regop
2759                    for the trie. In case 2 we need to keep the start and end
2760                    branchs so
2761
2762                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2763                      becomes BRANCH TRIE; BRANCH X;
2764
2765                   There is an additional case, that being where there is a
2766                   common prefix, which gets split out into an EXACT like node
2767                   preceding the TRIE node.
2768
2769                   If x(1..n)==tail then we can do a simple trie, if not we make
2770                   a "jump" trie, such that when we match the appropriate word
2771                   we "jump" to the appopriate tail node. Essentailly we turn
2772                   a nested if into a case structure of sorts.
2773
2774                 */
2775
2776                     int made=0;
2777                     if (!re_trie_maxbuff) {
2778                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2779                         if (!SvIOK(re_trie_maxbuff))
2780                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2781                     }
2782                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2783                         regnode *cur;
2784                         regnode *first = (regnode *)NULL;
2785                         regnode *last = (regnode *)NULL;
2786                         regnode *tail = scan;
2787                         U8 optype = 0;
2788                         U32 count=0;
2789
2790 #ifdef DEBUGGING
2791                         SV * const mysv = sv_newmortal();       /* for dumping */
2792 #endif
2793                         /* var tail is used because there may be a TAIL
2794                            regop in the way. Ie, the exacts will point to the
2795                            thing following the TAIL, but the last branch will
2796                            point at the TAIL. So we advance tail. If we
2797                            have nested (?:) we may have to move through several
2798                            tails.
2799                          */
2800
2801                         while ( OP( tail ) == TAIL ) {
2802                             /* this is the TAIL generated by (?:) */
2803                             tail = regnext( tail );
2804                         }
2805
2806
2807                         DEBUG_OPTIMISE_r({
2808                             regprop(RExC_rx, mysv, tail );
2809                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2810                                 (int)depth * 2 + 2, "",
2811                                 "Looking for TRIE'able sequences. Tail node is: ",
2812                                 SvPV_nolen_const( mysv )
2813                             );
2814                         });
2815
2816                         /*
2817
2818                            step through the branches, cur represents each
2819                            branch, noper is the first thing to be matched
2820                            as part of that branch and noper_next is the
2821                            regnext() of that node. if noper is an EXACT
2822                            and noper_next is the same as scan (our current
2823                            position in the regex) then the EXACT branch is
2824                            a possible optimization target. Once we have
2825                            two or more consequetive such branches we can
2826                            create a trie of the EXACT's contents and stich
2827                            it in place. If the sequence represents all of
2828                            the branches we eliminate the whole thing and
2829                            replace it with a single TRIE. If it is a
2830                            subsequence then we need to stitch it in. This
2831                            means the first branch has to remain, and needs
2832                            to be repointed at the item on the branch chain
2833                            following the last branch optimized. This could
2834                            be either a BRANCH, in which case the
2835                            subsequence is internal, or it could be the
2836                            item following the branch sequence in which
2837                            case the subsequence is at the end.
2838
2839                         */
2840
2841                         /* dont use tail as the end marker for this traverse */
2842                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2843                             regnode * const noper = NEXTOPER( cur );
2844 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2845                             regnode * const noper_next = regnext( noper );
2846 #endif
2847
2848                             DEBUG_OPTIMISE_r({
2849                                 regprop(RExC_rx, mysv, cur);
2850                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2851                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2852
2853                                 regprop(RExC_rx, mysv, noper);
2854                                 PerlIO_printf( Perl_debug_log, " -> %s",
2855                                     SvPV_nolen_const(mysv));
2856
2857                                 if ( noper_next ) {
2858                                   regprop(RExC_rx, mysv, noper_next );
2859                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2860                                     SvPV_nolen_const(mysv));
2861                                 }
2862                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2863                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2864                             });
2865                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2866                                          : PL_regkind[ OP( noper ) ] == EXACT )
2867                                   || OP(noper) == NOTHING )
2868 #ifdef NOJUMPTRIE
2869                                   && noper_next == tail
2870 #endif
2871                                   && count < U16_MAX)
2872                             {
2873                                 count++;
2874                                 if ( !first || optype == NOTHING ) {
2875                                     if (!first) first = cur;
2876                                     optype = OP( noper );
2877                                 } else {
2878                                     last = cur;
2879                                 }
2880                             } else {
2881 /*
2882     Currently we do not believe that the trie logic can
2883     handle case insensitive matching properly when the
2884     pattern is not unicode (thus forcing unicode semantics).
2885
2886     If/when this is fixed the following define can be swapped
2887     in below to fully enable trie logic.
2888
2889 #define TRIE_TYPE_IS_SAFE 1
2890
2891 */
2892 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2893
2894                                 if ( last && TRIE_TYPE_IS_SAFE ) {
2895                                     make_trie( pRExC_state,
2896                                             startbranch, first, cur, tail, count,
2897                                             optype, depth+1 );
2898                                 }
2899                                 if ( PL_regkind[ OP( noper ) ] == EXACT
2900 #ifdef NOJUMPTRIE
2901                                      && noper_next == tail
2902 #endif
2903                                 ){
2904                                     count = 1;
2905                                     first = cur;
2906                                     optype = OP( noper );
2907                                 } else {
2908                                     count = 0;
2909                                     first = NULL;
2910                                     optype = 0;
2911                                 }
2912                                 last = NULL;
2913                             }
2914                         }
2915                         DEBUG_OPTIMISE_r({
2916                             regprop(RExC_rx, mysv, cur);
2917                             PerlIO_printf( Perl_debug_log,
2918                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2919                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2920
2921                         });
2922
2923                         if ( last && TRIE_TYPE_IS_SAFE ) {
2924                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2925 #ifdef TRIE_STUDY_OPT
2926                             if ( ((made == MADE_EXACT_TRIE &&
2927                                  startbranch == first)
2928                                  || ( first_non_open == first )) &&
2929                                  depth==0 ) {
2930                                 flags |= SCF_TRIE_RESTUDY;
2931                                 if ( startbranch == first
2932                                      && scan == tail )
2933                                 {
2934                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2935                                 }
2936                             }
2937 #endif
2938                         }
2939                     }
2940
2941                 } /* do trie */
2942
2943             }
2944             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
2945                 scan = NEXTOPER(NEXTOPER(scan));
2946             } else                      /* single branch is optimized. */
2947                 scan = NEXTOPER(scan);
2948             continue;
2949         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2950             scan_frame *newframe = NULL;
2951             I32 paren;
2952             regnode *start;
2953             regnode *end;
2954
2955             if (OP(scan) != SUSPEND) {
2956             /* set the pointer */
2957                 if (OP(scan) == GOSUB) {
2958                     paren = ARG(scan);
2959                     RExC_recurse[ARG2L(scan)] = scan;
2960                     start = RExC_open_parens[paren-1];
2961                     end   = RExC_close_parens[paren-1];
2962                 } else {
2963                     paren = 0;
2964                     start = RExC_rxi->program + 1;
2965                     end   = RExC_opend;
2966                 }
2967                 if (!recursed) {
2968                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2969                     SAVEFREEPV(recursed);
2970                 }
2971                 if (!PAREN_TEST(recursed,paren+1)) {
2972                     PAREN_SET(recursed,paren+1);
2973                     Newx(newframe,1,scan_frame);
2974                 } else {
2975                     if (flags & SCF_DO_SUBSTR) {
2976                         SCAN_COMMIT(pRExC_state,data,minlenp);
2977                         data->longest = &(data->longest_float);
2978                     }
2979                     is_inf = is_inf_internal = 1;
2980                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2981                         cl_anything(pRExC_state, data->start_class);
2982                     flags &= ~SCF_DO_STCLASS;
2983                 }
2984             } else {
2985                 Newx(newframe,1,scan_frame);
2986                 paren = stopparen;
2987                 start = scan+2;
2988                 end = regnext(scan);
2989             }
2990             if (newframe) {
2991                 assert(start);
2992                 assert(end);
2993                 SAVEFREEPV(newframe);
2994                 newframe->next = regnext(scan);
2995                 newframe->last = last;
2996                 newframe->stop = stopparen;
2997                 newframe->prev = frame;
2998
2999                 frame = newframe;
3000                 scan =  start;
3001                 stopparen = paren;
3002                 last = end;
3003
3004                 continue;
3005             }
3006         }
3007         else if (OP(scan) == EXACT) {
3008             I32 l = STR_LEN(scan);
3009             UV uc;
3010             if (UTF) {
3011                 const U8 * const s = (U8*)STRING(scan);
3012                 l = utf8_length(s, s + l);
3013                 uc = utf8_to_uvchr(s, NULL);
3014             } else {
3015                 uc = *((U8*)STRING(scan));
3016             }
3017             min += l;
3018             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3019                 /* The code below prefers earlier match for fixed
3020                    offset, later match for variable offset.  */
3021                 if (data->last_end == -1) { /* Update the start info. */
3022                     data->last_start_min = data->pos_min;
3023                     data->last_start_max = is_inf
3024                         ? I32_MAX : data->pos_min + data->pos_delta;
3025                 }
3026                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3027                 if (UTF)
3028                     SvUTF8_on(data->last_found);
3029                 {
3030                     SV * const sv = data->last_found;
3031                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3032                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3033                     if (mg && mg->mg_len >= 0)
3034                         mg->mg_len += utf8_length((U8*)STRING(scan),
3035                                                   (U8*)STRING(scan)+STR_LEN(scan));
3036                 }
3037                 data->last_end = data->pos_min + l;
3038                 data->pos_min += l; /* As in the first entry. */
3039                 data->flags &= ~SF_BEFORE_EOL;
3040             }
3041             if (flags & SCF_DO_STCLASS_AND) {
3042                 /* Check whether it is compatible with what we know already! */
3043                 int compat = 1;
3044
3045                 if (uc >= 0x100 ||
3046                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3047                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3048                     && (!(data->start_class->flags & ANYOF_FOLD)
3049                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3050                     )
3051                     compat = 0;
3052                 ANYOF_CLASS_ZERO(data->start_class);
3053                 ANYOF_BITMAP_ZERO(data->start_class);
3054                 if (compat)
3055                     ANYOF_BITMAP_SET(data->start_class, uc);
3056                 data->start_class->flags &= ~ANYOF_EOS;
3057                 if (uc < 0x100)
3058                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3059             }
3060             else if (flags & SCF_DO_STCLASS_OR) {
3061                 /* false positive possible if the class is case-folded */
3062                 if (uc < 0x100)
3063                     ANYOF_BITMAP_SET(data->start_class, uc);
3064                 else
3065                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3066                 data->start_class->flags &= ~ANYOF_EOS;
3067                 cl_and(data->start_class, and_withp);
3068             }
3069             flags &= ~SCF_DO_STCLASS;
3070         }
3071         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3072             I32 l = STR_LEN(scan);
3073             UV uc = *((U8*)STRING(scan));
3074
3075             /* Search for fixed substrings supports EXACT only. */
3076             if (flags & SCF_DO_SUBSTR) {
3077                 assert(data);
3078                 SCAN_COMMIT(pRExC_state, data, minlenp);
3079             }
3080             if (UTF) {
3081                 const U8 * const s = (U8 *)STRING(scan);
3082                 l = utf8_length(s, s + l);
3083                 uc = utf8_to_uvchr(s, NULL);
3084             }
3085             min += l;
3086             if (flags & SCF_DO_SUBSTR)
3087                 data->pos_min += l;
3088             if (flags & SCF_DO_STCLASS_AND) {
3089                 /* Check whether it is compatible with what we know already! */
3090                 int compat = 1;
3091
3092                 if (uc >= 0x100 ||
3093                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3094                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3095                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3096                     compat = 0;
3097                 ANYOF_CLASS_ZERO(data->start_class);
3098                 ANYOF_BITMAP_ZERO(data->start_class);
3099                 if (compat) {
3100                     ANYOF_BITMAP_SET(data->start_class, uc);
3101                     data->start_class->flags &= ~ANYOF_EOS;
3102                     data->start_class->flags |= ANYOF_FOLD;
3103                     if (OP(scan) == EXACTFL)
3104                         data->start_class->flags |= ANYOF_LOCALE;
3105                 }
3106             }
3107             else if (flags & SCF_DO_STCLASS_OR) {
3108                 if (data->start_class->flags & ANYOF_FOLD) {
3109                     /* false positive possible if the class is case-folded.
3110                        Assume that the locale settings are the same... */
3111                     if (uc < 0x100)
3112                         ANYOF_BITMAP_SET(data->start_class, uc);
3113                     data->start_class->flags &= ~ANYOF_EOS;
3114                 }
3115                 cl_and(data->start_class, and_withp);
3116             }
3117             flags &= ~SCF_DO_STCLASS;
3118         }
3119         else if (strchr((const char*)PL_varies,OP(scan))) {
3120             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3121             I32 f = flags, pos_before = 0;
3122             regnode * const oscan = scan;
3123             struct regnode_charclass_class this_class;
3124             struct regnode_charclass_class *oclass = NULL;
3125             I32 next_is_eval = 0;
3126
3127             switch (PL_regkind[OP(scan)]) {
3128             case WHILEM:                /* End of (?:...)* . */
3129                 scan = NEXTOPER(scan);
3130                 goto finish;
3131             case PLUS:
3132                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3133                     next = NEXTOPER(scan);
3134                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3135                         mincount = 1;
3136                         maxcount = REG_INFTY;
3137                         next = regnext(scan);
3138                         scan = NEXTOPER(scan);
3139                         goto do_curly;
3140                     }
3141                 }
3142                 if (flags & SCF_DO_SUBSTR)
3143                     data->pos_min++;
3144                 min++;
3145                 /* Fall through. */
3146             case STAR:
3147                 if (flags & SCF_DO_STCLASS) {
3148                     mincount = 0;
3149                     maxcount = REG_INFTY;
3150                     next = regnext(scan);
3151                     scan = NEXTOPER(scan);
3152                     goto do_curly;
3153                 }
3154                 is_inf = is_inf_internal = 1;
3155                 scan = regnext(scan);
3156                 if (flags & SCF_DO_SUBSTR) {
3157                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3158                     data->longest = &(data->longest_float);
3159                 }
3160                 goto optimize_curly_tail;
3161             case CURLY:
3162                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3163                     && (scan->flags == stopparen))
3164                 {
3165                     mincount = 1;
3166                     maxcount = 1;
3167                 } else {
3168                     mincount = ARG1(scan);
3169                     maxcount = ARG2(scan);
3170                 }
3171                 next = regnext(scan);
3172                 if (OP(scan) == CURLYX) {
3173                     I32 lp = (data ? *(data->last_closep) : 0);
3174                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3175                 }
3176                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3177                 next_is_eval = (OP(scan) == EVAL);
3178               do_curly:
3179                 if (flags & SCF_DO_SUBSTR) {
3180                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3181                     pos_before = data->pos_min;
3182                 }
3183                 if (data) {
3184                     fl = data->flags;
3185                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3186                     if (is_inf)
3187                         data->flags |= SF_IS_INF;
3188                 }
3189                 if (flags & SCF_DO_STCLASS) {
3190                     cl_init(pRExC_state, &this_class);
3191                     oclass = data->start_class;
3192                     data->start_class = &this_class;
3193                     f |= SCF_DO_STCLASS_AND;
3194                     f &= ~SCF_DO_STCLASS_OR;
3195                 }
3196                 /* These are the cases when once a subexpression
3197                    fails at a particular position, it cannot succeed
3198                    even after backtracking at the enclosing scope.
3199
3200                    XXXX what if minimal match and we are at the
3201                         initial run of {n,m}? */
3202                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3203                     f &= ~SCF_WHILEM_VISITED_POS;
3204
3205                 /* This will finish on WHILEM, setting scan, or on NULL: */
3206                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3207                                       last, data, stopparen, recursed, NULL,
3208                                       (mincount == 0
3209                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3210
3211                 if (flags & SCF_DO_STCLASS)
3212                     data->start_class = oclass;
3213                 if (mincount == 0 || minnext == 0) {
3214                     if (flags & SCF_DO_STCLASS_OR) {
3215                         cl_or(pRExC_state, data->start_class, &this_class);
3216                     }
3217                     else if (flags & SCF_DO_STCLASS_AND) {
3218                         /* Switch to OR mode: cache the old value of
3219                          * data->start_class */
3220                         INIT_AND_WITHP;
3221                         StructCopy(data->start_class, and_withp,
3222                                    struct regnode_charclass_class);
3223                         flags &= ~SCF_DO_STCLASS_AND;
3224                         StructCopy(&this_class, data->start_class,
3225                                    struct regnode_charclass_class);
3226                         flags |= SCF_DO_STCLASS_OR;
3227                         data->start_class->flags |= ANYOF_EOS;
3228                     }
3229                 } else {                /* Non-zero len */
3230                     if (flags & SCF_DO_STCLASS_OR) {
3231                         cl_or(pRExC_state, data->start_class, &this_class);
3232                         cl_and(data->start_class, and_withp);
3233                     }
3234                     else if (flags & SCF_DO_STCLASS_AND)
3235                         cl_and(data->start_class, &this_class);
3236                     flags &= ~SCF_DO_STCLASS;
3237                 }
3238                 if (!scan)              /* It was not CURLYX, but CURLY. */
3239                     scan = next;
3240                 if ( /* ? quantifier ok, except for (?{ ... }) */
3241                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3242                     && (minnext == 0) && (deltanext == 0)
3243                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3244                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3245                 {
3246                     ckWARNreg(RExC_parse,
3247                               "Quantifier unexpected on zero-length expression");
3248                 }
3249
3250                 min += minnext * mincount;
3251                 is_inf_internal |= ((maxcount == REG_INFTY
3252                                      && (minnext + deltanext) > 0)
3253                                     || deltanext == I32_MAX);
3254                 is_inf |= is_inf_internal;
3255                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3256
3257                 /* Try powerful optimization CURLYX => CURLYN. */
3258                 if (  OP(oscan) == CURLYX && data
3259                       && data->flags & SF_IN_PAR
3260                       && !(data->flags & SF_HAS_EVAL)
3261                       && !deltanext && minnext == 1 ) {
3262                     /* Try to optimize to CURLYN.  */
3263                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3264                     regnode * const nxt1 = nxt;
3265 #ifdef DEBUGGING
3266                     regnode *nxt2;
3267 #endif
3268
3269                     /* Skip open. */
3270                     nxt = regnext(nxt);
3271                     if (!strchr((const char*)PL_simple,OP(nxt))
3272                         && !(PL_regkind[OP(nxt)] == EXACT
3273                              && STR_LEN(nxt) == 1))
3274                         goto nogo;
3275 #ifdef DEBUGGING
3276                     nxt2 = nxt;
3277 #endif
3278                     nxt = regnext(nxt);
3279                     if (OP(nxt) != CLOSE)
3280                         goto nogo;
3281                     if (RExC_open_parens) {
3282                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3283                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3284                     }
3285                     /* Now we know that nxt2 is the only contents: */
3286                     oscan->flags = (U8)ARG(nxt);
3287                     OP(oscan) = CURLYN;
3288                     OP(nxt1) = NOTHING; /* was OPEN. */
3289
3290 #ifdef DEBUGGING
3291                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3292                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3293                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3294                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3295                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3296                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3297 #endif
3298                 }
3299               nogo:
3300
3301                 /* Try optimization CURLYX => CURLYM. */
3302                 if (  OP(oscan) == CURLYX && data
3303                       && !(data->flags & SF_HAS_PAR)
3304                       && !(data->flags & SF_HAS_EVAL)
3305                       && !deltanext     /* atom is fixed width */
3306                       && minnext != 0   /* CURLYM can't handle zero width */
3307                 ) {
3308                     /* XXXX How to optimize if data == 0? */
3309                     /* Optimize to a simpler form.  */
3310                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3311                     regnode *nxt2;
3312
3313                     OP(oscan) = CURLYM;
3314                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3315                             && (OP(nxt2) != WHILEM))
3316                         nxt = nxt2;
3317                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3318                     /* Need to optimize away parenths. */
3319                     if (data->flags & SF_IN_PAR) {
3320                         /* Set the parenth number.  */
3321                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3322
3323                         if (OP(nxt) != CLOSE)
3324                             FAIL("Panic opt close");
3325                         oscan->flags = (U8)ARG(nxt);
3326                         if (RExC_open_parens) {
3327                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3328                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3329                         }
3330                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3331                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3332
3333 #ifdef DEBUGGING
3334                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3335                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3336                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3337                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3338 #endif
3339 #if 0
3340                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3341                             regnode *nnxt = regnext(nxt1);
3342
3343                             if (nnxt == nxt) {
3344                                 if (reg_off_by_arg[OP(nxt1)])
3345                                     ARG_SET(nxt1, nxt2 - nxt1);
3346                                 else if (nxt2 - nxt1 < U16_MAX)
3347                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3348                                 else
3349                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3350                             }
3351                             nxt1 = nnxt;
3352                         }
3353 #endif
3354                         /* Optimize again: */
3355                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3356                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3357                     }
3358                     else
3359                         oscan->flags = 0;
3360                 }
3361                 else if ((OP(oscan) == CURLYX)
3362                          && (flags & SCF_WHILEM_VISITED_POS)
3363                          /* See the comment on a similar expression above.
3364                             However, this time it not a subexpression
3365                             we care about, but the expression itself. */
3366                          && (maxcount == REG_INFTY)
3367                          && data && ++data->whilem_c < 16) {
3368                     /* This stays as CURLYX, we can put the count/of pair. */
3369                     /* Find WHILEM (as in regexec.c) */
3370                     regnode *nxt = oscan + NEXT_OFF(oscan);
3371
3372                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3373                         nxt += ARG(nxt);
3374                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3375                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3376                 }
3377                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3378                     pars++;
3379                 if (flags & SCF_DO_SUBSTR) {
3380                     SV *last_str = NULL;
3381                     int counted = mincount != 0;
3382
3383                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3384 #if defined(SPARC64_GCC_WORKAROUND)
3385                         I32 b = 0;
3386                         STRLEN l = 0;
3387                         const char *s = NULL;
3388                         I32 old = 0;
3389
3390                         if (pos_before >= data->last_start_min)
3391                             b = pos_before;
3392                         else
3393                             b = data->last_start_min;
3394
3395                         l = 0;
3396                         s = SvPV_const(data->last_found, l);
3397                         old = b - data->last_start_min;
3398
3399 #else
3400                         I32 b = pos_before >= data->last_start_min
3401                             ? pos_before : data->last_start_min;
3402                         STRLEN l;
3403                         const char * const s = SvPV_const(data->last_found, l);
3404                         I32 old = b - data->last_start_min;
3405 #endif
3406
3407                         if (UTF)
3408                             old = utf8_hop((U8*)s, old) - (U8*)s;
3409
3410                         l -= old;
3411                         /* Get the added string: */
3412                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3413                         if (deltanext == 0 && pos_before == b) {
3414                             /* What was added is a constant string */
3415                             if (mincount > 1) {
3416                                 SvGROW(last_str, (mincount * l) + 1);
3417                                 repeatcpy(SvPVX(last_str) + l,
3418                                           SvPVX_const(last_str), l, mincount - 1);
3419                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3420                                 /* Add additional parts. */
3421                                 SvCUR_set(data->last_found,
3422                                           SvCUR(data->last_found) - l);
3423                                 sv_catsv(data->last_found, last_str);
3424                                 {
3425                                     SV * sv = data->last_found;
3426                                     MAGIC *mg =
3427                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3428                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3429                                     if (mg && mg->mg_len >= 0)
3430                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3431                                 }
3432                                 data->last_end += l * (mincount - 1);
3433                             }
3434                         } else {
3435                             /* start offset must point into the last copy */
3436                             data->last_start_min += minnext * (mincount - 1);
3437                             data->last_start_max += is_inf ? I32_MAX
3438                                 : (maxcount - 1) * (minnext + data->pos_delta);
3439                         }
3440                     }
3441                     /* It is counted once already... */
3442                     data->pos_min += minnext * (mincount - counted);
3443                     data->pos_delta += - counted * deltanext +
3444                         (minnext + deltanext) * maxcount - minnext * mincount;
3445                     if (mincount != maxcount) {
3446                          /* Cannot extend fixed substrings found inside
3447                             the group.  */
3448                         SCAN_COMMIT(pRExC_state,data,minlenp);
3449                         if (mincount && last_str) {
3450                             SV * const sv = data->last_found;
3451                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3452                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3453
3454                             if (mg)
3455                                 mg->mg_len = -1;
3456                             sv_setsv(sv, last_str);
3457                             data->last_end = data->pos_min;
3458                             data->last_start_min =
3459                                 data->pos_min - CHR_SVLEN(last_str);
3460                             data->last_start_max = is_inf
3461                                 ? I32_MAX
3462                                 : data->pos_min + data->pos_delta
3463                                 - CHR_SVLEN(last_str);
3464                         }
3465                         data->longest = &(data->longest_float);
3466                     }
3467                     SvREFCNT_dec(last_str);
3468                 }
3469                 if (data && (fl & SF_HAS_EVAL))
3470                     data->flags |= SF_HAS_EVAL;
3471               optimize_curly_tail:
3472                 if (OP(oscan) != CURLYX) {
3473                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3474                            && NEXT_OFF(next))
3475                         NEXT_OFF(oscan) += NEXT_OFF(next);
3476                 }
3477                 continue;
3478             default:                    /* REF and CLUMP only? */
3479                 if (flags & SCF_DO_SUBSTR) {
3480                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3481                     data->longest = &(data->longest_float);
3482                 }
3483                 is_inf = is_inf_internal = 1;
3484                 if (flags & SCF_DO_STCLASS_OR)
3485                     cl_anything(pRExC_state, data->start_class);
3486                 flags &= ~SCF_DO_STCLASS;
3487                 break;
3488             }
3489         }
3490         else if (OP(scan) == LNBREAK) {
3491             if (flags & SCF_DO_STCLASS) {
3492                 int value = 0;
3493                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3494                 if (flags & SCF_DO_STCLASS_AND) {
3495                     for (value = 0; value < 256; value++)
3496                         if (!is_VERTWS_cp(value))
3497                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3498                 }
3499                 else {
3500                     for (value = 0; value < 256; value++)
3501                         if (is_VERTWS_cp(value))
3502                             ANYOF_BITMAP_SET(data->start_class, value);
3503                 }
3504                 if (flags & SCF_DO_STCLASS_OR)
3505                     cl_and(data->start_class, and_withp);
3506                 flags &= ~SCF_DO_STCLASS;
3507             }
3508             min += 1;
3509             delta += 1;
3510             if (flags & SCF_DO_SUBSTR) {
3511                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3512                 data->pos_min += 1;
3513                 data->pos_delta += 1;
3514                 data->longest = &(data->longest_float);
3515             }
3516
3517         }
3518         else if (OP(scan) == FOLDCHAR) {
3519             int d = ARG(scan)==0xDF ? 1 : 2;
3520             flags &= ~SCF_DO_STCLASS;
3521             min += 1;
3522             delta += d;
3523             if (flags & SCF_DO_SUBSTR) {
3524                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3525                 data->pos_min += 1;
3526                 data->pos_delta += d;
3527                 data->longest = &(data->longest_float);
3528             }
3529         }
3530         else if (strchr((const char*)PL_simple,OP(scan))) {
3531             int value = 0;
3532
3533             if (flags & SCF_DO_SUBSTR) {
3534                 SCAN_COMMIT(pRExC_state,data,minlenp);
3535                 data->pos_min++;
3536             }
3537             min++;
3538             if (flags & SCF_DO_STCLASS) {
3539                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3540
3541                 /* Some of the logic below assumes that switching
3542                    locale on will only add false positives. */
3543                 switch (PL_regkind[OP(scan)]) {
3544                 case SANY:
3545                 default:
3546                   do_default:
3547                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3548                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3549                         cl_anything(pRExC_state, data->start_class);
3550                     break;
3551                 case REG_ANY:
3552                     if (OP(scan) == SANY)
3553                         goto do_default;
3554                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3555                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3556                                  || (data->start_class->flags & ANYOF_CLASS));
3557                         cl_anything(pRExC_state, data->start_class);
3558                     }
3559                     if (flags & SCF_DO_STCLASS_AND || !value)
3560                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3561                     break;
3562                 case ANYOF:
3563                     if (flags & SCF_DO_STCLASS_AND)
3564                         cl_and(data->start_class,
3565                                (struct regnode_charclass_class*)scan);
3566                     else
3567                         cl_or(pRExC_state, data->start_class,
3568                               (struct regnode_charclass_class*)scan);
3569                     break;
3570                 case ALNUM:
3571                     if (flags & SCF_DO_STCLASS_AND) {
3572                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3573                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3574                             for (value = 0; value < 256; value++)
3575                                 if (!isALNUM(value))
3576                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3577                         }
3578                     }
3579                     else {
3580                         if (data->start_class->flags & ANYOF_LOCALE)
3581                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3582                         else {
3583                             for (value = 0; value < 256; value++)
3584                                 if (isALNUM(value))
3585                                     ANYOF_BITMAP_SET(data->start_class, value);
3586                         }
3587                     }
3588                     break;
3589                 case ALNUML:
3590                     if (flags & SCF_DO_STCLASS_AND) {
3591                         if (data->start_class->flags & ANYOF_LOCALE)
3592                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3593                     }
3594                     else {
3595                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3596                         data->start_class->flags |= ANYOF_LOCALE;
3597                     }
3598                     break;
3599                 case NALNUM:
3600                     if (flags & SCF_DO_STCLASS_AND) {
3601                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3602                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3603                             for (value = 0; value < 256; value++)
3604                                 if (isALNUM(value))
3605                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3606                         }
3607                     }
3608                     else {
3609                         if (data->start_class->flags & ANYOF_LOCALE)
3610                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3611                         else {
3612                             for (value = 0; value < 256; value++)
3613                                 if (!isALNUM(value))
3614                                     ANYOF_BITMAP_SET(data->start_class, value);
3615                         }
3616                     }
3617                     break;
3618                 case NALNUML:
3619                     if (flags & SCF_DO_STCLASS_AND) {
3620                         if (data->start_class->flags & ANYOF_LOCALE)
3621                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3622                     }
3623                     else {
3624                         data->start_class->flags |= ANYOF_LOCALE;
3625                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3626                     }
3627                     break;
3628                 case SPACE:
3629                     if (flags & SCF_DO_STCLASS_AND) {
3630                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3631                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3632                             for (value = 0; value < 256; value++)
3633                                 if (!isSPACE(value))
3634                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3635                         }
3636                     }
3637                     else {
3638                         if (data->start_class->flags & ANYOF_LOCALE)
3639                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3640                         else {
3641                             for (value = 0; value < 256; value++)
3642                                 if (isSPACE(value))
3643                                     ANYOF_BITMAP_SET(data->start_class, value);
3644                         }
3645                     }
3646                     break;
3647                 case SPACEL:
3648                     if (flags & SCF_DO_STCLASS_AND) {
3649                         if (data->start_class->flags & ANYOF_LOCALE)
3650                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3651                     }
3652                     else {
3653                         data->start_class->flags |= ANYOF_LOCALE;
3654                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3655                     }
3656                     break;
3657                 case NSPACE:
3658                     if (flags & SCF_DO_STCLASS_AND) {
3659                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3660                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3661                             for (value = 0; value < 256; value++)
3662                                 if (isSPACE(value))
3663                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3664                         }
3665                     }
3666                     else {
3667                         if (data->start_class->flags & ANYOF_LOCALE)
3668                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3669                         else {
3670                             for (value = 0; value < 256; value++)
3671                                 if (!isSPACE(value))
3672                                     ANYOF_BITMAP_SET(data->start_class, value);
3673                         }
3674                     }
3675                     break;
3676                 case NSPACEL:
3677                     if (flags & SCF_DO_STCLASS_AND) {
3678                         if (data->start_class->flags & ANYOF_LOCALE) {
3679                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3680                             for (value = 0; value < 256; value++)
3681                                 if (!isSPACE(value))
3682                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3683                         }
3684                     }
3685                     else {
3686                         data->start_class->flags |= ANYOF_LOCALE;
3687                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3688                     }
3689                     break;
3690                 case DIGIT:
3691                     if (flags & SCF_DO_STCLASS_AND) {
3692                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3693                         for (value = 0; value < 256; value++)
3694                             if (!isDIGIT(value))
3695                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3696                     }
3697                     else {
3698                         if (data->start_class->flags & ANYOF_LOCALE)
3699                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3700                         else {
3701                             for (value = 0; value < 256; value++)
3702                                 if (isDIGIT(value))
3703                                     ANYOF_BITMAP_SET(data->start_class, value);
3704                         }
3705                     }
3706                     break;
3707                 case NDIGIT:
3708                     if (flags & SCF_DO_STCLASS_AND) {
3709                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3710                         for (value = 0; value < 256; value++)
3711                             if (isDIGIT(value))
3712                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3713                     }
3714                     else {
3715                         if (data->start_class->flags & ANYOF_LOCALE)
3716                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3717                         else {
3718                             for (value = 0; value < 256; value++)
3719                                 if (!isDIGIT(value))
3720                                     ANYOF_BITMAP_SET(data->start_class, value);
3721                         }
3722                     }
3723                     break;
3724                 CASE_SYNST_FNC(VERTWS);
3725                 CASE_SYNST_FNC(HORIZWS);
3726
3727                 }
3728                 if (flags & SCF_DO_STCLASS_OR)
3729                     cl_and(data->start_class, and_withp);
3730                 flags &= ~SCF_DO_STCLASS;
3731             }
3732         }
3733         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3734             data->flags |= (OP(scan) == MEOL
3735                             ? SF_BEFORE_MEOL
3736                             : SF_BEFORE_SEOL);
3737         }
3738         else if (  PL_regkind[OP(scan)] == BRANCHJ
3739                  /* Lookbehind, or need to calculate parens/evals/stclass: */
3740                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
3741                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3742             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3743                 || OP(scan) == UNLESSM )
3744             {
3745                 /* Negative Lookahead/lookbehind
3746                    In this case we can't do fixed string optimisation.
3747                 */
3748
3749                 I32 deltanext, minnext, fake = 0;
3750                 regnode *nscan;
3751                 struct regnode_charclass_class intrnl;
3752                 int f = 0;
3753
3754                 data_fake.flags = 0;
3755                 if (data) {
3756                     data_fake.whilem_c = data->whilem_c;
3757                     data_fake.last_closep = data->last_closep;
3758                 }
3759                 else
3760                     data_fake.last_closep = &fake;
3761                 data_fake.pos_delta = delta;
3762                 if ( flags & SCF_DO_STCLASS && !scan->flags
3763                      && OP(scan) == IFMATCH ) { /* Lookahead */
3764                     cl_init(pRExC_state, &intrnl);
3765                     data_fake.start_class = &intrnl;
3766                     f |= SCF_DO_STCLASS_AND;
3767                 }
3768                 if (flags & SCF_WHILEM_VISITED_POS)
3769                     f |= SCF_WHILEM_VISITED_POS;
3770                 next = regnext(scan);
3771                 nscan = NEXTOPER(NEXTOPER(scan));
3772                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3773                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3774                 if (scan->flags) {
3775                     if (deltanext) {
3776                         FAIL("Variable length lookbehind not implemented");
3777                     }
3778                     else if (minnext > (I32)U8_MAX) {
3779                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3780                     }
3781                     scan->flags = (U8)minnext;
3782                 }
3783                 if (data) {
3784                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3785                         pars++;
3786                     if (data_fake.flags & SF_HAS_EVAL)
3787                         data->flags |= SF_HAS_EVAL;
3788                     data->whilem_c = data_fake.whilem_c;
3789                 }
3790                 if (f & SCF_DO_STCLASS_AND) {
3791                     if (flags & SCF_DO_STCLASS_OR) {
3792                         /* OR before, AND after: ideally we would recurse with
3793                          * data_fake to get the AND applied by study of the
3794                          * remainder of the pattern, and then derecurse;
3795                          * *** HACK *** for now just treat as "no information".
3796                          * See [perl #56690].
3797                          */
3798                         cl_init(pRExC_state, data->start_class);
3799                     }  else {
3800                         /* AND before and after: combine and continue */
3801                         const int was = (data->start_class->flags & ANYOF_EOS);
3802
3803                         cl_and(data->start_class, &intrnl);
3804                         if (was)
3805                             data->start_class->flags |= ANYOF_EOS;
3806                     }
3807                 }
3808             }
3809 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3810             else {
3811                 /* Positive Lookahead/lookbehind
3812                    In this case we can do fixed string optimisation,
3813                    but we must be careful about it. Note in the case of
3814                    lookbehind the positions will be offset by the minimum
3815                    length of the pattern, something we won't know about
3816                    until after the recurse.
3817                 */
3818                 I32 deltanext, fake = 0;
3819                 regnode *nscan;
3820                 struct regnode_charclass_class intrnl;
3821                 int f = 0;
3822                 /* We use SAVEFREEPV so that when the full compile
3823                     is finished perl will clean up the allocated
3824                     minlens when its all done. This was we don't
3825                     have to worry about freeing them when we know
3826                     they wont be used, which would be a pain.
3827                  */
3828                 I32 *minnextp;
3829                 Newx( minnextp, 1, I32 );
3830                 SAVEFREEPV(minnextp);
3831
3832                 if (data) {
3833                     StructCopy(data, &data_fake, scan_data_t);
3834                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3835                         f |= SCF_DO_SUBSTR;
3836                         if (scan->flags)
3837                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3838                         data_fake.last_found=newSVsv(data->last_found);
3839                     }
3840                 }
3841                 else
3842                     data_fake.last_closep = &fake;
3843                 data_fake.flags = 0;
3844                 data_fake.pos_delta = delta;
3845                 if (is_inf)
3846                     data_fake.flags |= SF_IS_INF;
3847                 if ( flags & SCF_DO_STCLASS && !scan->flags
3848                      && OP(scan) == IFMATCH ) { /* Lookahead */
3849                     cl_init(pRExC_state, &intrnl);
3850                     data_fake.start_class = &intrnl;
3851                     f |= SCF_DO_STCLASS_AND;
3852                 }
3853                 if (flags & SCF_WHILEM_VISITED_POS)
3854                     f |= SCF_WHILEM_VISITED_POS;
3855                 next = regnext(scan);
3856                 nscan = NEXTOPER(NEXTOPER(scan));
3857
3858                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3859                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3860                 if (scan->flags) {
3861                     if (deltanext) {
3862                         FAIL("Variable length lookbehind not implemented");
3863                     }
3864                     else if (*minnextp > (I32)U8_MAX) {
3865                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3866                     }
3867                     scan->flags = (U8)*minnextp;
3868                 }
3869
3870                 *minnextp += min;
3871
3872                 if (f & SCF_DO_STCLASS_AND) {
3873                     const int was = (data->start_class->flags & ANYOF_EOS);
3874
3875                     cl_and(data->start_class, &intrnl);
3876                     if (was)
3877                         data->start_class->flags |= ANYOF_EOS;
3878                 }
3879                 if (data) {
3880                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3881                         pars++;
3882                     if (data_fake.flags & SF_HAS_EVAL)
3883                         data->flags |= SF_HAS_EVAL;
3884                     data->whilem_c = data_fake.whilem_c;
3885                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3886                         if (RExC_rx->minlen<*minnextp)
3887                             RExC_rx->minlen=*minnextp;
3888                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3889                         SvREFCNT_dec(data_fake.last_found);
3890
3891                         if ( data_fake.minlen_fixed != minlenp )
3892                         {
3893                             data->offset_fixed= data_fake.offset_fixed;
3894                             data->minlen_fixed= data_fake.minlen_fixed;
3895                             data->lookbehind_fixed+= scan->flags;
3896                         }
3897                         if ( data_fake.minlen_float != minlenp )
3898                         {
3899                             data->minlen_float= data_fake.minlen_float;
3900                             data->offset_float_min=data_fake.offset_float_min;
3901                             data->offset_float_max=data_fake.offset_float_max;
3902                             data->lookbehind_float+= scan->flags;
3903                         }
3904                     }
3905                 }
3906
3907
3908             }
3909 #endif
3910         }
3911         else if (OP(scan) == OPEN) {
3912             if (stopparen != (I32)ARG(scan))
3913                 pars++;
3914         }
3915         else if (OP(scan) == CLOSE) {
3916             if (stopparen == (I32)ARG(scan)) {
3917                 break;
3918             }
3919             if ((I32)ARG(scan) == is_par) {
3920                 next = regnext(scan);
3921
3922                 if ( next && (OP(next) != WHILEM) && next < last)
3923                     is_par = 0;         /* Disable optimization */
3924             }
3925             if (data)
3926                 *(data->last_closep) = ARG(scan);
3927         }
3928         else if (OP(scan) == EVAL) {
3929                 if (data)
3930                     data->flags |= SF_HAS_EVAL;
3931         }
3932         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3933             if (flags & SCF_DO_SUBSTR) {
3934                 SCAN_COMMIT(pRExC_state,data,minlenp);
3935                 flags &= ~SCF_DO_SUBSTR;
3936             }
3937             if (data && OP(scan)==ACCEPT) {
3938                 data->flags |= SCF_SEEN_ACCEPT;
3939                 if (stopmin > min)
3940                     stopmin = min;
3941             }
3942         }
3943         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3944         {
3945                 if (flags & SCF_DO_SUBSTR) {
3946                     SCAN_COMMIT(pRExC_state,data,minlenp);
3947                     data->longest = &(data->longest_float);
3948                 }
3949                 is_inf = is_inf_internal = 1;
3950                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3951                     cl_anything(pRExC_state, data->start_class);
3952                 flags &= ~SCF_DO_STCLASS;
3953         }
3954         else if (OP(scan) == GPOS) {
3955             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3956                 !(delta || is_inf || (data && data->pos_delta)))
3957             {
3958                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3959                     RExC_rx->extflags |= RXf_ANCH_GPOS;
3960                 if (RExC_rx->gofs < (U32)min)
3961                     RExC_rx->gofs = min;
3962             } else {
3963                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3964                 RExC_rx->gofs = 0;
3965             }
3966         }
3967 #ifdef TRIE_STUDY_OPT
3968 #ifdef FULL_TRIE_STUDY
3969         else if (PL_regkind[OP(scan)] == TRIE) {
3970             /* NOTE - There is similar code to this block above for handling
3971                BRANCH nodes on the initial study.  If you change stuff here
3972                check there too. */
3973             regnode *trie_node= scan;
3974             regnode *tail= regnext(scan);
3975             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3976             I32 max1 = 0, min1 = I32_MAX;
3977             struct regnode_charclass_class accum;
3978
3979             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3980                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3981             if (flags & SCF_DO_STCLASS)
3982                 cl_init_zero(pRExC_state, &accum);
3983
3984             if (!trie->jump) {
3985                 min1= trie->minlen;
3986                 max1= trie->maxlen;
3987             } else {
3988                 const regnode *nextbranch= NULL;
3989                 U32 word;
3990
3991                 for ( word=1 ; word <= trie->wordcount ; word++)
3992                 {
3993                     I32 deltanext=0, minnext=0, f = 0, fake;
3994                     struct regnode_charclass_class this_class;
3995
3996                     data_fake.flags = 0;
3997                     if (data) {
3998                         data_fake.whilem_c = data->whilem_c;
3999                         data_fake.last_closep = data->last_closep;
4000                     }
4001                     else
4002                         data_fake.last_closep = &fake;
4003                     data_fake.pos_delta = delta;
4004                     if (flags & SCF_DO_STCLASS) {
4005                         cl_init(pRExC_state, &this_class);
4006                         data_fake.start_class = &this_class;
4007                         f = SCF_DO_STCLASS_AND;
4008                     }
4009                     if (flags & SCF_WHILEM_VISITED_POS)
4010                         f |= SCF_WHILEM_VISITED_POS;
4011
4012                     if (trie->jump[word]) {
4013                         if (!nextbranch)
4014                             nextbranch = trie_node + trie->jump[0];
4015                         scan= trie_node + trie->jump[word];
4016                         /* We go from the jump point to the branch that follows
4017                            it. Note this means we need the vestigal unused branches
4018                            even though they arent otherwise used.
4019                          */
4020                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4021                             &deltanext, (regnode *)nextbranch, &data_fake,
4022                             stopparen, recursed, NULL, f,depth+1);
4023                     }
4024                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4025                         nextbranch= regnext((regnode*)nextbranch);
4026
4027                     if (min1 > (I32)(minnext + trie->minlen))
4028                         min1 = minnext + trie->minlen;
4029                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4030                         max1 = minnext + deltanext + trie->maxlen;
4031                     if (deltanext == I32_MAX)
4032                         is_inf = is_inf_internal = 1;
4033
4034                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4035                         pars++;
4036                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4037                         if ( stopmin > min + min1)
4038                             stopmin = min + min1;
4039                         flags &= ~SCF_DO_SUBSTR;
4040                         if (data)
4041                             data->flags |= SCF_SEEN_ACCEPT;
4042                     }
4043                     if (data) {
4044                         if (data_fake.flags & SF_HAS_EVAL)
4045                             data->flags |= SF_HAS_EVAL;
4046                         data->whilem_c = data_fake.whilem_c;
4047                     }
4048                     if (flags & SCF_DO_STCLASS)
4049                         cl_or(pRExC_state, &accum, &this_class);
4050                 }
4051             }
4052             if (flags & SCF_DO_SUBSTR) {
4053                 data->pos_min += min1;
4054                 data->pos_delta += max1 - min1;
4055                 if (max1 != min1 || is_inf)
4056                     data->longest = &(data->longest_float);
4057             }
4058             min += min1;
4059             delta += max1 - min1;
4060             if (flags & SCF_DO_STCLASS_OR) {
4061                 cl_or(pRExC_state, data->start_class, &accum);
4062                 if (min1) {
4063                     cl_and(data->start_class, and_withp);
4064                     flags &= ~SCF_DO_STCLASS;
4065                 }
4066             }
4067             else if (flags & SCF_DO_STCLASS_AND) {
4068                 if (min1) {
4069                     cl_and(data->start_class, &accum);
4070                     flags &= ~SCF_DO_STCLASS;
4071                 }
4072                 else {
4073                     /* Switch to OR mode: cache the old value of
4074                      * data->start_class */
4075                     INIT_AND_WITHP;
4076                     StructCopy(data->start_class, and_withp,
4077                                struct regnode_charclass_class);
4078                     flags &= ~SCF_DO_STCLASS_AND;
4079                     StructCopy(&accum, data->start_class,
4080                                struct regnode_charclass_class);
4081                     flags |= SCF_DO_STCLASS_OR;
4082                     data->start_class->flags |= ANYOF_EOS;
4083                 }
4084             }
4085             scan= tail;
4086             continue;
4087         }
4088 #else
4089         else if (PL_regkind[OP(scan)] == TRIE) {
4090             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4091             U8*bang=NULL;
4092
4093             min += trie->minlen;
4094             delta += (trie->maxlen - trie->minlen);
4095             flags &= ~SCF_DO_STCLASS; /* xxx */
4096             if (flags & SCF_DO_SUBSTR) {
4097                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4098                 data->pos_min += trie->minlen;
4099                 data->pos_delta += (trie->maxlen - trie->minlen);
4100                 if (trie->maxlen != trie->minlen)
4101                     data->longest = &(data->longest_float);
4102             }
4103             if (trie->jump) /* no more substrings -- for now /grr*/
4104                 flags &= ~SCF_DO_SUBSTR;
4105         }
4106 #endif /* old or new */
4107 #endif /* TRIE_STUDY_OPT */
4108
4109         /* Else: zero-length, ignore. */
4110         scan = regnext(scan);
4111     }
4112     if (frame) {
4113         last = frame->last;
4114         scan = frame->next;
4115         stopparen = frame->stop;
4116         frame = frame->prev;
4117         goto fake_study_recurse;
4118     }
4119
4120   finish:
4121     assert(!frame);
4122     DEBUG_STUDYDATA("pre-fin:",data,depth);
4123
4124     *scanp = scan;
4125     *deltap = is_inf_internal ? I32_MAX : delta;
4126     if (flags & SCF_DO_SUBSTR && is_inf)
4127         data->pos_delta = I32_MAX - data->pos_min;
4128     if (is_par > (I32)U8_MAX)
4129         is_par = 0;
4130     if (is_par && pars==1 && data) {
4131         data->flags |= SF_IN_PAR;
4132         data->flags &= ~SF_HAS_PAR;
4133     }
4134     else if (pars && data) {
4135         data->flags |= SF_HAS_PAR;
4136         data->flags &= ~SF_IN_PAR;
4137     }
4138     if (flags & SCF_DO_STCLASS_OR)
4139         cl_and(data->start_class, and_withp);
4140     if (flags & SCF_TRIE_RESTUDY)
4141         data->flags |=  SCF_TRIE_RESTUDY;
4142
4143     DEBUG_STUDYDATA("post-fin:",data,depth);
4144
4145     return min < stopmin ? min : stopmin;
4146 }
4147
4148 STATIC U32
4149 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4150 {
4151     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4152
4153     PERL_ARGS_ASSERT_ADD_DATA;
4154
4155     Renewc(RExC_rxi->data,
4156            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4157            char, struct reg_data);
4158     if(count)
4159         Renew(RExC_rxi->data->what, count + n, U8);
4160     else
4161         Newx(RExC_rxi->data->what, n, U8);
4162     RExC_rxi->data->count = count + n;
4163     Copy(s, RExC_rxi->data->what + count, n, U8);
4164     return count;
4165 }
4166
4167 /*XXX: todo make this not included in a non debugging perl */
4168 #ifndef PERL_IN_XSUB_RE
4169 void
4170 Perl_reginitcolors(pTHX)
4171 {
4172     dVAR;
4173     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4174     if (s) {
4175         char *t = savepv(s);
4176         int i = 0;
4177         PL_colors[0] = t;
4178         while (++i < 6) {
4179             t = strchr(t, '\t');
4180             if (t) {
4181                 *t = '\0';
4182                 PL_colors[i] = ++t;
4183             }
4184             else
4185                 PL_colors[i] = t = (char *)"";
4186         }
4187     } else {
4188         int i = 0;
4189         while (i < 6)
4190             PL_colors[i++] = (char *)"";
4191     }
4192     PL_colorset = 1;
4193 }
4194 #endif
4195
4196
4197 #ifdef TRIE_STUDY_OPT
4198 #define CHECK_RESTUDY_GOTO                                  \
4199         if (                                                \
4200               (data.flags & SCF_TRIE_RESTUDY)               \
4201               && ! restudied++                              \
4202         )     goto reStudy
4203 #else
4204 #define CHECK_RESTUDY_GOTO
4205 #endif
4206
4207 /*
4208  - pregcomp - compile a regular expression into internal code
4209  *
4210  * We can't allocate space until we know how big the compiled form will be,
4211  * but we can't compile it (and thus know how big it is) until we've got a
4212  * place to put the code.  So we cheat:  we compile it twice, once with code
4213  * generation turned off and size counting turned on, and once "for real".
4214  * This also means that we don't allocate space until we are sure that the
4215  * thing really will compile successfully, and we never have to move the
4216  * code and thus invalidate pointers into it.  (Note that it has to be in
4217  * one piece because free() must be able to free it all.) [NB: not true in perl]
4218  *
4219  * Beware that the optimization-preparation code in here knows about some
4220  * of the structure of the compiled regexp.  [I'll say.]
4221  */
4222
4223
4224
4225 #ifndef PERL_IN_XSUB_RE
4226 #define RE_ENGINE_PTR &PL_core_reg_engine
4227 #else
4228 extern const struct regexp_engine my_reg_engine;
4229 #define RE_ENGINE_PTR &my_reg_engine
4230 #endif
4231
4232 #ifndef PERL_IN_XSUB_RE
4233 REGEXP *
4234 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4235 {
4236     dVAR;
4237     HV * const table = GvHV(PL_hintgv);
4238
4239     PERL_ARGS_ASSERT_PREGCOMP;
4240
4241     /* Dispatch a request to compile a regexp to correct
4242        regexp engine. */
4243     if (table) {
4244         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4245         GET_RE_DEBUG_FLAGS_DECL;
4246         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4247             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4248             DEBUG_COMPILE_r({
4249                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4250                     SvIV(*ptr));
4251             });
4252             return CALLREGCOMP_ENG(eng, pattern, flags);
4253         }
4254     }
4255     return Perl_re_compile(aTHX_ pattern, flags);
4256 }
4257 #endif
4258
4259 REGEXP *
4260 Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
4261 {
4262     dVAR;
4263     REGEXP *rx;
4264     struct regexp *r;
4265     register regexp_internal *ri;
4266     STRLEN plen;
4267     char  *exp = SvPV(pattern, plen);
4268     char* xend = exp + plen;
4269     regnode *scan;
4270     I32 flags;
4271     I32 minlen = 0;
4272     I32 sawplus = 0;
4273     I32 sawopen = 0;
4274     scan_data_t data;
4275     RExC_state_t RExC_state;
4276     RExC_state_t * const pRExC_state = &RExC_state;
4277 #ifdef TRIE_STUDY_OPT
4278     int restudied= 0;
4279     RExC_state_t copyRExC_state;
4280 #endif
4281     GET_RE_DEBUG_FLAGS_DECL;
4282
4283     PERL_ARGS_ASSERT_RE_COMPILE;
4284
4285     DEBUG_r(if (!PL_colorset) reginitcolors());
4286
4287     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4288
4289     DEBUG_COMPILE_r({
4290         SV *dsv= sv_newmortal();
4291         RE_PV_QUOTED_DECL(s, RExC_utf8,
4292             dsv, exp, plen, 60);
4293         PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4294                        PL_colors[4],PL_colors[5],s);
4295     });
4296
4297 redo_first_pass:
4298     RExC_precomp = exp;
4299     RExC_flags = pm_flags;
4300     RExC_sawback = 0;
4301
4302     RExC_seen = 0;
4303     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4304     RExC_seen_evals = 0;
4305     RExC_extralen = 0;
4306
4307     /* First pass: determine size, legality. */
4308     RExC_parse = exp;
4309     RExC_start = exp;
4310     RExC_end = xend;
4311     RExC_naughty = 0;
4312     RExC_npar = 1;
4313     RExC_nestroot = 0;
4314     RExC_size = 0L;
4315     RExC_emit = &PL_regdummy;
4316     RExC_whilem_seen = 0;
4317     RExC_open_parens = NULL;
4318     RExC_close_parens = NULL;
4319     RExC_opend = NULL;
4320     RExC_paren_names = NULL;
4321 #ifdef DEBUGGING
4322     RExC_paren_name_list = NULL;
4323 #endif
4324     RExC_recurse = NULL;
4325     RExC_recurse_count = 0;
4326
4327 #if 0 /* REGC() is (currently) a NOP at the first pass.
4328        * Clever compilers notice this and complain. --jhi */
4329     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4330 #endif
4331     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4332     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4333         RExC_precomp = NULL;
4334         return(NULL);
4335     }
4336     if (RExC_utf8 && !RExC_orig_utf8) {
4337         /* It's possible to write a regexp in ascii that represents Unicode
4338         codepoints outside of the byte range, such as via \x{100}. If we
4339         detect such a sequence we have to convert the entire pattern to utf8
4340         and then recompile, as our sizing calculation will have been based
4341         on 1 byte == 1 character, but we will need to use utf8 to encode
4342         at least some part of the pattern, and therefore must convert the whole
4343         thing.
4344         XXX: somehow figure out how to make this less expensive...
4345         -- dmq */
4346         STRLEN len = plen;
4347         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4348             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4349         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4350         xend = exp + len;
4351         RExC_orig_utf8 = RExC_utf8;
4352         SAVEFREEPV(exp);
4353         goto redo_first_pass;
4354     }
4355     DEBUG_PARSE_r({
4356         PerlIO_printf(Perl_debug_log,
4357             "Required size %"IVdf" nodes\n"
4358             "Starting second pass (creation)\n",
4359             (IV)RExC_size);
4360         RExC_lastnum=0;
4361         RExC_lastparse=NULL;
4362     });
4363     /* Small enough for pointer-storage convention?
4364        If extralen==0, this means that we will not need long jumps. */
4365     if (RExC_size >= 0x10000L && RExC_extralen)
4366         RExC_size += RExC_extralen;
4367     else
4368         RExC_extralen = 0;
4369     if (RExC_whilem_seen > 15)
4370         RExC_whilem_seen = 15;
4371
4372     /* Allocate space and zero-initialize. Note, the two step process
4373        of zeroing when in debug mode, thus anything assigned has to
4374        happen after that */
4375     rx = (REGEXP*) newSV_type(SVt_REGEXP);
4376     r = (struct regexp*)SvANY(rx);
4377     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4378          char, regexp_internal);
4379     if ( r == NULL || ri == NULL )
4380         FAIL("Regexp out of space");
4381 #ifdef DEBUGGING
4382     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4383     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4384 #else
4385     /* bulk initialize base fields with 0. */
4386     Zero(ri, sizeof(regexp_internal), char);
4387 #endif
4388
4389     /* non-zero initialization begins here */
4390     RXi_SET( r, ri );
4391     r->engine= RE_ENGINE_PTR;
4392     r->extflags = pm_flags;
4393     {
4394         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4395         bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4396         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4397         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4398                             >> RXf_PMf_STD_PMMOD_SHIFT);
4399         const char *fptr = STD_PAT_MODS;        /*"msix"*/
4400         char *p;
4401         const STRLEN wraplen = plen + has_minus + has_p + has_runon
4402             + (sizeof(STD_PAT_MODS) - 1)
4403             + (sizeof("(?:)") - 1);
4404
4405         p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
4406         SvCUR_set(rx, wraplen);
4407         SvPOK_on(rx);
4408         SvFLAGS(rx) |= SvUTF8(pattern);
4409         *p++='('; *p++='?';
4410         if (has_p)
4411             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4412         {
4413             char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4414             char *colon = r + 1;
4415             char ch;
4416
4417             while((ch = *fptr++)) {
4418                 if(reganch & 1)
4419                     *p++ = ch;
4420                 else
4421                     *r-- = ch;
4422                 reganch >>= 1;
4423             }
4424             if(has_minus) {
4425                 *r = '-';
4426                 p = colon;
4427             }
4428         }
4429
4430         *p++ = ':';
4431         Copy(RExC_precomp, p, plen, char);
4432         assert ((RX_WRAPPED(rx) - p) < 16);
4433         r->pre_prefix = p - RX_WRAPPED(rx);
4434         p += plen;
4435         if (has_runon)
4436             *p++ = '\n';
4437         *p++ = ')';
4438         *p = 0;
4439     }
4440
4441     r->intflags = 0;
4442     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4443
4444     if (RExC_seen & REG_SEEN_RECURSE) {
4445         Newxz(RExC_open_parens, RExC_npar,regnode *);
4446         SAVEFREEPV(RExC_open_parens);
4447         Newxz(RExC_close_parens,RExC_npar,regnode *);
4448         SAVEFREEPV(RExC_close_parens);
4449     }
4450
4451     /* Useful during FAIL. */
4452 #ifdef RE_TRACK_PATTERN_OFFSETS
4453     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4454     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4455                           "%s %"UVuf" bytes for offset annotations.\n",
4456                           ri->u.offsets ? "Got" : "Couldn't get",
4457                           (UV)((2*RExC_size+1) * sizeof(U32))));
4458 #endif
4459     SetProgLen(ri,RExC_size);
4460     RExC_rx_sv = rx;
4461     RExC_rx = r;
4462     RExC_rxi = ri;
4463     REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4464
4465     /* Second pass: emit code. */
4466     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
4467     RExC_parse = exp;
4468     RExC_end = xend;
4469     RExC_naughty = 0;
4470     RExC_npar = 1;
4471     RExC_emit_start = ri->program;
4472     RExC_emit = ri->program;
4473     RExC_emit_bound = ri->program + RExC_size + 1;
4474
4475     /* Store the count of eval-groups for security checks: */
4476     RExC_rx->seen_evals = RExC_seen_evals;
4477     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4478     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4479         ReREFCNT_dec(rx);
4480         return(NULL);
4481     }
4482     /* XXXX To minimize changes to RE engine we always allocate
4483        3-units-long substrs field. */
4484     Newx(r->substrs, 1, struct reg_substr_data);
4485     if (RExC_recurse_count) {
4486         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4487         SAVEFREEPV(RExC_recurse);
4488     }
4489
4490 reStudy:
4491     r->minlen = minlen = sawplus = sawopen = 0;
4492     Zero(r->substrs, 1, struct reg_substr_data);
4493
4494 #ifdef TRIE_STUDY_OPT
4495     if (!restudied) {
4496         StructCopy(&zero_scan_data, &data, scan_data_t);
4497         copyRExC_state = RExC_state;
4498     } else {
4499         U32 seen=RExC_seen;
4500         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4501
4502         RExC_state = copyRExC_state;
4503         if (seen & REG_TOP_LEVEL_BRANCHES)
4504             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4505         else
4506             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4507         if (data.last_found) {
4508             SvREFCNT_dec(data.longest_fixed);
4509             SvREFCNT_dec(data.longest_float);
4510             SvREFCNT_dec(data.last_found);
4511         }
4512         StructCopy(&zero_scan_data, &data, scan_data_t);
4513     }
4514 #else
4515     StructCopy(&zero_scan_data, &data, scan_data_t);
4516 #endif
4517
4518     /* Dig out information for optimizations. */
4519     r->extflags = RExC_flags; /* was pm_op */
4520     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4521
4522     if (UTF)
4523         SvUTF8_on(rx);  /* Unicode in it? */
4524     ri->regstclass = NULL;
4525     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
4526         r->intflags |= PREGf_NAUGHTY;
4527     scan = ri->program + 1;             /* First BRANCH. */
4528
4529     /* testing for BRANCH here tells us whether there is "must appear"
4530        data in the pattern. If there is then we can use it for optimisations */
4531     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4532         I32 fake;
4533         STRLEN longest_float_length, longest_fixed_length;
4534         struct regnode_charclass_class ch_class; /* pointed to by data */
4535         int stclass_flag;
4536         I32 last_close = 0; /* pointed to by data */
4537         regnode *first= scan;
4538         regnode *first_next= regnext(first);
4539
4540         /*
4541          * Skip introductions and multiplicators >= 1
4542          * so that we can extract the 'meat' of the pattern that must
4543          * match in the large if() sequence following.
4544          * NOTE that EXACT is NOT covered here, as it is normally
4545          * picked up by the optimiser separately.
4546          *
4547          * This is unfortunate as the optimiser isnt handling lookahead
4548          * properly currently.
4549          *
4550          */
4551         while ((OP(first) == OPEN && (sawopen = 1)) ||
4552                /* An OR of *one* alternative - should not happen now. */
4553             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4554             /* for now we can't handle lookbehind IFMATCH*/
4555             (OP(first) == IFMATCH && !first->flags) ||
4556             (OP(first) == PLUS) ||
4557             (OP(first) == MINMOD) ||
4558                /* An {n,m} with n>0 */
4559             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4560             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4561         {
4562                 /*
4563                  * the only op that could be a regnode is PLUS, all the rest
4564                  * will be regnode_1 or regnode_2.
4565                  *
4566                  */
4567                 if (OP(first) == PLUS)
4568                     sawplus = 1;
4569                 else
4570                     first += regarglen[OP(first)];
4571
4572                 first = NEXTOPER(first);
4573                 first_next= regnext(first);
4574         }
4575
4576         /* Starting-point info. */
4577       again:
4578         DEBUG_PEEP("first:",first,0);
4579         /* Ignore EXACT as we deal with it later. */
4580         if (PL_regkind[OP(first)] == EXACT) {
4581             if (OP(first) == EXACT)
4582                 NOOP;   /* Empty, get anchored substr later. */
4583             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4584                 ri->regstclass = first;
4585         }
4586 #ifdef TRIE_STCLASS
4587         else if (PL_regkind[OP(first)] == TRIE &&
4588                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4589         {
4590             regnode *trie_op;
4591             /* this can happen only on restudy */
4592             if ( OP(first) == TRIE ) {
4593                 struct regnode_1 *trieop = (struct regnode_1 *)
4594                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
4595                 StructCopy(first,trieop,struct regnode_1);
4596                 trie_op=(regnode *)trieop;
4597             } else {
4598                 struct regnode_charclass *trieop = (struct regnode_charclass *)
4599                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4600                 StructCopy(first,trieop,struct regnode_charclass);
4601                 trie_op=(regnode *)trieop;
4602             }
4603             OP(trie_op)+=2;
4604             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4605             ri->regstclass = trie_op;
4606         }
4607 #endif
4608         else if (strchr((const char*)PL_simple,OP(first)))
4609             ri->regstclass = first;
4610         else if (PL_regkind[OP(first)] == BOUND ||
4611                  PL_regkind[OP(first)] == NBOUND)
4612             ri->regstclass = first;
4613         else if (PL_regkind[OP(first)] == BOL) {
4614             r->extflags |= (OP(first) == MBOL
4615                            ? RXf_ANCH_MBOL
4616                            : (OP(first) == SBOL
4617                               ? RXf_ANCH_SBOL
4618                               : RXf_ANCH_BOL));
4619             first = NEXTOPER(first);
4620             goto again;
4621         }
4622         else if (OP(first) == GPOS) {
4623             r->extflags |= RXf_ANCH_GPOS;
4624             first = NEXTOPER(first);
4625             goto again;
4626         }
4627         else if ((!sawopen || !RExC_sawback) &&
4628             (OP(first) == STAR &&
4629             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4630             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4631         {
4632             /* turn .* into ^.* with an implied $*=1 */
4633             const int type =
4634                 (OP(NEXTOPER(first)) == REG_ANY)
4635                     ? RXf_ANCH_MBOL
4636                     : RXf_ANCH_SBOL;
4637             r->extflags |= type;
4638             r->intflags |= PREGf_IMPLICIT;
4639             first = NEXTOPER(first);
4640             goto again;
4641         }
4642         if (sawplus && (!sawopen || !RExC_sawback)
4643             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4644             /* x+ must match at the 1st pos of run of x's */
4645             r->intflags |= PREGf_SKIP;
4646
4647         /* Scan is after the zeroth branch, first is atomic matcher. */
4648 #ifdef TRIE_STUDY_OPT
4649         DEBUG_PARSE_r(
4650             if (!restudied)
4651                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4652                               (IV)(first - scan + 1))
4653         );
4654 #else
4655         DEBUG_PARSE_r(
4656             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4657                 (IV)(first - scan + 1))
4658         );
4659 #endif
4660
4661
4662         /*
4663         * If there's something expensive in the r.e., find the
4664         * longest literal string that must appear and make it the
4665         * regmust.  Resolve ties in favor of later strings, since
4666         * the regstart check works with the beginning of the r.e.
4667         * and avoiding duplication strengthens checking.  Not a
4668         * strong reason, but sufficient in the absence of others.
4669         * [Now we resolve ties in favor of the earlier string if
4670         * it happens that c_offset_min has been invalidated, since the
4671         * earlier string may buy us something the later one won't.]
4672         */
4673
4674         data.longest_fixed = newSVpvs("");
4675         data.longest_float = newSVpvs("");
4676         data.last_found = newSVpvs("");
4677         data.longest = &(data.longest_fixed);
4678         first = scan;
4679         if (!ri->regstclass) {
4680             cl_init(pRExC_state, &ch_class);
4681             data.start_class = &ch_class;
4682             stclass_flag = SCF_DO_STCLASS_AND;
4683         } else                          /* XXXX Check for BOUND? */
4684             stclass_flag = 0;
4685         data.last_closep = &last_close;
4686
4687         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4688             &data, -1, NULL, NULL,
4689             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4690
4691
4692         CHECK_RESTUDY_GOTO;
4693
4694
4695         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4696              && data.last_start_min == 0 && data.last_end > 0
4697              && !RExC_seen_zerolen
4698              && !(RExC_seen & REG_SEEN_VERBARG)
4699              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4700             r->extflags |= RXf_CHECK_ALL;
4701         scan_commit(pRExC_state, &data,&minlen,0);
4702         SvREFCNT_dec(data.last_found);
4703
4704         /* Note that code very similar to this but for anchored string
4705            follows immediately below, changes may need to be made to both.
4706            Be careful.
4707          */
4708         longest_float_length = CHR_SVLEN(data.longest_float);
4709         if (longest_float_length
4710             || (data.flags & SF_FL_BEFORE_EOL
4711                 && (!(data.flags & SF_FL_BEFORE_MEOL)
4712                     || (RExC_flags & RXf_PMf_MULTILINE))))
4713         {
4714             I32 t,ml;
4715
4716             if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
4717                 && data.offset_fixed == data.offset_float_min
4718                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4719                     goto remove_float;          /* As in (a)+. */
4720
4721             /* copy the information about the longest float from the reg_scan_data
4722                over to the program. */
4723             if (SvUTF8(data.longest_float)) {
4724                 r->float_utf8 = data.longest_float;
4725                 r->float_substr = NULL;
4726             } else {
4727                 r->float_substr = data.longest_float;
4728                 r->float_utf8 = NULL;
4729             }
4730             /* float_end_shift is how many chars that must be matched that
4731                follow this item. We calculate it ahead of time as once the
4732                lookbehind offset is added in we lose the ability to correctly
4733                calculate it.*/
4734             ml = data.minlen_float ? *(data.minlen_float)
4735                                    : (I32)longest_float_length;
4736             r->float_end_shift = ml - data.offset_float_min
4737                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4738                 + data.lookbehind_float;
4739             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4740             r->float_max_offset = data.offset_float_max;
4741             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4742                 r->float_max_offset -= data.lookbehind_float;
4743
4744             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4745                        && (!(data.flags & SF_FL_BEFORE_MEOL)
4746                            || (RExC_flags & RXf_PMf_MULTILINE)));
4747             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4748         }
4749         else {
4750           remove_float:
4751             r->float_substr = r->float_utf8 = NULL;
4752             SvREFCNT_dec(data.longest_float);
4753             longest_float_length = 0;
4754         }
4755
4756         /* Note that code very similar to this but for floating string
4757            is immediately above, changes may need to be made to both.
4758            Be careful.
4759          */
4760         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4761         if (longest_fixed_length
4762             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4763                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4764                     || (RExC_flags & RXf_PMf_MULTILINE))))
4765         {
4766             I32 t,ml;
4767
4768             /* copy the information about the longest fixed
4769                from the reg_scan_data over to the program. */
4770             if (SvUTF8(data.longest_fixed)) {
4771                 r->anchored_utf8 = data.longest_fixed;
4772                 r->anchored_substr = NULL;
4773             } else {
4774                 r->anchored_substr = data.longest_fixed;
4775                 r->anchored_utf8 = NULL;
4776             }
4777             /* fixed_end_shift is how many chars that must be matched that
4778                follow this item. We calculate it ahead of time as once the
4779                lookbehind offset is added in we lose the ability to correctly
4780                calculate it.*/
4781             ml = data.minlen_fixed ? *(data.minlen_fixed)
4782                                    : (I32)longest_fixed_length;
4783             r->anchored_end_shift = ml - data.offset_fixed
4784                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4785                 + data.lookbehind_fixed;
4786             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4787
4788             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4789                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
4790                      || (RExC_flags & RXf_PMf_MULTILINE)));
4791             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4792         }
4793         else {
4794             r->anchored_substr = r->anchored_utf8 = NULL;
4795             SvREFCNT_dec(data.longest_fixed);
4796             longest_fixed_length = 0;
4797         }
4798         if (ri->regstclass
4799             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4800             ri->regstclass = NULL;
4801         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4802             && stclass_flag
4803             && !(data.start_class->flags & ANYOF_EOS)
4804             && !cl_is_anything(data.start_class))
4805         {
4806             const U32 n = add_data(pRExC_state, 1, "f");
4807
4808             Newx(RExC_rxi->data->data[n], 1,
4809                 struct regnode_charclass_class);
4810             StructCopy(data.start_class,
4811                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4812                        struct regnode_charclass_class);
4813             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4814             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4815             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4816                       regprop(r, sv, (regnode*)data.start_class);
4817                       PerlIO_printf(Perl_debug_log,
4818                                     "synthetic stclass \"%s\".\n",
4819                                     SvPVX_const(sv));});
4820         }
4821
4822         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4823         if (longest_fixed_length > longest_float_length) {
4824             r->check_end_shift = r->anchored_end_shift;
4825             r->check_substr = r->anchored_substr;
4826             r->check_utf8 = r->anchored_utf8;
4827             r->check_offset_min = r->check_offset_max = r->anchored_offset;
4828             if (r->extflags & RXf_ANCH_SINGLE)
4829                 r->extflags |= RXf_NOSCAN;
4830         }
4831         else {
4832             r->check_end_shift = r->float_end_shift;
4833             r->check_substr = r->float_substr;
4834             r->check_utf8 = r->float_utf8;
4835             r->check_offset_min = r->float_min_offset;
4836             r->check_offset_max = r->float_max_offset;
4837         }
4838         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4839            This should be changed ASAP!  */
4840         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4841             r->extflags |= RXf_USE_INTUIT;
4842             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4843                 r->extflags |= RXf_INTUIT_TAIL;
4844         }
4845         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4846         if ( (STRLEN)minlen < longest_float_length )
4847             minlen= longest_float_length;
4848         if ( (STRLEN)minlen < longest_fixed_length )
4849             minlen= longest_fixed_length;
4850         */
4851     }
4852     else {
4853         /* Several toplevels. Best we can is to set minlen. */
4854         I32 fake;
4855         struct regnode_charclass_class ch_class;
4856         I32 last_close = 0;
4857
4858         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4859
4860         scan = ri->program + 1;
4861         cl_init(pRExC_state, &ch_class);
4862         data.start_class = &ch_class;
4863         data.last_closep = &last_close;
4864
4865
4866         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4867             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4868
4869         CHECK_RESTUDY_GOTO;
4870
4871         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4872                 = r->float_substr = r->float_utf8 = NULL;
4873         if (!(data.start_class->flags & ANYOF_EOS)
4874             && !cl_is_anything(data.start_class))
4875         {
4876             const U32 n = add_data(pRExC_state, 1, "f");
4877
4878             Newx(RExC_rxi->data->data[n], 1,
4879                 struct regnode_charclass_class);
4880             StructCopy(data.start_class,
4881                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4882                        struct regnode_charclass_class);
4883             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4884             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4885             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4886                       regprop(r, sv, (regnode*)data.start_class);
4887                       PerlIO_printf(Perl_debug_log,
4888                                     "synthetic stclass \"%s\".\n",
4889                                     SvPVX_const(sv));});
4890         }
4891     }
4892
4893     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4894        the "real" pattern. */
4895     DEBUG_OPTIMISE_r({
4896         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4897                       (IV)minlen, (IV)r->minlen);
4898     });
4899     r->minlenret = minlen;
4900     if (r->minlen < minlen)
4901         r->minlen = minlen;
4902
4903     if (RExC_seen & REG_SEEN_GPOS)
4904         r->extflags |= RXf_GPOS_SEEN;
4905     if (RExC_seen & REG_SEEN_LOOKBEHIND)
4906         r->extflags |= RXf_LOOKBEHIND_SEEN;
4907     if (RExC_seen & REG_SEEN_EVAL)
4908         r->extflags |= RXf_EVAL_SEEN;
4909     if (RExC_seen & REG_SEEN_CANY)
4910         r->extflags |= RXf_CANY_SEEN;
4911     if (RExC_seen & REG_SEEN_VERBARG)
4912         r->intflags |= PREGf_VERBARG_SEEN;
4913     if (RExC_seen & REG_SEEN_CUTGROUP)
4914         r->intflags |= PREGf_CUTGROUP_SEEN;
4915     if (RExC_paren_names)
4916         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
4917     else
4918         RXp_PAREN_NAMES(r) = NULL;
4919
4920 #ifdef STUPID_PATTERN_CHECKS
4921     if (RX_PRELEN(rx) == 0)
4922         r->extflags |= RXf_NULL;
4923     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4924         /* XXX: this should happen BEFORE we compile */
4925         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4926     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
4927         r->extflags |= RXf_WHITE;
4928     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
4929         r->extflags |= RXf_START_ONLY;
4930 #else
4931     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4932             /* XXX: this should happen BEFORE we compile */
4933             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4934     else {
4935         regnode *first = ri->program + 1;
4936         U8 fop = OP(first);
4937         U8 nop = OP(NEXTOPER(first));
4938
4939         if (PL_regkind[fop] == NOTHING && nop == END)
4940             r->extflags |= RXf_NULL;
4941         else if (PL_regkind[fop] == BOL && nop == END)
4942             r->extflags |= RXf_START_ONLY;
4943         else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4944             r->extflags |= RXf_WHITE;
4945     }
4946 #endif
4947 #ifdef DEBUGGING
4948     if (RExC_paren_names) {
4949         ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4950         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4951     } else
4952 #endif
4953         ri->name_list_idx = 0;
4954
4955     if (RExC_recurse_count) {
4956         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4957             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4958             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4959         }
4960     }
4961     Newxz(r->offs, RExC_npar, regexp_paren_pair);
4962     /* assume we don't need to swap parens around before we match */
4963
4964     DEBUG_DUMP_r({
4965         PerlIO_printf(Perl_debug_log,"Final program:\n");
4966         regdump(r);
4967     });
4968 #ifdef RE_TRACK_PATTERN_OFFSETS
4969     DEBUG_OFFSETS_r(if (ri->u.offsets) {
4970         const U32 len = ri->u.offsets[0];
4971         U32 i;
4972         GET_RE_DEBUG_FLAGS_DECL;
4973         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4974         for (i = 1; i <= len; i++) {
4975             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4976                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4977                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4978             }
4979         PerlIO_printf(Perl_debug_log, "\n");
4980     });
4981 #endif
4982     return rx;
4983 }
4984
4985 #undef RE_ENGINE_PTR
4986
4987
4988 SV*
4989 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4990                     const U32 flags)
4991 {
4992     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
4993
4994     PERL_UNUSED_ARG(value);
4995
4996     if (flags & RXapif_FETCH) {
4997         return reg_named_buff_fetch(rx, key, flags);
4998     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4999         Perl_croak(aTHX_ "%s", PL_no_modify);
5000         return NULL;
5001     } else if (flags & RXapif_EXISTS) {
5002         return reg_named_buff_exists(rx, key, flags)
5003             ? &PL_sv_yes
5004             : &PL_sv_no;
5005     } else if (flags & RXapif_REGNAMES) {
5006         return reg_named_buff_all(rx, flags);
5007     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5008         return reg_named_buff_scalar(rx, flags);
5009     } else {
5010         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5011         return NULL;
5012     }
5013 }
5014
5015 SV*
5016 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5017                          const U32 flags)
5018 {
5019     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5020     PERL_UNUSED_ARG(lastkey);
5021
5022     if (flags & RXapif_FIRSTKEY)
5023         return reg_named_buff_firstkey(rx, flags);
5024     else if (flags & RXapif_NEXTKEY)
5025         return reg_named_buff_nextkey(rx, flags);
5026     else {
5027         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5028         return NULL;
5029     }
5030 }
5031
5032 SV*
5033 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5034                           const U32 flags)
5035 {
5036     AV *retarray = NULL;
5037     SV *ret;
5038     struct regexp *const rx = (struct regexp *)SvANY(r);
5039
5040     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5041
5042     if (flags & RXapif_ALL)
5043         retarray=newAV();
5044
5045     if (rx && RXp_PAREN_NAMES(rx)) {
5046         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5047         if (he_str) {
5048             IV i;
5049             SV* sv_dat=HeVAL(he_str);
5050             I32 *nums=(I32*)SvPVX(sv_dat);
5051             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5052                 if ((I32)(rx->nparens) >= nums[i]
5053                     && rx->offs[nums[i]].start != -1
5054                     && rx->offs[nums[i]].end != -1)
5055                 {
5056                     ret = newSVpvs("");
5057                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5058                     if (!retarray)
5059                         return ret;
5060                 } else {
5061                     ret = newSVsv(&PL_sv_undef);
5062                 }
5063                 if (retarray)
5064                     av_push(retarray, ret);
5065             }
5066             if (retarray)
5067                 return newRV_noinc(MUTABLE_SV(retarray));
5068         }
5069     }
5070     return NULL;
5071 }
5072
5073 bool
5074 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5075                            const U32 flags)
5076 {
5077     struct regexp *const rx = (struct regexp *)SvANY(r);
5078
5079     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5080
5081     if (rx && RXp_PAREN_NAMES(rx)) {
5082         if (flags & RXapif_ALL) {
5083             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5084         } else {
5085             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5086             if (sv) {
5087                 SvREFCNT_dec(sv);
5088                 return TRUE;
5089             } else {
5090                 return FALSE;
5091             }
5092         }
5093     } else {
5094         return FALSE;
5095     }
5096 }
5097
5098 SV*
5099 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5100 {
5101     struct regexp *const rx = (struct regexp *)SvANY(r);
5102
5103     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5104
5105     if ( rx && RXp_PAREN_NAMES(rx) ) {
5106         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5107
5108         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5109     } else {
5110         return FALSE;
5111     }
5112 }
5113
5114 SV*
5115 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5116 {
5117     struct regexp *const rx = (struct regexp *)SvANY(r);
5118     GET_RE_DEBUG_FLAGS_DECL;
5119
5120     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5121
5122     if (rx && RXp_PAREN_NAMES(rx)) {
5123         HV *hv = RXp_PAREN_NAMES(rx);
5124         HE *temphe;
5125         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5126             IV i;
5127             IV parno = 0;
5128             SV* sv_dat = HeVAL(temphe);
5129             I32 *nums = (I32*)SvPVX(sv_dat);
5130             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5131                 if ((I32)(rx->lastparen) >= nums[i] &&
5132                     rx->offs[nums[i]].start != -1 &&
5133                     rx->offs[nums[i]].end != -1)
5134                 {
5135                     parno = nums[i];
5136                     break;
5137                 }
5138             }
5139             if (parno || flags & RXapif_ALL) {
5140                 return newSVhek(HeKEY_hek(temphe));
5141             }
5142         }
5143     }
5144     return NULL;
5145 }
5146
5147 SV*
5148 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5149 {
5150     SV *ret;
5151     AV *av;
5152     I32 length;
5153     struct regexp *const rx = (struct regexp *)SvANY(r);
5154
5155     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5156
5157     if (rx && RXp_PAREN_NAMES(rx)) {
5158         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5159             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5160         } else if (flags & RXapif_ONE) {
5161             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5162             av = MUTABLE_AV(SvRV(ret));
5163             length = av_len(av);
5164             SvREFCNT_dec(ret);
5165             return newSViv(length + 1);
5166         } else {
5167             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5168             return NULL;
5169         }
5170     }
5171     return &PL_sv_undef;
5172 }
5173
5174 SV*
5175 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5176 {
5177     struct regexp *const rx = (struct regexp *)SvANY(r);
5178     AV *av = newAV();
5179
5180     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5181
5182     if (rx && RXp_PAREN_NAMES(rx)) {
5183         HV *hv= RXp_PAREN_NAMES(rx);
5184         HE *temphe;
5185         (void)hv_iterinit(hv);
5186         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5187             IV i;
5188             IV parno = 0;
5189             SV* sv_dat = HeVAL(temphe);
5190             I32 *nums = (I32*)SvPVX(sv_dat);
5191             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5192                 if ((I32)(rx->lastparen) >= nums[i] &&
5193                     rx->offs[nums[i]].start != -1 &&
5194                     rx->offs[nums[i]].end != -1)
5195                 {
5196                     parno = nums[i];
5197                     break;
5198                 }
5199             }
5200             if (parno || flags & RXapif_ALL) {
5201                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5202             }
5203         }
5204     }
5205
5206     return newRV_noinc(MUTABLE_SV(av));
5207 }
5208
5209 void
5210 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5211                              SV * const sv)
5212 {
5213     struct regexp *const rx = (struct regexp *)SvANY(r);
5214     char *s = NULL;
5215     I32 i = 0;
5216     I32 s1, t1;
5217
5218     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5219
5220     if (!rx->subbeg) {
5221         sv_setsv(sv,&PL_sv_undef);
5222         return;
5223     }
5224     else
5225     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5226         /* $` */
5227         i = rx->offs[0].start;
5228         s = rx->subbeg;
5229     }
5230     else
5231     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5232         /* $' */
5233         s = rx->subbeg + rx->offs[0].end;
5234         i = rx->sublen - rx->offs[0].end;
5235     }
5236     else
5237     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5238         (s1 = rx->offs[paren].start) != -1 &&
5239         (t1 = rx->offs[paren].end) != -1)
5240     {
5241         /* $& $1 ... */
5242         i = t1 - s1;
5243         s = rx->subbeg + s1;
5244     } else {
5245         sv_setsv(sv,&PL_sv_undef);
5246         return;
5247     }
5248     assert(rx->sublen >= (s - rx->subbeg) + i );
5249     if (i >= 0) {
5250         const int oldtainted = PL_tainted;
5251         TAINT_NOT;
5252         sv_setpvn(sv, s, i);
5253         PL_tainted = oldtainted;
5254         if ( (rx->extflags & RXf_CANY_SEEN)
5255             ? (RXp_MATCH_UTF8(rx)
5256                         && (!i || is_utf8_string((U8*)s, i)))
5257             : (RXp_MATCH_UTF8(rx)) )
5258         {
5259             SvUTF8_on(sv);
5260         }
5261         else
5262             SvUTF8_off(sv);
5263         if (PL_tainting) {
5264             if (RXp_MATCH_TAINTED(rx)) {
5265                 if (SvTYPE(sv) >= SVt_PVMG) {
5266                     MAGIC* const mg = SvMAGIC(sv);
5267                     MAGIC* mgt;
5268                     PL_tainted = 1;
5269                     SvMAGIC_set(sv, mg->mg_moremagic);
5270                     SvTAINT(sv);
5271                     if ((mgt = SvMAGIC(sv))) {
5272                         mg->mg_moremagic = mgt;
5273                         SvMAGIC_set(sv, mg);
5274                     }
5275                 } else {
5276                     PL_tainted = 1;
5277                     SvTAINT(sv);
5278                 }
5279             } else
5280                 SvTAINTED_off(sv);
5281         }
5282     } else {
5283         sv_setsv(sv,&PL_sv_undef);
5284         return;
5285     }
5286 }
5287
5288 void
5289 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5290                                                          SV const * const value)
5291 {
5292     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5293
5294     PERL_UNUSED_ARG(rx);
5295     PERL_UNUSED_ARG(paren);
5296     PERL_UNUSED_ARG(value);
5297
5298     if (!PL_localizing)
5299         Perl_croak(aTHX_ "%s", PL_no_modify);
5300 }
5301
5302 I32
5303 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5304                               const I32 paren)
5305 {
5306     struct regexp *const rx = (struct regexp *)SvANY(r);
5307     I32 i;
5308     I32 s1, t1;
5309
5310     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5311
5312     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5313         switch (paren) {
5314       /* $` / ${^PREMATCH} */
5315       case RX_BUFF_IDX_PREMATCH:
5316         if (rx->offs[0].start != -1) {
5317                         i = rx->offs[0].start;
5318                         if (i > 0) {
5319                                 s1 = 0;
5320                                 t1 = i;
5321                                 goto getlen;
5322                         }
5323             }
5324         return 0;
5325       /* $' / ${^POSTMATCH} */
5326       case RX_BUFF_IDX_POSTMATCH:
5327             if (rx->offs[0].end != -1) {
5328                         i = rx->sublen - rx->offs[0].end;
5329                         if (i > 0) {
5330                                 s1 = rx->offs[0].end;
5331                                 t1 = rx->sublen;
5332                                 goto getlen;
5333                         }
5334             }
5335         return 0;
5336       /* $& / ${^MATCH}, $1, $2, ... */
5337       default:
5338             if (paren <= (I32)rx->nparens &&
5339             (s1 = rx->offs[paren].start) != -1 &&
5340             (t1 = rx->offs[paren].end) != -1)
5341             {
5342             i = t1 - s1;
5343             goto getlen;
5344         } else {
5345             if (ckWARN(WARN_UNINITIALIZED))
5346                 report_uninit((const SV *)sv);
5347             return 0;
5348         }
5349     }
5350   getlen:
5351     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5352         const char * const s = rx->subbeg + s1;
5353         const U8 *ep;
5354         STRLEN el;
5355
5356         i = t1 - s1;
5357         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5358                         i = el;
5359     }
5360     return i;
5361 }
5362
5363 SV*
5364 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5365 {
5366     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5367         PERL_UNUSED_ARG(rx);
5368         if (0)
5369             return NULL;
5370         else
5371             return newSVpvs("Regexp");
5372 }
5373
5374 /* Scans the name of a named buffer from the pattern.
5375  * If flags is REG_RSN_RETURN_NULL returns null.
5376  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5377  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5378  * to the parsed name as looked up in the RExC_paren_names hash.
5379  * If there is an error throws a vFAIL().. type exception.
5380  */
5381
5382 #define REG_RSN_RETURN_NULL    0
5383 #define REG_RSN_RETURN_NAME    1
5384 #define REG_RSN_RETURN_DATA    2
5385
5386 STATIC SV*
5387 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5388 {
5389     char *name_start = RExC_parse;
5390
5391     PERL_ARGS_ASSERT_REG_SCAN_NAME;
5392
5393     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5394          /* skip IDFIRST by using do...while */
5395         if (UTF)
5396             do {
5397                 RExC_parse += UTF8SKIP(RExC_parse);
5398             } while (isALNUM_utf8((U8*)RExC_parse));
5399         else
5400             do {
5401                 RExC_parse++;
5402             } while (isALNUM(*RExC_parse));
5403     }
5404
5405     if ( flags ) {
5406         SV* sv_name
5407             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5408                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5409         if ( flags == REG_RSN_RETURN_NAME)
5410             return sv_name;
5411         else if (flags==REG_RSN_RETURN_DATA) {
5412             HE *he_str = NULL;
5413             SV *sv_dat = NULL;
5414             if ( ! sv_name )      /* should not happen*/
5415                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5416             if (RExC_paren_names)
5417                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5418             if ( he_str )
5419                 sv_dat = HeVAL(he_str);
5420             if ( ! sv_dat )
5421                 vFAIL("Reference to nonexistent named group");
5422             return sv_dat;
5423         }
5424         else {
5425             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5426         }
5427         /* NOT REACHED */
5428     }
5429     return NULL;
5430 }
5431
5432 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5433     int rem=(int)(RExC_end - RExC_parse);                       \
5434     int cut;                                                    \
5435     int num;                                                    \
5436     int iscut=0;                                                \
5437     if (rem>10) {                                               \
5438         rem=10;                                                 \
5439         iscut=1;                                                \
5440     }                                                           \
5441     cut=10-rem;                                                 \
5442     if (RExC_lastparse!=RExC_parse)                             \
5443         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5444             rem, RExC_parse,                                    \
5445             cut + 4,                                            \
5446             iscut ? "..." : "<"                                 \
5447         );                                                      \
5448     else                                                        \
5449         PerlIO_printf(Perl_debug_log,"%16s","");                \
5450                                                                 \
5451     if (SIZE_ONLY)                                              \
5452        num = RExC_size + 1;                                     \
5453     else                                                        \
5454        num=REG_NODE_NUM(RExC_emit);                             \
5455     if (RExC_lastnum!=num)                                      \
5456        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5457     else                                                        \
5458        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5459     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5460         (int)((depth*2)), "",                                   \
5461         (funcname)                                              \
5462     );                                                          \
5463     RExC_lastnum=num;                                           \
5464     RExC_lastparse=RExC_parse;                                  \
5465 })
5466
5467
5468
5469 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5470     DEBUG_PARSE_MSG((funcname));                            \
5471     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5472 })
5473 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5474     DEBUG_PARSE_MSG((funcname));                            \
5475     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5476 })
5477 /*
5478  - reg - regular expression, i.e. main body or parenthesized thing
5479  *
5480  * Caller must absorb opening parenthesis.
5481  *
5482  * Combining parenthesis handling with the base level of regular expression
5483  * is a trifle forced, but the need to tie the tails of the branches to what
5484  * follows makes it hard to avoid.
5485  */
5486 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5487 #ifdef DEBUGGING
5488 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5489 #else
5490 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5491 #endif
5492
5493 STATIC regnode *
5494 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5495     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5496 {
5497     dVAR;
5498     register regnode *ret;              /* Will be the head of the group. */
5499     register regnode *br;
5500     register regnode *lastbr;
5501     register regnode *ender = NULL;
5502     register I32 parno = 0;
5503     I32 flags;
5504     U32 oregflags = RExC_flags;
5505     bool have_branch = 0;
5506     bool is_open = 0;
5507     I32 freeze_paren = 0;
5508     I32 after_freeze = 0;
5509
5510     /* for (?g), (?gc), and (?o) warnings; warning
5511        about (?c) will warn about (?g) -- japhy    */
5512
5513 #define WASTED_O  0x01
5514 #define WASTED_G  0x02
5515 #define WASTED_C  0x04
5516 #define WASTED_GC (0x02|0x04)
5517     I32 wastedflags = 0x00;
5518
5519     char * parse_start = RExC_parse; /* MJD */
5520     char * const oregcomp_parse = RExC_parse;
5521
5522     GET_RE_DEBUG_FLAGS_DECL;
5523
5524     PERL_ARGS_ASSERT_REG;
5525     DEBUG_PARSE("reg ");
5526
5527     *flagp = 0;                         /* Tentatively. */
5528
5529
5530     /* Make an OPEN node, if parenthesized. */
5531     if (paren) {
5532         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5533             char *start_verb = RExC_parse;
5534             STRLEN verb_len = 0;
5535             char *start_arg = NULL;
5536             unsigned char op = 0;
5537             int argok = 1;
5538             int internal_argval = 0; /* internal_argval is only useful if !argok */
5539             while ( *RExC_parse && *RExC_parse != ')' ) {
5540                 if ( *RExC_parse == ':' ) {
5541                     start_arg = RExC_parse + 1;
5542                     break;
5543                 }
5544                 RExC_parse++;
5545             }
5546             ++start_verb;
5547             verb_len = RExC_parse - start_verb;
5548             if ( start_arg ) {
5549                 RExC_parse++;
5550                 while ( *RExC_parse && *RExC_parse != ')' )
5551                     RExC_parse++;
5552                 if ( *RExC_parse != ')' )
5553                     vFAIL("Unterminated verb pattern argument");
5554                 if ( RExC_parse == start_arg )
5555                     start_arg = NULL;
5556             } else {
5557                 if ( *RExC_parse != ')' )
5558                     vFAIL("Unterminated verb pattern");
5559             }
5560
5561             switch ( *start_verb ) {
5562             case 'A':  /* (*ACCEPT) */
5563                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5564                     op = ACCEPT;
5565                     internal_argval = RExC_nestroot;
5566                 }
5567                 break;
5568             case 'C':  /* (*COMMIT) */
5569                 if ( memEQs(start_verb,verb_len,"COMMIT") )
5570                     op = COMMIT;
5571                 break;
5572             case 'F':  /* (*FAIL) */
5573                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5574                     op = OPFAIL;
5575                     argok = 0;
5576                 }
5577                 break;
5578             case ':':  /* (*:NAME) */
5579             case 'M':  /* (*MARK:NAME) */
5580                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5581                     op = MARKPOINT;
5582                     argok = -1;
5583                 }
5584                 break;
5585             case 'P':  /* (*PRUNE) */
5586                 if ( memEQs(start_verb,verb_len,"PRUNE") )
5587                     op = PRUNE;
5588                 break;
5589             case 'S':   /* (*SKIP) */
5590                 if ( memEQs(start_verb,verb_len,"SKIP") )
5591                     op = SKIP;
5592                 break;
5593             case 'T':  /* (*THEN) */
5594                 /* [19:06] <TimToady> :: is then */
5595                 if ( memEQs(start_verb,verb_len,"THEN") ) {
5596                     op = CUTGROUP;
5597                     RExC_seen |= REG_SEEN_CUTGROUP;
5598                 }
5599                 break;
5600             }
5601             if ( ! op ) {
5602                 RExC_parse++;
5603                 vFAIL3("Unknown verb pattern '%.*s'",
5604                     verb_len, start_verb);
5605             }
5606             if ( argok ) {
5607                 if ( start_arg && internal_argval ) {
5608                     vFAIL3("Verb pattern '%.*s' may not have an argument",
5609                         verb_len, start_verb);
5610                 } else if ( argok < 0 && !start_arg ) {
5611                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5612                         verb_len, start_verb);
5613                 } else {
5614                     ret = reganode(pRExC_state, op, internal_argval);
5615                     if ( ! internal_argval && ! SIZE_ONLY ) {
5616                         if (start_arg) {
5617                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5618                             ARG(ret) = add_data( pRExC_state, 1, "S" );
5619                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5620                             ret->flags = 0;
5621                         } else {
5622                             ret->flags = 1;
5623                         }
5624                     }
5625                 }
5626                 if (!internal_argval)
5627                     RExC_seen |= REG_SEEN_VERBARG;
5628             } else if ( start_arg ) {
5629                 vFAIL3("Verb pattern '%.*s' may not have an argument",
5630                         verb_len, start_verb);
5631             } else {
5632                 ret = reg_node(pRExC_state, op);
5633             }
5634             nextchar(pRExC_state);
5635             return ret;
5636         } else
5637         if (*RExC_parse == '?') { /* (?...) */
5638             bool is_logical = 0;
5639             const char * const seqstart = RExC_parse;
5640
5641             RExC_parse++;
5642             paren = *RExC_parse++;
5643             ret = NULL;                 /* For look-ahead/behind. */
5644             switch (paren) {
5645
5646             case 'P':   /* (?P...) variants for those used to PCRE/Python */
5647                 paren = *RExC_parse++;
5648                 if ( paren == '<')         /* (?P<...>) named capture */
5649                     goto named_capture;
5650                 else if (paren == '>') {   /* (?P>name) named recursion */
5651                     goto named_recursion;
5652                 }
5653                 else if (paren == '=') {   /* (?P=...)  named backref */
5654                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
5655                        you change this make sure you change that */
5656                     char* name_start = RExC_parse;
5657                     U32 num = 0;
5658                     SV *sv_dat = reg_scan_name(pRExC_state,
5659                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5660                     if (RExC_parse == name_start || *RExC_parse != ')')
5661                         vFAIL2("Sequence %.3s... not terminated",parse_start);
5662
5663                     if (!SIZE_ONLY) {
5664                         num = add_data( pRExC_state, 1, "S" );
5665                         RExC_rxi->data->data[num]=(void*)sv_dat;
5666                         SvREFCNT_inc_simple_void(sv_dat);
5667                     }
5668                     RExC_sawback = 1;
5669                     ret = reganode(pRExC_state,
5670                            (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5671                            num);
5672                     *flagp |= HASWIDTH;
5673
5674                     Set_Node_Offset(ret, parse_start+1);
5675                     Set_Node_Cur_Length(ret); /* MJD */
5676
5677                     nextchar(pRExC_state);
5678                     return ret;
5679                 }
5680                 RExC_parse++;
5681                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5682                 /*NOTREACHED*/
5683             case '<':           /* (?<...) */
5684                 if (*RExC_parse == '!')
5685                     paren = ',';
5686                 else if (*RExC_parse != '=')
5687               named_capture:
5688                 {               /* (?<...>) */
5689                     char *name_start;
5690                     SV *svname;
5691                     paren= '>';
5692             case '\'':          /* (?'...') */
5693                     name_start= RExC_parse;
5694                     svname = reg_scan_name(pRExC_state,
5695                         SIZE_ONLY ?  /* reverse test from the others */
5696                         REG_RSN_RETURN_NAME :
5697                         REG_RSN_RETURN_NULL);
5698                     if (RExC_parse == name_start) {
5699                         RExC_parse++;
5700                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5701                         /*NOTREACHED*/
5702                     }
5703                     if (*RExC_parse != paren)
5704                         vFAIL2("Sequence (?%c... not terminated",
5705                             paren=='>' ? '<' : paren);
5706                     if (SIZE_ONLY) {
5707                         HE *he_str;
5708                         SV *sv_dat = NULL;
5709                         if (!svname) /* shouldnt happen */
5710                             Perl_croak(aTHX_
5711                                 "panic: reg_scan_name returned NULL");
5712                         if (!RExC_paren_names) {
5713                             RExC_paren_names= newHV();
5714                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
5715 #ifdef DEBUGGING
5716                             RExC_paren_name_list= newAV();
5717                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5718 #endif
5719                         }
5720                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5721                         if ( he_str )
5722                             sv_dat = HeVAL(he_str);
5723                         if ( ! sv_dat ) {
5724                             /* croak baby croak */
5725                             Perl_croak(aTHX_
5726                                 "panic: paren_name hash element allocation failed");
5727                         } else if ( SvPOK(sv_dat) ) {
5728                             /* (?|...) can mean we have dupes so scan to check
5729                                its already been stored. Maybe a flag indicating
5730                                we are inside such a construct would be useful,
5731                                but the arrays are likely to be quite small, so
5732                                for now we punt -- dmq */
5733                             IV count = SvIV(sv_dat);
5734                             I32 *pv = (I32*)SvPVX(sv_dat);
5735                             IV i;
5736                             for ( i = 0 ; i < count ; i++ ) {
5737                                 if ( pv[i] == RExC_npar ) {
5738                                     count = 0;
5739                                     break;
5740                                 }
5741                             }
5742                             if ( count ) {
5743                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5744                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5745                                 pv[count] = RExC_npar;
5746                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5747                             }
5748                         } else {
5749                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
5750                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5751                             SvIOK_on(sv_dat);
5752                             SvIV_set(sv_dat, 1);
5753                         }
5754 #ifdef DEBUGGING
5755                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5756                             SvREFCNT_dec(svname);
5757 #endif
5758
5759                         /*sv_dump(sv_dat);*/
5760                     }
5761                     nextchar(pRExC_state);
5762                     paren = 1;
5763                     goto capturing_parens;
5764                 }
5765                 RExC_seen |= REG_SEEN_LOOKBEHIND;
5766                 RExC_parse++;
5767             case '=':           /* (?=...) */
5768                 RExC_seen_zerolen++;
5769                         break;
5770             case '!':           /* (?!...) */
5771                 RExC_seen_zerolen++;
5772                 if (*RExC_parse == ')') {
5773                     ret=reg_node(pRExC_state, OPFAIL);
5774                     nextchar(pRExC_state);
5775                     return ret;
5776                 }
5777                 break;
5778             case '|':           /* (?|...) */
5779                 /* branch reset, behave like a (?:...) except that
5780                    buffers in alternations share the same numbers */
5781                 paren = ':';
5782                 after_freeze = freeze_paren = RExC_npar;
5783                 break;
5784             case ':':           /* (?:...) */
5785             case '>':           /* (?>...) */
5786                 break;
5787             case '$':           /* (?$...) */
5788             case '@':           /* (?@...) */
5789                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5790                 break;
5791             case '#':           /* (?#...) */
5792                 while (*RExC_parse && *RExC_parse != ')')
5793                     RExC_parse++;
5794                 if (*RExC_parse != ')')
5795                     FAIL("Sequence (?#... not terminated");
5796                 nextchar(pRExC_state);
5797                 *flagp = TRYAGAIN;
5798                 return NULL;
5799             case '0' :           /* (?0) */
5800             case 'R' :           /* (?R) */
5801                 if (*RExC_parse != ')')
5802                     FAIL("Sequence (?R) not terminated");
5803                 ret = reg_node(pRExC_state, GOSTART);
5804                 *flagp |= POSTPONED;
5805                 nextchar(pRExC_state);
5806                 return ret;
5807                 /*notreached*/
5808             { /* named and numeric backreferences */
5809                 I32 num;
5810             case '&':            /* (?&NAME) */
5811                 parse_start = RExC_parse - 1;
5812               named_recursion:
5813                 {
5814                     SV *sv_dat = reg_scan_name(pRExC_state,
5815                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5816                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5817                 }
5818                 goto gen_recurse_regop;
5819                 /* NOT REACHED */
5820             case '+':
5821                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5822                     RExC_parse++;
5823                     vFAIL("Illegal pattern");
5824                 }
5825                 goto parse_recursion;
5826                 /* NOT REACHED*/
5827             case '-': /* (?-1) */
5828                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5829                     RExC_parse--; /* rewind to let it be handled later */
5830                     goto parse_flags;
5831                 }
5832                 /*FALLTHROUGH */
5833             case '1': case '2': case '3': case '4': /* (?1) */
5834             case '5': case '6': case '7': case '8': case '9':
5835                 RExC_parse--;
5836               parse_recursion:
5837                 num = atoi(RExC_parse);
5838                 parse_start = RExC_parse - 1; /* MJD */
5839                 if (*RExC_parse == '-')
5840                     RExC_parse++;
5841                 while (isDIGIT(*RExC_parse))
5842                         RExC_parse++;
5843                 if (*RExC_parse!=')')
5844                     vFAIL("Expecting close bracket");
5845
5846               gen_recurse_regop:
5847                 if ( paren == '-' ) {
5848                     /*
5849                     Diagram of capture buffer numbering.
5850                     Top line is the normal capture buffer numbers
5851                     Botton line is the negative indexing as from
5852                     the X (the (?-2))
5853
5854                     +   1 2    3 4 5 X          6 7
5855                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5856                     -   5 4    3 2 1 X          x x
5857
5858                     */
5859                     num = RExC_npar + num;
5860                     if (num < 1)  {
5861                         RExC_parse++;
5862                         vFAIL("Reference to nonexistent group");
5863                     }
5864                 } else if ( paren == '+' ) {
5865                     num = RExC_npar + num - 1;
5866                 }
5867
5868                 ret = reganode(pRExC_state, GOSUB, num);
5869                 if (!SIZE_ONLY) {
5870                     if (num > (I32)RExC_rx->nparens) {
5871                         RExC_parse++;
5872                         vFAIL("Reference to nonexistent group");
5873                     }
5874                     ARG2L_SET( ret, RExC_recurse_count++);
5875                     RExC_emit++;
5876                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5877                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5878                 } else {
5879                     RExC_size++;
5880                 }
5881                 RExC_seen |= REG_SEEN_RECURSE;
5882                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5883                 Set_Node_Offset(ret, parse_start); /* MJD */
5884
5885                 *flagp |= POSTPONED;
5886                 nextchar(pRExC_state);
5887                 return ret;
5888             } /* named and numeric backreferences */
5889             /* NOT REACHED */
5890
5891             case '?':           /* (??...) */
5892                 is_logical = 1;
5893                 if (*RExC_parse != '{') {
5894                     RExC_parse++;
5895                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5896                     /*NOTREACHED*/
5897                 }
5898                 *flagp |= POSTPONED;
5899                 paren = *RExC_parse++;
5900                 /* FALL THROUGH */
5901             case '{':           /* (?{...}) */
5902             {
5903                 I32 count = 1;
5904                 U32 n = 0;
5905                 char c;
5906                 char *s = RExC_parse;
5907
5908                 RExC_seen_zerolen++;
5909                 RExC_seen |= REG_SEEN_EVAL;
5910                 while (count && (c = *RExC_parse)) {
5911                     if (c == '\\') {
5912                         if (RExC_parse[1])
5913                             RExC_parse++;
5914                     }
5915                     else if (c == '{')
5916                         count++;
5917                     else if (c == '}')
5918                         count--;
5919                     RExC_parse++;
5920                 }
5921                 if (*RExC_parse != ')') {
5922                     RExC_parse = s;
5923                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5924                 }
5925                 if (!SIZE_ONLY) {
5926                     PAD *pad;
5927                     OP_4tree *sop, *rop;
5928                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5929
5930                     ENTER;
5931                     Perl_save_re_context(aTHX);
5932                     rop = sv_compile_2op(sv, &sop, "re", &pad);
5933                     sop->op_private |= OPpREFCOUNTED;
5934                     /* re_dup will OpREFCNT_inc */
5935                     OpREFCNT_set(sop, 1);
5936                     LEAVE;
5937
5938                     n = add_data(pRExC_state, 3, "nop");
5939                     RExC_rxi->data->data[n] = (void*)rop;
5940                     RExC_rxi->data->data[n+1] = (void*)sop;
5941                     RExC_rxi->data->data[n+2] = (void*)pad;
5942                     SvREFCNT_dec(sv);
5943                 }
5944                 else {                                          /* First pass */
5945                     if (PL_reginterp_cnt < ++RExC_seen_evals
5946                         && IN_PERL_RUNTIME)
5947                         /* No compiled RE interpolated, has runtime
5948                            components ===> unsafe.  */
5949                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
5950                     if (PL_tainting && PL_tainted)
5951                         FAIL("Eval-group in insecure regular expression");
5952 #if PERL_VERSION > 8
5953                     if (IN_PERL_COMPILETIME)
5954                         PL_cv_has_eval = 1;
5955 #endif
5956                 }
5957
5958                 nextchar(pRExC_state);
5959                 if (is_logical) {
5960                     ret = reg_node(pRExC_state, LOGICAL);
5961                     if (!SIZE_ONLY)
5962                         ret->flags = 2;
5963                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5964                     /* deal with the length of this later - MJD */
5965                     return ret;
5966                 }
5967                 ret = reganode(pRExC_state, EVAL, n);
5968                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5969                 Set_Node_Offset(ret, parse_start);
5970                 return ret;
5971             }
5972             case '(':           /* (?(?{...})...) and (?(?=...)...) */
5973             {
5974                 int is_define= 0;
5975                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
5976                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5977                         || RExC_parse[1] == '<'
5978                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
5979                         I32 flag;
5980
5981                         ret = reg_node(pRExC_state, LOGICAL);
5982                         if (!SIZE_ONLY)
5983                             ret->flags = 1;
5984                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5985                         goto insert_if;
5986                     }
5987                 }
5988                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
5989                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5990                 {
5991                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
5992                     char *name_start= RExC_parse++;
5993                     U32 num = 0;
5994                     SV *sv_dat=reg_scan_name(pRExC_state,
5995                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5996                     if (RExC_parse == name_start || *RExC_parse != ch)
5997                         vFAIL2("Sequence (?(%c... not terminated",
5998                             (ch == '>' ? '<' : ch));
5999                     RExC_parse++;
6000                     if (!SIZE_ONLY) {
6001                         num = add_data( pRExC_state, 1, "S" );
6002                         RExC_rxi->data->data[num]=(void*)sv_dat;
6003                         SvREFCNT_inc_simple_void(sv_dat);
6004                     }
6005                     ret = reganode(pRExC_state,NGROUPP,num);
6006                     goto insert_if_check_paren;
6007                 }
6008                 else if (RExC_parse[0] == 'D' &&
6009                          RExC_parse[1] == 'E' &&
6010                          RExC_parse[2] == 'F' &&
6011                          RExC_parse[3] == 'I' &&
6012                          RExC_parse[4] == 'N' &&
6013                          RExC_parse[5] == 'E')
6014                 {
6015                     ret = reganode(pRExC_state,DEFINEP,0);
6016                     RExC_parse +=6 ;
6017                     is_define = 1;
6018                     goto insert_if_check_paren;
6019                 }
6020                 else if (RExC_parse[0] == 'R') {
6021                     RExC_parse++;
6022                     parno = 0;
6023                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6024                         parno = atoi(RExC_parse++);
6025                         while (isDIGIT(*RExC_parse))
6026                             RExC_parse++;
6027                     } else if (RExC_parse[0] == '&') {
6028                         SV *sv_dat;
6029                         RExC_parse++;
6030                         sv_dat = reg_scan_name(pRExC_state,
6031                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6032                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6033                     }
6034                     ret = reganode(pRExC_state,INSUBP,parno);
6035                     goto insert_if_check_paren;
6036                 }
6037                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6038                     /* (?(1)...) */
6039                     char c;
6040                     parno = atoi(RExC_parse++);
6041
6042                     while (isDIGIT(*RExC_parse))
6043                         RExC_parse++;
6044                     ret = reganode(pRExC_state, GROUPP, parno);
6045
6046                  insert_if_check_paren:
6047                     if ((c = *nextchar(pRExC_state)) != ')')
6048                         vFAIL("Switch condition not recognized");
6049                   insert_if:
6050                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6051                     br = regbranch(pRExC_state, &flags, 1,depth+1);
6052                     if (br == NULL)
6053                         br = reganode(pRExC_state, LONGJMP, 0);
6054                     else
6055                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6056                     c = *nextchar(pRExC_state);
6057                     if (flags&HASWIDTH)
6058                         *flagp |= HASWIDTH;
6059                     if (c == '|') {
6060                         if (is_define)
6061                             vFAIL("(?(DEFINE)....) does not allow branches");
6062                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6063                         regbranch(pRExC_state, &flags, 1,depth+1);
6064                         REGTAIL(pRExC_state, ret, lastbr);
6065                         if (flags&HASWIDTH)
6066                             *flagp |= HASWIDTH;
6067                         c = *nextchar(pRExC_state);
6068                     }
6069                     else
6070                         lastbr = NULL;
6071                     if (c != ')')
6072                         vFAIL("Switch (?(condition)... contains too many branches");
6073                     ender = reg_node(pRExC_state, TAIL);
6074                     REGTAIL(pRExC_state, br, ender);
6075                     if (lastbr) {
6076                         REGTAIL(pRExC_state, lastbr, ender);
6077                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6078                     }
6079                     else
6080                         REGTAIL(pRExC_state, ret, ender);
6081                     RExC_size++; /* XXX WHY do we need this?!!
6082                                     For large programs it seems to be required
6083                                     but I can't figure out why. -- dmq*/
6084                     return ret;
6085                 }
6086                 else {
6087                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6088                 }
6089             }
6090             case 0:
6091                 RExC_parse--; /* for vFAIL to print correctly */
6092                 vFAIL("Sequence (? incomplete");
6093                 break;
6094             default:
6095                 --RExC_parse;
6096                 parse_flags:      /* (?i) */
6097             {
6098                 U32 posflags = 0, negflags = 0;
6099                 U32 *flagsp = &posflags;
6100
6101                 while (*RExC_parse) {
6102                     /* && strchr("iogcmsx", *RExC_parse) */
6103                     /* (?g), (?gc) and (?o) are useless here
6104                        and must be globally applied -- japhy */
6105                     switch (*RExC_parse) {
6106                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6107                     case ONCE_PAT_MOD: /* 'o' */
6108                     case GLOBAL_PAT_MOD: /* 'g' */
6109                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6110                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6111                             if (! (wastedflags & wflagbit) ) {
6112                                 wastedflags |= wflagbit;
6113                                 vWARN5(
6114                                     RExC_parse + 1,
6115                                     "Useless (%s%c) - %suse /%c modifier",
6116                                     flagsp == &negflags ? "?-" : "?",
6117                                     *RExC_parse,
6118                                     flagsp == &negflags ? "don't " : "",
6119                                     *RExC_parse
6120                                 );
6121                             }
6122                         }
6123                         break;
6124
6125                     case CONTINUE_PAT_MOD: /* 'c' */
6126                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6127                             if (! (wastedflags & WASTED_C) ) {
6128                                 wastedflags |= WASTED_GC;
6129                                 vWARN3(
6130                                     RExC_parse + 1,
6131                                     "Useless (%sc) - %suse /gc modifier",
6132                                     flagsp == &negflags ? "?-" : "?",
6133                                     flagsp == &negflags ? "don't " : ""
6134                                 );
6135                             }
6136                         }
6137                         break;
6138                     case KEEPCOPY_PAT_MOD: /* 'p' */
6139                         if (flagsp == &negflags) {
6140                             if (SIZE_ONLY)
6141                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6142                         } else {
6143                             *flagsp |= RXf_PMf_KEEPCOPY;
6144                         }
6145                         break;
6146                     case '-':
6147                         if (flagsp == &negflags) {
6148                             RExC_parse++;
6149                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6150                             /*NOTREACHED*/
6151                         }
6152                         flagsp = &negflags;
6153                         wastedflags = 0;  /* reset so (?g-c) warns twice */
6154                         break;
6155                     case ':':
6156                         paren = ':';
6157                         /*FALLTHROUGH*/
6158                     case ')':
6159                         RExC_flags |= posflags;
6160                         RExC_flags &= ~negflags;
6161                         if (paren != ':') {
6162                             oregflags |= posflags;
6163                             oregflags &= ~negflags;
6164                         }
6165                         nextchar(pRExC_state);
6166                         if (paren != ':') {
6167                             *flagp = TRYAGAIN;
6168                             return NULL;
6169                         } else {
6170                             ret = NULL;
6171                             goto parse_rest;
6172                         }
6173                         /*NOTREACHED*/
6174                     default:
6175                         RExC_parse++;
6176                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6177                         /*NOTREACHED*/
6178                     }
6179                     ++RExC_parse;
6180                 }
6181             }} /* one for the default block, one for the switch */
6182         }
6183         else {                  /* (...) */
6184           capturing_parens:
6185             parno = RExC_npar;
6186             RExC_npar++;
6187
6188             ret = reganode(pRExC_state, OPEN, parno);
6189             if (!SIZE_ONLY ){
6190                 if (!RExC_nestroot)
6191                     RExC_nestroot = parno;
6192                 if (RExC_seen & REG_SEEN_RECURSE
6193                     && !RExC_open_parens[parno-1])
6194                 {
6195                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6196                         "Setting open paren #%"IVdf" to %d\n",
6197                         (IV)parno, REG_NODE_NUM(ret)));
6198                     RExC_open_parens[parno-1]= ret;
6199                 }
6200             }
6201             Set_Node_Length(ret, 1); /* MJD */
6202             Set_Node_Offset(ret, RExC_parse); /* MJD */
6203             is_open = 1;
6204         }
6205     }
6206     else                        /* ! paren */
6207         ret = NULL;
6208
6209    parse_rest:
6210     /* Pick up the branches, linking them together. */
6211     parse_start = RExC_parse;   /* MJD */
6212     br = regbranch(pRExC_state, &flags, 1,depth+1);
6213
6214     if (freeze_paren) {
6215         if (RExC_npar > after_freeze)
6216             after_freeze = RExC_npar;
6217         RExC_npar = freeze_paren;
6218     }
6219
6220     /*     branch_len = (paren != 0); */
6221
6222     if (br == NULL)
6223         return(NULL);
6224     if (*RExC_parse == '|') {
6225         if (!SIZE_ONLY && RExC_extralen) {
6226             reginsert(pRExC_state, BRANCHJ, br, depth+1);
6227         }
6228         else {                  /* MJD */
6229             reginsert(pRExC_state, BRANCH, br, depth+1);
6230             Set_Node_Length(br, paren != 0);
6231             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6232         }
6233         have_branch = 1;
6234         if (SIZE_ONLY)
6235             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
6236     }
6237     else if (paren == ':') {
6238         *flagp |= flags&SIMPLE;
6239     }
6240     if (is_open) {                              /* Starts with OPEN. */
6241         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
6242     }
6243     else if (paren != '?')              /* Not Conditional */
6244         ret = br;
6245     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6246     lastbr = br;
6247     while (*RExC_parse == '|') {
6248         if (!SIZE_ONLY && RExC_extralen) {
6249             ender = reganode(pRExC_state, LONGJMP,0);
6250             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6251         }
6252         if (SIZE_ONLY)
6253             RExC_extralen += 2;         /* Account for LONGJMP. */
6254         nextchar(pRExC_state);
6255         if (freeze_paren) {
6256             if (RExC_npar > after_freeze)
6257                 after_freeze = RExC_npar;
6258             RExC_npar = freeze_paren;
6259         }
6260         br = regbranch(pRExC_state, &flags, 0, depth+1);
6261
6262         if (br == NULL)
6263             return(NULL);
6264         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
6265         lastbr = br;
6266         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6267     }
6268
6269     if (have_branch || paren != ':') {
6270         /* Make a closing node, and hook it on the end. */
6271         switch (paren) {
6272         case ':':
6273             ender = reg_node(pRExC_state, TAIL);
6274             break;
6275         case 1:
6276             ender = reganode(pRExC_state, CLOSE, parno);
6277             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6278                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6279                         "Setting close paren #%"IVdf" to %d\n",
6280                         (IV)parno, REG_NODE_NUM(ender)));
6281                 RExC_close_parens[parno-1]= ender;
6282                 if (RExC_nestroot == parno)
6283                     RExC_nestroot = 0;
6284             }
6285             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6286             Set_Node_Length(ender,1); /* MJD */
6287             break;
6288         case '<':
6289         case ',':
6290         case '=':
6291         case '!':
6292             *flagp &= ~HASWIDTH;
6293             /* FALL THROUGH */
6294         case '>':
6295             ender = reg_node(pRExC_state, SUCCEED);
6296             break;
6297         case 0:
6298             ender = reg_node(pRExC_state, END);
6299             if (!SIZE_ONLY) {
6300                 assert(!RExC_opend); /* there can only be one! */
6301                 RExC_opend = ender;
6302             }
6303             break;
6304         }
6305         REGTAIL(pRExC_state, lastbr, ender);
6306
6307         if (have_branch && !SIZE_ONLY) {
6308             if (depth==1)
6309                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6310
6311             /* Hook the tails of the branches to the closing node. */
6312             for (br = ret; br; br = regnext(br)) {
6313                 const U8 op = PL_regkind[OP(br)];
6314                 if (op == BRANCH) {
6315                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6316                 }
6317                 else if (op == BRANCHJ) {
6318                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6319                 }
6320             }
6321         }
6322     }
6323
6324     {
6325         const char *p;
6326         static const char parens[] = "=!<,>";
6327
6328         if (paren && (p = strchr(parens, paren))) {
6329             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6330             int flag = (p - parens) > 1;
6331
6332             if (paren == '>')
6333                 node = SUSPEND, flag = 0;
6334             reginsert(pRExC_state, node,ret, depth+1);
6335             Set_Node_Cur_Length(ret);
6336             Set_Node_Offset(ret, parse_start + 1);
6337             ret->flags = flag;
6338             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6339         }
6340     }
6341
6342     /* Check for proper termination. */
6343     if (paren) {
6344         RExC_flags = oregflags;
6345         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6346             RExC_parse = oregcomp_parse;
6347             vFAIL("Unmatched (");
6348         }
6349     }
6350     else if (!paren && RExC_parse < RExC_end) {
6351         if (*RExC_parse == ')') {
6352             RExC_parse++;
6353             vFAIL("Unmatched )");
6354         }
6355         else
6356             FAIL("Junk on end of regexp");      /* "Can't happen". */
6357         /* NOTREACHED */
6358     }
6359     if (after_freeze)
6360         RExC_npar = after_freeze;
6361     return(ret);
6362 }
6363
6364 /*
6365  - regbranch - one alternative of an | operator
6366  *
6367  * Implements the concatenation operator.
6368  */
6369 STATIC regnode *
6370 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6371 {
6372     dVAR;
6373     register regnode *ret;
6374     register regnode *chain = NULL;
6375     register regnode *latest;
6376     I32 flags = 0, c = 0;
6377     GET_RE_DEBUG_FLAGS_DECL;
6378
6379     PERL_ARGS_ASSERT_REGBRANCH;
6380
6381     DEBUG_PARSE("brnc");
6382
6383     if (first)
6384         ret = NULL;
6385     else {
6386         if (!SIZE_ONLY && RExC_extralen)
6387             ret = reganode(pRExC_state, BRANCHJ,0);
6388         else {
6389             ret = reg_node(pRExC_state, BRANCH);
6390             Set_Node_Length(ret, 1);
6391         }
6392     }
6393
6394     if (!first && SIZE_ONLY)
6395         RExC_extralen += 1;                     /* BRANCHJ */
6396
6397     *flagp = WORST;                     /* Tentatively. */
6398
6399     RExC_parse--;
6400     nextchar(pRExC_state);
6401     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6402         flags &= ~TRYAGAIN;
6403         latest = regpiece(pRExC_state, &flags,depth+1);
6404         if (latest == NULL) {
6405             if (flags & TRYAGAIN)
6406                 continue;
6407             return(NULL);
6408         }
6409         else if (ret == NULL)
6410             ret = latest;
6411         *flagp |= flags&(HASWIDTH|POSTPONED);
6412         if (chain == NULL)      /* First piece. */
6413             *flagp |= flags&SPSTART;
6414         else {
6415             RExC_naughty++;
6416             REGTAIL(pRExC_state, chain, latest);
6417         }
6418         chain = latest;
6419         c++;
6420     }
6421     if (chain == NULL) {        /* Loop ran zero times. */
6422         chain = reg_node(pRExC_state, NOTHING);
6423         if (ret == NULL)
6424             ret = chain;
6425     }
6426     if (c == 1) {
6427         *flagp |= flags&SIMPLE;
6428     }
6429
6430     return ret;
6431 }
6432
6433 /*
6434  - regpiece - something followed by possible [*+?]
6435  *
6436  * Note that the branching code sequences used for ? and the general cases
6437  * of * and + are somewhat optimized:  they use the same NOTHING node as
6438  * both the endmarker for their branch list and the body of the last branch.
6439  * It might seem that this node could be dispensed with entirely, but the
6440  * endmarker role is not redundant.
6441  */
6442 STATIC regnode *
6443 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6444 {
6445     dVAR;
6446     register regnode *ret;
6447     register char op;
6448     register char *next;
6449     I32 flags;
6450     const char * const origparse = RExC_parse;
6451     I32 min;
6452     I32 max = REG_INFTY;
6453     char *parse_start;
6454     const char *maxpos = NULL;
6455     GET_RE_DEBUG_FLAGS_DECL;
6456
6457     PERL_ARGS_ASSERT_REGPIECE;
6458
6459     DEBUG_PARSE("piec");
6460
6461     ret = regatom(pRExC_state, &flags,depth+1);
6462     if (ret == NULL) {
6463         if (flags & TRYAGAIN)
6464             *flagp |= TRYAGAIN;
6465         return(NULL);
6466     }
6467
6468     op = *RExC_parse;
6469
6470     if (op == '{' && regcurly(RExC_parse)) {
6471         maxpos = NULL;
6472         parse_start = RExC_parse; /* MJD */
6473         next = RExC_parse + 1;
6474         while (isDIGIT(*next) || *next == ',') {
6475             if (*next == ',') {
6476                 if (maxpos)
6477                     break;
6478                 else
6479                     maxpos = next;
6480             }
6481             next++;
6482         }
6483         if (*next == '}') {             /* got one */
6484             if (!maxpos)
6485                 maxpos = next;
6486             RExC_parse++;
6487             min = atoi(RExC_parse);
6488             if (*maxpos == ',')
6489                 maxpos++;
6490             else
6491                 maxpos = RExC_parse;
6492             max = atoi(maxpos);
6493             if (!max && *maxpos != '0')
6494                 max = REG_INFTY;                /* meaning "infinity" */
6495             else if (max >= REG_INFTY)
6496                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6497             RExC_parse = next;
6498             nextchar(pRExC_state);
6499
6500         do_curly:
6501             if ((flags&SIMPLE)) {
6502                 RExC_naughty += 2 + RExC_naughty / 2;
6503                 reginsert(pRExC_state, CURLY, ret, depth+1);
6504                 Set_Node_Offset(ret, parse_start+1); /* MJD */
6505                 Set_Node_Cur_Length(ret);
6506             }
6507             else {
6508                 regnode * const w = reg_node(pRExC_state, WHILEM);
6509
6510                 w->flags = 0;
6511                 REGTAIL(pRExC_state, ret, w);
6512                 if (!SIZE_ONLY && RExC_extralen) {
6513                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
6514                     reginsert(pRExC_state, NOTHING,ret, depth+1);
6515                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
6516                 }
6517                 reginsert(pRExC_state, CURLYX,ret, depth+1);
6518                                 /* MJD hk */
6519                 Set_Node_Offset(ret, parse_start+1);
6520                 Set_Node_Length(ret,
6521                                 op == '{' ? (RExC_parse - parse_start) : 1);
6522
6523                 if (!SIZE_ONLY && RExC_extralen)
6524                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
6525                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6526                 if (SIZE_ONLY)
6527                     RExC_whilem_seen++, RExC_extralen += 3;
6528                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
6529             }
6530             ret->flags = 0;
6531
6532             if (min > 0)
6533                 *flagp = WORST;
6534             if (max > 0)
6535                 *flagp |= HASWIDTH;
6536             if (max < min)
6537                 vFAIL("Can't do {n,m} with n > m");
6538             if (!SIZE_ONLY) {
6539                 ARG1_SET(ret, (U16)min);
6540                 ARG2_SET(ret, (U16)max);
6541             }
6542
6543             goto nest_check;
6544         }
6545     }
6546
6547     if (!ISMULT1(op)) {
6548         *flagp = flags;
6549         return(ret);
6550     }
6551
6552 #if 0                           /* Now runtime fix should be reliable. */
6553
6554     /* if this is reinstated, don't forget to put this back into perldiag:
6555
6556             =item Regexp *+ operand could be empty at {#} in regex m/%s/
6557
6558            (F) The part of the regexp subject to either the * or + quantifier
6559            could match an empty string. The {#} shows in the regular
6560            expression about where the problem was discovered.
6561
6562     */
6563
6564     if (!(flags&HASWIDTH) && op != '?')
6565       vFAIL("Regexp *+ operand could be empty");
6566 #endif
6567
6568     parse_start = RExC_parse;
6569     nextchar(pRExC_state);
6570
6571     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6572
6573     if (op == '*' && (flags&SIMPLE)) {
6574         reginsert(pRExC_state, STAR, ret, depth+1);
6575         ret->flags = 0;
6576         RExC_naughty += 4;
6577     }
6578     else if (op == '*') {
6579         min = 0;
6580         goto do_curly;
6581     }
6582     else if (op == '+' && (flags&SIMPLE)) {
6583         reginsert(pRExC_state, PLUS, ret, depth+1);
6584         ret->flags = 0;
6585         RExC_naughty += 3;
6586     }
6587     else if (op == '+') {
6588         min = 1;
6589         goto do_curly;
6590     }
6591     else if (op == '?') {
6592         min = 0; max = 1;
6593         goto do_curly;
6594     }
6595   nest_check:
6596     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6597         ckWARN3reg(RExC_parse,
6598                    "%.*s matches null string many times",
6599                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6600                    origparse);
6601     }
6602
6603     if (RExC_parse < RExC_end && *RExC_parse == '?') {
6604         nextchar(pRExC_state);
6605         reginsert(pRExC_state, MINMOD, ret, depth+1);
6606         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6607     }
6608 #ifndef REG_ALLOW_MINMOD_SUSPEND
6609     else
6610 #endif
6611     if (RExC_parse < RExC_end && *RExC_parse == '+') {
6612         regnode *ender;
6613         nextchar(pRExC_state);
6614         ender = reg_node(pRExC_state, SUCCEED);
6615         REGTAIL(pRExC_state, ret, ender);
6616         reginsert(pRExC_state, SUSPEND, ret, depth+1);
6617         ret->flags = 0;
6618         ender = reg_node(pRExC_state, TAIL);
6619         REGTAIL(pRExC_state, ret, ender);
6620         /*ret= ender;*/
6621     }
6622
6623     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6624         RExC_parse++;
6625         vFAIL("Nested quantifiers");
6626     }
6627
6628     return(ret);
6629 }
6630
6631
6632 /* reg_namedseq(pRExC_state,UVp)
6633
6634    This is expected to be called by a parser routine that has
6635    recognized '\N' and needs to handle the rest. RExC_parse is
6636    expected to point at the first char following the N at the time
6637    of the call.
6638
6639    The \N may be inside (indicated by valuep not being NULL) or outside a
6640    character class.
6641
6642    \N may begin either a named sequence, or if outside a character class, mean
6643    to match a non-newline.  For non single-quoted regexes, the tokenizer has
6644    attempted to decide which, and in the case of a named sequence converted it
6645    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6646    where c1... are the characters in the sequence.  For single-quoted regexes,
6647    the tokenizer passes the \N sequence through unchanged; this code will not
6648    attempt to determine this nor expand those.  The net effect is that if the
6649    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6650    signals that this \N occurrence means to match a non-newline.
6651
6652    Only the \N{U+...} form should occur in a character class, for the same
6653    reason that '.' inside a character class means to just match a period: it
6654    just doesn't make sense.
6655
6656    If valuep is non-null then it is assumed that we are parsing inside
6657    of a charclass definition and the first codepoint in the resolved
6658    string is returned via *valuep and the routine will return NULL.
6659    In this mode if a multichar string is returned from the charnames
6660    handler, a warning will be issued, and only the first char in the
6661    sequence will be examined. If the string returned is zero length
6662    then the value of *valuep is undefined and NON-NULL will
6663    be returned to indicate failure. (This will NOT be a valid pointer
6664    to a regnode.)
6665
6666    If valuep is null then it is assumed that we are parsing normal text and a
6667    new EXACT node is inserted into the program containing the resolved string,
6668    and a pointer to the new node is returned.  But if the string is zero length
6669    a NOTHING node is emitted instead.
6670
6671    On success RExC_parse is set to the char following the endbrace.
6672    Parsing failures will generate a fatal error via vFAIL(...)
6673  */
6674 STATIC regnode *
6675 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6676 {
6677     char * endbrace;    /* '}' following the name */
6678     regnode *ret = NULL;
6679 #ifdef DEBUGGING
6680     char* parse_start = RExC_parse - 2;     /* points to the '\N' */
6681 #endif
6682     char* p;
6683
6684     GET_RE_DEBUG_FLAGS_DECL;
6685
6686     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6687
6688     GET_RE_DEBUG_FLAGS;
6689
6690     /* The [^\n] meaning of \N ignores spaces and comments under the /x
6691      * modifier.  The other meaning does not */
6692     p = (RExC_flags & RXf_PMf_EXTENDED)
6693         ? regwhite( pRExC_state, RExC_parse )
6694         : RExC_parse;
6695
6696     /* Disambiguate between \N meaning a named character versus \N meaning
6697      * [^\n].  The former is assumed when it can't be the latter. */
6698     if (*p != '{' || regcurly(p)) {
6699         RExC_parse = p;
6700         if (valuep) {
6701             /* no bare \N in a charclass */
6702             vFAIL("\\N in a character class must be a named character: \\N{...}");
6703         }
6704         nextchar(pRExC_state);
6705         ret = reg_node(pRExC_state, REG_ANY);
6706         *flagp |= HASWIDTH|SIMPLE;
6707         RExC_naughty++;
6708         RExC_parse--;
6709         Set_Node_Length(ret, 1); /* MJD */
6710         return ret;
6711     }
6712
6713     /* Here, we have decided it should be a named sequence */
6714
6715     /* The test above made sure that the next real character is a '{', but
6716      * under the /x modifier, it could be separated by space (or a comment and
6717      * \n) and this is not allowed (for consistency with \x{...} and the
6718      * tokenizer handling of \N{NAME}). */
6719     if (*RExC_parse != '{') {
6720         vFAIL("Missing braces on \\N{}");
6721     }
6722
6723     RExC_parse++;       /* Skip past the '{' */
6724
6725     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6726         || ! (endbrace == RExC_parse            /* nothing between the {} */
6727               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
6728                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6729     {
6730         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
6731         vFAIL("\\N{NAME} must be resolved by the lexer");
6732     }
6733
6734     if (endbrace == RExC_parse) {   /* empty: \N{} */
6735         if (! valuep) {
6736             RExC_parse = endbrace + 1;
6737             return reg_node(pRExC_state,NOTHING);
6738         }
6739
6740         if (SIZE_ONLY) {
6741             ckWARNreg(RExC_parse,
6742                     "Ignoring zero length \\N{} in character class"
6743             );
6744             RExC_parse = endbrace + 1;
6745         }
6746         *valuep = 0;
6747         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6748     }
6749
6750     RExC_utf8 = 1;      /* named sequences imply Unicode semantics */
6751     RExC_parse += 2;    /* Skip past the 'U+' */
6752
6753     if (valuep) {   /* In a bracketed char class */
6754         /* We only pay attention to the first char of
6755         multichar strings being returned. I kinda wonder
6756         if this makes sense as it does change the behaviour
6757         from earlier versions, OTOH that behaviour was broken
6758         as well. XXX Solution is to recharacterize as
6759         [rest-of-class]|multi1|multi2... */
6760
6761         STRLEN length_of_hex;
6762         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6763             | PERL_SCAN_DISALLOW_PREFIX
6764             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6765
6766         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
6767         if (endchar < endbrace) {
6768             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6769         }
6770
6771         length_of_hex = (STRLEN)(endchar - RExC_parse);
6772         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
6773
6774         /* The tokenizer should have guaranteed validity, but it's possible to
6775          * bypass it by using single quoting, so check */
6776         if (length_of_hex == 0
6777             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6778         {
6779             RExC_parse += length_of_hex;        /* Includes all the valid */
6780             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
6781                             ? UTF8SKIP(RExC_parse)
6782                             : 1;
6783             /* Guard against malformed utf8 */
6784             if (RExC_parse >= endchar) RExC_parse = endchar;
6785             vFAIL("Invalid hexadecimal number in \\N{U+...}");
6786         }
6787
6788         RExC_parse = endbrace + 1;
6789         if (endchar == endbrace) return NULL;
6790
6791         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
6792     }
6793     else {      /* Not a char class */
6794         char *s;            /* String to put in generated EXACT node */
6795         STRLEN len = 0;     /* Its current length */
6796         char *endchar;      /* Points to '.' or '}' ending cur char in the input
6797                                stream */
6798
6799         ret = reg_node(pRExC_state,
6800                         (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6801         s= STRING(ret);
6802
6803         /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
6804          * the input which is of the form now 'c1.c2.c3...}' until find the
6805          * ending brace or exeed length 255.  The characters that exceed this
6806          * limit are dropped.  The limit could be relaxed should it become
6807          * desirable by reparsing this as (?:\N{NAME}), so could generate
6808          * multiple EXACT nodes, as is done for just regular input.  But this
6809          * is primarily a named character, and not intended to be a huge long
6810          * string, so 255 bytes should be good enough */
6811         while (1) {
6812             STRLEN length_of_hex;
6813             I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
6814                             | PERL_SCAN_DISALLOW_PREFIX
6815                             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6816             UV cp;  /* Ord of current character */
6817
6818             /* Code points are separated by dots.  If none, there is only one
6819              * code point, and is terminated by the brace */
6820             endchar = RExC_parse + strcspn(RExC_parse, ".}");
6821
6822             /* The values are Unicode even on EBCDIC machines */
6823             length_of_hex = (STRLEN)(endchar - RExC_parse);
6824             cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
6825             if ( length_of_hex == 0
6826                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6827             {
6828                 RExC_parse += length_of_hex;        /* Includes all the valid */
6829                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
6830                                 ? UTF8SKIP(RExC_parse)
6831                                 : 1;
6832                 /* Guard against malformed utf8 */
6833                 if (RExC_parse >= endchar) RExC_parse = endchar;
6834                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
6835             }
6836
6837             if (! FOLD) {       /* Not folding, just append to the string */
6838                 STRLEN unilen;
6839
6840                 /* Quit before adding this character if would exceed limit */
6841                 if (len + UNISKIP(cp) > U8_MAX) break;
6842
6843                 unilen = reguni(pRExC_state, cp, s);
6844                 if (unilen > 0) {
6845                     s   += unilen;
6846                     len += unilen;
6847                 }
6848             } else {    /* Folding, output the folded equivalent */
6849                 STRLEN foldlen,numlen;
6850                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6851                 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
6852
6853                 /* Quit before exceeding size limit */
6854                 if (len + foldlen > U8_MAX) break;
6855
6856                 for (foldbuf = tmpbuf;
6857                     foldlen;
6858                     foldlen -= numlen)
6859                 {
6860                     cp = utf8_to_uvchr(foldbuf, &numlen);
6861                     if (numlen > 0) {
6862                         const STRLEN unilen = reguni(pRExC_state, cp, s);
6863                         s       += unilen;
6864                         len     += unilen;
6865                         /* In EBCDIC the numlen and unilen can differ. */
6866                         foldbuf += numlen;
6867                         if (numlen >= foldlen)
6868                             break;
6869                     }
6870                     else
6871                         break; /* "Can't happen." */
6872                 }
6873             }
6874
6875             /* Point to the beginning of the next character in the sequence. */
6876             RExC_parse = endchar + 1;
6877
6878             /* Quit if no more characters */
6879             if (RExC_parse >= endbrace) break;
6880         }
6881
6882
6883         if (SIZE_ONLY) {
6884             if (RExC_parse < endbrace) {
6885                 ckWARNreg(RExC_parse - 1,
6886                           "Using just the first characters returned by \\N{}");
6887             }
6888
6889             RExC_size += STR_SZ(len);
6890         } else {
6891             STR_LEN(ret) = len;
6892             RExC_emit += STR_SZ(len);
6893         }
6894
6895         RExC_parse = endbrace + 1;
6896
6897         *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
6898                                with malformed in t/re/pat_advanced.t */
6899         RExC_parse --;
6900         Set_Node_Cur_Length(ret); /* MJD */
6901         nextchar(pRExC_state);
6902     }
6903
6904     return ret;
6905 }
6906
6907
6908 /*
6909  * reg_recode
6910  *
6911  * It returns the code point in utf8 for the value in *encp.
6912  *    value: a code value in the source encoding
6913  *    encp:  a pointer to an Encode object
6914  *
6915  * If the result from Encode is not a single character,
6916  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6917  */
6918 STATIC UV
6919 S_reg_recode(pTHX_ const char value, SV **encp)
6920 {
6921     STRLEN numlen = 1;
6922     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
6923     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6924     const STRLEN newlen = SvCUR(sv);
6925     UV uv = UNICODE_REPLACEMENT;
6926
6927     PERL_ARGS_ASSERT_REG_RECODE;
6928
6929     if (newlen)
6930         uv = SvUTF8(sv)
6931              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6932              : *(U8*)s;
6933
6934     if (!newlen || numlen != newlen) {
6935         uv = UNICODE_REPLACEMENT;
6936         *encp = NULL;
6937     }
6938     return uv;
6939 }
6940
6941
6942 /*
6943  - regatom - the lowest level
6944
6945    Try to identify anything special at the start of the pattern. If there
6946    is, then handle it as required. This may involve generating a single regop,
6947    such as for an assertion; or it may involve recursing, such as to
6948    handle a () structure.
6949
6950    If the string doesn't start with something special then we gobble up
6951    as much literal text as we can.
6952
6953    Once we have been able to handle whatever type of thing started the
6954    sequence, we return.
6955
6956    Note: we have to be careful with escapes, as they can be both literal
6957    and special, and in the case of \10 and friends can either, depending
6958    on context. Specifically there are two seperate switches for handling
6959    escape sequences, with the one for handling literal escapes requiring
6960    a dummy entry for all of the special escapes that are actually handled
6961    by the other.
6962 */
6963
6964 STATIC regnode *
6965 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6966 {
6967     dVAR;
6968     register regnode *ret = NULL;
6969     I32 flags;
6970     char *parse_start = RExC_parse;
6971     GET_RE_DEBUG_FLAGS_DECL;
6972     DEBUG_PARSE("atom");
6973     *flagp = WORST;             /* Tentatively. */
6974
6975     PERL_ARGS_ASSERT_REGATOM;
6976
6977 tryagain:
6978     switch ((U8)*RExC_parse) {
6979     case '^':
6980         RExC_seen_zerolen++;
6981         nextchar(pRExC_state);
6982         if (RExC_flags & RXf_PMf_MULTILINE)
6983             ret = reg_node(pRExC_state, MBOL);
6984         else if (RExC_flags & RXf_PMf_SINGLELINE)
6985             ret = reg_node(pRExC_state, SBOL);
6986         else
6987             ret = reg_node(pRExC_state, BOL);
6988         Set_Node_Length(ret, 1); /* MJD */
6989         break;
6990     case '$':
6991         nextchar(pRExC_state);
6992         if (*RExC_parse)
6993             RExC_seen_zerolen++;
6994         if (RExC_flags & RXf_PMf_MULTILINE)
6995             ret = reg_node(pRExC_state, MEOL);
6996         else if (RExC_flags & RXf_PMf_SINGLELINE)
6997             ret = reg_node(pRExC_state, SEOL);
6998         else
6999             ret = reg_node(pRExC_state, EOL);
7000         Set_Node_Length(ret, 1); /* MJD */
7001         break;
7002     case '.':
7003         nextchar(pRExC_state);
7004         if (RExC_flags & RXf_PMf_SINGLELINE)
7005             ret = reg_node(pRExC_state, SANY);
7006         else
7007             ret = reg_node(pRExC_state, REG_ANY);
7008         *flagp |= HASWIDTH|SIMPLE;
7009         RExC_naughty++;
7010         Set_Node_Length(ret, 1); /* MJD */
7011         break;
7012     case '[':
7013     {
7014         char * const oregcomp_parse = ++RExC_parse;
7015         ret = regclass(pRExC_state,depth+1);
7016         if (*RExC_parse != ']') {
7017             RExC_parse = oregcomp_parse;
7018             vFAIL("Unmatched [");
7019         }
7020         nextchar(pRExC_state);
7021         *flagp |= HASWIDTH|SIMPLE;
7022         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7023         break;
7024     }
7025     case '(':
7026         nextchar(pRExC_state);
7027         ret = reg(pRExC_state, 1, &flags,depth+1);
7028         if (ret == NULL) {
7029                 if (flags & TRYAGAIN) {
7030                     if (RExC_parse == RExC_end) {
7031                          /* Make parent create an empty node if needed. */
7032                         *flagp |= TRYAGAIN;
7033                         return(NULL);
7034                     }
7035                     goto tryagain;
7036                 }
7037                 return(NULL);
7038         }
7039         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7040         break;
7041     case '|':
7042     case ')':
7043         if (flags & TRYAGAIN) {
7044             *flagp |= TRYAGAIN;
7045             return NULL;
7046         }
7047         vFAIL("Internal urp");
7048                                 /* Supposed to be caught earlier. */
7049         break;
7050     case '{':
7051         if (!regcurly(RExC_parse)) {
7052             RExC_parse++;
7053             goto defchar;
7054         }
7055         /* FALL THROUGH */
7056     case '?':
7057     case '+':
7058     case '*':
7059         RExC_parse++;
7060         vFAIL("Quantifier follows nothing");
7061         break;
7062     case 0xDF:
7063     case 0xC3:
7064     case 0xCE:
7065         do_foldchar:
7066         if (!LOC && FOLD) {
7067             U32 len,cp;
7068             len=0; /* silence a spurious compiler warning */
7069             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7070                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7071                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7072                 ret = reganode(pRExC_state, FOLDCHAR, cp);
7073                 Set_Node_Length(ret, 1); /* MJD */
7074                 nextchar(pRExC_state); /* kill whitespace under /x */
7075                 return ret;
7076             }
7077         }
7078         goto outer_default;
7079     case '\\':
7080         /* Special Escapes
7081
7082            This switch handles escape sequences that resolve to some kind
7083            of special regop and not to literal text. Escape sequnces that
7084            resolve to literal text are handled below in the switch marked
7085            "Literal Escapes".
7086
7087            Every entry in this switch *must* have a corresponding entry
7088            in the literal escape switch. However, the opposite is not
7089            required, as the default for this switch is to jump to the
7090            literal text handling code.
7091         */
7092         switch ((U8)*++RExC_parse) {
7093         case 0xDF:
7094         case 0xC3:
7095         case 0xCE:
7096                    goto do_foldchar;
7097         /* Special Escapes */
7098         case 'A':
7099             RExC_seen_zerolen++;
7100             ret = reg_node(pRExC_state, SBOL);
7101             *flagp |= SIMPLE;
7102             goto finish_meta_pat;
7103         case 'G':
7104             ret = reg_node(pRExC_state, GPOS);
7105             RExC_seen |= REG_SEEN_GPOS;
7106             *flagp |= SIMPLE;
7107             goto finish_meta_pat;
7108         case 'K':
7109             RExC_seen_zerolen++;
7110             ret = reg_node(pRExC_state, KEEPS);
7111             *flagp |= SIMPLE;
7112             /* XXX:dmq : disabling in-place substitution seems to
7113              * be necessary here to avoid cases of memory corruption, as
7114              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7115              */
7116             RExC_seen |= REG_SEEN_LOOKBEHIND;
7117             goto finish_meta_pat;
7118         case 'Z':
7119             ret = reg_node(pRExC_state, SEOL);
7120             *flagp |= SIMPLE;
7121             RExC_seen_zerolen++;                /* Do not optimize RE away */
7122             goto finish_meta_pat;
7123         case 'z':
7124             ret = reg_node(pRExC_state, EOS);
7125             *flagp |= SIMPLE;
7126             RExC_seen_zerolen++;                /* Do not optimize RE away */
7127             goto finish_meta_pat;
7128         case 'C':
7129             ret = reg_node(pRExC_state, CANY);
7130             RExC_seen |= REG_SEEN_CANY;
7131             *flagp |= HASWIDTH|SIMPLE;
7132             goto finish_meta_pat;
7133         case 'X':
7134             ret = reg_node(pRExC_state, CLUMP);
7135             *flagp |= HASWIDTH;
7136             goto finish_meta_pat;
7137         case 'w':
7138             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
7139             *flagp |= HASWIDTH|SIMPLE;
7140             goto finish_meta_pat;
7141         case 'W':
7142             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
7143             *flagp |= HASWIDTH|SIMPLE;
7144             goto finish_meta_pat;
7145         case 'b':
7146             RExC_seen_zerolen++;
7147             RExC_seen |= REG_SEEN_LOOKBEHIND;
7148             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
7149             *flagp |= SIMPLE;
7150             goto finish_meta_pat;
7151         case 'B':
7152             RExC_seen_zerolen++;
7153             RExC_seen |= REG_SEEN_LOOKBEHIND;
7154             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
7155             *flagp |= SIMPLE;
7156             goto finish_meta_pat;
7157         case 's':
7158             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
7159             *flagp |= HASWIDTH|SIMPLE;
7160             goto finish_meta_pat;
7161         case 'S':
7162             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
7163             *flagp |= HASWIDTH|SIMPLE;
7164             goto finish_meta_pat;
7165         case 'd':
7166             ret = reg_node(pRExC_state, DIGIT);
7167             *flagp |= HASWIDTH|SIMPLE;
7168             goto finish_meta_pat;
7169         case 'D':
7170             ret = reg_node(pRExC_state, NDIGIT);
7171             *flagp |= HASWIDTH|SIMPLE;
7172             goto finish_meta_pat;
7173         case 'R':
7174             ret = reg_node(pRExC_state, LNBREAK);
7175             *flagp |= HASWIDTH|SIMPLE;
7176             goto finish_meta_pat;
7177         case 'h':
7178             ret = reg_node(pRExC_state, HORIZWS);
7179             *flagp |= HASWIDTH|SIMPLE;
7180             goto finish_meta_pat;
7181         case 'H':
7182             ret = reg_node(pRExC_state, NHORIZWS);
7183             *flagp |= HASWIDTH|SIMPLE;
7184             goto finish_meta_pat;
7185         case 'v':
7186             ret = reg_node(pRExC_state, VERTWS);
7187             *flagp |= HASWIDTH|SIMPLE;
7188             goto finish_meta_pat;
7189         case 'V':
7190             ret = reg_node(pRExC_state, NVERTWS);
7191             *flagp |= HASWIDTH|SIMPLE;
7192          finish_meta_pat:
7193             nextchar(pRExC_state);
7194             Set_Node_Length(ret, 2); /* MJD */
7195             break;
7196         case 'p':
7197         case 'P':
7198             {
7199                 char* const oldregxend = RExC_end;
7200 #ifdef DEBUGGING
7201                 char* parse_start = RExC_parse - 2;
7202 #endif
7203
7204                 if (RExC_parse[1] == '{') {
7205                   /* a lovely hack--pretend we saw [\pX] instead */
7206                     RExC_end = strchr(RExC_parse, '}');
7207                     if (!RExC_end) {
7208                         const U8 c = (U8)*RExC_parse;
7209                         RExC_parse += 2;
7210                         RExC_end = oldregxend;
7211                         vFAIL2("Missing right brace on \\%c{}", c);
7212                     }
7213                     RExC_end++;
7214                 }
7215                 else {
7216                     RExC_end = RExC_parse + 2;
7217                     if (RExC_end > oldregxend)
7218                         RExC_end = oldregxend;
7219                 }
7220                 RExC_parse--;
7221
7222                 ret = regclass(pRExC_state,depth+1);
7223
7224                 RExC_end = oldregxend;
7225                 RExC_parse--;
7226
7227                 Set_Node_Offset(ret, parse_start + 2);
7228                 Set_Node_Cur_Length(ret);
7229                 nextchar(pRExC_state);
7230                 *flagp |= HASWIDTH|SIMPLE;
7231             }
7232             break;
7233         case 'N':
7234             /* Handle \N and \N{NAME} here and not below because it can be
7235             multicharacter. join_exact() will join them up later on.
7236             Also this makes sure that things like /\N{BLAH}+/ and
7237             \N{BLAH} being multi char Just Happen. dmq*/
7238             ++RExC_parse;
7239             ret= reg_namedseq(pRExC_state, NULL, flagp);
7240             break;
7241         case 'k':    /* Handle \k<NAME> and \k'NAME' */
7242         parse_named_seq:
7243         {
7244             char ch= RExC_parse[1];
7245             if (ch != '<' && ch != '\'' && ch != '{') {
7246                 RExC_parse++;
7247                 vFAIL2("Sequence %.2s... not terminated",parse_start);
7248             } else {
7249                 /* this pretty much dupes the code for (?P=...) in reg(), if
7250                    you change this make sure you change that */
7251                 char* name_start = (RExC_parse += 2);
7252                 U32 num = 0;
7253                 SV *sv_dat = reg_scan_name(pRExC_state,
7254                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7255                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7256                 if (RExC_parse == name_start || *RExC_parse != ch)
7257                     vFAIL2("Sequence %.3s... not terminated",parse_start);
7258
7259                 if (!SIZE_ONLY) {
7260                     num = add_data( pRExC_state, 1, "S" );
7261                     RExC_rxi->data->data[num]=(void*)sv_dat;
7262                     SvREFCNT_inc_simple_void(sv_dat);
7263                 }
7264
7265                 RExC_sawback = 1;
7266                 ret = reganode(pRExC_state,
7267                            (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7268                            num);
7269                 *flagp |= HASWIDTH;
7270
7271                 /* override incorrect value set in reganode MJD */
7272                 Set_Node_Offset(ret, parse_start+1);
7273                 Set_Node_Cur_Length(ret); /* MJD */
7274                 nextchar(pRExC_state);
7275
7276             }
7277             break;
7278         }
7279         case 'g':
7280         case '1': case '2': case '3': case '4':
7281         case '5': case '6': case '7': case '8': case '9':
7282             {
7283                 I32 num;
7284                 bool isg = *RExC_parse == 'g';
7285                 bool isrel = 0;
7286                 bool hasbrace = 0;
7287                 if (isg) {
7288                     RExC_parse++;
7289                     if (*RExC_parse == '{') {
7290                         RExC_parse++;
7291                         hasbrace = 1;
7292                     }
7293                     if (*RExC_parse == '-') {
7294                         RExC_parse++;
7295                         isrel = 1;
7296                     }
7297                     if (hasbrace && !isDIGIT(*RExC_parse)) {
7298                         if (isrel) RExC_parse--;
7299                         RExC_parse -= 2;
7300                         goto parse_named_seq;
7301                 }   }
7302                 num = atoi(RExC_parse);
7303                 if (isg && num == 0)
7304                     vFAIL("Reference to invalid group 0");
7305                 if (isrel) {
7306                     num = RExC_npar - num;
7307                     if (num < 1)
7308                         vFAIL("Reference to nonexistent or unclosed group");
7309                 }
7310                 if (!isg && num > 9 && num >= RExC_npar)
7311                     goto defchar;
7312                 else {
7313                     char * const parse_start = RExC_parse - 1; /* MJD */
7314                     while (isDIGIT(*RExC_parse))
7315                         RExC_parse++;
7316                     if (parse_start == RExC_parse - 1)
7317                         vFAIL("Unterminated \\g... pattern");
7318                     if (hasbrace) {
7319                         if (*RExC_parse != '}')
7320                             vFAIL("Unterminated \\g{...} pattern");
7321                         RExC_parse++;
7322                     }
7323                     if (!SIZE_ONLY) {
7324                         if (num > (I32)RExC_rx->nparens)
7325                             vFAIL("Reference to nonexistent group");
7326                     }
7327                     RExC_sawback = 1;
7328                     ret = reganode(pRExC_state,
7329                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7330                                    num);
7331                     *flagp |= HASWIDTH;
7332
7333                     /* override incorrect value set in reganode MJD */
7334                     Set_Node_Offset(ret, parse_start+1);
7335                     Set_Node_Cur_Length(ret); /* MJD */
7336                     RExC_parse--;
7337                     nextchar(pRExC_state);
7338                 }
7339             }
7340             break;
7341         case '\0':
7342             if (RExC_parse >= RExC_end)
7343                 FAIL("Trailing \\");
7344             /* FALL THROUGH */
7345         default:
7346             /* Do not generate "unrecognized" warnings here, we fall
7347                back into the quick-grab loop below */
7348             parse_start--;
7349             goto defchar;
7350         }
7351         break;
7352
7353     case '#':
7354         if (RExC_flags & RXf_PMf_EXTENDED) {
7355             if ( reg_skipcomment( pRExC_state ) )
7356                 goto tryagain;
7357         }
7358         /* FALL THROUGH */
7359
7360     default:
7361         outer_default:{
7362             register STRLEN len;
7363             register UV ender;
7364             register char *p;
7365             char *s;
7366             STRLEN foldlen;
7367             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7368
7369             parse_start = RExC_parse - 1;
7370
7371             RExC_parse++;
7372
7373         defchar:
7374             ender = 0;
7375             ret = reg_node(pRExC_state,
7376                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7377             s = STRING(ret);
7378             for (len = 0, p = RExC_parse - 1;
7379               len < 127 && p < RExC_end;
7380               len++)
7381             {
7382                 char * const oldp = p;
7383
7384                 if (RExC_flags & RXf_PMf_EXTENDED)
7385                     p = regwhite( pRExC_state, p );
7386                 switch ((U8)*p) {
7387                 case 0xDF:
7388                 case 0xC3:
7389                 case 0xCE:
7390                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7391                                 goto normal_default;
7392                 case '^':
7393                 case '$':
7394                 case '.':
7395                 case '[':
7396                 case '(':
7397                 case ')':
7398                 case '|':
7399                     goto loopdone;
7400                 case '\\':
7401                     /* Literal Escapes Switch
7402
7403                        This switch is meant to handle escape sequences that
7404                        resolve to a literal character.
7405
7406                        Every escape sequence that represents something
7407                        else, like an assertion or a char class, is handled
7408                        in the switch marked 'Special Escapes' above in this
7409                        routine, but also has an entry here as anything that
7410                        isn't explicitly mentioned here will be treated as
7411                        an unescaped equivalent literal.
7412                     */
7413
7414                     switch ((U8)*++p) {
7415                     /* These are all the special escapes. */
7416                     case 0xDF:
7417                     case 0xC3:
7418                     case 0xCE:
7419                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7420                                 goto normal_default;
7421                     case 'A':             /* Start assertion */
7422                     case 'b': case 'B':   /* Word-boundary assertion*/
7423                     case 'C':             /* Single char !DANGEROUS! */
7424                     case 'd': case 'D':   /* digit class */
7425                     case 'g': case 'G':   /* generic-backref, pos assertion */
7426                     case 'h': case 'H':   /* HORIZWS */
7427                     case 'k': case 'K':   /* named backref, keep marker */
7428                     case 'N':             /* named char sequence */
7429                     case 'p': case 'P':   /* Unicode property */
7430                               case 'R':   /* LNBREAK */
7431                     case 's': case 'S':   /* space class */
7432                     case 'v': case 'V':   /* VERTWS */
7433                     case 'w': case 'W':   /* word class */
7434                     case 'X':             /* eXtended Unicode "combining character sequence" */
7435                     case 'z': case 'Z':   /* End of line/string assertion */
7436                         --p;
7437                         goto loopdone;
7438
7439                     /* Anything after here is an escape that resolves to a
7440                        literal. (Except digits, which may or may not)
7441                      */
7442                     case 'n':
7443                         ender = '\n';
7444                         p++;
7445                         break;
7446                     case 'r':
7447                         ender = '\r';
7448                         p++;
7449                         break;
7450                     case 't':
7451                         ender = '\t';
7452                         p++;
7453                         break;
7454                     case 'f':
7455                         ender = '\f';
7456                         p++;
7457                         break;
7458                     case 'e':
7459                           ender = ASCII_TO_NATIVE('\033');
7460                         p++;
7461                         break;
7462                     case 'a':
7463                           ender = ASCII_TO_NATIVE('\007');
7464                         p++;
7465                         break;
7466                     case 'x':
7467                         if (*++p == '{') {
7468                             char* const e = strchr(p, '}');
7469
7470                             if (!e) {
7471                                 RExC_parse = p + 1;
7472                                 vFAIL("Missing right brace on \\x{}");
7473                             }
7474                             else {
7475                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7476                                     | PERL_SCAN_DISALLOW_PREFIX;
7477                                 STRLEN numlen = e - p - 1;
7478                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7479                                 if (ender > 0xff)
7480                                     RExC_utf8 = 1;
7481                                 p = e + 1;
7482                             }
7483                         }
7484                         else {
7485                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7486                             STRLEN numlen = 2;
7487                             ender = grok_hex(p, &numlen, &flags, NULL);
7488                             p += numlen;
7489                         }
7490                         if (PL_encoding && ender < 0x100)
7491                             goto recode_encoding;
7492                         break;
7493                     case 'c':
7494                         p++;
7495                         ender = grok_bslash_c(*p++, SIZE_ONLY);
7496                         break;
7497                     case '0': case '1': case '2': case '3':case '4':
7498                     case '5': case '6': case '7': case '8':case '9':
7499                         if (*p == '0' ||
7500                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
7501                             I32 flags = 0;
7502                             STRLEN numlen = 3;
7503                             ender = grok_oct(p, &numlen, &flags, NULL);
7504
7505                             /* An octal above 0xff is interpreted differently
7506                              * depending on if the re is in utf8 or not.  If it
7507                              * is in utf8, the value will be itself, otherwise
7508                              * it is interpreted as modulo 0x100.  It has been
7509                              * decided to discourage the use of octal above the
7510                              * single-byte range.  For now, warn only when
7511                              * it ends up modulo */
7512                             if (SIZE_ONLY && ender >= 0x100
7513                                     && ! UTF && ! PL_encoding) {
7514                                 ckWARNregdep(p, "Use of octal value above 377 is deprecated");
7515                             }
7516                             p += numlen;
7517                         }
7518                         else {
7519                             --p;
7520                             goto loopdone;
7521                         }
7522                         if (PL_encoding && ender < 0x100)
7523                             goto recode_encoding;
7524                         break;
7525                     recode_encoding:
7526                         {
7527                             SV* enc = PL_encoding;
7528                             ender = reg_recode((const char)(U8)ender, &enc);
7529                             if (!enc && SIZE_ONLY)
7530                                 ckWARNreg(p, "Invalid escape in the specified encoding");
7531                             RExC_utf8 = 1;
7532                         }
7533                         break;
7534                     case '\0':
7535                         if (p >= RExC_end)
7536                             FAIL("Trailing \\");
7537                         /* FALL THROUGH */
7538                     default:
7539                         if (!SIZE_ONLY&& isALPHA(*p))
7540                             ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7541                         goto normal_default;
7542                     }
7543                     break;
7544                 default:
7545                   normal_default:
7546                     if (UTF8_IS_START(*p) && UTF) {
7547                         STRLEN numlen;
7548                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7549                                                &numlen, UTF8_ALLOW_DEFAULT);
7550                         p += numlen;
7551                     }
7552                     else
7553                         ender = *p++;
7554                     break;
7555                 }
7556                 if ( RExC_flags & RXf_PMf_EXTENDED)
7557                     p = regwhite( pRExC_state, p );
7558                 if (UTF && FOLD) {
7559                     /* Prime the casefolded buffer. */
7560                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7561                 }
7562                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7563                     if (len)
7564                         p = oldp;
7565                     else if (UTF) {
7566                          if (FOLD) {
7567                               /* Emit all the Unicode characters. */
7568                               STRLEN numlen;
7569                               for (foldbuf = tmpbuf;
7570                                    foldlen;
7571                                    foldlen -= numlen) {
7572                                    ender = utf8_to_uvchr(foldbuf, &numlen);
7573                                    if (numlen > 0) {
7574                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
7575                                         s       += unilen;
7576                                         len     += unilen;
7577                                         /* In EBCDIC the numlen
7578                                          * and unilen can differ. */
7579                                         foldbuf += numlen;
7580                                         if (numlen >= foldlen)
7581                                              break;
7582                                    }
7583                                    else
7584                                         break; /* "Can't happen." */
7585                               }
7586                          }
7587                          else {
7588                               const STRLEN unilen = reguni(pRExC_state, ender, s);
7589                               if (unilen > 0) {
7590                                    s   += unilen;
7591                                    len += unilen;
7592                               }
7593                          }
7594                     }
7595                     else {
7596                         len++;
7597                         REGC((char)ender, s++);
7598                     }
7599                     break;
7600                 }
7601                 if (UTF) {
7602                      if (FOLD) {
7603                           /* Emit all the Unicode characters. */
7604                           STRLEN numlen;
7605                           for (foldbuf = tmpbuf;
7606                                foldlen;
7607                                foldlen -= numlen) {
7608                                ender = utf8_to_uvchr(foldbuf, &numlen);
7609                                if (numlen > 0) {
7610                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
7611                                     len     += unilen;
7612                                     s       += unilen;
7613                                     /* In EBCDIC the numlen
7614                                      * and unilen can differ. */
7615                                     foldbuf += numlen;
7616                                     if (numlen >= foldlen)
7617                                          break;
7618                                }
7619                                else
7620                                     break;
7621                           }
7622                      }
7623                      else {
7624                           const STRLEN unilen = reguni(pRExC_state, ender, s);
7625                           if (unilen > 0) {
7626                                s   += unilen;
7627                                len += unilen;
7628                           }
7629                      }
7630                      len--;
7631                 }
7632                 else
7633                     REGC((char)ender, s++);
7634             }
7635         loopdone:
7636             RExC_parse = p - 1;
7637             Set_Node_Cur_Length(ret); /* MJD */
7638             nextchar(pRExC_state);
7639             {
7640                 /* len is STRLEN which is unsigned, need to copy to signed */
7641                 IV iv = len;
7642                 if (iv < 0)
7643                     vFAIL("Internal disaster");
7644             }
7645             if (len > 0)
7646                 *flagp |= HASWIDTH;
7647             if (len == 1 && UNI_IS_INVARIANT(ender))
7648                 *flagp |= SIMPLE;
7649
7650             if (SIZE_ONLY)
7651                 RExC_size += STR_SZ(len);
7652             else {
7653                 STR_LEN(ret) = len;
7654                 RExC_emit += STR_SZ(len);
7655             }
7656         }
7657         break;
7658     }
7659
7660     return(ret);
7661 }
7662
7663 STATIC char *
7664 S_regwhite( RExC_state_t *pRExC_state, char *p )
7665 {
7666     const char *e = RExC_end;
7667
7668     PERL_ARGS_ASSERT_REGWHITE;
7669
7670     while (p < e) {
7671         if (isSPACE(*p))
7672             ++p;
7673         else if (*p == '#') {
7674             bool ended = 0;
7675             do {
7676                 if (*p++ == '\n') {
7677                     ended = 1;
7678                     break;
7679                 }
7680             } while (p < e);
7681             if (!ended)
7682                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7683         }
7684         else
7685             break;
7686     }
7687     return p;
7688 }
7689
7690 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7691    Character classes ([:foo:]) can also be negated ([:^foo:]).
7692    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7693    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7694    but trigger failures because they are currently unimplemented. */
7695
7696 #define POSIXCC_DONE(c)   ((c) == ':')
7697 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7698 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7699
7700 STATIC I32
7701 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7702 {
7703     dVAR;
7704     I32 namedclass = OOB_NAMEDCLASS;
7705
7706     PERL_ARGS_ASSERT_REGPPOSIXCC;
7707
7708     if (value == '[' && RExC_parse + 1 < RExC_end &&
7709         /* I smell either [: or [= or [. -- POSIX has been here, right? */
7710         POSIXCC(UCHARAT(RExC_parse))) {
7711         const char c = UCHARAT(RExC_parse);
7712         char* const s = RExC_parse++;
7713
7714         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7715             RExC_parse++;
7716         if (RExC_parse == RExC_end)
7717             /* Grandfather lone [:, [=, [. */
7718             RExC_parse = s;
7719         else {
7720             const char* const t = RExC_parse++; /* skip over the c */
7721             assert(*t == c);
7722
7723             if (UCHARAT(RExC_parse) == ']') {
7724                 const char *posixcc = s + 1;
7725                 RExC_parse++; /* skip over the ending ] */
7726
7727                 if (*s == ':') {
7728                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7729                     const I32 skip = t - posixcc;
7730
7731                     /* Initially switch on the length of the name.  */
7732                     switch (skip) {
7733                     case 4:
7734                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7735                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7736                         break;
7737                     case 5:
7738                         /* Names all of length 5.  */
7739                         /* alnum alpha ascii blank cntrl digit graph lower
7740                            print punct space upper  */
7741                         /* Offset 4 gives the best switch position.  */
7742                         switch (posixcc[4]) {
7743                         case 'a':
7744                             if (memEQ(posixcc, "alph", 4)) /* alpha */
7745                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7746                             break;
7747                         case 'e':
7748                             if (memEQ(posixcc, "spac", 4)) /* space */
7749                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7750                             break;
7751                         case 'h':
7752                             if (memEQ(posixcc, "grap", 4)) /* graph */
7753                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7754                             break;
7755                         case 'i':
7756                             if (memEQ(posixcc, "asci", 4)) /* ascii */
7757                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7758                             break;
7759                         case 'k':
7760                             if (memEQ(posixcc, "blan", 4)) /* blank */
7761                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7762                             break;
7763                         case 'l':
7764                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7765                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7766                             break;
7767                         case 'm':
7768                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
7769                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7770                             break;
7771                         case 'r':
7772                             if (memEQ(posixcc, "lowe", 4)) /* lower */
7773                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7774                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
7775                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7776                             break;
7777                         case 't':
7778                             if (memEQ(posixcc, "digi", 4)) /* digit */
7779                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7780                             else if (memEQ(posixcc, "prin", 4)) /* print */
7781                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7782                             else if (memEQ(posixcc, "punc", 4)) /* punct */
7783                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7784                             break;
7785                         }
7786                         break;
7787                     case 6:
7788                         if (memEQ(posixcc, "xdigit", 6))
7789                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7790                         break;
7791                     }
7792
7793                     if (namedclass == OOB_NAMEDCLASS)
7794                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7795                                       t - s - 1, s + 1);
7796                     assert (posixcc[skip] == ':');
7797                     assert (posixcc[skip+1] == ']');
7798                 } else if (!SIZE_ONLY) {
7799                     /* [[=foo=]] and [[.foo.]] are still future. */
7800
7801                     /* adjust RExC_parse so the warning shows after
7802                        the class closes */
7803                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7804                         RExC_parse++;
7805                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7806                 }
7807             } else {
7808                 /* Maternal grandfather:
7809                  * "[:" ending in ":" but not in ":]" */
7810                 RExC_parse = s;
7811             }
7812         }
7813     }
7814
7815     return namedclass;
7816 }
7817
7818 STATIC void
7819 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7820 {
7821     dVAR;
7822
7823     PERL_ARGS_ASSERT_CHECKPOSIXCC;
7824
7825     if (POSIXCC(UCHARAT(RExC_parse))) {
7826         const char *s = RExC_parse;
7827         const char  c = *s++;
7828
7829         while (isALNUM(*s))
7830             s++;
7831         if (*s && c == *s && s[1] == ']') {
7832             ckWARN3reg(s+2,
7833                        "POSIX syntax [%c %c] belongs inside character classes",
7834                        c, c);
7835
7836             /* [[=foo=]] and [[.foo.]] are still future. */
7837             if (POSIXCC_NOTYET(c)) {
7838                 /* adjust RExC_parse so the error shows after
7839                    the class closes */
7840                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7841                     NOOP;
7842                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7843             }
7844         }
7845     }
7846 }
7847
7848
7849 #define _C_C_T_(NAME,TEST,WORD)                         \
7850 ANYOF_##NAME:                                           \
7851     if (LOC)                                            \
7852         ANYOF_CLASS_SET(ret, ANYOF_##NAME);             \
7853     else {                                              \
7854         for (value = 0; value < 256; value++)           \
7855             if (TEST)                                   \
7856                 ANYOF_BITMAP_SET(ret, value);           \
7857     }                                                   \
7858     yesno = '+';                                        \
7859     what = WORD;                                        \
7860     break;                                              \
7861 case ANYOF_N##NAME:                                     \
7862     if (LOC)                                            \
7863         ANYOF_CLASS_SET(ret, ANYOF_N##NAME);            \
7864     else {                                              \
7865         for (value = 0; value < 256; value++)           \
7866             if (!TEST)                                  \
7867                 ANYOF_BITMAP_SET(ret, value);           \
7868     }                                                   \
7869     yesno = '!';                                        \
7870     what = WORD;                                        \
7871     break
7872
7873 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
7874 ANYOF_##NAME:                                           \
7875         for (value = 0; value < 256; value++)           \
7876             if (TEST)                                   \
7877                 ANYOF_BITMAP_SET(ret, value);           \
7878     yesno = '+';                                        \
7879     what = WORD;                                        \
7880     break;                                              \
7881 case ANYOF_N##NAME:                                     \
7882         for (value = 0; value < 256; value++)           \
7883             if (!TEST)                                  \
7884                 ANYOF_BITMAP_SET(ret, value);           \
7885     yesno = '!';                                        \
7886     what = WORD;                                        \
7887     break
7888
7889 /*
7890    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7891    so that it is possible to override the option here without having to
7892    rebuild the entire core. as we are required to do if we change regcomp.h
7893    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7894 */
7895 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7896 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7897 #endif
7898
7899 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7900 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7901 #else
7902 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7903 #endif
7904
7905 /*
7906    parse a class specification and produce either an ANYOF node that
7907    matches the pattern or if the pattern matches a single char only and
7908    that char is < 256 and we are case insensitive then we produce an
7909    EXACT node instead.
7910 */
7911
7912 STATIC regnode *
7913 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7914 {
7915     dVAR;
7916     register UV nextvalue;
7917     register IV prevvalue = OOB_UNICODE;
7918     register IV range = 0;
7919     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7920     register regnode *ret;
7921     STRLEN numlen;
7922     IV namedclass;
7923     char *rangebegin = NULL;
7924     bool need_class = 0;
7925     SV *listsv = NULL;
7926     UV n;
7927     bool optimize_invert   = TRUE;
7928     AV* unicode_alternate  = NULL;
7929 #ifdef EBCDIC
7930     UV literal_endpoint = 0;
7931 #endif
7932     UV stored = 0;  /* number of chars stored in the class */
7933
7934     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7935         case we need to change the emitted regop to an EXACT. */
7936     const char * orig_parse = RExC_parse;
7937     GET_RE_DEBUG_FLAGS_DECL;
7938
7939     PERL_ARGS_ASSERT_REGCLASS;
7940 #ifndef DEBUGGING
7941     PERL_UNUSED_ARG(depth);
7942 #endif
7943
7944     DEBUG_PARSE("clas");
7945
7946     /* Assume we are going to generate an ANYOF node. */
7947     ret = reganode(pRExC_state, ANYOF, 0);
7948
7949     if (!SIZE_ONLY)
7950         ANYOF_FLAGS(ret) = 0;
7951
7952     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
7953         RExC_naughty++;
7954         RExC_parse++;
7955         if (!SIZE_ONLY)
7956             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7957     }
7958
7959     if (SIZE_ONLY) {
7960         RExC_size += ANYOF_SKIP;
7961         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7962     }
7963     else {
7964         RExC_emit += ANYOF_SKIP;
7965         if (FOLD)
7966             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7967         if (LOC)
7968             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7969         ANYOF_BITMAP_ZERO(ret);
7970         listsv = newSVpvs("# comment\n");
7971     }
7972
7973     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7974
7975     if (!SIZE_ONLY && POSIXCC(nextvalue))
7976         checkposixcc(pRExC_state);
7977
7978     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7979     if (UCHARAT(RExC_parse) == ']')
7980         goto charclassloop;
7981
7982 parseit:
7983     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7984
7985     charclassloop:
7986
7987         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7988
7989         if (!range)
7990             rangebegin = RExC_parse;
7991         if (UTF) {
7992             value = utf8n_to_uvchr((U8*)RExC_parse,
7993                                    RExC_end - RExC_parse,
7994                                    &numlen, UTF8_ALLOW_DEFAULT);
7995             RExC_parse += numlen;
7996         }
7997         else
7998             value = UCHARAT(RExC_parse++);
7999
8000         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8001         if (value == '[' && POSIXCC(nextvalue))
8002             namedclass = regpposixcc(pRExC_state, value);
8003         else if (value == '\\') {
8004             if (UTF) {
8005                 value = utf8n_to_uvchr((U8*)RExC_parse,
8006                                    RExC_end - RExC_parse,
8007                                    &numlen, UTF8_ALLOW_DEFAULT);
8008                 RExC_parse += numlen;
8009             }
8010             else
8011                 value = UCHARAT(RExC_parse++);
8012             /* Some compilers cannot handle switching on 64-bit integer
8013              * values, therefore value cannot be an UV.  Yes, this will
8014              * be a problem later if we want switch on Unicode.
8015              * A similar issue a little bit later when switching on
8016              * namedclass. --jhi */
8017             switch ((I32)value) {
8018             case 'w':   namedclass = ANYOF_ALNUM;       break;
8019             case 'W':   namedclass = ANYOF_NALNUM;      break;
8020             case 's':   namedclass = ANYOF_SPACE;       break;
8021             case 'S':   namedclass = ANYOF_NSPACE;      break;
8022             case 'd':   namedclass = ANYOF_DIGIT;       break;
8023             case 'D':   namedclass = ANYOF_NDIGIT;      break;
8024             case 'v':   namedclass = ANYOF_VERTWS;      break;
8025             case 'V':   namedclass = ANYOF_NVERTWS;     break;
8026             case 'h':   namedclass = ANYOF_HORIZWS;     break;
8027             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
8028             case 'N':  /* Handle \N{NAME} in class */
8029                 {
8030                     /* We only pay attention to the first char of
8031                     multichar strings being returned. I kinda wonder
8032                     if this makes sense as it does change the behaviour
8033                     from earlier versions, OTOH that behaviour was broken
8034                     as well. */
8035                     UV v; /* value is register so we cant & it /grrr */
8036                     if (reg_namedseq(pRExC_state, &v, NULL)) {
8037                         goto parseit;
8038                     }
8039                     value= v;
8040                 }
8041                 break;
8042             case 'p':
8043             case 'P':
8044                 {
8045                 char *e;
8046                 if (RExC_parse >= RExC_end)
8047                     vFAIL2("Empty \\%c{}", (U8)value);
8048                 if (*RExC_parse == '{') {
8049                     const U8 c = (U8)value;
8050                     e = strchr(RExC_parse++, '}');
8051                     if (!e)
8052                         vFAIL2("Missing right brace on \\%c{}", c);
8053                     while (isSPACE(UCHARAT(RExC_parse)))
8054                         RExC_parse++;
8055                     if (e == RExC_parse)
8056                         vFAIL2("Empty \\%c{}", c);
8057                     n = e - RExC_parse;
8058                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8059                         n--;
8060                 }
8061                 else {
8062                     e = RExC_parse;
8063                     n = 1;
8064                 }
8065                 if (!SIZE_ONLY) {
8066                     if (UCHARAT(RExC_parse) == '^') {
8067                          RExC_parse++;
8068                          n--;
8069                          value = value == 'p' ? 'P' : 'p'; /* toggle */
8070                          while (isSPACE(UCHARAT(RExC_parse))) {
8071                               RExC_parse++;
8072                               n--;
8073                          }
8074                     }
8075                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8076                         (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8077                 }
8078                 RExC_parse = e + 1;
8079                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8080                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
8081                 }
8082                 break;
8083             case 'n':   value = '\n';                   break;
8084             case 'r':   value = '\r';                   break;
8085             case 't':   value = '\t';                   break;
8086             case 'f':   value = '\f';                   break;
8087             case 'b':   value = '\b';                   break;
8088             case 'e':   value = ASCII_TO_NATIVE('\033');break;
8089             case 'a':   value = ASCII_TO_NATIVE('\007');break;
8090             case 'x':
8091                 if (*RExC_parse == '{') {
8092                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8093                         | PERL_SCAN_DISALLOW_PREFIX;
8094                     char * const e = strchr(RExC_parse++, '}');
8095                     if (!e)
8096                         vFAIL("Missing right brace on \\x{}");
8097
8098                     numlen = e - RExC_parse;
8099                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8100                     RExC_parse = e + 1;
8101                 }
8102                 else {
8103                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8104                     numlen = 2;
8105                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8106                     RExC_parse += numlen;
8107                 }
8108                 if (PL_encoding && value < 0x100)
8109                     goto recode_encoding;
8110                 break;
8111             case 'c':
8112                 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
8113                 break;
8114             case '0': case '1': case '2': case '3': case '4':
8115             case '5': case '6': case '7': case '8': case '9':
8116                 {
8117                     I32 flags = 0;
8118                     numlen = 3;
8119                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8120                     RExC_parse += numlen;
8121                     if (PL_encoding && value < 0x100)
8122                         goto recode_encoding;
8123                     break;
8124                 }
8125             recode_encoding:
8126                 {
8127                     SV* enc = PL_encoding;
8128                     value = reg_recode((const char)(U8)value, &enc);
8129                     if (!enc && SIZE_ONLY)
8130                         ckWARNreg(RExC_parse,
8131                                   "Invalid escape in the specified encoding");
8132                     break;
8133                 }
8134             default:
8135                 if (!SIZE_ONLY && isALPHA(value))
8136                     ckWARN2reg(RExC_parse,
8137                                "Unrecognized escape \\%c in character class passed through",
8138                                (int)value);
8139                 break;
8140             }
8141         } /* end of \blah */
8142 #ifdef EBCDIC
8143         else
8144             literal_endpoint++;
8145 #endif
8146
8147         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8148
8149             if (!SIZE_ONLY && !need_class)
8150                 ANYOF_CLASS_ZERO(ret);
8151
8152             need_class = 1;
8153
8154             /* a bad range like a-\d, a-[:digit:] ? */
8155             if (range) {
8156                 if (!SIZE_ONLY) {
8157                     const int w =
8158                         RExC_parse >= rangebegin ?
8159                         RExC_parse - rangebegin : 0;
8160                     ckWARN4reg(RExC_parse,
8161                                "False [] range \"%*.*s\"",
8162                                w, w, rangebegin);
8163
8164                     if (prevvalue < 256) {
8165                         ANYOF_BITMAP_SET(ret, prevvalue);
8166                         ANYOF_BITMAP_SET(ret, '-');
8167                     }
8168                     else {
8169                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8170                         Perl_sv_catpvf(aTHX_ listsv,
8171                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8172                     }
8173                 }
8174
8175                 range = 0; /* this was not a true range */
8176             }
8177
8178
8179
8180             if (!SIZE_ONLY) {
8181                 const char *what = NULL;
8182                 char yesno = 0;
8183
8184                 if (namedclass > OOB_NAMEDCLASS)
8185                     optimize_invert = FALSE;
8186                 /* Possible truncation here but in some 64-bit environments
8187                  * the compiler gets heartburn about switch on 64-bit values.
8188                  * A similar issue a little earlier when switching on value.
8189                  * --jhi */
8190                 switch ((I32)namedclass) {
8191
8192                 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8193                 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8194                 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8195                 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8196                 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8197                 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8198                 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8199                 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8200                 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8201                 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8202 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8203                 case _C_C_T_(ALNUM, isALNUM(value), "Word");
8204                 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
8205 #else
8206                 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8207                 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8208 #endif
8209                 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
8210                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8211                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8212                 case ANYOF_ASCII:
8213                     if (LOC)
8214                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8215                     else {
8216 #ifndef EBCDIC
8217                         for (value = 0; value < 128; value++)
8218                             ANYOF_BITMAP_SET(ret, value);
8219 #else  /* EBCDIC */
8220                         for (value = 0; value < 256; value++) {
8221                             if (isASCII(value))
8222                                 ANYOF_BITMAP_SET(ret, value);
8223                         }
8224 #endif /* EBCDIC */
8225                     }
8226                     yesno = '+';
8227                     what = "ASCII";
8228                     break;
8229                 case ANYOF_NASCII:
8230                     if (LOC)
8231                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8232                     else {
8233 #ifndef EBCDIC
8234                         for (value = 128; value < 256; value++)
8235                             ANYOF_BITMAP_SET(ret, value);
8236 #else  /* EBCDIC */
8237                         for (value = 0; value < 256; value++) {
8238                             if (!isASCII(value))
8239                                 ANYOF_BITMAP_SET(ret, value);
8240                         }
8241 #endif /* EBCDIC */
8242                     }
8243                     yesno = '!';
8244                     what = "ASCII";
8245                     break;
8246                 case ANYOF_DIGIT:
8247                     if (LOC)
8248                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8249                     else {
8250                         /* consecutive digits assumed */
8251                         for (value = '0'; value <= '9'; value++)
8252                             ANYOF_BITMAP_SET(ret, value);
8253                     }
8254                     yesno = '+';
8255                     what = POSIX_CC_UNI_NAME("Digit");
8256                     break;
8257                 case ANYOF_NDIGIT:
8258                     if (LOC)
8259                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8260                     else {
8261                         /* consecutive digits assumed */
8262                         for (value = 0; value < '0'; value++)
8263                             ANYOF_BITMAP_SET(ret, value);
8264                         for (value = '9' + 1; value < 256; value++)
8265                             ANYOF_BITMAP_SET(ret, value);
8266                     }
8267                     yesno = '!';
8268                     what = POSIX_CC_UNI_NAME("Digit");
8269                     break;
8270                 case ANYOF_MAX:
8271                     /* this is to handle \p and \P */
8272                     break;
8273                 default:
8274                     vFAIL("Invalid [::] class");
8275                     break;
8276                 }
8277                 if (what) {
8278                     /* Strings such as "+utf8::isWord\n" */
8279                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8280                 }
8281                 if (LOC)
8282                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8283                 continue;
8284             }
8285         } /* end of namedclass \blah */
8286
8287         if (range) {
8288             if (prevvalue > (IV)value) /* b-a */ {
8289                 const int w = RExC_parse - rangebegin;
8290                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8291                 range = 0; /* not a valid range */
8292             }
8293         }
8294         else {
8295             prevvalue = value; /* save the beginning of the range */
8296             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8297                 RExC_parse[1] != ']') {
8298                 RExC_parse++;
8299
8300                 /* a bad range like \w-, [:word:]- ? */
8301                 if (namedclass > OOB_NAMEDCLASS) {
8302                     if (ckWARN(WARN_REGEXP)) {
8303                         const int w =
8304                             RExC_parse >= rangebegin ?
8305                             RExC_parse - rangebegin : 0;
8306                         vWARN4(RExC_parse,
8307                                "False [] range \"%*.*s\"",
8308                                w, w, rangebegin);
8309                     }
8310                     if (!SIZE_ONLY)
8311                         ANYOF_BITMAP_SET(ret, '-');
8312                 } else
8313                     range = 1;  /* yeah, it's a range! */
8314                 continue;       /* but do it the next time */
8315             }
8316         }
8317
8318         /* now is the next time */
8319         /*stored += (value - prevvalue + 1);*/
8320         if (!SIZE_ONLY) {
8321             if (prevvalue < 256) {
8322                 const IV ceilvalue = value < 256 ? value : 255;
8323                 IV i;
8324 #ifdef EBCDIC
8325                 /* In EBCDIC [\x89-\x91] should include
8326                  * the \x8e but [i-j] should not. */
8327                 if (literal_endpoint == 2 &&
8328                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8329                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8330                 {
8331                     if (isLOWER(prevvalue)) {
8332                         for (i = prevvalue; i <= ceilvalue; i++)
8333                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8334                                 stored++;
8335                                 ANYOF_BITMAP_SET(ret, i);
8336                             }
8337                     } else {
8338                         for (i = prevvalue; i <= ceilvalue; i++)
8339                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8340                                 stored++;
8341                                 ANYOF_BITMAP_SET(ret, i);
8342                             }
8343                     }
8344                 }
8345                 else
8346 #endif
8347                       for (i = prevvalue; i <= ceilvalue; i++) {
8348                         if (!ANYOF_BITMAP_TEST(ret,i)) {
8349                             stored++;
8350                             ANYOF_BITMAP_SET(ret, i);
8351                         }
8352                       }
8353           }
8354           if (value > 255 || UTF) {
8355                 const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
8356                 const UV natvalue      = NATIVE_TO_UNI(value);
8357                 stored+=2; /* can't optimize this class */
8358                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8359                 if (prevnatvalue < natvalue) { /* what about > ? */
8360                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8361                                    prevnatvalue, natvalue);
8362                 }
8363                 else if (prevnatvalue == natvalue) {
8364                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8365                     if (FOLD) {
8366                          U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8367                          STRLEN foldlen;
8368                          const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8369
8370 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8371                          if (RExC_precomp[0] == ':' &&
8372                              RExC_precomp[1] == '[' &&
8373                              (f == 0xDF || f == 0x92)) {
8374                              f = NATIVE_TO_UNI(f);
8375                         }
8376 #endif
8377                          /* If folding and foldable and a single
8378                           * character, insert also the folded version
8379                           * to the charclass. */
8380                          if (f != value) {
8381 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8382                              if ((RExC_precomp[0] == ':' &&
8383                                   RExC_precomp[1] == '[' &&
8384                                   (f == 0xA2 &&
8385                                    (value == 0xFB05 || value == 0xFB06))) ?
8386                                  foldlen == ((STRLEN)UNISKIP(f) - 1) :
8387                                  foldlen == (STRLEN)UNISKIP(f) )
8388 #else
8389                               if (foldlen == (STRLEN)UNISKIP(f))
8390 #endif
8391                                   Perl_sv_catpvf(aTHX_ listsv,
8392                                                  "%04"UVxf"\n", f);
8393                               else {
8394                                   /* Any multicharacter foldings
8395                                    * require the following transform:
8396                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8397                                    * where E folds into "pq" and F folds
8398                                    * into "rst", all other characters
8399                                    * fold to single characters.  We save
8400                                    * away these multicharacter foldings,
8401                                    * to be later saved as part of the
8402                                    * additional "s" data. */
8403                                   SV *sv;
8404
8405                                   if (!unicode_alternate)
8406                                       unicode_alternate = newAV();
8407                                   sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8408                                                      TRUE);
8409                                   av_push(unicode_alternate, sv);
8410                               }
8411                          }
8412
8413                          /* If folding and the value is one of the Greek
8414                           * sigmas insert a few more sigmas to make the
8415                           * folding rules of the sigmas to work right.
8416                           * Note that not all the possible combinations
8417                           * are handled here: some of them are handled
8418                           * by the standard folding rules, and some of
8419                           * them (literal or EXACTF cases) are handled
8420                           * during runtime in regexec.c:S_find_byclass(). */
8421                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8422                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8423                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8424                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8425                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8426                          }
8427                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8428                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8429                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8430                     }
8431                 }
8432             }
8433 #ifdef EBCDIC
8434             literal_endpoint = 0;
8435 #endif
8436         }
8437
8438         range = 0; /* this range (if it was one) is done now */
8439     }
8440
8441     if (need_class) {
8442         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8443         if (SIZE_ONLY)
8444             RExC_size += ANYOF_CLASS_ADD_SKIP;
8445         else
8446             RExC_emit += ANYOF_CLASS_ADD_SKIP;
8447     }
8448
8449
8450     if (SIZE_ONLY)
8451         return ret;
8452     /****** !SIZE_ONLY AFTER HERE *********/
8453
8454     if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8455         && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8456     ) {
8457         /* optimize single char class to an EXACT node
8458            but *only* when its not a UTF/high char  */
8459         const char * cur_parse= RExC_parse;
8460         RExC_emit = (regnode *)orig_emit;
8461         RExC_parse = (char *)orig_parse;
8462         ret = reg_node(pRExC_state,
8463                        (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8464         RExC_parse = (char *)cur_parse;
8465         *STRING(ret)= (char)value;
8466         STR_LEN(ret)= 1;
8467         RExC_emit += STR_SZ(1);
8468         SvREFCNT_dec(listsv);
8469         return ret;
8470     }
8471     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8472     if ( /* If the only flag is folding (plus possibly inversion). */
8473         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8474        ) {
8475         for (value = 0; value < 256; ++value) {
8476             if (ANYOF_BITMAP_TEST(ret, value)) {
8477                 UV fold = PL_fold[value];
8478
8479                 if (fold != value)
8480                     ANYOF_BITMAP_SET(ret, fold);
8481             }
8482         }
8483         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8484     }
8485
8486     /* optimize inverted simple patterns (e.g. [^a-z]) */
8487     if (optimize_invert &&
8488         /* If the only flag is inversion. */
8489         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8490         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8491             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8492         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8493     }
8494     {
8495         AV * const av = newAV();
8496         SV *rv;
8497         /* The 0th element stores the character class description
8498          * in its textual form: used later (regexec.c:Perl_regclass_swash())
8499          * to initialize the appropriate swash (which gets stored in
8500          * the 1st element), and also useful for dumping the regnode.
8501          * The 2nd element stores the multicharacter foldings,
8502          * used later (regexec.c:S_reginclass()). */
8503         av_store(av, 0, listsv);
8504         av_store(av, 1, NULL);
8505         av_store(av, 2, MUTABLE_SV(unicode_alternate));
8506         rv = newRV_noinc(MUTABLE_SV(av));
8507         n = add_data(pRExC_state, 1, "s");
8508         RExC_rxi->data->data[n] = (void*)rv;
8509         ARG_SET(ret, n);
8510     }
8511     return ret;
8512 }
8513 #undef _C_C_T_
8514
8515
8516 /* reg_skipcomment()
8517
8518    Absorbs an /x style # comments from the input stream.
8519    Returns true if there is more text remaining in the stream.
8520    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8521    terminates the pattern without including a newline.
8522
8523    Note its the callers responsibility to ensure that we are
8524    actually in /x mode
8525
8526 */
8527
8528 STATIC bool
8529 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8530 {
8531     bool ended = 0;
8532
8533     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8534
8535     while (RExC_parse < RExC_end)
8536         if (*RExC_parse++ == '\n') {
8537             ended = 1;
8538             break;
8539         }
8540     if (!ended) {
8541         /* we ran off the end of the pattern without ending
8542            the comment, so we have to add an \n when wrapping */
8543         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8544         return 0;
8545     } else
8546         return 1;
8547 }
8548
8549 /* nextchar()
8550
8551    Advance that parse position, and optionall absorbs
8552    "whitespace" from the inputstream.
8553
8554    Without /x "whitespace" means (?#...) style comments only,
8555    with /x this means (?#...) and # comments and whitespace proper.
8556
8557    Returns the RExC_parse point from BEFORE the scan occurs.
8558
8559    This is the /x friendly way of saying RExC_parse++.
8560 */
8561
8562 STATIC char*
8563 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8564 {
8565     char* const retval = RExC_parse++;
8566
8567     PERL_ARGS_ASSERT_NEXTCHAR;
8568
8569     for (;;) {
8570         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8571                 RExC_parse[2] == '#') {
8572             while (*RExC_parse != ')') {
8573                 if (RExC_parse == RExC_end)
8574                     FAIL("Sequence (?#... not terminated");
8575                 RExC_parse++;
8576             }
8577             RExC_parse++;
8578             continue;
8579         }
8580         if (RExC_flags & RXf_PMf_EXTENDED) {
8581             if (isSPACE(*RExC_parse)) {
8582                 RExC_parse++;
8583                 continue;
8584             }
8585             else if (*RExC_parse == '#') {
8586                 if ( reg_skipcomment( pRExC_state ) )
8587                     continue;
8588             }
8589         }
8590         return retval;
8591     }
8592 }
8593
8594 /*
8595 - reg_node - emit a node
8596 */
8597 STATIC regnode *                        /* Location. */
8598 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8599 {
8600     dVAR;
8601     register regnode *ptr;
8602     regnode * const ret = RExC_emit;
8603     GET_RE_DEBUG_FLAGS_DECL;
8604
8605     PERL_ARGS_ASSERT_REG_NODE;
8606
8607     if (SIZE_ONLY) {
8608         SIZE_ALIGN(RExC_size);
8609         RExC_size += 1;
8610         return(ret);
8611     }
8612     if (RExC_emit >= RExC_emit_bound)
8613         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8614
8615     NODE_ALIGN_FILL(ret);
8616     ptr = ret;
8617     FILL_ADVANCE_NODE(ptr, op);
8618     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
8619 #ifdef RE_TRACK_PATTERN_OFFSETS
8620     if (RExC_offsets) {         /* MJD */
8621         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8622               "reg_node", __LINE__,
8623               PL_reg_name[op],
8624               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8625                 ? "Overwriting end of array!\n" : "OK",
8626               (UV)(RExC_emit - RExC_emit_start),
8627               (UV)(RExC_parse - RExC_start),
8628               (UV)RExC_offsets[0]));
8629         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8630     }
8631 #endif
8632     RExC_emit = ptr;
8633     return(ret);
8634 }
8635
8636 /*
8637 - reganode - emit a node with an argument
8638 */
8639 STATIC regnode *                        /* Location. */
8640 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8641 {
8642     dVAR;
8643     register regnode *ptr;
8644     regnode * const ret = RExC_emit;
8645     GET_RE_DEBUG_FLAGS_DECL;
8646
8647     PERL_ARGS_ASSERT_REGANODE;
8648
8649     if (SIZE_ONLY) {
8650         SIZE_ALIGN(RExC_size);
8651         RExC_size += 2;
8652         /*
8653            We can't do this:
8654
8655            assert(2==regarglen[op]+1);
8656
8657            Anything larger than this has to allocate the extra amount.
8658            If we changed this to be:
8659
8660            RExC_size += (1 + regarglen[op]);
8661
8662            then it wouldn't matter. Its not clear what side effect
8663            might come from that so its not done so far.
8664            -- dmq
8665         */
8666         return(ret);
8667     }
8668     if (RExC_emit >= RExC_emit_bound)
8669         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8670
8671     NODE_ALIGN_FILL(ret);
8672     ptr = ret;
8673     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8674     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
8675 #ifdef RE_TRACK_PATTERN_OFFSETS
8676     if (RExC_offsets) {         /* MJD */
8677         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8678               "reganode",
8679               __LINE__,
8680               PL_reg_name[op],
8681               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8682               "Overwriting end of array!\n" : "OK",
8683               (UV)(RExC_emit - RExC_emit_start),
8684               (UV)(RExC_parse - RExC_start),
8685               (UV)RExC_offsets[0]));
8686         Set_Cur_Node_Offset;
8687     }
8688 #endif
8689     RExC_emit = ptr;
8690     return(ret);
8691 }
8692
8693 /*
8694 - reguni - emit (if appropriate) a Unicode character
8695 */
8696 STATIC STRLEN
8697 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8698 {
8699     dVAR;
8700
8701     PERL_ARGS_ASSERT_REGUNI;
8702
8703     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8704 }
8705
8706 /*
8707 - reginsert - insert an operator in front of already-emitted operand
8708 *
8709 * Means relocating the operand.
8710 */
8711 STATIC void
8712 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8713 {
8714     dVAR;
8715     register regnode *src;
8716     register regnode *dst;
8717     register regnode *place;
8718     const int offset = regarglen[(U8)op];
8719     const int size = NODE_STEP_REGNODE + offset;
8720     GET_RE_DEBUG_FLAGS_DECL;
8721
8722     PERL_ARGS_ASSERT_REGINSERT;
8723     PERL_UNUSED_ARG(depth);
8724 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8725     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8726     if (SIZE_ONLY) {
8727         RExC_size += size;
8728         return;
8729     }
8730
8731     src = RExC_emit;
8732     RExC_emit += size;
8733     dst = RExC_emit;
8734     if (RExC_open_parens) {
8735         int paren;
8736         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8737         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8738             if ( RExC_open_parens[paren] >= opnd ) {
8739                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8740                 RExC_open_parens[paren] += size;
8741             } else {
8742                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8743             }
8744             if ( RExC_close_parens[paren] >= opnd ) {
8745                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8746                 RExC_close_parens[paren] += size;
8747             } else {
8748                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8749             }
8750         }
8751     }
8752
8753     while (src > opnd) {
8754         StructCopy(--src, --dst, regnode);
8755 #ifdef RE_TRACK_PATTERN_OFFSETS
8756         if (RExC_offsets) {     /* MJD 20010112 */
8757             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8758                   "reg_insert",
8759                   __LINE__,
8760                   PL_reg_name[op],
8761                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8762                     ? "Overwriting end of array!\n" : "OK",
8763                   (UV)(src - RExC_emit_start),
8764                   (UV)(dst - RExC_emit_start),
8765                   (UV)RExC_offsets[0]));
8766             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8767             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8768         }
8769 #endif
8770     }
8771
8772
8773     place = opnd;               /* Op node, where operand used to be. */
8774 #ifdef RE_TRACK_PATTERN_OFFSETS
8775     if (RExC_offsets) {         /* MJD */
8776         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8777               "reginsert",
8778               __LINE__,
8779               PL_reg_name[op],
8780               (UV)(place - RExC_emit_start) > RExC_offsets[0]
8781               ? "Overwriting end of array!\n" : "OK",
8782               (UV)(place - RExC_emit_start),
8783               (UV)(RExC_parse - RExC_start),
8784               (UV)RExC_offsets[0]));
8785         Set_Node_Offset(place, RExC_parse);
8786         Set_Node_Length(place, 1);
8787     }
8788 #endif
8789     src = NEXTOPER(place);
8790     FILL_ADVANCE_NODE(place, op);
8791     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
8792     Zero(src, offset, regnode);
8793 }
8794
8795 /*
8796 - regtail - set the next-pointer at the end of a node chain of p to val.
8797 - SEE ALSO: regtail_study
8798 */
8799 /* TODO: All three parms should be const */
8800 STATIC void
8801 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8802 {
8803     dVAR;
8804     register regnode *scan;
8805     GET_RE_DEBUG_FLAGS_DECL;
8806
8807     PERL_ARGS_ASSERT_REGTAIL;
8808 #ifndef DEBUGGING
8809     PERL_UNUSED_ARG(depth);
8810 #endif
8811
8812     if (SIZE_ONLY)
8813         return;
8814
8815     /* Find last node. */
8816     scan = p;
8817     for (;;) {
8818         regnode * const temp = regnext(scan);
8819         DEBUG_PARSE_r({
8820             SV * const mysv=sv_newmortal();
8821             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8822             regprop(RExC_rx, mysv, scan);
8823             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8824                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8825                     (temp == NULL ? "->" : ""),
8826                     (temp == NULL ? PL_reg_name[OP(val)] : "")
8827             );
8828         });
8829         if (temp == NULL)
8830             break;
8831         scan = temp;
8832     }
8833
8834     if (reg_off_by_arg[OP(scan)]) {
8835         ARG_SET(scan, val - scan);
8836     }
8837     else {
8838         NEXT_OFF(scan) = val - scan;
8839     }
8840 }
8841
8842 #ifdef DEBUGGING
8843 /*
8844 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8845 - Look for optimizable sequences at the same time.
8846 - currently only looks for EXACT chains.
8847
8848 This is expermental code. The idea is to use this routine to perform
8849 in place optimizations on branches and groups as they are constructed,
8850 with the long term intention of removing optimization from study_chunk so
8851 that it is purely analytical.
8852
8853 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8854 to control which is which.
8855
8856 */
8857 /* TODO: All four parms should be const */
8858
8859 STATIC U8
8860 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8861 {
8862     dVAR;
8863     register regnode *scan;
8864     U8 exact = PSEUDO;
8865 #ifdef EXPERIMENTAL_INPLACESCAN
8866     I32 min = 0;
8867 #endif
8868     GET_RE_DEBUG_FLAGS_DECL;
8869
8870     PERL_ARGS_ASSERT_REGTAIL_STUDY;
8871
8872
8873     if (SIZE_ONLY)
8874         return exact;
8875
8876     /* Find last node. */
8877
8878     scan = p;
8879     for (;;) {
8880         regnode * const temp = regnext(scan);
8881 #ifdef EXPERIMENTAL_INPLACESCAN
8882         if (PL_regkind[OP(scan)] == EXACT)
8883             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8884                 return EXACT;
8885 #endif
8886         if ( exact ) {
8887             switch (OP(scan)) {
8888                 case EXACT:
8889                 case EXACTF:
8890                 case EXACTFL:
8891                         if( exact == PSEUDO )
8892                             exact= OP(scan);
8893                         else if ( exact != OP(scan) )
8894                             exact= 0;
8895                 case NOTHING:
8896                     break;
8897                 default:
8898                     exact= 0;
8899             }
8900         }
8901         DEBUG_PARSE_r({
8902             SV * const mysv=sv_newmortal();
8903             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8904             regprop(RExC_rx, mysv, scan);
8905             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8906                 SvPV_nolen_const(mysv),
8907                 REG_NODE_NUM(scan),
8908                 PL_reg_name[exact]);
8909         });
8910         if (temp == NULL)
8911             break;
8912         scan = temp;
8913     }
8914     DEBUG_PARSE_r({
8915         SV * const mysv_val=sv_newmortal();
8916         DEBUG_PARSE_MSG("");
8917         regprop(RExC_rx, mysv_val, val);
8918         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8919                       SvPV_nolen_const(mysv_val),
8920                       (IV)REG_NODE_NUM(val),
8921                       (IV)(val - scan)
8922         );
8923     });
8924     if (reg_off_by_arg[OP(scan)]) {
8925         ARG_SET(scan, val - scan);
8926     }
8927     else {
8928         NEXT_OFF(scan) = val - scan;
8929     }
8930
8931     return exact;
8932 }
8933 #endif
8934
8935 /*
8936  - regcurly - a little FSA that accepts {\d+,?\d*}
8937  */
8938 #ifndef PERL_IN_XSUB_RE
8939 I32
8940 Perl_regcurly(register const char *s)
8941 {
8942     PERL_ARGS_ASSERT_REGCURLY;
8943
8944     if (*s++ != '{')
8945         return FALSE;
8946     if (!isDIGIT(*s))
8947         return FALSE;
8948     while (isDIGIT(*s))
8949         s++;
8950     if (*s == ',')
8951         s++;
8952     while (isDIGIT(*s))
8953         s++;
8954     if (*s != '}')
8955         return FALSE;
8956     return TRUE;
8957 }
8958 #endif
8959
8960 /*
8961  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8962  */
8963 #ifdef DEBUGGING
8964 static void
8965 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
8966 {
8967     int bit;
8968     int set=0;
8969
8970     for (bit=0; bit<32; bit++) {
8971         if (flags & (1<<bit)) {
8972             if (!set++ && lead)
8973                 PerlIO_printf(Perl_debug_log, "%s",lead);
8974             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8975         }
8976     }
8977     if (lead)  {
8978         if (set)
8979             PerlIO_printf(Perl_debug_log, "\n");
8980         else
8981             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8982     }
8983 }
8984 #endif
8985
8986 void
8987 Perl_regdump(pTHX_ const regexp *r)
8988 {
8989 #ifdef DEBUGGING
8990     dVAR;
8991     SV * const sv = sv_newmortal();
8992     SV *dsv= sv_newmortal();
8993     RXi_GET_DECL(r,ri);
8994     GET_RE_DEBUG_FLAGS_DECL;
8995
8996     PERL_ARGS_ASSERT_REGDUMP;
8997
8998     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8999
9000     /* Header fields of interest. */
9001     if (r->anchored_substr) {
9002         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9003             RE_SV_DUMPLEN(r->anchored_substr), 30);
9004         PerlIO_printf(Perl_debug_log,
9005                       "anchored %s%s at %"IVdf" ",
9006                       s, RE_SV_TAIL(r->anchored_substr),
9007                       (IV)r->anchored_offset);
9008     } else if (r->anchored_utf8) {
9009         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9010             RE_SV_DUMPLEN(r->anchored_utf8), 30);
9011         PerlIO_printf(Perl_debug_log,
9012                       "anchored utf8 %s%s at %"IVdf" ",
9013                       s, RE_SV_TAIL(r->anchored_utf8),
9014                       (IV)r->anchored_offset);
9015     }
9016     if (r->float_substr) {
9017         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9018             RE_SV_DUMPLEN(r->float_substr), 30);
9019         PerlIO_printf(Perl_debug_log,
9020                       "floating %s%s at %"IVdf"..%"UVuf" ",
9021                       s, RE_SV_TAIL(r->float_substr),
9022                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9023     } else if (r->float_utf8) {
9024         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9025             RE_SV_DUMPLEN(r->float_utf8), 30);
9026         PerlIO_printf(Perl_debug_log,
9027                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9028                       s, RE_SV_TAIL(r->float_utf8),
9029                       (IV)r->float_min_offset, (UV)r->float_max_offset);
9030     }
9031     if (r->check_substr || r->check_utf8)
9032         PerlIO_printf(Perl_debug_log,
9033                       (const char *)
9034                       (r->check_substr == r->float_substr
9035                        && r->check_utf8 == r->float_utf8
9036                        ? "(checking floating" : "(checking anchored"));
9037     if (r->extflags & RXf_NOSCAN)
9038         PerlIO_printf(Perl_debug_log, " noscan");
9039     if (r->extflags & RXf_CHECK_ALL)
9040         PerlIO_printf(Perl_debug_log, " isall");
9041     if (r->check_substr || r->check_utf8)
9042         PerlIO_printf(Perl_debug_log, ") ");
9043
9044     if (ri->regstclass) {
9045         regprop(r, sv, ri->regstclass);
9046         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9047     }
9048     if (r->extflags & RXf_ANCH) {
9049         PerlIO_printf(Perl_debug_log, "anchored");
9050         if (r->extflags & RXf_ANCH_BOL)
9051             PerlIO_printf(Perl_debug_log, "(BOL)");
9052         if (r->extflags & RXf_ANCH_MBOL)
9053             PerlIO_printf(Perl_debug_log, "(MBOL)");
9054         if (r->extflags & RXf_ANCH_SBOL)
9055             PerlIO_printf(Perl_debug_log, "(SBOL)");
9056         if (r->extflags & RXf_ANCH_GPOS)
9057             PerlIO_printf(Perl_debug_log, "(GPOS)");
9058         PerlIO_putc(Perl_debug_log, ' ');
9059     }
9060     if (r->extflags & RXf_GPOS_SEEN)
9061         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9062     if (r->intflags & PREGf_SKIP)
9063         PerlIO_printf(Perl_debug_log, "plus ");
9064     if (r->intflags & PREGf_IMPLICIT)
9065         PerlIO_printf(Perl_debug_log, "implicit ");
9066     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9067     if (r->extflags & RXf_EVAL_SEEN)
9068         PerlIO_printf(Perl_debug_log, "with eval ");
9069     PerlIO_printf(Perl_debug_log, "\n");
9070     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9071 #else
9072     PERL_ARGS_ASSERT_REGDUMP;
9073     PERL_UNUSED_CONTEXT;
9074     PERL_UNUSED_ARG(r);
9075 #endif  /* DEBUGGING */
9076 }
9077
9078 /*
9079 - regprop - printable representation of opcode
9080 */
9081 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9082 STMT_START { \
9083         if (do_sep) {                           \
9084             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9085             if (flags & ANYOF_INVERT)           \
9086                 /*make sure the invert info is in each */ \
9087                 sv_catpvs(sv, "^");             \
9088             do_sep = 0;                         \
9089         }                                       \
9090 } STMT_END
9091
9092 void
9093 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9094 {
9095 #ifdef DEBUGGING
9096     dVAR;
9097     register int k;
9098     RXi_GET_DECL(prog,progi);
9099     GET_RE_DEBUG_FLAGS_DECL;
9100
9101     PERL_ARGS_ASSERT_REGPROP;
9102
9103     sv_setpvs(sv, "");
9104
9105     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
9106         /* It would be nice to FAIL() here, but this may be called from
9107            regexec.c, and it would be hard to supply pRExC_state. */
9108         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9109     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9110
9111     k = PL_regkind[OP(o)];
9112
9113     if (k == EXACT) {
9114         sv_catpvs(sv, " ");
9115         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9116          * is a crude hack but it may be the best for now since
9117          * we have no flag "this EXACTish node was UTF-8"
9118          * --jhi */
9119         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9120                   PERL_PV_ESCAPE_UNI_DETECT |
9121                   PERL_PV_PRETTY_ELLIPSES   |
9122                   PERL_PV_PRETTY_LTGT       |
9123                   PERL_PV_PRETTY_NOCLEAR
9124                   );
9125     } else if (k == TRIE) {
9126         /* print the details of the trie in dumpuntil instead, as
9127          * progi->data isn't available here */
9128         const char op = OP(o);
9129         const U32 n = ARG(o);
9130         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9131                (reg_ac_data *)progi->data->data[n] :
9132                NULL;
9133         const reg_trie_data * const trie
9134             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9135
9136         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9137         DEBUG_TRIE_COMPILE_r(
9138             Perl_sv_catpvf(aTHX_ sv,
9139                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9140                 (UV)trie->startstate,
9141                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9142                 (UV)trie->wordcount,
9143                 (UV)trie->minlen,
9144                 (UV)trie->maxlen,
9145                 (UV)TRIE_CHARCOUNT(trie),
9146                 (UV)trie->uniquecharcount
9147             )
9148         );
9149         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9150             int i;
9151             int rangestart = -1;
9152             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9153             sv_catpvs(sv, "[");
9154             for (i = 0; i <= 256; i++) {
9155                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9156                     if (rangestart == -1)
9157                         rangestart = i;
9158                 } else if (rangestart != -1) {
9159                     if (i <= rangestart + 3)
9160                         for (; rangestart < i; rangestart++)
9161                             put_byte(sv, rangestart);
9162                     else {
9163                         put_byte(sv, rangestart);
9164                         sv_catpvs(sv, "-");
9165                         put_byte(sv, i - 1);
9166                     }
9167                     rangestart = -1;
9168                 }
9169             }
9170             sv_catpvs(sv, "]");
9171         }
9172
9173     } else if (k == CURLY) {
9174         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9175             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9176         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9177     }
9178     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
9179         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9180     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9181         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
9182         if ( RXp_PAREN_NAMES(prog) ) {
9183             if ( k != REF || OP(o) < NREF) {
9184                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9185                 SV **name= av_fetch(list, ARG(o), 0 );
9186                 if (name)
9187                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9188             }
9189             else {
9190                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9191                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9192                 I32 *nums=(I32*)SvPVX(sv_dat);
9193                 SV **name= av_fetch(list, nums[0], 0 );
9194                 I32 n;
9195                 if (name) {
9196                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
9197                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9198                                     (n ? "," : ""), (IV)nums[n]);
9199                     }
9200                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9201                 }
9202             }
9203         }
9204     } else if (k == GOSUB)
9205         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9206     else if (k == VERB) {
9207         if (!o->flags)
9208             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9209                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9210     } else if (k == LOGICAL)
9211         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
9212     else if (k == FOLDCHAR)
9213         Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9214     else if (k == ANYOF) {
9215         int i, rangestart = -1;
9216         const U8 flags = ANYOF_FLAGS(o);
9217         int do_sep = 0;
9218
9219         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9220         static const char * const anyofs[] = {
9221             "\\w",
9222             "\\W",
9223             "\\s",
9224             "\\S",
9225             "\\d",
9226             "\\D",
9227             "[:alnum:]",
9228             "[:^alnum:]",
9229             "[:alpha:]",
9230             "[:^alpha:]",
9231             "[:ascii:]",
9232             "[:^ascii:]",
9233             "[:cntrl:]",
9234             "[:^cntrl:]",
9235             "[:graph:]",
9236             "[:^graph:]",
9237             "[:lower:]",
9238             "[:^lower:]",
9239             "[:print:]",
9240             "[:^print:]",
9241             "[:punct:]",
9242             "[:^punct:]",
9243             "[:upper:]",
9244             "[:^upper:]",
9245             "[:xdigit:]",
9246             "[:^xdigit:]",
9247             "[:space:]",
9248             "[:^space:]",
9249             "[:blank:]",
9250             "[:^blank:]"
9251         };
9252
9253         if (flags & ANYOF_LOCALE)
9254             sv_catpvs(sv, "{loc}");
9255         if (flags & ANYOF_FOLD)
9256             sv_catpvs(sv, "{i}");
9257         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9258         if (flags & ANYOF_INVERT)
9259             sv_catpvs(sv, "^");
9260
9261         /* output what the standard cp 0-255 bitmap matches */
9262         for (i = 0; i <= 256; i++) {
9263             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9264                 if (rangestart == -1)
9265                     rangestart = i;
9266             } else if (rangestart != -1) {
9267                 if (i <= rangestart + 3)
9268                     for (; rangestart < i; rangestart++)
9269                         put_byte(sv, rangestart);
9270                 else {
9271                     put_byte(sv, rangestart);
9272                     sv_catpvs(sv, "-");
9273                     put_byte(sv, i - 1);
9274                 }
9275                 do_sep = 1;
9276                 rangestart = -1;
9277             }
9278         }
9279
9280         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9281         /* output any special charclass tests (used mostly under use locale) */
9282         if (o->flags & ANYOF_CLASS)
9283             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9284                 if (ANYOF_CLASS_TEST(o,i)) {
9285                     sv_catpv(sv, anyofs[i]);
9286                     do_sep = 1;
9287                 }
9288
9289         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9290
9291         /* output information about the unicode matching */
9292         if (flags & ANYOF_UNICODE)
9293             sv_catpvs(sv, "{unicode}");
9294         else if (flags & ANYOF_UNICODE_ALL)
9295             sv_catpvs(sv, "{unicode_all}");
9296
9297         {
9298             SV *lv;
9299             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9300
9301             if (lv) {
9302                 if (sw) {
9303                     U8 s[UTF8_MAXBYTES_CASE+1];
9304
9305                     for (i = 0; i <= 256; i++) { /* just the first 256 */
9306                         uvchr_to_utf8(s, i);
9307
9308                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
9309                             if (rangestart == -1)
9310                                 rangestart = i;
9311                         } else if (rangestart != -1) {
9312                             if (i <= rangestart + 3)
9313                                 for (; rangestart < i; rangestart++) {
9314                                     const U8 * const e = uvchr_to_utf8(s,rangestart);
9315                                     U8 *p;
9316                                     for(p = s; p < e; p++)
9317                                         put_byte(sv, *p);
9318                                 }
9319                             else {
9320                                 const U8 *e = uvchr_to_utf8(s,rangestart);
9321                                 U8 *p;
9322                                 for (p = s; p < e; p++)
9323                                     put_byte(sv, *p);
9324                                 sv_catpvs(sv, "-");
9325                                 e = uvchr_to_utf8(s, i-1);
9326                                 for (p = s; p < e; p++)
9327                                     put_byte(sv, *p);
9328                                 }
9329                                 rangestart = -1;
9330                             }
9331                         }
9332
9333                     sv_catpvs(sv, "..."); /* et cetera */
9334                 }
9335
9336                 {
9337                     char *s = savesvpv(lv);
9338                     char * const origs = s;
9339
9340                     while (*s && *s != '\n')
9341                         s++;
9342
9343                     if (*s == '\n') {
9344                         const char * const t = ++s;
9345
9346                         while (*s) {
9347                             if (*s == '\n')
9348                                 *s = ' ';
9349                             s++;
9350                         }
9351                         if (s[-1] == ' ')
9352                             s[-1] = 0;
9353
9354                         sv_catpv(sv, t);
9355                     }
9356
9357                     Safefree(origs);
9358                 }
9359             }
9360         }
9361
9362         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9363     }
9364     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9365         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9366 #else
9367     PERL_UNUSED_CONTEXT;
9368     PERL_UNUSED_ARG(sv);
9369     PERL_UNUSED_ARG(o);
9370     PERL_UNUSED_ARG(prog);
9371 #endif  /* DEBUGGING */
9372 }
9373
9374 SV *
9375 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9376 {                               /* Assume that RE_INTUIT is set */
9377     dVAR;
9378     struct regexp *const prog = (struct regexp *)SvANY(r);
9379     GET_RE_DEBUG_FLAGS_DECL;
9380
9381     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9382     PERL_UNUSED_CONTEXT;
9383
9384     DEBUG_COMPILE_r(
9385         {
9386             const char * const s = SvPV_nolen_const(prog->check_substr
9387                       ? prog->check_substr : prog->check_utf8);
9388
9389             if (!PL_colorset) reginitcolors();
9390             PerlIO_printf(Perl_debug_log,
9391                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9392                       PL_colors[4],
9393                       prog->check_substr ? "" : "utf8 ",
9394                       PL_colors[5],PL_colors[0],
9395                       s,
9396                       PL_colors[1],
9397                       (strlen(s) > 60 ? "..." : ""));
9398         } );
9399
9400     return prog->check_substr ? prog->check_substr : prog->check_utf8;
9401 }
9402
9403 /*
9404    pregfree()
9405
9406    handles refcounting and freeing the perl core regexp structure. When
9407    it is necessary to actually free the structure the first thing it
9408    does is call the 'free' method of the regexp_engine associated to to
9409    the regexp, allowing the handling of the void *pprivate; member
9410    first. (This routine is not overridable by extensions, which is why
9411    the extensions free is called first.)
9412
9413    See regdupe and regdupe_internal if you change anything here.
9414 */
9415 #ifndef PERL_IN_XSUB_RE
9416 void
9417 Perl_pregfree(pTHX_ REGEXP *r)
9418 {
9419     SvREFCNT_dec(r);
9420 }
9421
9422 void
9423 Perl_pregfree2(pTHX_ REGEXP *rx)
9424 {
9425     dVAR;
9426     struct regexp *const r = (struct regexp *)SvANY(rx);
9427     GET_RE_DEBUG_FLAGS_DECL;
9428
9429     PERL_ARGS_ASSERT_PREGFREE2;
9430
9431     if (r->mother_re) {
9432         ReREFCNT_dec(r->mother_re);
9433     } else {
9434         CALLREGFREE_PVT(rx); /* free the private data */
9435         SvREFCNT_dec(RXp_PAREN_NAMES(r));
9436     }
9437     if (r->substrs) {
9438         SvREFCNT_dec(r->anchored_substr);
9439         SvREFCNT_dec(r->anchored_utf8);
9440         SvREFCNT_dec(r->float_substr);
9441         SvREFCNT_dec(r->float_utf8);
9442         Safefree(r->substrs);
9443     }
9444     RX_MATCH_COPY_FREE(rx);
9445 #ifdef PERL_OLD_COPY_ON_WRITE
9446     SvREFCNT_dec(r->saved_copy);
9447 #endif
9448     Safefree(r->offs);
9449 }
9450
9451 /*  reg_temp_copy()
9452
9453     This is a hacky workaround to the structural issue of match results
9454     being stored in the regexp structure which is in turn stored in
9455     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9456     could be PL_curpm in multiple contexts, and could require multiple
9457     result sets being associated with the pattern simultaneously, such
9458     as when doing a recursive match with (??{$qr})
9459
9460     The solution is to make a lightweight copy of the regexp structure
9461     when a qr// is returned from the code executed by (??{$qr}) this
9462     lightweight copy doesnt actually own any of its data except for
9463     the starp/end and the actual regexp structure itself.
9464
9465 */
9466
9467
9468 REGEXP *
9469 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9470 {
9471     struct regexp *ret;
9472     struct regexp *const r = (struct regexp *)SvANY(rx);
9473     register const I32 npar = r->nparens+1;
9474
9475     PERL_ARGS_ASSERT_REG_TEMP_COPY;
9476
9477     if (!ret_x)
9478         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9479     ret = (struct regexp *)SvANY(ret_x);
9480
9481     (void)ReREFCNT_inc(rx);
9482     /* We can take advantage of the existing "copied buffer" mechanism in SVs
9483        by pointing directly at the buffer, but flagging that the allocated
9484        space in the copy is zero. As we've just done a struct copy, it's now
9485        a case of zero-ing that, rather than copying the current length.  */
9486     SvPV_set(ret_x, RX_WRAPPED(rx));
9487     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9488     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9489            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9490     SvLEN_set(ret_x, 0);
9491     SvSTASH_set(ret_x, NULL);
9492     SvMAGIC_set(ret_x, NULL);
9493     Newx(ret->offs, npar, regexp_paren_pair);
9494     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9495     if (r->substrs) {
9496         Newx(ret->substrs, 1, struct reg_substr_data);
9497         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9498
9499         SvREFCNT_inc_void(ret->anchored_substr);
9500         SvREFCNT_inc_void(ret->anchored_utf8);
9501         SvREFCNT_inc_void(ret->float_substr);
9502         SvREFCNT_inc_void(ret->float_utf8);
9503
9504         /* check_substr and check_utf8, if non-NULL, point to either their
9505            anchored or float namesakes, and don't hold a second reference.  */
9506     }
9507     RX_MATCH_COPIED_off(ret_x);
9508 #ifdef PERL_OLD_COPY_ON_WRITE
9509     ret->saved_copy = NULL;
9510 #endif
9511     ret->mother_re = rx;
9512
9513     return ret_x;
9514 }
9515 #endif
9516
9517 /* regfree_internal()
9518
9519    Free the private data in a regexp. This is overloadable by
9520    extensions. Perl takes care of the regexp structure in pregfree(),
9521    this covers the *pprivate pointer which technically perldoesnt
9522    know about, however of course we have to handle the
9523    regexp_internal structure when no extension is in use.
9524
9525    Note this is called before freeing anything in the regexp
9526    structure.
9527  */
9528
9529 void
9530 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9531 {
9532     dVAR;
9533     struct regexp *const r = (struct regexp *)SvANY(rx);
9534     RXi_GET_DECL(r,ri);
9535     GET_RE_DEBUG_FLAGS_DECL;
9536
9537     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9538
9539     DEBUG_COMPILE_r({
9540         if (!PL_colorset)
9541             reginitcolors();
9542         {
9543             SV *dsv= sv_newmortal();
9544             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
9545                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
9546             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9547                 PL_colors[4],PL_colors[5],s);
9548         }
9549     });
9550 #ifdef RE_TRACK_PATTERN_OFFSETS
9551     if (ri->u.offsets)
9552         Safefree(ri->u.offsets);             /* 20010421 MJD */
9553 #endif
9554     if (ri->data) {
9555         int n = ri->data->count;
9556         PAD* new_comppad = NULL;
9557         PAD* old_comppad;
9558         PADOFFSET refcnt;
9559
9560         while (--n >= 0) {
9561           /* If you add a ->what type here, update the comment in regcomp.h */
9562             switch (ri->data->what[n]) {
9563             case 's':
9564             case 'S':
9565             case 'u':
9566                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9567                 break;
9568             case 'f':
9569                 Safefree(ri->data->data[n]);
9570                 break;
9571             case 'p':
9572                 new_comppad = MUTABLE_AV(ri->data->data[n]);
9573                 break;
9574             case 'o':
9575                 if (new_comppad == NULL)
9576                     Perl_croak(aTHX_ "panic: pregfree comppad");
9577                 PAD_SAVE_LOCAL(old_comppad,
9578                     /* Watch out for global destruction's random ordering. */
9579                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9580                 );
9581                 OP_REFCNT_LOCK;
9582                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9583                 OP_REFCNT_UNLOCK;
9584                 if (!refcnt)
9585                     op_free((OP_4tree*)ri->data->data[n]);
9586
9587                 PAD_RESTORE_LOCAL(old_comppad);
9588                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
9589                 new_comppad = NULL;
9590                 break;
9591             case 'n':
9592                 break;
9593             case 'T':
9594                 { /* Aho Corasick add-on structure for a trie node.
9595                      Used in stclass optimization only */
9596                     U32 refcount;
9597                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9598                     OP_REFCNT_LOCK;
9599                     refcount = --aho->refcount;
9600                     OP_REFCNT_UNLOCK;
9601                     if ( !refcount ) {
9602                         PerlMemShared_free(aho->states);
9603                         PerlMemShared_free(aho->fail);
9604                          /* do this last!!!! */
9605                         PerlMemShared_free(ri->data->data[n]);
9606                         PerlMemShared_free(ri->regstclass);
9607                     }
9608                 }
9609                 break;
9610             case 't':
9611                 {
9612                     /* trie structure. */
9613                     U32 refcount;
9614                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9615                     OP_REFCNT_LOCK;
9616                     refcount = --trie->refcount;
9617                     OP_REFCNT_UNLOCK;
9618                     if ( !refcount ) {
9619                         PerlMemShared_free(trie->charmap);
9620                         PerlMemShared_free(trie->states);
9621                         PerlMemShared_free(trie->trans);
9622                         if (trie->bitmap)
9623                             PerlMemShared_free(trie->bitmap);
9624                         if (trie->jump)
9625                             PerlMemShared_free(trie->jump);
9626                         PerlMemShared_free(trie->wordinfo);
9627                         /* do this last!!!! */
9628                         PerlMemShared_free(ri->data->data[n]);
9629                     }
9630                 }
9631                 break;
9632             default:
9633                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9634             }
9635         }
9636         Safefree(ri->data->what);
9637         Safefree(ri->data);
9638     }
9639
9640     Safefree(ri);
9641 }
9642
9643 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
9644 #define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9645 #define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9646 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
9647
9648 /*
9649    re_dup - duplicate a regexp.
9650
9651    This routine is expected to clone a given regexp structure. It is only
9652    compiled under USE_ITHREADS.
9653
9654    After all of the core data stored in struct regexp is duplicated
9655    the regexp_engine.dupe method is used to copy any private data
9656    stored in the *pprivate pointer. This allows extensions to handle
9657    any duplication it needs to do.
9658
9659    See pregfree() and regfree_internal() if you change anything here.
9660 */
9661 #if defined(USE_ITHREADS)
9662 #ifndef PERL_IN_XSUB_RE
9663 void
9664 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
9665 {
9666     dVAR;
9667     I32 npar;
9668     const struct regexp *r = (const struct regexp *)SvANY(sstr);
9669     struct regexp *ret = (struct regexp *)SvANY(dstr);
9670
9671     PERL_ARGS_ASSERT_RE_DUP_GUTS;
9672
9673     npar = r->nparens+1;
9674     Newx(ret->offs, npar, regexp_paren_pair);
9675     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9676     if(ret->swap) {
9677         /* no need to copy these */
9678         Newx(ret->swap, npar, regexp_paren_pair);
9679     }
9680
9681     if (ret->substrs) {
9682         /* Do it this way to avoid reading from *r after the StructCopy().
9683            That way, if any of the sv_dup_inc()s dislodge *r from the L1
9684            cache, it doesn't matter.  */
9685         const bool anchored = r->check_substr
9686             ? r->check_substr == r->anchored_substr
9687             : r->check_utf8 == r->anchored_utf8;
9688         Newx(ret->substrs, 1, struct reg_substr_data);
9689         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9690
9691         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9692         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9693         ret->float_substr = sv_dup_inc(ret->float_substr, param);
9694         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9695
9696         /* check_substr and check_utf8, if non-NULL, point to either their
9697            anchored or float namesakes, and don't hold a second reference.  */
9698
9699         if (ret->check_substr) {
9700             if (anchored) {
9701                 assert(r->check_utf8 == r->anchored_utf8);
9702                 ret->check_substr = ret->anchored_substr;
9703                 ret->check_utf8 = ret->anchored_utf8;
9704             } else {
9705                 assert(r->check_substr == r->float_substr);
9706                 assert(r->check_utf8 == r->float_utf8);
9707                 ret->check_substr = ret->float_substr;
9708                 ret->check_utf8 = ret->float_utf8;
9709             }
9710         } else if (ret->check_utf8) {
9711             if (anchored) {
9712                 ret->check_utf8 = ret->anchored_utf8;
9713             } else {
9714                 ret->check_utf8 = ret->float_utf8;
9715             }
9716         }
9717     }
9718
9719     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9720
9721     if (ret->pprivate)
9722         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
9723
9724     if (RX_MATCH_COPIED(dstr))
9725         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
9726     else
9727         ret->subbeg = NULL;
9728 #ifdef PERL_OLD_COPY_ON_WRITE
9729     ret->saved_copy = NULL;
9730 #endif
9731
9732     if (ret->mother_re) {
9733         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9734             /* Our storage points directly to our mother regexp, but that's
9735                1: a buffer in a different thread
9736                2: something we no longer hold a reference on
9737                so we need to copy it locally.  */
9738             /* Note we need to sue SvCUR() on our mother_re, because it, in
9739                turn, may well be pointing to its own mother_re.  */
9740             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9741                                    SvCUR(ret->mother_re)+1));
9742             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9743         }
9744         ret->mother_re      = NULL;
9745     }
9746     ret->gofs = 0;
9747 }
9748 #endif /* PERL_IN_XSUB_RE */
9749
9750 /*
9751    regdupe_internal()
9752
9753    This is the internal complement to regdupe() which is used to copy
9754    the structure pointed to by the *pprivate pointer in the regexp.
9755    This is the core version of the extension overridable cloning hook.
9756    The regexp structure being duplicated will be copied by perl prior
9757    to this and will be provided as the regexp *r argument, however
9758    with the /old/ structures pprivate pointer value. Thus this routine
9759    may override any copying normally done by perl.
9760
9761    It returns a pointer to the new regexp_internal structure.
9762 */
9763
9764 void *
9765 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
9766 {
9767     dVAR;
9768     struct regexp *const r = (struct regexp *)SvANY(rx);
9769     regexp_internal *reti;
9770     int len, npar;
9771     RXi_GET_DECL(r,ri);
9772
9773     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
9774
9775     npar = r->nparens+1;
9776     len = ProgLen(ri);
9777
9778     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
9779     Copy(ri->program, reti->program, len+1, regnode);
9780
9781
9782     reti->regstclass = NULL;
9783
9784     if (ri->data) {
9785         struct reg_data *d;
9786         const int count = ri->data->count;
9787         int i;
9788
9789         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9790                 char, struct reg_data);
9791         Newx(d->what, count, U8);
9792
9793         d->count = count;
9794         for (i = 0; i < count; i++) {
9795             d->what[i] = ri->data->what[i];
9796             switch (d->what[i]) {
9797                 /* legal options are one of: sSfpontTu
9798                    see also regcomp.h and pregfree() */
9799             case 's':
9800             case 'S':
9801             case 'p': /* actually an AV, but the dup function is identical.  */
9802             case 'u': /* actually an HV, but the dup function is identical.  */
9803                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
9804                 break;
9805             case 'f':
9806                 /* This is cheating. */
9807                 Newx(d->data[i], 1, struct regnode_charclass_class);
9808                 StructCopy(ri->data->data[i], d->data[i],
9809                             struct regnode_charclass_class);
9810                 reti->regstclass = (regnode*)d->data[i];
9811                 break;
9812             case 'o':
9813                 /* Compiled op trees are readonly and in shared memory,
9814                    and can thus be shared without duplication. */
9815                 OP_REFCNT_LOCK;
9816                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9817                 OP_REFCNT_UNLOCK;
9818                 break;
9819             case 'T':
9820                 /* Trie stclasses are readonly and can thus be shared
9821                  * without duplication. We free the stclass in pregfree
9822                  * when the corresponding reg_ac_data struct is freed.
9823                  */
9824                 reti->regstclass= ri->regstclass;
9825                 /* Fall through */
9826             case 't':
9827                 OP_REFCNT_LOCK;
9828                 ((reg_trie_data*)ri->data->data[i])->refcount++;
9829                 OP_REFCNT_UNLOCK;
9830                 /* Fall through */
9831             case 'n':
9832                 d->data[i] = ri->data->data[i];
9833                 break;
9834             default:
9835                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9836             }
9837         }
9838
9839         reti->data = d;
9840     }
9841     else
9842         reti->data = NULL;
9843
9844     reti->name_list_idx = ri->name_list_idx;
9845
9846 #ifdef RE_TRACK_PATTERN_OFFSETS
9847     if (ri->u.offsets) {
9848         Newx(reti->u.offsets, 2*len+1, U32);
9849         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9850     }
9851 #else
9852     SetProgLen(reti,len);
9853 #endif
9854
9855     return (void*)reti;
9856 }
9857
9858 #endif    /* USE_ITHREADS */
9859
9860 #ifndef PERL_IN_XSUB_RE
9861
9862 /*
9863  - regnext - dig the "next" pointer out of a node
9864  */
9865 regnode *
9866 Perl_regnext(pTHX_ register regnode *p)
9867 {
9868     dVAR;
9869     register I32 offset;
9870
9871     if (!p)
9872         return(NULL);
9873
9874     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9875     if (offset == 0)
9876         return(NULL);
9877
9878     return(p+offset);
9879 }
9880 #endif
9881
9882 STATIC void
9883 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9884 {
9885     va_list args;
9886     STRLEN l1 = strlen(pat1);
9887     STRLEN l2 = strlen(pat2);
9888     char buf[512];
9889     SV *msv;
9890     const char *message;
9891
9892     PERL_ARGS_ASSERT_RE_CROAK2;
9893
9894     if (l1 > 510)
9895         l1 = 510;
9896     if (l1 + l2 > 510)
9897         l2 = 510 - l1;
9898     Copy(pat1, buf, l1 , char);
9899     Copy(pat2, buf + l1, l2 , char);
9900     buf[l1 + l2] = '\n';
9901     buf[l1 + l2 + 1] = '\0';
9902 #ifdef I_STDARG
9903     /* ANSI variant takes additional second argument */
9904     va_start(args, pat2);
9905 #else
9906     va_start(args);
9907 #endif
9908     msv = vmess(buf, &args);
9909     va_end(args);
9910     message = SvPV_const(msv,l1);
9911     if (l1 > 512)
9912         l1 = 512;
9913     Copy(message, buf, l1 , char);
9914     buf[l1-1] = '\0';                   /* Overwrite \n */
9915     Perl_croak(aTHX_ "%s", buf);
9916 }
9917
9918 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
9919
9920 #ifndef PERL_IN_XSUB_RE
9921 void
9922 Perl_save_re_context(pTHX)
9923 {
9924     dVAR;
9925
9926     struct re_save_state *state;
9927
9928     SAVEVPTR(PL_curcop);
9929     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9930
9931     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9932     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9933     SSPUSHUV(SAVEt_RE_STATE);
9934
9935     Copy(&PL_reg_state, state, 1, struct re_save_state);
9936
9937     PL_reg_start_tmp = 0;
9938     PL_reg_start_tmpl = 0;
9939     PL_reg_oldsaved = NULL;
9940     PL_reg_oldsavedlen = 0;
9941     PL_reg_maxiter = 0;
9942     PL_reg_leftiter = 0;
9943     PL_reg_poscache = NULL;
9944     PL_reg_poscache_size = 0;
9945 #ifdef PERL_OLD_COPY_ON_WRITE
9946     PL_nrs = NULL;
9947 #endif
9948
9949     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9950     if (PL_curpm) {
9951         const REGEXP * const rx = PM_GETRE(PL_curpm);
9952         if (rx) {
9953             U32 i;
9954             for (i = 1; i <= RX_NPARENS(rx); i++) {
9955                 char digits[TYPE_CHARS(long)];
9956                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9957                 GV *const *const gvp
9958                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9959
9960                 if (gvp) {
9961                     GV * const gv = *gvp;
9962                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9963                         save_scalar(gv);
9964                 }
9965             }
9966         }
9967     }
9968 }
9969 #endif
9970
9971 static void
9972 clear_re(pTHX_ void *r)
9973 {
9974     dVAR;
9975     ReREFCNT_dec((REGEXP *)r);
9976 }
9977
9978 #ifdef DEBUGGING
9979
9980 STATIC void
9981 S_put_byte(pTHX_ SV *sv, int c)
9982 {
9983     PERL_ARGS_ASSERT_PUT_BYTE;
9984
9985     /* Our definition of isPRINT() ignores locales, so only bytes that are
9986        not part of UTF-8 are considered printable. I assume that the same
9987        holds for UTF-EBCDIC.
9988        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9989        which Wikipedia says:
9990
9991        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9992        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9993        identical, to the ASCII delete (DEL) or rubout control character.
9994        ) So the old condition can be simplified to !isPRINT(c)  */
9995     if (!isPRINT(c))
9996         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9997     else {
9998         const char string = c;
9999         if (c == '-' || c == ']' || c == '\\' || c == '^')
10000             sv_catpvs(sv, "\\");
10001         sv_catpvn(sv, &string, 1);
10002     }
10003 }
10004
10005
10006 #define CLEAR_OPTSTART \
10007     if (optstart) STMT_START { \
10008             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
10009             optstart=NULL; \
10010     } STMT_END
10011
10012 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
10013
10014 STATIC const regnode *
10015 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
10016             const regnode *last, const regnode *plast,
10017             SV* sv, I32 indent, U32 depth)
10018 {
10019     dVAR;
10020     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
10021     register const regnode *next;
10022     const regnode *optstart= NULL;
10023
10024     RXi_GET_DECL(r,ri);
10025     GET_RE_DEBUG_FLAGS_DECL;
10026
10027     PERL_ARGS_ASSERT_DUMPUNTIL;
10028
10029 #ifdef DEBUG_DUMPUNTIL
10030     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10031         last ? last-start : 0,plast ? plast-start : 0);
10032 #endif
10033
10034     if (plast && plast < last)
10035         last= plast;
10036
10037     while (PL_regkind[op] != END && (!last || node < last)) {
10038         /* While that wasn't END last time... */
10039         NODE_ALIGN(node);
10040         op = OP(node);
10041         if (op == CLOSE || op == WHILEM)
10042             indent--;
10043         next = regnext((regnode *)node);
10044
10045         /* Where, what. */
10046         if (OP(node) == OPTIMIZED) {
10047             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10048                 optstart = node;
10049             else
10050                 goto after_print;
10051         } else
10052             CLEAR_OPTSTART;
10053
10054         regprop(r, sv, node);
10055         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10056                       (int)(2*indent + 1), "", SvPVX_const(sv));
10057
10058         if (OP(node) != OPTIMIZED) {
10059             if (next == NULL)           /* Next ptr. */
10060                 PerlIO_printf(Perl_debug_log, " (0)");
10061             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10062                 PerlIO_printf(Perl_debug_log, " (FAIL)");
10063             else
10064                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10065             (void)PerlIO_putc(Perl_debug_log, '\n');
10066         }
10067
10068       after_print:
10069         if (PL_regkind[(U8)op] == BRANCHJ) {
10070             assert(next);
10071             {
10072                 register const regnode *nnode = (OP(next) == LONGJMP
10073                                              ? regnext((regnode *)next)
10074                                              : next);
10075                 if (last && nnode > last)
10076                     nnode = last;
10077                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10078             }
10079         }
10080         else if (PL_regkind[(U8)op] == BRANCH) {
10081             assert(next);
10082             DUMPUNTIL(NEXTOPER(node), next);
10083         }
10084         else if ( PL_regkind[(U8)op]  == TRIE ) {
10085             const regnode *this_trie = node;
10086             const char op = OP(node);
10087             const U32 n = ARG(node);
10088             const reg_ac_data * const ac = op>=AHOCORASICK ?
10089                (reg_ac_data *)ri->data->data[n] :
10090                NULL;
10091             const reg_trie_data * const trie =
10092                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10093 #ifdef DEBUGGING
10094             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10095 #endif
10096             const regnode *nextbranch= NULL;
10097             I32 word_idx;
10098             sv_setpvs(sv, "");
10099             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10100                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10101
10102                 PerlIO_printf(Perl_debug_log, "%*s%s ",
10103                    (int)(2*(indent+3)), "",
10104                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10105                             PL_colors[0], PL_colors[1],
10106                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10107                             PERL_PV_PRETTY_ELLIPSES    |
10108                             PERL_PV_PRETTY_LTGT
10109                             )
10110                             : "???"
10111                 );
10112                 if (trie->jump) {
10113                     U16 dist= trie->jump[word_idx+1];
10114                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10115                                   (UV)((dist ? this_trie + dist : next) - start));
10116                     if (dist) {
10117                         if (!nextbranch)
10118                             nextbranch= this_trie + trie->jump[0];
10119                         DUMPUNTIL(this_trie + dist, nextbranch);
10120                     }
10121                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10122                         nextbranch= regnext((regnode *)nextbranch);
10123                 } else {
10124                     PerlIO_printf(Perl_debug_log, "\n");
10125                 }
10126             }
10127             if (last && next > last)
10128                 node= last;
10129             else
10130                 node= next;
10131         }
10132         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
10133             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10134                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10135         }
10136         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10137             assert(next);
10138             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10139         }
10140         else if ( op == PLUS || op == STAR) {
10141             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10142         }
10143         else if (op == ANYOF) {
10144             /* arglen 1 + class block */
10145             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10146                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10147             node = NEXTOPER(node);
10148         }
10149         else if (PL_regkind[(U8)op] == EXACT) {
10150             /* Literal string, where present. */
10151             node += NODE_SZ_STR(node) - 1;
10152             node = NEXTOPER(node);
10153         }
10154         else {
10155             node = NEXTOPER(node);
10156             node += regarglen[(U8)op];
10157         }
10158         if (op == CURLYX || op == OPEN)
10159             indent++;
10160     }
10161     CLEAR_OPTSTART;
10162 #ifdef DEBUG_DUMPUNTIL
10163     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10164 #endif
10165     return node;
10166 }
10167
10168 #endif  /* DEBUGGING */
10169
10170 /*
10171  * Local variables:
10172  * c-indentation-style: bsd
10173  * c-basic-offset: 4
10174  * indent-tabs-mode: t
10175  * End:
10176  *
10177  * ex: set ts=8 sts=4 sw=4 noet:
10178  */