1288 lines
45 KiB
C
1288 lines
45 KiB
C
|
/*
|
||
|
* Regexp compilation.
|
||
|
*
|
||
|
* See doc/regexp.rst for a discussion of the compilation approach and
|
||
|
* current limitations.
|
||
|
*
|
||
|
* Regexp bytecode assumes jumps can be expressed with signed 32-bit
|
||
|
* integers. Consequently the bytecode size must not exceed 0x7fffffffL.
|
||
|
* The implementation casts duk_size_t (buffer size) to duk_(u)int32_t
|
||
|
* in many places. Although this could be changed, the bytecode format
|
||
|
* limit would still prevent regexps exceeding the signed 32-bit limit
|
||
|
* from working.
|
||
|
*
|
||
|
* XXX: The implementation does not prevent bytecode from exceeding the
|
||
|
* maximum supported size. This could be done by limiting the maximum
|
||
|
* input string size (assuming an upper bound can be computed for number
|
||
|
* of bytecode bytes emitted per input byte) or checking buffer maximum
|
||
|
* size when emitting bytecode (slower).
|
||
|
*/
|
||
|
|
||
|
#include "duk_internal.h"
|
||
|
|
||
|
#if defined(DUK_USE_REGEXP_SUPPORT)
|
||
|
|
||
|
/*
|
||
|
* Helper macros
|
||
|
*/
|
||
|
|
||
|
#define DUK__RE_INITIAL_BUFSIZE 64
|
||
|
|
||
|
#define DUK__RE_BUFLEN(re_ctx) \
|
||
|
DUK_BW_GET_SIZE(re_ctx->thr, &re_ctx->bw)
|
||
|
|
||
|
/*
|
||
|
* Disjunction struct: result of parsing a disjunction
|
||
|
*/
|
||
|
|
||
|
typedef struct {
|
||
|
/* Number of characters that the atom matches (e.g. 3 for 'abc'),
|
||
|
* -1 if atom is complex and number of matched characters either
|
||
|
* varies or is not known.
|
||
|
*/
|
||
|
duk_int32_t charlen;
|
||
|
|
||
|
#if 0
|
||
|
/* These are not needed to implement quantifier capture handling,
|
||
|
* but might be needed at some point.
|
||
|
*/
|
||
|
|
||
|
/* re_ctx->captures at start and end of atom parsing.
|
||
|
* Since 'captures' indicates highest capture number emitted
|
||
|
* so far in a DUK_REOP_SAVE, the captures numbers saved by
|
||
|
* the atom are: ]start_captures,end_captures].
|
||
|
*/
|
||
|
duk_uint32_t start_captures;
|
||
|
duk_uint32_t end_captures;
|
||
|
#endif
|
||
|
} duk__re_disjunction_info;
|
||
|
|
||
|
/*
|
||
|
* Encoding helpers
|
||
|
*
|
||
|
* Some of the typing is bytecode based, e.g. slice sizes are unsigned 32-bit
|
||
|
* even though the buffer operations will use duk_size_t.
|
||
|
*/
|
||
|
|
||
|
/* XXX: the insert helpers should ensure that the bytecode result is not
|
||
|
* larger than expected (or at least assert for it). Many things in the
|
||
|
* bytecode, like skip offsets, won't work correctly if the bytecode is
|
||
|
* larger than say 2G.
|
||
|
*/
|
||
|
|
||
|
DUK_LOCAL duk_uint32_t duk__encode_i32(duk_int32_t x) {
|
||
|
if (x < 0) {
|
||
|
return ((duk_uint32_t) (-x)) * 2 + 1;
|
||
|
} else {
|
||
|
return ((duk_uint32_t) x) * 2;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* XXX: return type should probably be duk_size_t, or explicit checks are needed for
|
||
|
* maximum size.
|
||
|
*/
|
||
|
DUK_LOCAL duk_uint32_t duk__insert_u32(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_uint32_t x) {
|
||
|
duk_uint8_t buf[DUK_UNICODE_MAX_XUTF8_LENGTH];
|
||
|
duk_small_int_t len;
|
||
|
|
||
|
len = duk_unicode_encode_xutf8((duk_ucodepoint_t) x, buf);
|
||
|
DUK_ASSERT(len >= 0);
|
||
|
DUK_BW_INSERT_ENSURE_BYTES(re_ctx->thr, &re_ctx->bw, offset, buf, (duk_size_t) len);
|
||
|
return (duk_uint32_t) len;
|
||
|
}
|
||
|
|
||
|
DUK_LOCAL void duk__append_u32(duk_re_compiler_ctx *re_ctx, duk_uint32_t x) {
|
||
|
DUK_BW_WRITE_ENSURE_XUTF8(re_ctx->thr, &re_ctx->bw, x);
|
||
|
}
|
||
|
|
||
|
DUK_LOCAL void duk__append_7bit(duk_re_compiler_ctx *re_ctx, duk_uint32_t x) {
|
||
|
#if defined(DUK_USE_PREFER_SIZE)
|
||
|
duk__append_u32(re_ctx, x);
|
||
|
#else
|
||
|
DUK_ASSERT(x <= 0x7fU);
|
||
|
DUK_BW_WRITE_ENSURE_U8(re_ctx->thr, &re_ctx->bw, (duk_uint8_t) x);
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
#if 0
|
||
|
DUK_LOCAL void duk__append_2bytes(duk_re_compiler_ctx *re_ctx, duk_uint8_t x, duk_uint8_t y) {
|
||
|
DUK_BW_WRITE_ENSURE_U8_2(re_ctx->thr, &re_ctx->bw, x, y);
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
DUK_LOCAL duk_uint32_t duk__insert_i32(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_int32_t x) {
|
||
|
return duk__insert_u32(re_ctx, offset, duk__encode_i32(x));
|
||
|
}
|
||
|
|
||
|
DUK_LOCAL void duk__append_reop(duk_re_compiler_ctx *re_ctx, duk_uint32_t reop) {
|
||
|
DUK_ASSERT(reop <= 0x7fU);
|
||
|
(void) duk__append_7bit(re_ctx, reop);
|
||
|
}
|
||
|
|
||
|
#if 0 /* unused */
|
||
|
DUK_LOCAL void duk__append_i32(duk_re_compiler_ctx *re_ctx, duk_int32_t x) {
|
||
|
duk__append_u32(re_ctx, duk__encode_i32(x));
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
/* special helper for emitting u16 lists (used for character ranges for built-in char classes) */
|
||
|
DUK_LOCAL void duk__append_u16_list(duk_re_compiler_ctx *re_ctx, const duk_uint16_t *values, duk_uint32_t count) {
|
||
|
/* Call sites don't need the result length so it's not accumulated. */
|
||
|
while (count-- > 0) {
|
||
|
duk__append_u32(re_ctx, (duk_uint32_t) (*values++));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
DUK_LOCAL void duk__insert_slice(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_uint32_t data_offset, duk_uint32_t data_length) {
|
||
|
DUK_BW_INSERT_ENSURE_SLICE(re_ctx->thr, &re_ctx->bw, offset, data_offset, data_length);
|
||
|
}
|
||
|
|
||
|
DUK_LOCAL void duk__append_slice(duk_re_compiler_ctx *re_ctx, duk_uint32_t data_offset, duk_uint32_t data_length) {
|
||
|
DUK_BW_WRITE_ENSURE_SLICE(re_ctx->thr, &re_ctx->bw, data_offset, data_length);
|
||
|
}
|
||
|
|
||
|
DUK_LOCAL void duk__remove_slice(duk_re_compiler_ctx *re_ctx, duk_uint32_t data_offset, duk_uint32_t data_length) {
|
||
|
DUK_BW_REMOVE_ENSURE_SLICE(re_ctx->thr, &re_ctx->bw, data_offset, data_length);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Insert a jump offset at 'offset' to complete an instruction
|
||
|
* (the jump offset is always the last component of an instruction).
|
||
|
* The 'skip' argument must be computed relative to 'offset',
|
||
|
* -without- taking into account the skip field being inserted.
|
||
|
*
|
||
|
* ... A B C ins X Y Z ... (ins may be a JUMP, SPLIT1/SPLIT2, etc)
|
||
|
* => ... A B C ins SKIP X Y Z
|
||
|
*
|
||
|
* Computing the final (adjusted) skip value, which is relative to the
|
||
|
* first byte of the next instruction, is a bit tricky because of the
|
||
|
* variable length UTF-8 encoding. See doc/regexp.rst for discussion.
|
||
|
*/
|
||
|
DUK_LOCAL duk_uint32_t duk__insert_jump_offset(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_int32_t skip) {
|
||
|
#if 0
|
||
|
/* Iterative solution. */
|
||
|
if (skip < 0) {
|
||
|
duk_small_int_t len;
|
||
|
/* two encoding attempts suffices */
|
||
|
len = duk_unicode_get_xutf8_length((duk_codepoint_t) duk__encode_i32(skip));
|
||
|
len = duk_unicode_get_xutf8_length((duk_codepoint_t) duk__encode_i32(skip - (duk_int32_t) len));
|
||
|
DUK_ASSERT(duk_unicode_get_xutf8_length(duk__encode_i32(skip - (duk_int32_t) len)) == len); /* no change */
|
||
|
skip -= (duk_int32_t) len;
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
#if defined(DUK_USE_PREFER_SIZE)
|
||
|
/* Closed form solution, this produces smallest code.
|
||
|
* See re_neg_jump_offset (closed2).
|
||
|
*/
|
||
|
if (skip < 0) {
|
||
|
skip--;
|
||
|
if (skip < -0x3fL) {
|
||
|
skip--;
|
||
|
}
|
||
|
if (skip < -0x3ffL) {
|
||
|
skip--;
|
||
|
}
|
||
|
if (skip < -0x7fffL) {
|
||
|
skip--;
|
||
|
}
|
||
|
if (skip < -0xfffffL) {
|
||
|
skip--;
|
||
|
}
|
||
|
if (skip < -0x1ffffffL) {
|
||
|
skip--;
|
||
|
}
|
||
|
if (skip < -0x3fffffffL) {
|
||
|
skip--;
|
||
|
}
|
||
|
}
|
||
|
#else /* DUK_USE_PREFER_SIZE */
|
||
|
/* Closed form solution, this produces fastest code.
|
||
|
* See re_neg_jump_offset (closed1).
|
||
|
*/
|
||
|
if (skip < 0) {
|
||
|
if (skip >= -0x3eL) {
|
||
|
skip -= 1;
|
||
|
} else if (skip >= -0x3fdL) {
|
||
|
skip -= 2;
|
||
|
} else if (skip >= -0x7ffcL) {
|
||
|
skip -= 3;
|
||
|
} else if (skip >= -0xffffbL) {
|
||
|
skip -= 4;
|
||
|
} else if (skip >= -0x1fffffaL) {
|
||
|
skip -= 5;
|
||
|
} else if (skip >= -0x3ffffff9L) {
|
||
|
skip -= 6;
|
||
|
} else {
|
||
|
skip -= 7;
|
||
|
}
|
||
|
}
|
||
|
#endif /* DUK_USE_PREFER_SIZE */
|
||
|
|
||
|
return duk__insert_i32(re_ctx, offset, skip);
|
||
|
}
|
||
|
|
||
|
DUK_LOCAL duk_uint32_t duk__append_jump_offset(duk_re_compiler_ctx *re_ctx, duk_int32_t skip) {
|
||
|
return (duk_uint32_t) duk__insert_jump_offset(re_ctx, (duk_uint32_t) DUK__RE_BUFLEN(re_ctx), skip);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* duk_re_range_callback for generating character class ranges.
|
||
|
*
|
||
|
* When ignoreCase is false, the range is simply emitted as is. We don't,
|
||
|
* for instance, eliminate duplicates or overlapping ranges in a character
|
||
|
* class.
|
||
|
*
|
||
|
* When ignoreCase is true but the 'direct' flag is set, the caller knows
|
||
|
* that the range canonicalizes to itself for case insensitive matching,
|
||
|
* so the range is emitted as is. This is mainly useful for built-in ranges
|
||
|
* like \W.
|
||
|
*
|
||
|
* Otherwise, when ignoreCase is true, the range needs to be normalized
|
||
|
* through canonicalization. Unfortunately a canonicalized version of a
|
||
|
* continuous range is not necessarily continuous (e.g. [x-{] is continuous
|
||
|
* but [X-{] is not). As a result, a single input range may expand to a lot
|
||
|
* of output ranges. The current algorithm creates the canonicalized ranges
|
||
|
* footprint efficiently at the cost of compile time execution time; see
|
||
|
* doc/regexp.rst for discussion, and some more details below.
|
||
|
*
|
||
|
* Note that the ctx->nranges is a context-wide temporary value. This is OK
|
||
|
* because there cannot be multiple character classes being parsed
|
||
|
* simultaneously.
|
||
|
*
|
||
|
* More detail on canonicalization:
|
||
|
*
|
||
|
* Conceptually, a range is canonicalized by scanning the entire range,
|
||
|
* normalizing each codepoint by converting it to uppercase, and generating
|
||
|
* a set of result ranges.
|
||
|
*
|
||
|
* Ideally a minimal set of output ranges would be emitted by merging all
|
||
|
* possible ranges even if they're emitted out of sequence. Because the
|
||
|
* input string is also case normalized during matching, some codepoints
|
||
|
* never occur at runtime; these "don't care" codepoints can be included or
|
||
|
* excluded from ranges when merging/optimizing ranges.
|
||
|
*
|
||
|
* The current algorithm does not do optimal range merging. Rather, output
|
||
|
* codepoints are generated in sequence, and when the output codepoints are
|
||
|
* continuous (CP, CP+1, CP+2, ...), they are merged locally into as large a
|
||
|
* range as possible. A small canonicalization bitmap is used to reduce
|
||
|
* actual codepoint canonicalizations which are quite slow at present. The
|
||
|
* bitmap provides a "codepoint block is continuous with respect to
|
||
|
* canonicalization" for N-codepoint blocks. This allows blocks to be
|
||
|
* skipped quickly.
|
||
|
*
|
||
|
* There are a number of shortcomings and future work here:
|
||
|
*
|
||
|
* - Individual codepoint normalizations are slow because they involve
|
||
|
* walking bit-packed rules without a lookup index.
|
||
|
*
|
||
|
* - The conceptual algorithm needs to canonicalize every codepoint in the
|
||
|
* input range to figure out the output range(s). Even with the small
|
||
|
* canonicalization bitmap the algorithm runs quite slowly for worst case
|
||
|
* inputs. There are many data structure alternatives to improve this.
|
||
|
*
|
||
|
* - While the current algorithm generates maximal output ranges when the
|
||
|
* output codepoints are emitted linearly, output ranges are not sorted or
|
||
|
* merged otherwise. In the worst case a lot of ranges are emitted when
|
||
|
* most of the ranges could be merged. In this process one could take
|
||
|
* advantage of "don't care" codepoints, which are never matched against at
|
||
|
* runtime due to canonicalization of input codepoints before comparison,
|
||
|
* to merge otherwise discontinuous output ranges.
|
||
|
*
|
||
|
* - The runtime data structure is just a linear list of ranges to match
|
||
|
* against. This can be quite slow if there are a lot of output ranges.
|
||
|
* There are various ways to make matching against the ranges faster,
|
||
|
* e.g. sorting the ranges and using a binary search; skip lists; tree
|
||
|
* based representations; full or approximate codepoint bitmaps, etc.
|
||
|
*
|
||
|
* - Only BMP is supported, codepoints above BMP are assumed to canonicalize
|
||
|
* to themselves. For now this is one place where we don't want to
|
||
|
* support chars outside the BMP, because the exhaustive search would be
|
||
|
* massively larger. It would be possible to support non-BMP with a
|
||
|
* different algorithm, or perhaps doing case normalization only at match
|
||
|
* time.
|
||
|
*/
|
||
|
|
||
|
DUK_LOCAL void duk__regexp_emit_range(duk_re_compiler_ctx *re_ctx, duk_codepoint_t r1, duk_codepoint_t r2) {
|
||
|
DUK_ASSERT(r2 >= r1);
|
||
|
duk__append_u32(re_ctx, (duk_uint32_t) r1);
|
||
|
duk__append_u32(re_ctx, (duk_uint32_t) r2);
|
||
|
re_ctx->nranges++;
|
||
|
}
|
||
|
|
||
|
#if defined(DUK_USE_REGEXP_CANON_BITMAP)
|
||
|
/* Find next canonicalization discontinuity (conservative estimate) starting
|
||
|
* from 'start', not exceeding 'end'. If continuity is fine up to 'end'
|
||
|
* inclusive, returns end. Minimum possible return value is start.
|
||
|
*/
|
||
|
DUK_LOCAL duk_codepoint_t duk__re_canon_next_discontinuity(duk_codepoint_t start, duk_codepoint_t end) {
|
||
|
duk_uint_t start_blk;
|
||
|
duk_uint_t end_blk;
|
||
|
duk_uint_t blk;
|
||
|
duk_uint_t offset;
|
||
|
duk_uint8_t mask;
|
||
|
|
||
|
/* Inclusive block range. */
|
||
|
DUK_ASSERT(start >= 0);
|
||
|
DUK_ASSERT(end >= 0);
|
||
|
DUK_ASSERT(end >= start);
|
||
|
start_blk = (duk_uint_t) (start >> DUK_CANON_BITMAP_BLKSHIFT);
|
||
|
end_blk = (duk_uint_t) (end >> DUK_CANON_BITMAP_BLKSHIFT);
|
||
|
|
||
|
for (blk = start_blk; blk <= end_blk; blk++) {
|
||
|
offset = blk >> 3;
|
||
|
mask = 1U << (blk & 0x07);
|
||
|
if (offset >= sizeof(duk_unicode_re_canon_bitmap)) {
|
||
|
/* Reached non-BMP range which is assumed continuous. */
|
||
|
return end;
|
||
|
}
|
||
|
DUK_ASSERT(offset < sizeof(duk_unicode_re_canon_bitmap));
|
||
|
if ((duk_unicode_re_canon_bitmap[offset] & mask) == 0) {
|
||
|
/* Block is discontinuous, continuity is guaranteed
|
||
|
* only up to end of previous block (+1 for exclusive
|
||
|
* return value => start of current block). Start
|
||
|
* block requires special handling.
|
||
|
*/
|
||
|
if (blk > start_blk) {
|
||
|
return (duk_codepoint_t) (blk << DUK_CANON_BITMAP_BLKSHIFT);
|
||
|
} else {
|
||
|
return start;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
DUK_ASSERT(blk == end_blk + 1); /* Reached end block which is continuous. */
|
||
|
return end;
|
||
|
}
|
||
|
#else /* DUK_USE_REGEXP_CANON_BITMAP */
|
||
|
DUK_LOCAL duk_codepoint_t duk__re_canon_next_discontinuity(duk_codepoint_t start, duk_codepoint_t end) {
|
||
|
DUK_ASSERT(start >= 0);
|
||
|
DUK_ASSERT(end >= 0);
|
||
|
DUK_ASSERT(end >= start);
|
||
|
if (start >= 0x10000) {
|
||
|
/* Even without the bitmap, treat non-BMP as continuous. */
|
||
|
return end;
|
||
|
}
|
||
|
return start;
|
||
|
}
|
||
|
#endif /* DUK_USE_REGEXP_CANON_BITMAP */
|
||
|
|
||
|
DUK_LOCAL void duk__regexp_generate_ranges(void *userdata, duk_codepoint_t r1, duk_codepoint_t r2, duk_bool_t direct) {
|
||
|
duk_re_compiler_ctx *re_ctx = (duk_re_compiler_ctx *) userdata;
|
||
|
duk_codepoint_t r_start;
|
||
|
duk_codepoint_t r_end;
|
||
|
duk_codepoint_t i;
|
||
|
duk_codepoint_t t;
|
||
|
duk_codepoint_t r_disc;
|
||
|
|
||
|
DUK_DD(DUK_DDPRINT("duk__regexp_generate_ranges(): re_ctx=%p, range=[%ld,%ld] direct=%ld",
|
||
|
(void *) re_ctx, (long) r1, (long) r2, (long) direct));
|
||
|
|
||
|
DUK_ASSERT(r2 >= r1); /* SyntaxError for out of order range. */
|
||
|
|
||
|
if (direct || (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE) == 0) {
|
||
|
DUK_DD(DUK_DDPRINT("direct or not case sensitive, emit range: [%ld,%ld]", (long) r1, (long) r2));
|
||
|
duk__regexp_emit_range(re_ctx, r1, r2);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
DUK_DD(DUK_DDPRINT("case sensitive, process range: [%ld,%ld]", (long) r1, (long) r2));
|
||
|
|
||
|
r_start = duk_unicode_re_canonicalize_char(re_ctx->thr, r1);
|
||
|
r_end = r_start;
|
||
|
|
||
|
for (i = r1 + 1; i <= r2;) {
|
||
|
/* Input codepoint space processed up to i-1, and
|
||
|
* current range in r_{start,end} is up-to-date
|
||
|
* (inclusive) and may either break or continue.
|
||
|
*/
|
||
|
r_disc = duk__re_canon_next_discontinuity(i, r2);
|
||
|
DUK_ASSERT(r_disc >= i);
|
||
|
DUK_ASSERT(r_disc <= r2);
|
||
|
|
||
|
r_end += r_disc - i; /* May be zero. */
|
||
|
t = duk_unicode_re_canonicalize_char(re_ctx->thr, r_disc);
|
||
|
if (t == r_end + 1) {
|
||
|
/* Not actually a discontinuity, continue range
|
||
|
* to r_disc and recheck.
|
||
|
*/
|
||
|
r_end = t;
|
||
|
} else {
|
||
|
duk__regexp_emit_range(re_ctx, r_start, r_end);
|
||
|
r_start = t;
|
||
|
r_end = t;
|
||
|
}
|
||
|
i = r_disc + 1; /* Guarantees progress. */
|
||
|
}
|
||
|
duk__regexp_emit_range(re_ctx, r_start, r_end);
|
||
|
|
||
|
#if 0 /* Exhaustive search, very slow. */
|
||
|
r_start = duk_unicode_re_canonicalize_char(re_ctx->thr, r1);
|
||
|
r_end = r_start;
|
||
|
for (i = r1 + 1; i <= r2; i++) {
|
||
|
t = duk_unicode_re_canonicalize_char(re_ctx->thr, i);
|
||
|
if (t == r_end + 1) {
|
||
|
r_end = t;
|
||
|
} else {
|
||
|
DUK_DD(DUK_DDPRINT("canonicalized, emit range: [%ld,%ld]", (long) r_start, (long) r_end));
|
||
|
duk__append_u32(re_ctx, (duk_uint32_t) r_start);
|
||
|
duk__append_u32(re_ctx, (duk_uint32_t) r_end);
|
||
|
re_ctx->nranges++;
|
||
|
r_start = t;
|
||
|
r_end = t;
|
||
|
}
|
||
|
}
|
||
|
DUK_DD(DUK_DDPRINT("canonicalized, emit range: [%ld,%ld]", (long) r_start, (long) r_end));
|
||
|
duk__append_u32(re_ctx, (duk_uint32_t) r_start);
|
||
|
duk__append_u32(re_ctx, (duk_uint32_t) r_end);
|
||
|
re_ctx->nranges++;
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Parse regexp Disjunction. Most of regexp compilation happens here.
|
||
|
*
|
||
|
* Handles Disjunction, Alternative, and Term productions directly without
|
||
|
* recursion. The only constructs requiring recursion are positive/negative
|
||
|
* lookaheads, capturing parentheses, and non-capturing parentheses.
|
||
|
*
|
||
|
* The function determines whether the entire disjunction is a 'simple atom'
|
||
|
* (see doc/regexp.rst discussion on 'simple quantifiers') and if so,
|
||
|
* returns the atom character length which is needed by the caller to keep
|
||
|
* track of its own atom character length. A disjunction with more than one
|
||
|
* alternative is never considered a simple atom (although in some cases
|
||
|
* that might be the case).
|
||
|
*
|
||
|
* Return value: simple atom character length or < 0 if not a simple atom.
|
||
|
* Appends the bytecode for the disjunction matcher to the end of the temp
|
||
|
* buffer.
|
||
|
*
|
||
|
* Regexp top level structure is:
|
||
|
*
|
||
|
* Disjunction = Term*
|
||
|
* | Term* | Disjunction
|
||
|
*
|
||
|
* Term = Assertion
|
||
|
* | Atom
|
||
|
* | Atom Quantifier
|
||
|
*
|
||
|
* An empty Term sequence is a valid disjunction alternative (e.g. /|||c||/).
|
||
|
*
|
||
|
* Notes:
|
||
|
*
|
||
|
* * Tracking of the 'simple-ness' of the current atom vs. the entire
|
||
|
* disjunction are separate matters. For instance, the disjunction
|
||
|
* may be complex, but individual atoms may be simple. Furthermore,
|
||
|
* simple quantifiers are used whenever possible, even if the
|
||
|
* disjunction as a whole is complex.
|
||
|
*
|
||
|
* * The estimate of whether an atom is simple is conservative now,
|
||
|
* and it would be possible to expand it. For instance, captures
|
||
|
* cause the disjunction to be marked complex, even though captures
|
||
|
* -can- be handled by simple quantifiers with some minor modifications.
|
||
|
*
|
||
|
* * Disjunction 'tainting' as 'complex' is handled at the end of the
|
||
|
* main for loop collectively for atoms. Assertions, quantifiers,
|
||
|
* and '|' tokens need to taint the result manually if necessary.
|
||
|
* Assertions cannot add to result char length, only atoms (and
|
||
|
* quantifiers) can; currently quantifiers will taint the result
|
||
|
* as complex though.
|
||
|
*/
|
||
|
|
||
|
DUK_LOCAL const duk_uint16_t * const duk__re_range_lookup1[3] = {
|
||
|
duk_unicode_re_ranges_digit,
|
||
|
duk_unicode_re_ranges_white,
|
||
|
duk_unicode_re_ranges_wordchar
|
||
|
};
|
||
|
DUK_LOCAL const duk_uint8_t duk__re_range_lookup2[3] = {
|
||
|
sizeof(duk_unicode_re_ranges_digit) / (2 * sizeof(duk_uint16_t)),
|
||
|
sizeof(duk_unicode_re_ranges_white) / (2 * sizeof(duk_uint16_t)),
|
||
|
sizeof(duk_unicode_re_ranges_wordchar) / (2 * sizeof(duk_uint16_t))
|
||
|
};
|
||
|
|
||
|
DUK_LOCAL void duk__append_range_atom_matcher(duk_re_compiler_ctx *re_ctx, duk_small_uint_t re_op, const duk_uint16_t *ranges, duk_small_uint_t count) {
|
||
|
#if 0
|
||
|
DUK_ASSERT(re_op <= 0x7fUL);
|
||
|
DUK_ASSERT(count <= 0x7fUL);
|
||
|
duk__append_2bytes(re_ctx, (duk_uint8_t) re_op, (duk_uint8_t) count);
|
||
|
#endif
|
||
|
duk__append_reop(re_ctx, re_op);
|
||
|
duk__append_7bit(re_ctx, count);
|
||
|
duk__append_u16_list(re_ctx, ranges, count * 2);
|
||
|
}
|
||
|
|
||
|
DUK_LOCAL void duk__parse_disjunction(duk_re_compiler_ctx *re_ctx, duk_bool_t expect_eof, duk__re_disjunction_info *out_atom_info) {
|
||
|
duk_int32_t atom_start_offset = -1; /* negative -> no atom matched on previous round */
|
||
|
duk_int32_t atom_char_length = 0; /* negative -> complex atom */
|
||
|
duk_uint32_t atom_start_captures = re_ctx->captures; /* value of re_ctx->captures at start of atom */
|
||
|
duk_int32_t unpatched_disjunction_split = -1;
|
||
|
duk_int32_t unpatched_disjunction_jump = -1;
|
||
|
duk_uint32_t entry_offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);
|
||
|
duk_int32_t res_charlen = 0; /* -1 if disjunction is complex, char length if simple */
|
||
|
duk__re_disjunction_info tmp_disj;
|
||
|
|
||
|
DUK_ASSERT(out_atom_info != NULL);
|
||
|
|
||
|
duk_native_stack_check(re_ctx->thr);
|
||
|
if (re_ctx->recursion_depth >= re_ctx->recursion_limit) {
|
||
|
DUK_ERROR_RANGE(re_ctx->thr, DUK_STR_REGEXP_COMPILER_RECURSION_LIMIT);
|
||
|
DUK_WO_NORETURN(return;);
|
||
|
}
|
||
|
re_ctx->recursion_depth++;
|
||
|
|
||
|
#if 0
|
||
|
out_atom_info->start_captures = re_ctx->captures;
|
||
|
#endif
|
||
|
|
||
|
for (;;) {
|
||
|
/* atom_char_length, atom_start_offset, atom_start_offset reflect the
|
||
|
* atom matched on the previous loop. If a quantifier is encountered
|
||
|
* on this loop, these are needed to handle the quantifier correctly.
|
||
|
* new_atom_char_length etc are for the atom parsed on this round;
|
||
|
* they're written to atom_char_length etc at the end of the round.
|
||
|
*/
|
||
|
duk_int32_t new_atom_char_length; /* char length of the atom parsed in this loop */
|
||
|
duk_int32_t new_atom_start_offset; /* bytecode start offset of the atom parsed in this loop
|
||
|
* (allows quantifiers to copy the atom bytecode)
|
||
|
*/
|
||
|
duk_uint32_t new_atom_start_captures; /* re_ctx->captures at the start of the atom parsed in this loop */
|
||
|
|
||
|
duk_lexer_parse_re_token(&re_ctx->lex, &re_ctx->curr_token);
|
||
|
|
||
|
DUK_DD(DUK_DDPRINT("re token: %ld (num=%ld, char=%c)",
|
||
|
(long) re_ctx->curr_token.t,
|
||
|
(long) re_ctx->curr_token.num,
|
||
|
(re_ctx->curr_token.num >= 0x20 && re_ctx->curr_token.num <= 0x7e) ?
|
||
|
(int) re_ctx->curr_token.num : (int) '?'));
|
||
|
|
||
|
/* set by atom case clauses */
|
||
|
new_atom_start_offset = -1;
|
||
|
new_atom_char_length = -1;
|
||
|
new_atom_start_captures = re_ctx->captures;
|
||
|
|
||
|
switch (re_ctx->curr_token.t) {
|
||
|
case DUK_RETOK_DISJUNCTION: {
|
||
|
/*
|
||
|
* The handling here is a bit tricky. If a previous '|' has been processed,
|
||
|
* we have a pending split1 and a pending jump (for a previous match). These
|
||
|
* need to be back-patched carefully. See docs for a detailed example.
|
||
|
*/
|
||
|
|
||
|
/* patch pending jump and split */
|
||
|
if (unpatched_disjunction_jump >= 0) {
|
||
|
duk_uint32_t offset;
|
||
|
|
||
|
DUK_ASSERT(unpatched_disjunction_split >= 0);
|
||
|
offset = (duk_uint32_t) unpatched_disjunction_jump;
|
||
|
offset += duk__insert_jump_offset(re_ctx,
|
||
|
offset,
|
||
|
(duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - offset));
|
||
|
/* offset is now target of the pending split (right after jump) */
|
||
|
duk__insert_jump_offset(re_ctx,
|
||
|
(duk_uint32_t) unpatched_disjunction_split,
|
||
|
(duk_int32_t) offset - unpatched_disjunction_split);
|
||
|
}
|
||
|
|
||
|
/* add a new pending split to the beginning of the entire disjunction */
|
||
|
(void) duk__insert_u32(re_ctx,
|
||
|
entry_offset,
|
||
|
DUK_REOP_SPLIT1); /* prefer direct execution */
|
||
|
unpatched_disjunction_split = (duk_int32_t) (entry_offset + 1); /* +1 for opcode */
|
||
|
|
||
|
/* add a new pending match jump for latest finished alternative */
|
||
|
duk__append_reop(re_ctx, DUK_REOP_JUMP);
|
||
|
unpatched_disjunction_jump = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
|
||
|
|
||
|
/* 'taint' result as complex */
|
||
|
res_charlen = -1;
|
||
|
break;
|
||
|
}
|
||
|
case DUK_RETOK_QUANTIFIER: {
|
||
|
if (atom_start_offset < 0) {
|
||
|
DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_INVALID_QUANTIFIER_NO_ATOM);
|
||
|
DUK_WO_NORETURN(return;);
|
||
|
}
|
||
|
if (re_ctx->curr_token.qmin > re_ctx->curr_token.qmax) {
|
||
|
DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_INVALID_QUANTIFIER_VALUES);
|
||
|
DUK_WO_NORETURN(return;);
|
||
|
}
|
||
|
if (atom_char_length >= 0) {
|
||
|
/*
|
||
|
* Simple atom
|
||
|
*
|
||
|
* If atom_char_length is zero, we'll have unbounded execution time for e.g.
|
||
|
* /()*x/.exec('x'). We can't just skip the match because it might have some
|
||
|
* side effects (for instance, if we allowed captures in simple atoms, the
|
||
|
* capture needs to happen). The simple solution below is to force the
|
||
|
* quantifier to match at most once, since the additional matches have no effect.
|
||
|
*
|
||
|
* With a simple atom there can be no capture groups, so no captures need
|
||
|
* to be reset.
|
||
|
*/
|
||
|
duk_int32_t atom_code_length;
|
||
|
duk_uint32_t offset;
|
||
|
duk_uint32_t qmin, qmax;
|
||
|
|
||
|
qmin = re_ctx->curr_token.qmin;
|
||
|
qmax = re_ctx->curr_token.qmax;
|
||
|
if (atom_char_length == 0) {
|
||
|
/* qmin and qmax will be 0 or 1 */
|
||
|
if (qmin > 1) {
|
||
|
qmin = 1;
|
||
|
}
|
||
|
if (qmax > 1) {
|
||
|
qmax = 1;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
duk__append_reop(re_ctx, DUK_REOP_MATCH); /* complete 'sub atom' */
|
||
|
atom_code_length = (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - (duk_size_t) atom_start_offset);
|
||
|
|
||
|
offset = (duk_uint32_t) atom_start_offset;
|
||
|
if (re_ctx->curr_token.greedy) {
|
||
|
offset += duk__insert_u32(re_ctx, offset, DUK_REOP_SQGREEDY);
|
||
|
offset += duk__insert_u32(re_ctx, offset, qmin);
|
||
|
offset += duk__insert_u32(re_ctx, offset, qmax);
|
||
|
offset += duk__insert_u32(re_ctx, offset, (duk_uint32_t) atom_char_length);
|
||
|
offset += duk__insert_jump_offset(re_ctx, offset, atom_code_length);
|
||
|
} else {
|
||
|
offset += duk__insert_u32(re_ctx, offset, DUK_REOP_SQMINIMAL);
|
||
|
offset += duk__insert_u32(re_ctx, offset, qmin);
|
||
|
offset += duk__insert_u32(re_ctx, offset, qmax);
|
||
|
offset += duk__insert_jump_offset(re_ctx, offset, atom_code_length);
|
||
|
}
|
||
|
DUK_UNREF(offset); /* silence scan-build warning */
|
||
|
} else {
|
||
|
/*
|
||
|
* Complex atom
|
||
|
*
|
||
|
* The original code is used as a template, and removed at the end
|
||
|
* (this differs from the handling of simple quantifiers).
|
||
|
*
|
||
|
* NOTE: there is no current solution for empty atoms in complex
|
||
|
* quantifiers. This would need some sort of a 'progress' instruction.
|
||
|
*
|
||
|
* XXX: impose limit on maximum result size, i.e. atom_code_len * atom_copies?
|
||
|
*/
|
||
|
duk_int32_t atom_code_length;
|
||
|
duk_uint32_t atom_copies;
|
||
|
duk_uint32_t tmp_qmin, tmp_qmax;
|
||
|
|
||
|
/* pre-check how many atom copies we're willing to make (atom_copies not needed below) */
|
||
|
atom_copies = (re_ctx->curr_token.qmax == DUK_RE_QUANTIFIER_INFINITE) ?
|
||
|
re_ctx->curr_token.qmin : re_ctx->curr_token.qmax;
|
||
|
if (atom_copies > DUK_RE_MAX_ATOM_COPIES) {
|
||
|
DUK_ERROR_RANGE(re_ctx->thr, DUK_STR_QUANTIFIER_TOO_MANY_COPIES);
|
||
|
DUK_WO_NORETURN(return;);
|
||
|
}
|
||
|
|
||
|
/* wipe the capture range made by the atom (if any) */
|
||
|
DUK_ASSERT(atom_start_captures <= re_ctx->captures);
|
||
|
if (atom_start_captures != re_ctx->captures) {
|
||
|
DUK_ASSERT(atom_start_captures < re_ctx->captures);
|
||
|
DUK_DDD(DUK_DDDPRINT("must wipe ]atom_start_captures,re_ctx->captures]: ]%ld,%ld]",
|
||
|
(long) atom_start_captures, (long) re_ctx->captures));
|
||
|
|
||
|
/* insert (DUK_REOP_WIPERANGE, start, count) in reverse order so the order ends up right */
|
||
|
duk__insert_u32(re_ctx, (duk_uint32_t) atom_start_offset, (re_ctx->captures - atom_start_captures) * 2U);
|
||
|
duk__insert_u32(re_ctx, (duk_uint32_t) atom_start_offset, (atom_start_captures + 1) * 2);
|
||
|
duk__insert_u32(re_ctx, (duk_uint32_t) atom_start_offset, DUK_REOP_WIPERANGE);
|
||
|
} else {
|
||
|
DUK_DDD(DUK_DDDPRINT("no need to wipe captures: atom_start_captures == re_ctx->captures == %ld",
|
||
|
(long) atom_start_captures));
|
||
|
}
|
||
|
|
||
|
atom_code_length = (duk_int32_t) DUK__RE_BUFLEN(re_ctx) - atom_start_offset;
|
||
|
|
||
|
/* insert the required matches (qmin) by copying the atom */
|
||
|
tmp_qmin = re_ctx->curr_token.qmin;
|
||
|
tmp_qmax = re_ctx->curr_token.qmax;
|
||
|
while (tmp_qmin > 0) {
|
||
|
duk__append_slice(re_ctx, (duk_uint32_t) atom_start_offset, (duk_uint32_t) atom_code_length);
|
||
|
tmp_qmin--;
|
||
|
if (tmp_qmax != DUK_RE_QUANTIFIER_INFINITE) {
|
||
|
tmp_qmax--;
|
||
|
}
|
||
|
}
|
||
|
DUK_ASSERT(tmp_qmin == 0);
|
||
|
|
||
|
/* insert code for matching the remainder - infinite or finite */
|
||
|
if (tmp_qmax == DUK_RE_QUANTIFIER_INFINITE) {
|
||
|
/* reuse last emitted atom for remaining 'infinite' quantifier */
|
||
|
|
||
|
if (re_ctx->curr_token.qmin == 0) {
|
||
|
/* Special case: original qmin was zero so there is nothing
|
||
|
* to repeat. Emit an atom copy but jump over it here.
|
||
|
*/
|
||
|
duk__append_reop(re_ctx, DUK_REOP_JUMP);
|
||
|
duk__append_jump_offset(re_ctx, atom_code_length);
|
||
|
duk__append_slice(re_ctx, (duk_uint32_t) atom_start_offset, (duk_uint32_t) atom_code_length);
|
||
|
}
|
||
|
if (re_ctx->curr_token.greedy) {
|
||
|
duk__append_reop(re_ctx, DUK_REOP_SPLIT2); /* prefer jump */
|
||
|
} else {
|
||
|
duk__append_reop(re_ctx, DUK_REOP_SPLIT1); /* prefer direct */
|
||
|
}
|
||
|
duk__append_jump_offset(re_ctx, -atom_code_length - 1); /* -1 for opcode */
|
||
|
} else {
|
||
|
/*
|
||
|
* The remaining matches are emitted as sequence of SPLITs and atom
|
||
|
* copies; the SPLITs skip the remaining copies and match the sequel.
|
||
|
* This sequence needs to be emitted starting from the last copy
|
||
|
* because the SPLITs are variable length due to the variable length
|
||
|
* skip offset. This causes a lot of memory copying now.
|
||
|
*
|
||
|
* Example structure (greedy, match maximum # atoms):
|
||
|
*
|
||
|
* SPLIT1 LSEQ
|
||
|
* (atom)
|
||
|
* SPLIT1 LSEQ ; <- the byte length of this instruction is needed
|
||
|
* (atom) ; to encode the above SPLIT1 correctly
|
||
|
* ...
|
||
|
* LSEQ:
|
||
|
*/
|
||
|
duk_uint32_t offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);
|
||
|
while (tmp_qmax > 0) {
|
||
|
duk__insert_slice(re_ctx, offset, (duk_uint32_t) atom_start_offset, (duk_uint32_t) atom_code_length);
|
||
|
if (re_ctx->curr_token.greedy) {
|
||
|
duk__insert_u32(re_ctx, offset, DUK_REOP_SPLIT1); /* prefer direct */
|
||
|
} else {
|
||
|
duk__insert_u32(re_ctx, offset, DUK_REOP_SPLIT2); /* prefer jump */
|
||
|
}
|
||
|
duk__insert_jump_offset(re_ctx,
|
||
|
offset + 1, /* +1 for opcode */
|
||
|
(duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - (offset + 1)));
|
||
|
tmp_qmax--;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* remove the original 'template' atom */
|
||
|
duk__remove_slice(re_ctx, (duk_uint32_t) atom_start_offset, (duk_uint32_t) atom_code_length);
|
||
|
}
|
||
|
|
||
|
/* 'taint' result as complex */
|
||
|
res_charlen = -1;
|
||
|
break;
|
||
|
}
|
||
|
case DUK_RETOK_ASSERT_START: {
|
||
|
duk__append_reop(re_ctx, DUK_REOP_ASSERT_START);
|
||
|
break;
|
||
|
}
|
||
|
case DUK_RETOK_ASSERT_END: {
|
||
|
duk__append_reop(re_ctx, DUK_REOP_ASSERT_END);
|
||
|
break;
|
||
|
}
|
||
|
case DUK_RETOK_ASSERT_WORD_BOUNDARY: {
|
||
|
duk__append_reop(re_ctx, DUK_REOP_ASSERT_WORD_BOUNDARY);
|
||
|
break;
|
||
|
}
|
||
|
case DUK_RETOK_ASSERT_NOT_WORD_BOUNDARY: {
|
||
|
duk__append_reop(re_ctx, DUK_REOP_ASSERT_NOT_WORD_BOUNDARY);
|
||
|
break;
|
||
|
}
|
||
|
case DUK_RETOK_ASSERT_START_POS_LOOKAHEAD:
|
||
|
case DUK_RETOK_ASSERT_START_NEG_LOOKAHEAD: {
|
||
|
duk_uint32_t offset;
|
||
|
duk_uint32_t opcode = (re_ctx->curr_token.t == DUK_RETOK_ASSERT_START_POS_LOOKAHEAD) ?
|
||
|
DUK_REOP_LOOKPOS : DUK_REOP_LOOKNEG;
|
||
|
|
||
|
offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);
|
||
|
duk__parse_disjunction(re_ctx, 0, &tmp_disj);
|
||
|
duk__append_reop(re_ctx, DUK_REOP_MATCH);
|
||
|
|
||
|
(void) duk__insert_u32(re_ctx, offset, opcode);
|
||
|
(void) duk__insert_jump_offset(re_ctx,
|
||
|
offset + 1, /* +1 for opcode */
|
||
|
(duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - (offset + 1)));
|
||
|
|
||
|
/* 'taint' result as complex -- this is conservative,
|
||
|
* as lookaheads do not backtrack.
|
||
|
*/
|
||
|
res_charlen = -1;
|
||
|
break;
|
||
|
}
|
||
|
case DUK_RETOK_ATOM_PERIOD: {
|
||
|
new_atom_char_length = 1;
|
||
|
new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
|
||
|
duk__append_reop(re_ctx, DUK_REOP_PERIOD);
|
||
|
break;
|
||
|
}
|
||
|
case DUK_RETOK_ATOM_CHAR: {
|
||
|
/* Note: successive characters could be joined into string matches
|
||
|
* but this is not trivial (consider e.g. '/xyz+/); see docs for
|
||
|
* more discussion.
|
||
|
*
|
||
|
* No support for \u{H+} yet. While only BMP Unicode escapes are
|
||
|
* supported for RegExps at present, 'ch' may still be a non-BMP
|
||
|
* codepoint if it is decoded straight from source text UTF-8.
|
||
|
* There's no non-BMP support yet so this is handled simply by
|
||
|
* matching the non-BMP character (which is custom behavior).
|
||
|
*/
|
||
|
duk_uint32_t ch;
|
||
|
|
||
|
new_atom_char_length = 1;
|
||
|
new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
|
||
|
duk__append_reop(re_ctx, DUK_REOP_CHAR);
|
||
|
ch = re_ctx->curr_token.num;
|
||
|
if (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE) {
|
||
|
ch = (duk_uint32_t) duk_unicode_re_canonicalize_char(re_ctx->thr, (duk_codepoint_t) ch);
|
||
|
}
|
||
|
duk__append_u32(re_ctx, ch);
|
||
|
break;
|
||
|
}
|
||
|
case DUK_RETOK_ATOM_DIGIT:
|
||
|
case DUK_RETOK_ATOM_NOT_DIGIT:
|
||
|
case DUK_RETOK_ATOM_WHITE:
|
||
|
case DUK_RETOK_ATOM_NOT_WHITE:
|
||
|
case DUK_RETOK_ATOM_WORD_CHAR:
|
||
|
case DUK_RETOK_ATOM_NOT_WORD_CHAR: {
|
||
|
duk_small_uint_t re_op;
|
||
|
duk_small_uint_t idx;
|
||
|
|
||
|
new_atom_char_length = 1;
|
||
|
new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
|
||
|
|
||
|
DUK_ASSERT((DUK_RETOK_ATOM_DIGIT & 0x01) != 0);
|
||
|
DUK_ASSERT((DUK_RETOK_ATOM_WHITE & 0x01) != 0);
|
||
|
DUK_ASSERT((DUK_RETOK_ATOM_WORD_CHAR & 0x01) != 0);
|
||
|
DUK_ASSERT((DUK_RETOK_ATOM_NOT_DIGIT & 0x01) == 0);
|
||
|
DUK_ASSERT((DUK_RETOK_ATOM_NOT_WHITE & 0x01) == 0);
|
||
|
DUK_ASSERT((DUK_RETOK_ATOM_NOT_WORD_CHAR & 0x01) == 0);
|
||
|
re_op = (re_ctx->curr_token.t & 0x01) ? DUK_REOP_RANGES : DUK_REOP_INVRANGES;
|
||
|
|
||
|
DUK_ASSERT(DUK_RETOK_ATOM_WHITE == DUK_RETOK_ATOM_DIGIT + 2);
|
||
|
DUK_ASSERT(DUK_RETOK_ATOM_WORD_CHAR == DUK_RETOK_ATOM_DIGIT + 4);
|
||
|
idx = (duk_small_uint_t) ((re_ctx->curr_token.t - DUK_RETOK_ATOM_DIGIT) >> 1U);
|
||
|
DUK_ASSERT(idx <= 2U); /* Assume continuous token numbers; also checks negative underflow. */
|
||
|
|
||
|
duk__append_range_atom_matcher(re_ctx, re_op, duk__re_range_lookup1[idx], duk__re_range_lookup2[idx]);
|
||
|
break;
|
||
|
}
|
||
|
case DUK_RETOK_ATOM_BACKREFERENCE: {
|
||
|
duk_uint32_t backref = (duk_uint32_t) re_ctx->curr_token.num;
|
||
|
if (backref > re_ctx->highest_backref) {
|
||
|
re_ctx->highest_backref = backref;
|
||
|
}
|
||
|
new_atom_char_length = -1; /* mark as complex */
|
||
|
new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
|
||
|
duk__append_reop(re_ctx, DUK_REOP_BACKREFERENCE);
|
||
|
duk__append_u32(re_ctx, backref);
|
||
|
break;
|
||
|
}
|
||
|
case DUK_RETOK_ATOM_START_CAPTURE_GROUP: {
|
||
|
duk_uint32_t cap;
|
||
|
|
||
|
new_atom_char_length = -1; /* mark as complex (capture handling) */
|
||
|
new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
|
||
|
cap = ++re_ctx->captures;
|
||
|
duk__append_reop(re_ctx, DUK_REOP_SAVE);
|
||
|
duk__append_u32(re_ctx, cap * 2);
|
||
|
duk__parse_disjunction(re_ctx, 0, &tmp_disj); /* retval (sub-atom char length) unused, tainted as complex above */
|
||
|
duk__append_reop(re_ctx, DUK_REOP_SAVE);
|
||
|
duk__append_u32(re_ctx, cap * 2 + 1);
|
||
|
break;
|
||
|
}
|
||
|
case DUK_RETOK_ATOM_START_NONCAPTURE_GROUP: {
|
||
|
new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
|
||
|
duk__parse_disjunction(re_ctx, 0, &tmp_disj);
|
||
|
new_atom_char_length = tmp_disj.charlen;
|
||
|
break;
|
||
|
}
|
||
|
case DUK_RETOK_ATOM_START_CHARCLASS:
|
||
|
case DUK_RETOK_ATOM_START_CHARCLASS_INVERTED: {
|
||
|
/*
|
||
|
* Range parsing is done with a special lexer function which calls
|
||
|
* us for every range parsed. This is different from how rest of
|
||
|
* the parsing works, but avoids a heavy, arbitrary size intermediate
|
||
|
* value type to hold the ranges.
|
||
|
*
|
||
|
* Another complication is the handling of character ranges when
|
||
|
* case insensitive matching is used (see docs for discussion).
|
||
|
* The range handler callback given to the lexer takes care of this
|
||
|
* as well.
|
||
|
*
|
||
|
* Note that duplicate ranges are not eliminated when parsing character
|
||
|
* classes, so that canonicalization of
|
||
|
*
|
||
|
* [0-9a-fA-Fx-{]
|
||
|
*
|
||
|
* creates the result (note the duplicate ranges):
|
||
|
*
|
||
|
* [0-9A-FA-FX-Z{-{]
|
||
|
*
|
||
|
* where [x-{] is split as a result of canonicalization. The duplicate
|
||
|
* ranges are not a semantics issue: they work correctly.
|
||
|
*/
|
||
|
|
||
|
duk_uint32_t offset;
|
||
|
|
||
|
DUK_DD(DUK_DDPRINT("character class"));
|
||
|
|
||
|
/* insert ranges instruction, range count patched in later */
|
||
|
new_atom_char_length = 1;
|
||
|
new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
|
||
|
duk__append_reop(re_ctx,
|
||
|
(re_ctx->curr_token.t == DUK_RETOK_ATOM_START_CHARCLASS) ?
|
||
|
DUK_REOP_RANGES : DUK_REOP_INVRANGES);
|
||
|
offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx); /* patch in range count later */
|
||
|
|
||
|
/* parse ranges until character class ends */
|
||
|
re_ctx->nranges = 0; /* note: ctx-wide temporary */
|
||
|
duk_lexer_parse_re_ranges(&re_ctx->lex, duk__regexp_generate_ranges, (void *) re_ctx);
|
||
|
|
||
|
/* insert range count */
|
||
|
duk__insert_u32(re_ctx, offset, re_ctx->nranges);
|
||
|
break;
|
||
|
}
|
||
|
case DUK_RETOK_ATOM_END_GROUP: {
|
||
|
if (expect_eof) {
|
||
|
DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_UNEXPECTED_CLOSING_PAREN);
|
||
|
DUK_WO_NORETURN(return;);
|
||
|
}
|
||
|
goto done;
|
||
|
}
|
||
|
case DUK_RETOK_EOF: {
|
||
|
if (!expect_eof) {
|
||
|
DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_UNEXPECTED_END_OF_PATTERN);
|
||
|
DUK_WO_NORETURN(return;);
|
||
|
}
|
||
|
goto done;
|
||
|
}
|
||
|
default: {
|
||
|
DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_UNEXPECTED_REGEXP_TOKEN);
|
||
|
DUK_WO_NORETURN(return;);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* a complex (new) atom taints the result */
|
||
|
if (new_atom_start_offset >= 0) {
|
||
|
if (new_atom_char_length < 0) {
|
||
|
res_charlen = -1;
|
||
|
} else if (res_charlen >= 0) {
|
||
|
/* only advance if not tainted */
|
||
|
res_charlen += new_atom_char_length;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* record previous atom info in case next token is a quantifier */
|
||
|
atom_start_offset = new_atom_start_offset;
|
||
|
atom_char_length = new_atom_char_length;
|
||
|
atom_start_captures = new_atom_start_captures;
|
||
|
}
|
||
|
|
||
|
done:
|
||
|
|
||
|
/* finish up pending jump and split for last alternative */
|
||
|
if (unpatched_disjunction_jump >= 0) {
|
||
|
duk_uint32_t offset;
|
||
|
|
||
|
DUK_ASSERT(unpatched_disjunction_split >= 0);
|
||
|
offset = (duk_uint32_t) unpatched_disjunction_jump;
|
||
|
offset += duk__insert_jump_offset(re_ctx,
|
||
|
offset,
|
||
|
(duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - offset));
|
||
|
/* offset is now target of the pending split (right after jump) */
|
||
|
duk__insert_jump_offset(re_ctx,
|
||
|
(duk_uint32_t) unpatched_disjunction_split,
|
||
|
(duk_int32_t) offset - unpatched_disjunction_split);
|
||
|
}
|
||
|
|
||
|
#if 0
|
||
|
out_atom_info->end_captures = re_ctx->captures;
|
||
|
#endif
|
||
|
out_atom_info->charlen = res_charlen;
|
||
|
DUK_DDD(DUK_DDDPRINT("parse disjunction finished: charlen=%ld",
|
||
|
(long) out_atom_info->charlen));
|
||
|
|
||
|
re_ctx->recursion_depth--;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Flags parsing (see E5 Section 15.10.4.1).
|
||
|
*/
|
||
|
|
||
|
DUK_LOCAL duk_uint32_t duk__parse_regexp_flags(duk_hthread *thr, duk_hstring *h) {
|
||
|
const duk_uint8_t *p;
|
||
|
const duk_uint8_t *p_end;
|
||
|
duk_uint32_t flags = 0;
|
||
|
|
||
|
p = DUK_HSTRING_GET_DATA(h);
|
||
|
p_end = p + DUK_HSTRING_GET_BYTELEN(h);
|
||
|
|
||
|
/* Note: can be safely scanned as bytes (undecoded) */
|
||
|
|
||
|
while (p < p_end) {
|
||
|
duk_uint8_t c = *p++;
|
||
|
switch (c) {
|
||
|
case (duk_uint8_t) 'g': {
|
||
|
if (flags & DUK_RE_FLAG_GLOBAL) {
|
||
|
goto flags_error;
|
||
|
}
|
||
|
flags |= DUK_RE_FLAG_GLOBAL;
|
||
|
break;
|
||
|
}
|
||
|
case (duk_uint8_t) 'i': {
|
||
|
if (flags & DUK_RE_FLAG_IGNORE_CASE) {
|
||
|
goto flags_error;
|
||
|
}
|
||
|
flags |= DUK_RE_FLAG_IGNORE_CASE;
|
||
|
break;
|
||
|
}
|
||
|
case (duk_uint8_t) 'm': {
|
||
|
if (flags & DUK_RE_FLAG_MULTILINE) {
|
||
|
goto flags_error;
|
||
|
}
|
||
|
flags |= DUK_RE_FLAG_MULTILINE;
|
||
|
break;
|
||
|
}
|
||
|
default: {
|
||
|
goto flags_error;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return flags;
|
||
|
|
||
|
flags_error:
|
||
|
DUK_ERROR_SYNTAX(thr, DUK_STR_INVALID_REGEXP_FLAGS);
|
||
|
DUK_WO_NORETURN(return 0U;);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Create escaped RegExp source (E5 Section 15.10.3).
|
||
|
*
|
||
|
* The current approach is to special case the empty RegExp
|
||
|
* ('' -> '(?:)') and otherwise replace unescaped '/' characters
|
||
|
* with '\/' regardless of where they occur in the regexp.
|
||
|
*
|
||
|
* Note that normalization does not seem to be necessary for
|
||
|
* RegExp literals (e.g. '/foo/') because to be acceptable as
|
||
|
* a RegExp literal, the text between forward slashes must
|
||
|
* already match the escaping requirements (e.g. must not contain
|
||
|
* unescaped forward slashes or be empty). Escaping IS needed
|
||
|
* for expressions like 'new Regexp("...", "")' however.
|
||
|
* Currently, we re-escape in either case.
|
||
|
*
|
||
|
* Also note that we process the source here in UTF-8 encoded
|
||
|
* form. This is correct, because any non-ASCII characters are
|
||
|
* passed through without change.
|
||
|
*/
|
||
|
|
||
|
DUK_LOCAL void duk__create_escaped_source(duk_hthread *thr, int idx_pattern) {
|
||
|
duk_hstring *h;
|
||
|
const duk_uint8_t *p;
|
||
|
duk_bufwriter_ctx bw_alloc;
|
||
|
duk_bufwriter_ctx *bw;
|
||
|
duk_uint8_t *q;
|
||
|
duk_size_t i, n;
|
||
|
duk_uint_fast8_t c_prev, c;
|
||
|
|
||
|
h = duk_known_hstring(thr, idx_pattern);
|
||
|
p = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h);
|
||
|
n = (duk_size_t) DUK_HSTRING_GET_BYTELEN(h);
|
||
|
|
||
|
if (n == 0) {
|
||
|
duk_push_literal(thr, "(?:)");
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
bw = &bw_alloc;
|
||
|
DUK_BW_INIT_PUSHBUF(thr, bw, n);
|
||
|
q = DUK_BW_GET_PTR(thr, bw);
|
||
|
|
||
|
c_prev = (duk_uint_fast8_t) 0;
|
||
|
|
||
|
for (i = 0; i < n; i++) {
|
||
|
c = p[i];
|
||
|
|
||
|
q = DUK_BW_ENSURE_RAW(thr, bw, 2, q);
|
||
|
|
||
|
if (c == (duk_uint_fast8_t) '/' && c_prev != (duk_uint_fast8_t) '\\') {
|
||
|
/* Unescaped '/' ANYWHERE in the regexp (in disjunction,
|
||
|
* inside a character class, ...) => same escape works.
|
||
|
*/
|
||
|
*q++ = DUK_ASC_BACKSLASH;
|
||
|
}
|
||
|
*q++ = (duk_uint8_t) c;
|
||
|
|
||
|
c_prev = c;
|
||
|
}
|
||
|
|
||
|
DUK_BW_SETPTR_AND_COMPACT(thr, bw, q);
|
||
|
(void) duk_buffer_to_string(thr, -1); /* Safe if input is safe. */
|
||
|
|
||
|
/* [ ... escaped_source ] */
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Exposed regexp compilation primitive.
|
||
|
*
|
||
|
* Sets up a regexp compilation context, and calls duk__parse_disjunction() to do the
|
||
|
* actual parsing. Handles generation of the compiled regexp header and the
|
||
|
* "boilerplate" capture of the matching substring (save 0 and 1). Also does some
|
||
|
* global level regexp checks after recursive compilation has finished.
|
||
|
*
|
||
|
* An escaped version of the regexp source, suitable for use as a RegExp instance
|
||
|
* 'source' property (see E5 Section 15.10.3), is also left on the stack.
|
||
|
*
|
||
|
* Input stack: [ pattern flags ]
|
||
|
* Output stack: [ bytecode escaped_source ] (both as strings)
|
||
|
*/
|
||
|
|
||
|
DUK_INTERNAL void duk_regexp_compile(duk_hthread *thr) {
|
||
|
duk_re_compiler_ctx re_ctx;
|
||
|
duk_lexer_point lex_point;
|
||
|
duk_hstring *h_pattern;
|
||
|
duk_hstring *h_flags;
|
||
|
duk__re_disjunction_info ign_disj;
|
||
|
|
||
|
DUK_ASSERT(thr != NULL);
|
||
|
|
||
|
/*
|
||
|
* Args validation
|
||
|
*/
|
||
|
|
||
|
/* TypeError if fails */
|
||
|
h_pattern = duk_require_hstring_notsymbol(thr, -2);
|
||
|
h_flags = duk_require_hstring_notsymbol(thr, -1);
|
||
|
|
||
|
/*
|
||
|
* Create normalized 'source' property (E5 Section 15.10.3).
|
||
|
*/
|
||
|
|
||
|
/* [ ... pattern flags ] */
|
||
|
|
||
|
duk__create_escaped_source(thr, -2);
|
||
|
|
||
|
/* [ ... pattern flags escaped_source ] */
|
||
|
|
||
|
/*
|
||
|
* Init compilation context
|
||
|
*/
|
||
|
|
||
|
/* [ ... pattern flags escaped_source buffer ] */
|
||
|
|
||
|
duk_memzero(&re_ctx, sizeof(re_ctx));
|
||
|
DUK_LEXER_INITCTX(&re_ctx.lex); /* duplicate zeroing, expect for (possible) NULL inits */
|
||
|
re_ctx.thr = thr;
|
||
|
re_ctx.lex.thr = thr;
|
||
|
re_ctx.lex.input = DUK_HSTRING_GET_DATA(h_pattern);
|
||
|
re_ctx.lex.input_length = DUK_HSTRING_GET_BYTELEN(h_pattern);
|
||
|
re_ctx.lex.token_limit = DUK_RE_COMPILE_TOKEN_LIMIT;
|
||
|
re_ctx.recursion_limit = DUK_USE_REGEXP_COMPILER_RECLIMIT;
|
||
|
re_ctx.re_flags = duk__parse_regexp_flags(thr, h_flags);
|
||
|
|
||
|
DUK_BW_INIT_PUSHBUF(thr, &re_ctx.bw, DUK__RE_INITIAL_BUFSIZE);
|
||
|
|
||
|
DUK_DD(DUK_DDPRINT("regexp compiler ctx initialized, flags=0x%08lx, recursion_limit=%ld",
|
||
|
(unsigned long) re_ctx.re_flags, (long) re_ctx.recursion_limit));
|
||
|
|
||
|
/*
|
||
|
* Init lexer
|
||
|
*/
|
||
|
|
||
|
lex_point.offset = 0; /* expensive init, just want to fill window */
|
||
|
lex_point.line = 1;
|
||
|
DUK_LEXER_SETPOINT(&re_ctx.lex, &lex_point);
|
||
|
|
||
|
/*
|
||
|
* Compilation
|
||
|
*/
|
||
|
|
||
|
DUK_DD(DUK_DDPRINT("starting regexp compilation"));
|
||
|
|
||
|
duk__append_reop(&re_ctx, DUK_REOP_SAVE);
|
||
|
duk__append_7bit(&re_ctx, 0);
|
||
|
duk__parse_disjunction(&re_ctx, 1 /*expect_eof*/, &ign_disj);
|
||
|
duk__append_reop(&re_ctx, DUK_REOP_SAVE);
|
||
|
duk__append_7bit(&re_ctx, 1);
|
||
|
duk__append_reop(&re_ctx, DUK_REOP_MATCH);
|
||
|
|
||
|
/*
|
||
|
* Check for invalid backreferences; note that it is NOT an error
|
||
|
* to back-reference a capture group which has not yet been introduced
|
||
|
* in the pattern (as in /\1(foo)/); in fact, the backreference will
|
||
|
* always match! It IS an error to back-reference a capture group
|
||
|
* which will never be introduced in the pattern. Thus, we can check
|
||
|
* for such references only after parsing is complete.
|
||
|
*/
|
||
|
|
||
|
if (re_ctx.highest_backref > re_ctx.captures) {
|
||
|
DUK_ERROR_SYNTAX(thr, DUK_STR_INVALID_BACKREFS);
|
||
|
DUK_WO_NORETURN(return;);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Emit compiled regexp header: flags, ncaptures
|
||
|
* (insertion order inverted on purpose)
|
||
|
*/
|
||
|
|
||
|
duk__insert_u32(&re_ctx, 0, (re_ctx.captures + 1) * 2);
|
||
|
duk__insert_u32(&re_ctx, 0, re_ctx.re_flags);
|
||
|
|
||
|
/* [ ... pattern flags escaped_source buffer ] */
|
||
|
|
||
|
DUK_BW_COMPACT(thr, &re_ctx.bw);
|
||
|
(void) duk_buffer_to_string(thr, -1); /* Safe because flags is at most 7 bit. */
|
||
|
|
||
|
/* [ ... pattern flags escaped_source bytecode ] */
|
||
|
|
||
|
/*
|
||
|
* Finalize stack
|
||
|
*/
|
||
|
|
||
|
duk_remove(thr, -4); /* -> [ ... flags escaped_source bytecode ] */
|
||
|
duk_remove(thr, -3); /* -> [ ... escaped_source bytecode ] */
|
||
|
|
||
|
DUK_DD(DUK_DDPRINT("regexp compilation successful, bytecode: %!T, escaped source: %!T",
|
||
|
(duk_tval *) duk_get_tval(thr, -1), (duk_tval *) duk_get_tval(thr, -2)));
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Create a RegExp instance (E5 Section 15.10.7).
|
||
|
*
|
||
|
* Note: the output stack left by duk_regexp_compile() is directly compatible
|
||
|
* with the input here.
|
||
|
*
|
||
|
* Input stack: [ escaped_source bytecode ] (both as strings)
|
||
|
* Output stack: [ RegExp ]
|
||
|
*/
|
||
|
|
||
|
DUK_INTERNAL void duk_regexp_create_instance(duk_hthread *thr) {
|
||
|
duk_hobject *h;
|
||
|
|
||
|
/* [ ... escaped_source bytecode ] */
|
||
|
|
||
|
duk_push_object(thr);
|
||
|
h = duk_known_hobject(thr, -1);
|
||
|
duk_insert(thr, -3);
|
||
|
|
||
|
/* [ ... regexp_object escaped_source bytecode ] */
|
||
|
|
||
|
DUK_HOBJECT_SET_CLASS_NUMBER(h, DUK_HOBJECT_CLASS_REGEXP);
|
||
|
DUK_HOBJECT_SET_PROTOTYPE_UPDREF(thr, h, thr->builtins[DUK_BIDX_REGEXP_PROTOTYPE]);
|
||
|
|
||
|
duk_xdef_prop_stridx_short(thr, -3, DUK_STRIDX_INT_BYTECODE, DUK_PROPDESC_FLAGS_NONE);
|
||
|
|
||
|
/* [ ... regexp_object escaped_source ] */
|
||
|
|
||
|
/* In ES2015 .source, and the .global, .multiline, etc flags are
|
||
|
* inherited getters. Store the escaped source as an internal
|
||
|
* property for the getter.
|
||
|
*/
|
||
|
|
||
|
duk_xdef_prop_stridx_short(thr, -2, DUK_STRIDX_INT_SOURCE, DUK_PROPDESC_FLAGS_NONE);
|
||
|
|
||
|
/* [ ... regexp_object ] */
|
||
|
|
||
|
duk_push_int(thr, 0);
|
||
|
duk_xdef_prop_stridx_short(thr, -2, DUK_STRIDX_LAST_INDEX, DUK_PROPDESC_FLAGS_W);
|
||
|
|
||
|
/* [ ... regexp_object ] */
|
||
|
}
|
||
|
|
||
|
#else /* DUK_USE_REGEXP_SUPPORT */
|
||
|
|
||
|
/* regexp support disabled */
|
||
|
|
||
|
#endif /* DUK_USE_REGEXP_SUPPORT */
|