2439 lines
78 KiB
C
2439 lines
78 KiB
C
/*
|
|
* Lexer for source files, ToNumber() string conversions, RegExp expressions,
|
|
* and JSON.
|
|
*
|
|
* Provides a stream of ECMAScript tokens from an UTF-8/CESU-8 buffer. The
|
|
* caller can also rewind the token stream into a certain position which is
|
|
* needed by the compiler part for multi-pass scanning. Tokens are
|
|
* represented as duk_token structures, and contain line number information.
|
|
* Token types are identified with DUK_TOK_* defines.
|
|
*
|
|
* Characters are decoded into a fixed size lookup window consisting of
|
|
* decoded Unicode code points, with window positions past the end of the
|
|
* input filled with an invalid codepoint (-1). The tokenizer can thus
|
|
* perform multiple character lookups efficiently and with few sanity
|
|
* checks (such as access outside the end of the input), which keeps the
|
|
* tokenization code small at the cost of performance.
|
|
*
|
|
* Character data in tokens, such as identifier names and string literals,
|
|
* is encoded into CESU-8 format on-the-fly while parsing the token in
|
|
* question. The string data is made reachable to garbage collection by
|
|
* placing the token-related values in value stack entries allocated for
|
|
* this purpose by the caller. The characters exist in Unicode code point
|
|
* form only in the fixed size lookup window, which keeps character data
|
|
* expansion (of especially ASCII data) low.
|
|
*
|
|
* Token parsing supports the full range of Unicode characters as described
|
|
* in the E5 specification. Parsing has been optimized for ASCII characters
|
|
* because ordinary ECMAScript code consists almost entirely of ASCII
|
|
* characters. Matching of complex Unicode codepoint sets (such as in the
|
|
* IdentifierStart and IdentifierPart productions) is optimized for size,
|
|
* and is done using a linear scan of a bit-packed list of ranges. This is
|
|
* very slow, but should never be entered unless the source code actually
|
|
* contains Unicode characters.
|
|
*
|
|
* ECMAScript tokenization is partially context sensitive. First,
|
|
* additional future reserved words are recognized in strict mode (see E5
|
|
* Section 7.6.1.2). Second, a forward slash character ('/') can be
|
|
* recognized either as starting a RegExp literal or as a division operator,
|
|
* depending on context. The caller must provide necessary context flags
|
|
* when requesting a new token.
|
|
*
|
|
* Future work:
|
|
*
|
|
* * Make line number tracking optional, as it consumes space.
|
|
*
|
|
* * Add a feature flag for disabling UTF-8 decoding of input, as most
|
|
* source code is ASCII. Because of Unicode escapes written in ASCII,
|
|
* this does not allow Unicode support to be removed from e.g.
|
|
* duk_unicode_is_identifier_start() nor does it allow removal of CESU-8
|
|
* encoding of e.g. string literals.
|
|
*
|
|
* * Add a feature flag for disabling Unicode compliance of e.g. identifier
|
|
* names. This allows for a build more than a kilobyte smaller, because
|
|
* Unicode ranges needed by duk_unicode_is_identifier_start() and
|
|
* duk_unicode_is_identifier_part() can be dropped. String literals
|
|
* should still be allowed to contain escaped Unicode, so this still does
|
|
* not allow removal of CESU-8 encoding of e.g. string literals.
|
|
*
|
|
* * Character lookup tables for codepoints above BMP could be stripped.
|
|
*
|
|
* * Strictly speaking, E5 specification requires that source code consists
|
|
* of 16-bit code units, and if not, must be conceptually converted to
|
|
* that format first. The current lexer processes Unicode code points
|
|
* and allows characters outside the BMP. These should be converted to
|
|
* surrogate pairs while reading the source characters into the window,
|
|
* not after tokens have been formed (as is done now). However, the fix
|
|
* is not trivial because two characters are decoded from one codepoint.
|
|
*
|
|
* * Optimize for speed as well as size. Large if-else ladders are (at
|
|
* least potentially) slow.
|
|
*/
|
|
|
|
#include "duk_internal.h"
|
|
|
|
/*
|
|
* Various defines and file specific helper macros
|
|
*/
|
|
|
|
#define DUK__MAX_RE_DECESC_DIGITS 9
|
|
#define DUK__MAX_RE_QUANT_DIGITS 9 /* Does not allow e.g. 2**31-1, but one more would allow overflows of u32. */
|
|
|
|
/* whether to use macros or helper function depends on call count */
|
|
#define DUK__ISDIGIT(x) ((x) >= DUK_ASC_0 && (x) <= DUK_ASC_9)
|
|
#define DUK__ISHEXDIGIT(x) duk__is_hex_digit((x))
|
|
#define DUK__ISOCTDIGIT(x) ((x) >= DUK_ASC_0 && (x) <= DUK_ASC_7)
|
|
#define DUK__ISDIGIT03(x) ((x) >= DUK_ASC_0 && (x) <= DUK_ASC_3)
|
|
#define DUK__ISDIGIT47(x) ((x) >= DUK_ASC_4 && (x) <= DUK_ASC_7)
|
|
|
|
/* lexer character window helpers */
|
|
#define DUK__LOOKUP(lex_ctx,idx) ((lex_ctx)->window[(idx)].codepoint)
|
|
#define DUK__ADVANCECHARS(lex_ctx,count) duk__advance_chars((lex_ctx), (count))
|
|
#define DUK__ADVANCEBYTES(lex_ctx,count) duk__advance_bytes((lex_ctx), (count))
|
|
#define DUK__INITBUFFER(lex_ctx) duk__initbuffer((lex_ctx))
|
|
#define DUK__APPENDBUFFER(lex_ctx,x) duk__appendbuffer((lex_ctx), (duk_codepoint_t) (x))
|
|
#define DUK__APPENDBUFFER_ASCII(lex_ctx,x) duk__appendbuffer_ascii((lex_ctx), (duk_codepoint_t) (x))
|
|
|
|
/* lookup shorthands (note: assume context variable is named 'lex_ctx') */
|
|
#define DUK__L0() DUK__LOOKUP(lex_ctx, 0)
|
|
#define DUK__L1() DUK__LOOKUP(lex_ctx, 1)
|
|
#define DUK__L2() DUK__LOOKUP(lex_ctx, 2)
|
|
#define DUK__L3() DUK__LOOKUP(lex_ctx, 3)
|
|
#define DUK__L4() DUK__LOOKUP(lex_ctx, 4)
|
|
#define DUK__L5() DUK__LOOKUP(lex_ctx, 5)
|
|
|
|
/* packed advance/token number macro used by multiple functions */
|
|
#define DUK__ADVTOK(advbytes,tok) ((((advbytes) * sizeof(duk_lexer_codepoint)) << 8) + (tok))
|
|
|
|
/*
|
|
* Advance lookup window by N characters, filling in new characters as
|
|
* necessary. After returning caller is guaranteed a character window of
|
|
* at least DUK_LEXER_WINDOW_SIZE characters.
|
|
*
|
|
* The main function duk__advance_bytes() is called at least once per every
|
|
* token so it has a major lexer/compiler performance impact. There are two
|
|
* variants for the main duk__advance_bytes() algorithm: a sliding window
|
|
* approach which is slightly faster at the cost of larger code footprint,
|
|
* and a simple copying one.
|
|
*
|
|
* Decoding directly from the source string would be another lexing option.
|
|
* But the lookup window based approach has the advantage of hiding the
|
|
* source string and its encoding effectively which gives more flexibility
|
|
* going forward to e.g. support chunked streaming of source from flash.
|
|
*
|
|
* Decodes UTF-8/CESU-8 leniently with support for code points from U+0000 to
|
|
* U+10FFFF, causing an error if the input is unparseable. Leniency means:
|
|
*
|
|
* * Unicode code point validation is intentionally not performed,
|
|
* except to check that the codepoint does not exceed 0x10ffff.
|
|
*
|
|
* * In particular, surrogate pairs are allowed and not combined, which
|
|
* allows source files to represent all SourceCharacters with CESU-8.
|
|
* Broken surrogate pairs are allowed, as ECMAScript does not mandate
|
|
* their validation.
|
|
*
|
|
* * Allow non-shortest UTF-8 encodings.
|
|
*
|
|
* Leniency here causes few security concerns because all character data is
|
|
* decoded into Unicode codepoints before lexer processing, and is then
|
|
* re-encoded into CESU-8. The source can be parsed as strict UTF-8 with
|
|
* a compiler option. However, ECMAScript source characters include -all-
|
|
* 16-bit unsigned integer codepoints, so leniency seems to be appropriate.
|
|
*
|
|
* Note that codepoints above the BMP are not strictly SourceCharacters,
|
|
* but the lexer still accepts them as such. Before ending up in a string
|
|
* or an identifier name, codepoints above BMP are converted into surrogate
|
|
* pairs and then CESU-8 encoded, resulting in 16-bit Unicode data as
|
|
* expected by ECMAScript.
|
|
*
|
|
* An alternative approach to dealing with invalid or partial sequences
|
|
* would be to skip them and replace them with e.g. the Unicode replacement
|
|
* character U+FFFD. This has limited utility because a replacement character
|
|
* will most likely cause a parse error, unless it occurs inside a string.
|
|
* Further, ECMAScript source is typically pure ASCII.
|
|
*
|
|
* See:
|
|
*
|
|
* http://en.wikipedia.org/wiki/UTF-8
|
|
* http://en.wikipedia.org/wiki/CESU-8
|
|
* http://tools.ietf.org/html/rfc3629
|
|
* http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
|
|
*
|
|
* Future work:
|
|
*
|
|
* * Reject other invalid Unicode sequences (see Wikipedia entry for examples)
|
|
* in strict UTF-8 mode.
|
|
*
|
|
* * Size optimize. An attempt to use a 16-byte lookup table for the first
|
|
* byte resulted in a code increase though.
|
|
*
|
|
* * Is checking against maximum 0x10ffff really useful? 4-byte encoding
|
|
* imposes a certain limit anyway.
|
|
*
|
|
* * Support chunked streaming of source code. Can be implemented either
|
|
* by streaming chunks of bytes or chunks of codepoints.
|
|
*/
|
|
|
|
#if defined(DUK_USE_LEXER_SLIDING_WINDOW)
|
|
DUK_LOCAL void duk__fill_lexer_buffer(duk_lexer_ctx *lex_ctx, duk_small_uint_t start_offset_bytes) {
|
|
duk_lexer_codepoint *cp, *cp_end;
|
|
duk_ucodepoint_t x;
|
|
duk_small_uint_t contlen;
|
|
const duk_uint8_t *p, *p_end;
|
|
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
|
|
duk_ucodepoint_t mincp;
|
|
#endif
|
|
duk_int_t input_line;
|
|
|
|
/* Use temporaries and update lex_ctx only when finished. */
|
|
input_line = lex_ctx->input_line;
|
|
p = lex_ctx->input + lex_ctx->input_offset;
|
|
p_end = lex_ctx->input + lex_ctx->input_length;
|
|
|
|
cp = (duk_lexer_codepoint *) (void *) ((duk_uint8_t *) lex_ctx->buffer + start_offset_bytes);
|
|
cp_end = lex_ctx->buffer + DUK_LEXER_BUFFER_SIZE;
|
|
|
|
for (; cp != cp_end; cp++) {
|
|
cp->offset = (duk_size_t) (p - lex_ctx->input);
|
|
cp->line = input_line;
|
|
|
|
/* XXX: potential issue with signed pointers, p_end < p. */
|
|
if (DUK_UNLIKELY(p >= p_end)) {
|
|
/* If input_offset were assigned a negative value, it would
|
|
* result in a large positive value. Most likely it would be
|
|
* larger than input_length and be caught here. In any case
|
|
* no memory unsafe behavior would happen.
|
|
*/
|
|
cp->codepoint = -1;
|
|
continue;
|
|
}
|
|
|
|
x = (duk_ucodepoint_t) (*p++);
|
|
|
|
/* Fast path. */
|
|
|
|
if (DUK_LIKELY(x < 0x80UL)) {
|
|
DUK_ASSERT(x != 0x2028UL && x != 0x2029UL); /* not LS/PS */
|
|
if (DUK_UNLIKELY(x <= 0x000dUL)) {
|
|
if ((x == 0x000aUL) ||
|
|
((x == 0x000dUL) && (p >= p_end || *p != 0x000aUL))) {
|
|
/* lookup for 0x000a above assumes shortest encoding now */
|
|
|
|
/* E5 Section 7.3, treat the following as newlines:
|
|
* LF
|
|
* CR [not followed by LF]
|
|
* LS
|
|
* PS
|
|
*
|
|
* For CR LF, CR is ignored if it is followed by LF, and the LF will bump
|
|
* the line number.
|
|
*/
|
|
input_line++;
|
|
}
|
|
}
|
|
|
|
cp->codepoint = (duk_codepoint_t) x;
|
|
continue;
|
|
}
|
|
|
|
/* Slow path. */
|
|
|
|
if (x < 0xc0UL) {
|
|
/* 10xx xxxx -> invalid */
|
|
goto error_encoding;
|
|
} else if (x < 0xe0UL) {
|
|
/* 110x xxxx 10xx xxxx */
|
|
contlen = 1;
|
|
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
|
|
mincp = 0x80UL;
|
|
#endif
|
|
x = x & 0x1fUL;
|
|
} else if (x < 0xf0UL) {
|
|
/* 1110 xxxx 10xx xxxx 10xx xxxx */
|
|
contlen = 2;
|
|
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
|
|
mincp = 0x800UL;
|
|
#endif
|
|
x = x & 0x0fUL;
|
|
} else if (x < 0xf8UL) {
|
|
/* 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx */
|
|
contlen = 3;
|
|
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
|
|
mincp = 0x10000UL;
|
|
#endif
|
|
x = x & 0x07UL;
|
|
} else {
|
|
/* no point in supporting encodings of 5 or more bytes */
|
|
goto error_encoding;
|
|
}
|
|
|
|
DUK_ASSERT(p_end >= p);
|
|
if ((duk_size_t) contlen > (duk_size_t) (p_end - p)) {
|
|
goto error_clipped;
|
|
}
|
|
|
|
while (contlen > 0) {
|
|
duk_small_uint_t y;
|
|
y = *p++;
|
|
if ((y & 0xc0U) != 0x80U) {
|
|
/* check that byte has the form 10xx xxxx */
|
|
goto error_encoding;
|
|
}
|
|
x = x << 6;
|
|
x += y & 0x3fUL;
|
|
contlen--;
|
|
}
|
|
|
|
/* check final character validity */
|
|
|
|
if (x > 0x10ffffUL) {
|
|
goto error_encoding;
|
|
}
|
|
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
|
|
if (x < mincp || (x >= 0xd800UL && x <= 0xdfffUL) || x == 0xfffeUL) {
|
|
goto error_encoding;
|
|
}
|
|
#endif
|
|
|
|
DUK_ASSERT(x != 0x000aUL && x != 0x000dUL);
|
|
if ((x == 0x2028UL) || (x == 0x2029UL)) {
|
|
input_line++;
|
|
}
|
|
|
|
cp->codepoint = (duk_codepoint_t) x;
|
|
}
|
|
|
|
lex_ctx->input_offset = (duk_size_t) (p - lex_ctx->input);
|
|
lex_ctx->input_line = input_line;
|
|
return;
|
|
|
|
error_clipped: /* clipped codepoint */
|
|
error_encoding: /* invalid codepoint encoding or codepoint */
|
|
lex_ctx->input_offset = (duk_size_t) (p - lex_ctx->input);
|
|
lex_ctx->input_line = input_line;
|
|
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_SOURCE_DECODE_FAILED);
|
|
DUK_WO_NORETURN(return;);
|
|
}
|
|
|
|
DUK_LOCAL void duk__advance_bytes(duk_lexer_ctx *lex_ctx, duk_small_uint_t count_bytes) {
|
|
duk_small_uint_t used_bytes, avail_bytes;
|
|
|
|
DUK_ASSERT_DISABLE(count_bytes >= 0); /* unsigned */
|
|
DUK_ASSERT(count_bytes <= (duk_small_uint_t) (DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint)));
|
|
DUK_ASSERT(lex_ctx->window >= lex_ctx->buffer);
|
|
DUK_ASSERT(lex_ctx->window < lex_ctx->buffer + DUK_LEXER_BUFFER_SIZE);
|
|
DUK_ASSERT((duk_uint8_t *) lex_ctx->window + count_bytes <= (duk_uint8_t *) lex_ctx->buffer + DUK_LEXER_BUFFER_SIZE * sizeof(duk_lexer_codepoint));
|
|
|
|
/* Zero 'count' is also allowed to make call sites easier.
|
|
* Arithmetic in bytes generates better code in GCC.
|
|
*/
|
|
|
|
lex_ctx->window = (duk_lexer_codepoint *) (void *) ((duk_uint8_t *) lex_ctx->window + count_bytes); /* avoid multiply */
|
|
used_bytes = (duk_small_uint_t) ((duk_uint8_t *) lex_ctx->window - (duk_uint8_t *) lex_ctx->buffer);
|
|
avail_bytes = DUK_LEXER_BUFFER_SIZE * sizeof(duk_lexer_codepoint) - used_bytes;
|
|
if (avail_bytes < (duk_small_uint_t) (DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint))) {
|
|
/* Not enough data to provide a full window, so "scroll" window to
|
|
* start of buffer and fill up the rest.
|
|
*/
|
|
duk_memmove((void *) lex_ctx->buffer,
|
|
(const void *) lex_ctx->window,
|
|
(size_t) avail_bytes);
|
|
lex_ctx->window = lex_ctx->buffer;
|
|
duk__fill_lexer_buffer(lex_ctx, avail_bytes);
|
|
}
|
|
}
|
|
|
|
DUK_LOCAL void duk__init_lexer_window(duk_lexer_ctx *lex_ctx) {
|
|
lex_ctx->window = lex_ctx->buffer;
|
|
duk__fill_lexer_buffer(lex_ctx, 0);
|
|
}
|
|
#else /* DUK_USE_LEXER_SLIDING_WINDOW */
|
|
DUK_LOCAL duk_codepoint_t duk__read_char(duk_lexer_ctx *lex_ctx) {
|
|
duk_ucodepoint_t x;
|
|
duk_small_uint_t len;
|
|
duk_small_uint_t i;
|
|
const duk_uint8_t *p;
|
|
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
|
|
duk_ucodepoint_t mincp;
|
|
#endif
|
|
duk_size_t input_offset;
|
|
|
|
input_offset = lex_ctx->input_offset;
|
|
if (DUK_UNLIKELY(input_offset >= lex_ctx->input_length)) {
|
|
/* If input_offset were assigned a negative value, it would
|
|
* result in a large positive value. Most likely it would be
|
|
* larger than input_length and be caught here. In any case
|
|
* no memory unsafe behavior would happen.
|
|
*/
|
|
return -1;
|
|
}
|
|
|
|
p = lex_ctx->input + input_offset;
|
|
x = (duk_ucodepoint_t) (*p);
|
|
|
|
if (DUK_LIKELY(x < 0x80UL)) {
|
|
/* 0xxx xxxx -> fast path */
|
|
|
|
/* input offset tracking */
|
|
lex_ctx->input_offset++;
|
|
|
|
DUK_ASSERT(x != 0x2028UL && x != 0x2029UL); /* not LS/PS */
|
|
if (DUK_UNLIKELY(x <= 0x000dUL)) {
|
|
if ((x == 0x000aUL) ||
|
|
((x == 0x000dUL) && (lex_ctx->input_offset >= lex_ctx->input_length ||
|
|
lex_ctx->input[lex_ctx->input_offset] != 0x000aUL))) {
|
|
/* lookup for 0x000a above assumes shortest encoding now */
|
|
|
|
/* E5 Section 7.3, treat the following as newlines:
|
|
* LF
|
|
* CR [not followed by LF]
|
|
* LS
|
|
* PS
|
|
*
|
|
* For CR LF, CR is ignored if it is followed by LF, and the LF will bump
|
|
* the line number.
|
|
*/
|
|
lex_ctx->input_line++;
|
|
}
|
|
}
|
|
|
|
return (duk_codepoint_t) x;
|
|
}
|
|
|
|
/* Slow path. */
|
|
|
|
if (x < 0xc0UL) {
|
|
/* 10xx xxxx -> invalid */
|
|
goto error_encoding;
|
|
} else if (x < 0xe0UL) {
|
|
/* 110x xxxx 10xx xxxx */
|
|
len = 2;
|
|
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
|
|
mincp = 0x80UL;
|
|
#endif
|
|
x = x & 0x1fUL;
|
|
} else if (x < 0xf0UL) {
|
|
/* 1110 xxxx 10xx xxxx 10xx xxxx */
|
|
len = 3;
|
|
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
|
|
mincp = 0x800UL;
|
|
#endif
|
|
x = x & 0x0fUL;
|
|
} else if (x < 0xf8UL) {
|
|
/* 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx */
|
|
len = 4;
|
|
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
|
|
mincp = 0x10000UL;
|
|
#endif
|
|
x = x & 0x07UL;
|
|
} else {
|
|
/* no point in supporting encodings of 5 or more bytes */
|
|
goto error_encoding;
|
|
}
|
|
|
|
DUK_ASSERT(lex_ctx->input_length >= lex_ctx->input_offset);
|
|
if ((duk_size_t) len > (duk_size_t) (lex_ctx->input_length - lex_ctx->input_offset)) {
|
|
goto error_clipped;
|
|
}
|
|
|
|
p++;
|
|
for (i = 1; i < len; i++) {
|
|
duk_small_uint_t y;
|
|
y = *p++;
|
|
if ((y & 0xc0U) != 0x80U) {
|
|
/* check that byte has the form 10xx xxxx */
|
|
goto error_encoding;
|
|
}
|
|
x = x << 6;
|
|
x += y & 0x3fUL;
|
|
}
|
|
|
|
/* check final character validity */
|
|
|
|
if (x > 0x10ffffUL) {
|
|
goto error_encoding;
|
|
}
|
|
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
|
|
if (x < mincp || (x >= 0xd800UL && x <= 0xdfffUL) || x == 0xfffeUL) {
|
|
goto error_encoding;
|
|
}
|
|
#endif
|
|
|
|
/* input offset tracking */
|
|
lex_ctx->input_offset += len;
|
|
|
|
/* line tracking */
|
|
DUK_ASSERT(x != 0x000aUL && x != 0x000dUL);
|
|
if ((x == 0x2028UL) || (x == 0x2029UL)) {
|
|
lex_ctx->input_line++;
|
|
}
|
|
|
|
return (duk_codepoint_t) x;
|
|
|
|
error_clipped: /* clipped codepoint */
|
|
error_encoding: /* invalid codepoint encoding or codepoint */
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_SOURCE_DECODE_FAILED);
|
|
DUK_WO_NORETURN(return 0;);
|
|
}
|
|
|
|
DUK_LOCAL void duk__advance_bytes(duk_lexer_ctx *lex_ctx, duk_small_uint_t count_bytes) {
|
|
duk_small_uint_t keep_bytes;
|
|
duk_lexer_codepoint *cp, *cp_end;
|
|
|
|
DUK_ASSERT_DISABLE(count_bytes >= 0); /* unsigned */
|
|
DUK_ASSERT(count_bytes <= (duk_small_uint_t) (DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint)));
|
|
|
|
/* Zero 'count' is also allowed to make call sites easier. */
|
|
|
|
keep_bytes = DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint) - count_bytes;
|
|
duk_memmove((void *) lex_ctx->window,
|
|
(const void *) ((duk_uint8_t *) lex_ctx->window + count_bytes),
|
|
(size_t) keep_bytes);
|
|
|
|
cp = (duk_lexer_codepoint *) ((duk_uint8_t *) lex_ctx->window + keep_bytes);
|
|
cp_end = lex_ctx->window + DUK_LEXER_WINDOW_SIZE;
|
|
for (; cp != cp_end; cp++) {
|
|
cp->offset = lex_ctx->input_offset;
|
|
cp->line = lex_ctx->input_line;
|
|
cp->codepoint = duk__read_char(lex_ctx);
|
|
}
|
|
}
|
|
|
|
DUK_LOCAL void duk__init_lexer_window(duk_lexer_ctx *lex_ctx) {
|
|
/* Call with count == DUK_LEXER_WINDOW_SIZE to fill buffer initially. */
|
|
duk__advance_bytes(lex_ctx, DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint)); /* fill window */
|
|
}
|
|
#endif /* DUK_USE_LEXER_SLIDING_WINDOW */
|
|
|
|
DUK_LOCAL void duk__advance_chars(duk_lexer_ctx *lex_ctx, duk_small_uint_t count_chars) {
|
|
duk__advance_bytes(lex_ctx, count_chars * sizeof(duk_lexer_codepoint));
|
|
}
|
|
|
|
/*
|
|
* (Re)initialize the temporary byte buffer. May be called extra times
|
|
* with little impact.
|
|
*/
|
|
|
|
DUK_LOCAL void duk__initbuffer(duk_lexer_ctx *lex_ctx) {
|
|
/* Reuse buffer as is unless buffer has grown large. */
|
|
if (DUK_HBUFFER_DYNAMIC_GET_SIZE(lex_ctx->buf) < DUK_LEXER_TEMP_BUF_LIMIT) {
|
|
/* Keep current size */
|
|
} else {
|
|
duk_hbuffer_resize(lex_ctx->thr, lex_ctx->buf, DUK_LEXER_TEMP_BUF_LIMIT);
|
|
}
|
|
|
|
DUK_BW_INIT_WITHBUF(lex_ctx->thr, &lex_ctx->bw, lex_ctx->buf);
|
|
}
|
|
|
|
/*
|
|
* Append a Unicode codepoint to the temporary byte buffer. Performs
|
|
* CESU-8 surrogate pair encoding for codepoints above the BMP.
|
|
* Existing surrogate pairs are allowed and also encoded into CESU-8.
|
|
*/
|
|
|
|
DUK_LOCAL void duk__appendbuffer(duk_lexer_ctx *lex_ctx, duk_codepoint_t x) {
|
|
/*
|
|
* Since character data is only generated by decoding the source or by
|
|
* the compiler itself, we rely on the input codepoints being correct
|
|
* and avoid a check here.
|
|
*
|
|
* Character data can also come here through decoding of Unicode
|
|
* escapes ("\udead\ubeef") so all 16-but unsigned values can be
|
|
* present, even when the source file itself is strict UTF-8.
|
|
*/
|
|
DUK_ASSERT(x >= 0 && x <= 0x10ffffL);
|
|
|
|
DUK_BW_WRITE_ENSURE_CESU8(lex_ctx->thr, &lex_ctx->bw, (duk_ucodepoint_t) x);
|
|
}
|
|
|
|
DUK_LOCAL void duk__appendbuffer_ascii(duk_lexer_ctx *lex_ctx, duk_codepoint_t x) {
|
|
/* ASCII characters can be emitted as a single byte without encoding
|
|
* which matters for some fast paths.
|
|
*/
|
|
DUK_ASSERT(x >= 0 && x <= 0x7f);
|
|
|
|
DUK_BW_WRITE_ENSURE_U8(lex_ctx->thr, &lex_ctx->bw, (duk_uint8_t) x);
|
|
}
|
|
|
|
/*
|
|
* Intern the temporary byte buffer into a valstack slot
|
|
* (in practice, slot1 or slot2).
|
|
*/
|
|
|
|
DUK_LOCAL duk_hstring *duk__internbuffer(duk_lexer_ctx *lex_ctx, duk_idx_t valstack_idx) {
|
|
DUK_ASSERT(valstack_idx == lex_ctx->slot1_idx || valstack_idx == lex_ctx->slot2_idx);
|
|
|
|
DUK_BW_PUSH_AS_STRING(lex_ctx->thr, &lex_ctx->bw);
|
|
duk_replace(lex_ctx->thr, valstack_idx);
|
|
return duk_known_hstring(lex_ctx->thr, valstack_idx);
|
|
}
|
|
|
|
/*
|
|
* Init lexer context
|
|
*/
|
|
|
|
DUK_INTERNAL void duk_lexer_initctx(duk_lexer_ctx *lex_ctx) {
|
|
DUK_ASSERT(lex_ctx != NULL);
|
|
|
|
duk_memzero(lex_ctx, sizeof(*lex_ctx));
|
|
#if defined(DUK_USE_EXPLICIT_NULL_INIT)
|
|
#if defined(DUK_USE_LEXER_SLIDING_WINDOW)
|
|
lex_ctx->window = NULL;
|
|
#endif
|
|
lex_ctx->thr = NULL;
|
|
lex_ctx->input = NULL;
|
|
lex_ctx->buf = NULL;
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* Set lexer input position and reinitialize lookup window.
|
|
*/
|
|
|
|
DUK_INTERNAL void duk_lexer_getpoint(duk_lexer_ctx *lex_ctx, duk_lexer_point *pt) {
|
|
pt->offset = lex_ctx->window[0].offset;
|
|
pt->line = lex_ctx->window[0].line;
|
|
}
|
|
|
|
DUK_INTERNAL void duk_lexer_setpoint(duk_lexer_ctx *lex_ctx, duk_lexer_point *pt) {
|
|
DUK_ASSERT_DISABLE(pt->offset >= 0); /* unsigned */
|
|
DUK_ASSERT(pt->line >= 1);
|
|
lex_ctx->input_offset = pt->offset;
|
|
lex_ctx->input_line = pt->line;
|
|
duk__init_lexer_window(lex_ctx);
|
|
}
|
|
|
|
/*
|
|
* Lexing helpers
|
|
*/
|
|
|
|
/* Numeric value of a hex digit (also covers octal and decimal digits) or
|
|
* -1 if not a valid hex digit.
|
|
*/
|
|
DUK_LOCAL duk_codepoint_t duk__hexval_validate(duk_codepoint_t x) {
|
|
duk_small_int_t t;
|
|
|
|
/* Here 'x' is a Unicode codepoint */
|
|
if (DUK_LIKELY(x >= 0 && x <= 0xff)) {
|
|
t = duk_hex_dectab[x];
|
|
if (DUK_LIKELY(t >= 0)) {
|
|
return t;
|
|
}
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
/* Just a wrapper for call sites where 'x' is known to be valid so
|
|
* we assert for it before decoding.
|
|
*/
|
|
DUK_LOCAL duk_codepoint_t duk__hexval(duk_codepoint_t x) {
|
|
duk_codepoint_t ret;
|
|
|
|
DUK_ASSERT((x >= DUK_ASC_0 && x <= DUK_ASC_9) ||
|
|
(x >= DUK_ASC_LC_A && x <= DUK_ASC_LC_F) ||
|
|
(x >= DUK_ASC_UC_A && x <= DUK_ASC_UC_F));
|
|
ret = duk__hexval_validate(x);
|
|
DUK_ASSERT(ret >= 0 && ret <= 15);
|
|
return ret;
|
|
}
|
|
|
|
/* having this as a separate function provided a size benefit */
|
|
DUK_LOCAL duk_bool_t duk__is_hex_digit(duk_codepoint_t x) {
|
|
if (DUK_LIKELY(x >= 0 && x <= 0xff)) {
|
|
return (duk_hex_dectab[x] >= 0);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/* Parse a Unicode escape of the form \xHH, \uHHHH, or \u{H+}. Shared by
|
|
* source and RegExp parsing.
|
|
*/
|
|
DUK_LOCAL duk_codepoint_t duk__lexer_parse_escape(duk_lexer_ctx *lex_ctx, duk_bool_t allow_es6) {
|
|
duk_small_int_t digits; /* Initial value 2 or 4 for fixed length escapes, 0 for ES2015 \u{H+}. */
|
|
duk_codepoint_t escval;
|
|
duk_codepoint_t x;
|
|
duk_small_uint_t adv;
|
|
|
|
DUK_ASSERT(DUK__L0() == DUK_ASC_BACKSLASH); /* caller responsibilities */
|
|
DUK_ASSERT(DUK__L1() == DUK_ASC_LC_X || DUK__L1() == DUK_ASC_LC_U);
|
|
DUK_UNREF(allow_es6);
|
|
|
|
adv = 2;
|
|
digits = 2;
|
|
if (DUK__L1() == DUK_ASC_LC_U) {
|
|
digits = 4;
|
|
#if defined(DUK_USE_ES6_UNICODE_ESCAPE)
|
|
if (DUK__L2() == DUK_ASC_LCURLY && allow_es6) {
|
|
digits = 0;
|
|
adv = 3;
|
|
}
|
|
#endif
|
|
}
|
|
DUK__ADVANCECHARS(lex_ctx, adv);
|
|
|
|
escval = 0;
|
|
for (;;) {
|
|
/* One of the escape forms: \xHH, \uHHHH, \u{H+}.
|
|
* The 'digits' variable tracks parsing state and is
|
|
* initialized to:
|
|
*
|
|
* \xHH 2
|
|
* \uHH 4
|
|
* \u{H+} 0 first time, updated to -1 to indicate
|
|
* at least one digit has been parsed
|
|
*
|
|
* Octal parsing is handled separately because it can be
|
|
* done with fixed lookahead and also has validation
|
|
* rules which depend on the escape length (which is
|
|
* variable).
|
|
*
|
|
* We don't need a specific check for x < 0 (end of
|
|
* input) or duk_unicode_is_line_terminator(x)
|
|
* because the 'dig' decode will fail and lead to a
|
|
* SyntaxError.
|
|
*/
|
|
duk_codepoint_t dig;
|
|
|
|
x = DUK__L0();
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
|
|
dig = duk__hexval_validate(x);
|
|
if (digits > 0) {
|
|
digits--;
|
|
if (dig < 0) {
|
|
goto fail_escape;
|
|
}
|
|
DUK_ASSERT(dig >= 0x00 && dig <= 0x0f);
|
|
escval = (escval << 4) + dig;
|
|
if (digits == 0) {
|
|
DUK_ASSERT(escval >= 0 && escval <= 0xffffL);
|
|
break;
|
|
}
|
|
} else {
|
|
#if defined(DUK_USE_ES6_UNICODE_ESCAPE)
|
|
DUK_ASSERT(digits == 0 /* first time */ || digits == -1 /* others */);
|
|
if (dig >= 0) {
|
|
DUK_ASSERT(dig >= 0x00 && dig <= 0x0f);
|
|
escval = (escval << 4) + dig;
|
|
if (escval > 0x10ffffL) {
|
|
goto fail_escape;
|
|
}
|
|
} else if (x == DUK_ASC_RCURLY) {
|
|
if (digits == 0) {
|
|
/* Empty escape, \u{}. */
|
|
goto fail_escape;
|
|
}
|
|
DUK_ASSERT(escval >= 0 && escval <= 0x10ffffL);
|
|
break;
|
|
} else {
|
|
goto fail_escape;
|
|
}
|
|
digits = -1; /* Indicate we have at least one digit. */
|
|
#else /* DUK_USE_ES6_UNICODE_ESCAPE */
|
|
DUK_ASSERT(0); /* Never happens if \u{H+} support disabled. */
|
|
#endif /* DUK_USE_ES6_UNICODE_ESCAPE */
|
|
}
|
|
}
|
|
|
|
return escval;
|
|
|
|
fail_escape:
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_ESCAPE);
|
|
DUK_WO_NORETURN(return 0;);
|
|
}
|
|
|
|
/* Parse legacy octal escape of the form \N{1,3}, e.g. \0, \5, \0377. Maximum
|
|
* allowed value is \0377 (U+00FF), longest match is used. Used for both string
|
|
* RegExp octal escape parsing. Window[0] must be the slash '\' and the first
|
|
* digit must already be validated to be in [0-9] by the caller.
|
|
*/
|
|
DUK_LOCAL duk_codepoint_t duk__lexer_parse_legacy_octal(duk_lexer_ctx *lex_ctx, duk_small_uint_t *out_adv, duk_bool_t reject_annex_b) {
|
|
duk_codepoint_t cp;
|
|
duk_small_uint_t lookup_idx;
|
|
duk_small_uint_t adv;
|
|
duk_codepoint_t tmp;
|
|
|
|
DUK_ASSERT(out_adv != NULL);
|
|
DUK_ASSERT(DUK__LOOKUP(lex_ctx, 0) == DUK_ASC_BACKSLASH);
|
|
DUK_ASSERT(DUK__LOOKUP(lex_ctx, 1) >= DUK_ASC_0 && DUK__LOOKUP(lex_ctx, 1) <= DUK_ASC_9);
|
|
|
|
cp = 0;
|
|
tmp = 0;
|
|
for (lookup_idx = 1; lookup_idx <= 3; lookup_idx++) {
|
|
DUK_DDD(DUK_DDDPRINT("lookup_idx=%ld, cp=%ld", (long) lookup_idx, (long) cp));
|
|
tmp = DUK__LOOKUP(lex_ctx, lookup_idx);
|
|
if (tmp < DUK_ASC_0 || tmp > DUK_ASC_7) {
|
|
/* No more valid digits. */
|
|
break;
|
|
}
|
|
tmp = (cp << 3) + (tmp - DUK_ASC_0);
|
|
if (tmp > 0xff) {
|
|
/* Three digit octal escapes above \377 (= 0xff)
|
|
* are not allowed.
|
|
*/
|
|
break;
|
|
}
|
|
cp = tmp;
|
|
}
|
|
DUK_DDD(DUK_DDDPRINT("final lookup_idx=%ld, cp=%ld", (long) lookup_idx, (long) cp));
|
|
|
|
adv = lookup_idx;
|
|
if (lookup_idx == 1) {
|
|
DUK_DDD(DUK_DDDPRINT("\\8 or \\9 -> treat as literal, accept in strict mode too"));
|
|
DUK_ASSERT(tmp == DUK_ASC_8 || tmp == DUK_ASC_9);
|
|
cp = tmp;
|
|
adv++; /* correction to above, eat offending character */
|
|
} else if (lookup_idx == 2 && cp == 0) {
|
|
/* Note: 'foo\0bar' is OK in strict mode, but 'foo\00bar' is not.
|
|
* It won't be interpreted as 'foo\u{0}0bar' but as a SyntaxError.
|
|
*/
|
|
DUK_DDD(DUK_DDDPRINT("\\0 -> accept in strict mode too"));
|
|
} else {
|
|
/* This clause also handles non-shortest zero, e.g. \00. */
|
|
if (reject_annex_b) {
|
|
DUK_DDD(DUK_DDDPRINT("non-zero octal literal %ld -> reject in strict-mode", (long) cp));
|
|
cp = -1;
|
|
} else {
|
|
DUK_DDD(DUK_DDDPRINT("non-zero octal literal %ld -> accepted", (long) cp));
|
|
DUK_ASSERT(cp >= 0 && cp <= 0xff);
|
|
}
|
|
}
|
|
|
|
*out_adv = adv;
|
|
|
|
DUK_ASSERT((cp >= 0 && cp <= 0xff) || (cp == -1 && reject_annex_b));
|
|
return cp;
|
|
}
|
|
|
|
/* XXX: move strict mode to lex_ctx? */
|
|
DUK_LOCAL void duk__lexer_parse_string_literal(duk_lexer_ctx *lex_ctx, duk_token *out_token, duk_small_int_t quote, duk_bool_t strict_mode) {
|
|
duk_small_uint_t adv;
|
|
|
|
for (adv = 1 /* initial quote */ ;;) {
|
|
duk_codepoint_t x;
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, adv); /* eat opening quote on first loop */
|
|
x = DUK__L0();
|
|
|
|
adv = 1;
|
|
if (x == quote) {
|
|
DUK__ADVANCECHARS(lex_ctx, 1); /* eat closing quote */
|
|
break;
|
|
} else if (x == '\\') {
|
|
/* DUK__L0 -> '\' char
|
|
* DUK__L1 ... DUK__L5 -> more lookup
|
|
*/
|
|
duk_small_int_t emitcp = -1;
|
|
|
|
x = DUK__L1();
|
|
|
|
/* How much to advance before next loop. */
|
|
adv = 2; /* note: long live range */
|
|
|
|
switch (x) {
|
|
case '\'':
|
|
emitcp = 0x0027;
|
|
break;
|
|
case '"':
|
|
emitcp = 0x0022;
|
|
break;
|
|
case '\\':
|
|
emitcp = 0x005c;
|
|
break;
|
|
case 'b':
|
|
emitcp = 0x0008;
|
|
break;
|
|
case 'f':
|
|
emitcp = 0x000c;
|
|
break;
|
|
case 'n':
|
|
emitcp = 0x000a;
|
|
break;
|
|
case 'r':
|
|
emitcp = 0x000d;
|
|
break;
|
|
case 't':
|
|
emitcp = 0x0009;
|
|
break;
|
|
case 'v':
|
|
emitcp = 0x000b;
|
|
break;
|
|
case 'x':
|
|
case 'u': {
|
|
duk_codepoint_t esc_cp;
|
|
esc_cp = duk__lexer_parse_escape(lex_ctx, 1 /*allow_es6*/);
|
|
DUK__APPENDBUFFER(lex_ctx, esc_cp);
|
|
adv = 0;
|
|
break;
|
|
}
|
|
default: {
|
|
if (duk_unicode_is_line_terminator(x)) {
|
|
/* line continuation */
|
|
if (x == 0x000d && DUK__L2() == 0x000a) {
|
|
/* CR LF again a special case */
|
|
adv = 3; /* line terminator, CR, LF */
|
|
}
|
|
} else if (DUK__ISDIGIT(x)) {
|
|
/*
|
|
* Octal escape or zero escape:
|
|
* \0 (lookahead not OctalDigit)
|
|
* \1 ... \7 (lookahead not OctalDigit)
|
|
* \ZeroToThree OctalDigit (lookahead not OctalDigit)
|
|
* \FourToSeven OctalDigit (no lookahead restrictions)
|
|
* \ZeroToThree OctalDigit OctalDigit (no lookahead restrictions)
|
|
*
|
|
* Zero escape is part of the standard syntax. Octal escapes are
|
|
* defined in E5 Section B.1.2, and are only allowed in non-strict mode.
|
|
* Any other productions starting with a decimal digit are invalid
|
|
* but are in practice treated like identity escapes.
|
|
*
|
|
* Parse octal (up to 3 digits) from the lookup window.
|
|
*/
|
|
|
|
emitcp = duk__lexer_parse_legacy_octal(lex_ctx, &adv, strict_mode /*reject_annex_b*/);
|
|
if (emitcp < 0) {
|
|
goto fail_escape;
|
|
}
|
|
} else if (x < 0) {
|
|
goto fail_unterminated;
|
|
} else {
|
|
/* escaped NonEscapeCharacter */
|
|
DUK__APPENDBUFFER(lex_ctx, x);
|
|
}
|
|
} /* end default clause */
|
|
} /* end switch */
|
|
|
|
/* Shared handling for single codepoint escapes. */
|
|
if (emitcp >= 0) {
|
|
DUK__APPENDBUFFER(lex_ctx, emitcp);
|
|
}
|
|
|
|
/* Track number of escapes; count not really needed but directive
|
|
* prologues need to detect whether there were any escapes or line
|
|
* continuations or not.
|
|
*/
|
|
out_token->num_escapes++;
|
|
} else if (x >= 0x20 && x <= 0x7f) {
|
|
/* Fast path for ASCII case, avoids line terminator
|
|
* check and CESU-8 encoding.
|
|
*/
|
|
DUK_ASSERT(x >= 0);
|
|
DUK_ASSERT(!duk_unicode_is_line_terminator(x));
|
|
DUK_ASSERT(x != quote);
|
|
DUK_ASSERT(x != DUK_ASC_BACKSLASH);
|
|
DUK__APPENDBUFFER_ASCII(lex_ctx, x);
|
|
} else if (x < 0 || duk_unicode_is_line_terminator(x)) {
|
|
goto fail_unterminated;
|
|
} else {
|
|
/* Character which is part of the string but wasn't handled
|
|
* by the fast path.
|
|
*/
|
|
DUK__APPENDBUFFER(lex_ctx, x);
|
|
}
|
|
} /* string parse loop */
|
|
|
|
return;
|
|
|
|
fail_escape:
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_ESCAPE);
|
|
DUK_WO_NORETURN(return;);
|
|
|
|
fail_unterminated:
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_UNTERMINATED_STRING);
|
|
DUK_WO_NORETURN(return;);
|
|
}
|
|
|
|
/* Skip to end-of-line (or end-of-file), used for single line comments. */
|
|
DUK_LOCAL void duk__lexer_skip_to_endofline(duk_lexer_ctx *lex_ctx) {
|
|
for (;;) {
|
|
duk_codepoint_t x;
|
|
|
|
x = DUK__L0();
|
|
if (x < 0 || duk_unicode_is_line_terminator(x)) {
|
|
break;
|
|
}
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Parse ECMAScript source InputElementDiv or InputElementRegExp
|
|
* (E5 Section 7), skipping whitespace, comments, and line terminators.
|
|
*
|
|
* Possible results are:
|
|
* (1) a token
|
|
* (2) a line terminator (skipped)
|
|
* (3) a comment (skipped)
|
|
* (4) EOF
|
|
*
|
|
* White space is automatically skipped from the current position (but
|
|
* not after the input element). If input has already ended, returns
|
|
* DUK_TOK_EOF indefinitely. If a parse error occurs, uses an DUK_ERROR()
|
|
* macro call (and hence a longjmp through current heap longjmp context).
|
|
* Comments and line terminator tokens are automatically skipped.
|
|
*
|
|
* The input element being matched is determined by regexp_mode; if set,
|
|
* parses a InputElementRegExp, otherwise a InputElementDiv. The
|
|
* difference between these are handling of productions starting with a
|
|
* forward slash.
|
|
*
|
|
* If strict_mode is set, recognizes additional future reserved words
|
|
* specific to strict mode, and refuses to parse octal literals.
|
|
*
|
|
* The matching strategy below is to (currently) use a six character
|
|
* lookup window to quickly determine which production is the -longest-
|
|
* matching one, and then parse that. The top-level if-else clauses
|
|
* match the first character, and the code blocks for each clause
|
|
* handle -all- alternatives for that first character. ECMAScript
|
|
* specification uses the "longest match wins" semantics, so the order
|
|
* of the if-clauses matters.
|
|
*
|
|
* Misc notes:
|
|
*
|
|
* * ECMAScript numeric literals do not accept a sign character.
|
|
* Consequently e.g. "-1.0" is parsed as two tokens: a negative
|
|
* sign and a positive numeric literal. The compiler performs
|
|
* the negation during compilation, so this has no adverse impact.
|
|
*
|
|
* * There is no token for "undefined": it is just a value available
|
|
* from the global object (or simply established by doing a reference
|
|
* to an undefined value).
|
|
*
|
|
* * Some contexts want Identifier tokens, which are IdentifierNames
|
|
* excluding reserved words, while some contexts want IdentifierNames
|
|
* directly. In the latter case e.g. "while" is interpreted as an
|
|
* identifier name, not a DUK_TOK_WHILE token. The solution here is
|
|
* to provide both token types: DUK_TOK_WHILE goes to 't' while
|
|
* DUK_TOK_IDENTIFIER goes to 't_nores', and 'slot1' always contains
|
|
* the identifier / keyword name.
|
|
*
|
|
* * Directive prologue needs to identify string literals such as
|
|
* "use strict" and 'use strict', which are sensitive to line
|
|
* continuations and escape sequences. For instance, "use\u0020strict"
|
|
* is a valid directive but is distinct from "use strict". The solution
|
|
* here is to decode escapes while tokenizing, but to keep track of the
|
|
* number of escapes. Directive detection can then check that the
|
|
* number of escapes is zero.
|
|
*
|
|
* * Multi-line comments with one or more internal LineTerminator are
|
|
* treated like a line terminator to comply with automatic semicolon
|
|
* insertion.
|
|
*/
|
|
|
|
DUK_INTERNAL
|
|
void duk_lexer_parse_js_input_element(duk_lexer_ctx *lex_ctx,
|
|
duk_token *out_token,
|
|
duk_bool_t strict_mode,
|
|
duk_bool_t regexp_mode) {
|
|
duk_codepoint_t x; /* temporary, must be signed and 32-bit to hold Unicode code points */
|
|
duk_small_uint_t advtok = 0; /* (advance << 8) + token_type, updated at function end,
|
|
* init is unnecessary but suppresses "may be used uninitialized" warnings.
|
|
*/
|
|
duk_bool_t got_lineterm = 0; /* got lineterm preceding non-whitespace, non-lineterm token */
|
|
|
|
if (++lex_ctx->token_count >= lex_ctx->token_limit) {
|
|
goto fail_token_limit;
|
|
}
|
|
|
|
out_token->t = DUK_TOK_EOF;
|
|
out_token->t_nores = DUK_TOK_INVALID; /* marker: copy t if not changed */
|
|
#if 0 /* not necessary to init, disabled for faster parsing */
|
|
out_token->num = DUK_DOUBLE_NAN;
|
|
out_token->str1 = NULL;
|
|
out_token->str2 = NULL;
|
|
#endif
|
|
out_token->num_escapes = 0;
|
|
/* out_token->lineterm set by caller */
|
|
|
|
/* This would be nice, but parsing is faster without resetting the
|
|
* value slots. The only side effect is that references to temporary
|
|
* string values may linger until lexing is finished; they're then
|
|
* freed normally.
|
|
*/
|
|
#if 0
|
|
duk_to_undefined(lex_ctx->thr, lex_ctx->slot1_idx);
|
|
duk_to_undefined(lex_ctx->thr, lex_ctx->slot2_idx);
|
|
#endif
|
|
|
|
/* 'advtok' indicates how much to advance and which token id to assign
|
|
* at the end. This shared functionality minimizes code size. All
|
|
* code paths are required to set 'advtok' to some value, so no default
|
|
* init value is used. Code paths calling DUK_ERROR() never return so
|
|
* they don't need to set advtok.
|
|
*/
|
|
|
|
/*
|
|
* Matching order:
|
|
*
|
|
* Punctuator first chars, also covers comments, regexps
|
|
* LineTerminator
|
|
* Identifier or reserved word, also covers null/true/false literals
|
|
* NumericLiteral
|
|
* StringLiteral
|
|
* EOF
|
|
*
|
|
* The order does not matter as long as the longest match is
|
|
* always correctly identified. There are order dependencies
|
|
* in the clauses, so it's not trivial to convert to a switch.
|
|
*/
|
|
|
|
restart_lineupdate:
|
|
out_token->start_line = lex_ctx->window[0].line;
|
|
|
|
restart:
|
|
out_token->start_offset = lex_ctx->window[0].offset;
|
|
|
|
x = DUK__L0();
|
|
|
|
switch (x) {
|
|
case DUK_ASC_SPACE:
|
|
case DUK_ASC_HT: /* fast paths for space and tab */
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
goto restart;
|
|
case DUK_ASC_LF: /* LF line terminator; CR LF and Unicode lineterms are handled in slow path */
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
got_lineterm = 1;
|
|
goto restart_lineupdate;
|
|
#if defined(DUK_USE_SHEBANG_COMMENTS)
|
|
case DUK_ASC_HASH: /* '#' */
|
|
if (DUK__L1() == DUK_ASC_EXCLAMATION && lex_ctx->window[0].offset == 0 &&
|
|
(lex_ctx->flags & DUK_COMPILE_SHEBANG)) {
|
|
/* "Shebang" comment ('#! ...') on first line. */
|
|
/* DUK__ADVANCECHARS(lex_ctx, 2) would be correct here, but not necessary */
|
|
duk__lexer_skip_to_endofline(lex_ctx);
|
|
goto restart; /* line terminator will be handled on next round */
|
|
}
|
|
goto fail_token;
|
|
#endif /* DUK_USE_SHEBANG_COMMENTS */
|
|
case DUK_ASC_SLASH: /* '/' */
|
|
if (DUK__L1() == DUK_ASC_SLASH) {
|
|
/*
|
|
* E5 Section 7.4, allow SourceCharacter (which is any 16-bit
|
|
* code point).
|
|
*/
|
|
|
|
/* DUK__ADVANCECHARS(lex_ctx, 2) would be correct here, but not necessary */
|
|
duk__lexer_skip_to_endofline(lex_ctx);
|
|
goto restart; /* line terminator will be handled on next round */
|
|
} else if (DUK__L1() == DUK_ASC_STAR) {
|
|
/*
|
|
* E5 Section 7.4. If the multi-line comment contains a newline,
|
|
* it is treated like a single line terminator for automatic
|
|
* semicolon insertion.
|
|
*/
|
|
|
|
duk_bool_t last_asterisk = 0;
|
|
DUK__ADVANCECHARS(lex_ctx, 2);
|
|
for (;;) {
|
|
x = DUK__L0();
|
|
if (x < 0) {
|
|
goto fail_unterm_comment;
|
|
}
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
if (last_asterisk && x == DUK_ASC_SLASH) {
|
|
break;
|
|
}
|
|
if (duk_unicode_is_line_terminator(x)) {
|
|
got_lineterm = 1;
|
|
}
|
|
last_asterisk = (x == DUK_ASC_STAR);
|
|
}
|
|
goto restart_lineupdate;
|
|
} else if (regexp_mode) {
|
|
#if defined(DUK_USE_REGEXP_SUPPORT)
|
|
/*
|
|
* "/" followed by something in regexp mode. See E5 Section 7.8.5.
|
|
*
|
|
* RegExp parsing is a bit complex. First, the regexp body is delimited
|
|
* by forward slashes, but the body may also contain forward slashes as
|
|
* part of an escape sequence or inside a character class (delimited by
|
|
* square brackets). A mini state machine is used to implement these.
|
|
*
|
|
* Further, an early (parse time) error must be thrown if the regexp
|
|
* would cause a run-time error when used in the expression new RegExp(...).
|
|
* Parsing here simply extracts the (candidate) regexp, and also accepts
|
|
* invalid regular expressions (which are delimited properly). The caller
|
|
* (compiler) must perform final validation and regexp compilation.
|
|
*
|
|
* RegExp first char may not be '/' (single line comment) or '*' (multi-
|
|
* line comment). These have already been checked above, so there is no
|
|
* need below for special handling of the first regexp character as in
|
|
* the E5 productions.
|
|
*
|
|
* About unicode escapes within regexp literals:
|
|
*
|
|
* E5 Section 7.8.5 grammar does NOT accept \uHHHH escapes.
|
|
* However, Section 6 states that regexps accept the escapes,
|
|
* see paragraph starting with "In string literals...".
|
|
* The regexp grammar, which sees the decoded regexp literal
|
|
* (after lexical parsing) DOES have a \uHHHH unicode escape.
|
|
* So, for instance:
|
|
*
|
|
* /\u1234/
|
|
*
|
|
* should first be parsed by the lexical grammar as:
|
|
*
|
|
* '\' 'u' RegularExpressionBackslashSequence
|
|
* '1' RegularExpressionNonTerminator
|
|
* '2' RegularExpressionNonTerminator
|
|
* '3' RegularExpressionNonTerminator
|
|
* '4' RegularExpressionNonTerminator
|
|
*
|
|
* and the escape itself is then parsed by the regexp engine.
|
|
* This is the current implementation.
|
|
*
|
|
* Minor spec inconsistency:
|
|
*
|
|
* E5 Section 7.8.5 RegularExpressionBackslashSequence is:
|
|
*
|
|
* \ RegularExpressionNonTerminator
|
|
*
|
|
* while Section A.1 RegularExpressionBackslashSequence is:
|
|
*
|
|
* \ NonTerminator
|
|
*
|
|
* The latter is not normative and a typo.
|
|
*
|
|
*/
|
|
|
|
/* first, parse regexp body roughly */
|
|
|
|
duk_small_int_t state = 0; /* 0=base, 1=esc, 2=class, 3=class+esc */
|
|
|
|
DUK__INITBUFFER(lex_ctx);
|
|
for (;;) {
|
|
DUK__ADVANCECHARS(lex_ctx, 1); /* skip opening slash on first loop */
|
|
x = DUK__L0();
|
|
if (x < 0 || duk_unicode_is_line_terminator(x)) {
|
|
goto fail_unterm_regexp;
|
|
}
|
|
x = DUK__L0(); /* re-read to avoid spill / fetch */
|
|
if (state == 0) {
|
|
if (x == DUK_ASC_SLASH) {
|
|
DUK__ADVANCECHARS(lex_ctx, 1); /* eat closing slash */
|
|
break;
|
|
} else if (x == DUK_ASC_BACKSLASH) {
|
|
state = 1;
|
|
} else if (x == DUK_ASC_LBRACKET) {
|
|
state = 2;
|
|
}
|
|
} else if (state == 1) {
|
|
state = 0;
|
|
} else if (state == 2) {
|
|
if (x == DUK_ASC_RBRACKET) {
|
|
state = 0;
|
|
} else if (x == DUK_ASC_BACKSLASH) {
|
|
state = 3;
|
|
}
|
|
} else { /* state == 3 */
|
|
state = 2;
|
|
}
|
|
DUK__APPENDBUFFER(lex_ctx, x);
|
|
}
|
|
out_token->str1 = duk__internbuffer(lex_ctx, lex_ctx->slot1_idx);
|
|
|
|
/* second, parse flags */
|
|
|
|
DUK__INITBUFFER(lex_ctx);
|
|
for (;;) {
|
|
x = DUK__L0();
|
|
if (!duk_unicode_is_identifier_part(x)) {
|
|
break;
|
|
}
|
|
x = DUK__L0(); /* re-read to avoid spill / fetch */
|
|
DUK__APPENDBUFFER(lex_ctx, x);
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
}
|
|
out_token->str2 = duk__internbuffer(lex_ctx, lex_ctx->slot2_idx);
|
|
|
|
DUK__INITBUFFER(lex_ctx); /* free some memory */
|
|
|
|
/* validation of the regexp is caller's responsibility */
|
|
|
|
advtok = DUK__ADVTOK(0, DUK_TOK_REGEXP);
|
|
#else /* DUK_USE_REGEXP_SUPPORT */
|
|
goto fail_regexp_support;
|
|
#endif /* DUK_USE_REGEXP_SUPPORT */
|
|
} else if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
/* "/=" and not in regexp mode */
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_DIV_EQ);
|
|
} else {
|
|
/* "/" and not in regexp mode */
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_DIV);
|
|
}
|
|
break;
|
|
case DUK_ASC_LCURLY: /* '{' */
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_LCURLY);
|
|
break;
|
|
case DUK_ASC_RCURLY: /* '}' */
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_RCURLY);
|
|
break;
|
|
case DUK_ASC_LPAREN: /* '(' */
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_LPAREN);
|
|
break;
|
|
case DUK_ASC_RPAREN: /* ')' */
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_RPAREN);
|
|
break;
|
|
case DUK_ASC_LBRACKET: /* '[' */
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_LBRACKET);
|
|
break;
|
|
case DUK_ASC_RBRACKET: /* ']' */
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_RBRACKET);
|
|
break;
|
|
case DUK_ASC_PERIOD: /* '.' */
|
|
if (DUK__ISDIGIT(DUK__L1())) {
|
|
/* Period followed by a digit can only start DecimalLiteral
|
|
* (handled in slow path). We could jump straight into the
|
|
* DecimalLiteral handling but should avoid goto to inside
|
|
* a block.
|
|
*/
|
|
goto slow_path;
|
|
}
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_PERIOD);
|
|
break;
|
|
case DUK_ASC_SEMICOLON: /* ';' */
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_SEMICOLON);
|
|
break;
|
|
case DUK_ASC_COMMA: /* ',' */
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_COMMA);
|
|
break;
|
|
case DUK_ASC_LANGLE: /* '<' */
|
|
#if defined(DUK_USE_HTML_COMMENTS)
|
|
if (DUK__L1() == DUK_ASC_EXCLAMATION && DUK__L2() == DUK_ASC_MINUS && DUK__L3() == DUK_ASC_MINUS) {
|
|
/*
|
|
* ES2015: B.1.3, handle "<!--" SingleLineHTMLOpenComment
|
|
*/
|
|
|
|
/* DUK__ADVANCECHARS(lex_ctx, 4) would be correct here, but not necessary */
|
|
duk__lexer_skip_to_endofline(lex_ctx);
|
|
goto restart; /* line terminator will be handled on next round */
|
|
}
|
|
else
|
|
#endif /* DUK_USE_HTML_COMMENTS */
|
|
if (DUK__L1() == DUK_ASC_LANGLE && DUK__L2() == DUK_ASC_EQUALS) {
|
|
advtok = DUK__ADVTOK(3, DUK_TOK_ALSHIFT_EQ);
|
|
} else if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_LE);
|
|
} else if (DUK__L1() == DUK_ASC_LANGLE) {
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_ALSHIFT);
|
|
} else {
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_LT);
|
|
}
|
|
break;
|
|
case DUK_ASC_RANGLE: /* '>' */
|
|
if (DUK__L1() == DUK_ASC_RANGLE && DUK__L2() == DUK_ASC_RANGLE && DUK__L3() == DUK_ASC_EQUALS) {
|
|
advtok = DUK__ADVTOK(4, DUK_TOK_RSHIFT_EQ);
|
|
} else if (DUK__L1() == DUK_ASC_RANGLE && DUK__L2() == DUK_ASC_RANGLE) {
|
|
advtok = DUK__ADVTOK(3, DUK_TOK_RSHIFT);
|
|
} else if (DUK__L1() == DUK_ASC_RANGLE && DUK__L2() == DUK_ASC_EQUALS) {
|
|
advtok = DUK__ADVTOK(3, DUK_TOK_ARSHIFT_EQ);
|
|
} else if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_GE);
|
|
} else if (DUK__L1() == DUK_ASC_RANGLE) {
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_ARSHIFT);
|
|
} else {
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_GT);
|
|
}
|
|
break;
|
|
case DUK_ASC_EQUALS: /* '=' */
|
|
if (DUK__L1() == DUK_ASC_EQUALS && DUK__L2() == DUK_ASC_EQUALS) {
|
|
advtok = DUK__ADVTOK(3, DUK_TOK_SEQ);
|
|
} else if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_EQ);
|
|
} else {
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_EQUALSIGN);
|
|
}
|
|
break;
|
|
case DUK_ASC_EXCLAMATION: /* '!' */
|
|
if (DUK__L1() == DUK_ASC_EQUALS && DUK__L2() == DUK_ASC_EQUALS) {
|
|
advtok = DUK__ADVTOK(3, DUK_TOK_SNEQ);
|
|
} else if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_NEQ);
|
|
} else {
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_LNOT);
|
|
}
|
|
break;
|
|
case DUK_ASC_PLUS: /* '+' */
|
|
if (DUK__L1() == DUK_ASC_PLUS) {
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_INCREMENT);
|
|
} else if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_ADD_EQ);
|
|
} else {
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_ADD);
|
|
}
|
|
break;
|
|
case DUK_ASC_MINUS: /* '-' */
|
|
#if defined(DUK_USE_HTML_COMMENTS)
|
|
if (got_lineterm && DUK__L1() == DUK_ASC_MINUS && DUK__L2() == DUK_ASC_RANGLE) {
|
|
/*
|
|
* ES2015: B.1.3, handle "-->" SingleLineHTMLCloseComment
|
|
* Only allowed:
|
|
* - on new line
|
|
* - preceded only by whitespace
|
|
* - preceded by end of multiline comment and optional whitespace
|
|
*
|
|
* Since whitespace generates no tokens, and multiline comments
|
|
* are treated as a line ending, consulting `got_lineterm` is
|
|
* sufficient to test for these three options.
|
|
*/
|
|
|
|
/* DUK__ADVANCECHARS(lex_ctx, 3) would be correct here, but not necessary */
|
|
duk__lexer_skip_to_endofline(lex_ctx);
|
|
goto restart; /* line terminator will be handled on next round */
|
|
} else
|
|
#endif /* DUK_USE_HTML_COMMENTS */
|
|
if (DUK__L1() == DUK_ASC_MINUS) {
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_DECREMENT);
|
|
} else if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_SUB_EQ);
|
|
} else {
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_SUB);
|
|
}
|
|
break;
|
|
case DUK_ASC_STAR: /* '*' */
|
|
#if defined(DUK_USE_ES7_EXP_OPERATOR)
|
|
if (DUK__L1() == DUK_ASC_STAR && DUK__L2() == DUK_ASC_EQUALS) {
|
|
advtok = DUK__ADVTOK(3, DUK_TOK_EXP_EQ);
|
|
} else if (DUK__L1() == DUK_ASC_STAR) {
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_EXP);
|
|
} else
|
|
#endif
|
|
if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_MUL_EQ);
|
|
} else {
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_MUL);
|
|
}
|
|
break;
|
|
case DUK_ASC_PERCENT: /* '%' */
|
|
if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_MOD_EQ);
|
|
} else {
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_MOD);
|
|
}
|
|
break;
|
|
case DUK_ASC_AMP: /* '&' */
|
|
if (DUK__L1() == DUK_ASC_AMP) {
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_LAND);
|
|
} else if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_BAND_EQ);
|
|
} else {
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_BAND);
|
|
}
|
|
break;
|
|
case DUK_ASC_PIPE: /* '|' */
|
|
if (DUK__L1() == DUK_ASC_PIPE) {
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_LOR);
|
|
} else if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_BOR_EQ);
|
|
} else {
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_BOR);
|
|
}
|
|
break;
|
|
case DUK_ASC_CARET: /* '^' */
|
|
if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_BXOR_EQ);
|
|
} else {
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_BXOR);
|
|
}
|
|
break;
|
|
case DUK_ASC_TILDE: /* '~' */
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_BNOT);
|
|
break;
|
|
case DUK_ASC_QUESTION: /* '?' */
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_QUESTION);
|
|
break;
|
|
case DUK_ASC_COLON: /* ':' */
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_COLON);
|
|
break;
|
|
case DUK_ASC_DOUBLEQUOTE: /* '"' */
|
|
case DUK_ASC_SINGLEQUOTE: { /* '\'' */
|
|
DUK__INITBUFFER(lex_ctx);
|
|
duk__lexer_parse_string_literal(lex_ctx, out_token, x /*quote*/, strict_mode);
|
|
duk__internbuffer(lex_ctx, lex_ctx->slot1_idx);
|
|
out_token->str1 = duk_known_hstring(lex_ctx->thr, lex_ctx->slot1_idx);
|
|
|
|
DUK__INITBUFFER(lex_ctx); /* free some memory */
|
|
|
|
advtok = DUK__ADVTOK(0, DUK_TOK_STRING);
|
|
break;
|
|
}
|
|
default:
|
|
goto slow_path;
|
|
} /* switch */
|
|
|
|
goto skip_slow_path;
|
|
|
|
slow_path:
|
|
if (duk_unicode_is_line_terminator(x)) {
|
|
if (x == 0x000d && DUK__L1() == 0x000a) {
|
|
/*
|
|
* E5 Section 7.3: CR LF is detected as a single line terminator for
|
|
* line numbers. Here we also detect it as a single line terminator
|
|
* token.
|
|
*/
|
|
DUK__ADVANCECHARS(lex_ctx, 2);
|
|
} else {
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
}
|
|
got_lineterm = 1;
|
|
goto restart_lineupdate;
|
|
} else if (duk_unicode_is_identifier_start(x) || x == DUK_ASC_BACKSLASH) {
|
|
/*
|
|
* Parse an identifier and then check whether it is:
|
|
* - reserved word (keyword or other reserved word)
|
|
* - "null" (NullLiteral)
|
|
* - "true" (BooleanLiteral)
|
|
* - "false" (BooleanLiteral)
|
|
* - anything else => identifier
|
|
*
|
|
* This does not follow the E5 productions cleanly, but is
|
|
* useful and compact.
|
|
*
|
|
* Note that identifiers may contain Unicode escapes,
|
|
* see E5 Sections 6 and 7.6. They must be decoded first,
|
|
* and the result checked against allowed characters.
|
|
* The above if-clause accepts an identifier start and an
|
|
* '\' character -- no other token can begin with a '\'.
|
|
*
|
|
* Note that "get" and "set" are not reserved words in E5
|
|
* specification so they are recognized as plain identifiers
|
|
* (the tokens DUK_TOK_GET and DUK_TOK_SET are actually not
|
|
* used now). The compiler needs to work around this.
|
|
*
|
|
* Strictly speaking, following ECMAScript longest match
|
|
* specification, an invalid escape for the first character
|
|
* should cause a syntax error. However, an invalid escape
|
|
* for IdentifierParts should just terminate the identifier
|
|
* early (longest match), and let the next tokenization
|
|
* fail. For instance Rhino croaks with 'foo\z' when
|
|
* parsing the identifier. This has little practical impact.
|
|
*/
|
|
|
|
duk_small_uint_t i, i_end;
|
|
duk_bool_t first = 1;
|
|
duk_hstring *str;
|
|
|
|
DUK__INITBUFFER(lex_ctx);
|
|
for (;;) {
|
|
/* re-lookup first char on first loop */
|
|
if (DUK__L0() == DUK_ASC_BACKSLASH) {
|
|
duk_codepoint_t esc_cp;
|
|
if (DUK__L1() != DUK_ASC_LC_U) {
|
|
goto fail_escape;
|
|
}
|
|
esc_cp = duk__lexer_parse_escape(lex_ctx, 1 /*allow_es6*/);
|
|
DUK__APPENDBUFFER(lex_ctx, esc_cp);
|
|
|
|
/* IdentifierStart is stricter than IdentifierPart, so if the first
|
|
* character is escaped, must have a stricter check here.
|
|
*/
|
|
if (!(first ? duk_unicode_is_identifier_start(esc_cp) : duk_unicode_is_identifier_part(esc_cp))) {
|
|
goto fail_escape;
|
|
}
|
|
|
|
/* Track number of escapes: necessary for proper keyword
|
|
* detection.
|
|
*/
|
|
out_token->num_escapes++;
|
|
} else {
|
|
/* Note: first character is checked against this. But because
|
|
* IdentifierPart includes all IdentifierStart characters, and
|
|
* the first character (if unescaped) has already been checked
|
|
* in the if condition, this is OK.
|
|
*/
|
|
if (!duk_unicode_is_identifier_part(DUK__L0())) {
|
|
break;
|
|
}
|
|
DUK__APPENDBUFFER(lex_ctx, DUK__L0());
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
}
|
|
first = 0;
|
|
}
|
|
|
|
out_token->str1 = duk__internbuffer(lex_ctx, lex_ctx->slot1_idx);
|
|
str = out_token->str1;
|
|
out_token->t_nores = DUK_TOK_IDENTIFIER;
|
|
|
|
DUK__INITBUFFER(lex_ctx); /* free some memory */
|
|
|
|
/*
|
|
* Interned identifier is compared against reserved words, which are
|
|
* currently interned into the heap context. See genbuiltins.py.
|
|
*
|
|
* Note that an escape in the identifier disables recognition of
|
|
* keywords; e.g. "\u0069f = 1;" is a valid statement (assigns to
|
|
* identifier named "if"). This is not necessarily compliant,
|
|
* see test-dec-escaped-char-in-keyword.js.
|
|
*
|
|
* Note: "get" and "set" are awkward. They are not officially
|
|
* ReservedWords (and indeed e.g. "var set = 1;" is valid), and
|
|
* must come out as DUK_TOK_IDENTIFIER. The compiler needs to
|
|
* work around this a bit.
|
|
*/
|
|
|
|
/* XXX: optimize by adding the token numbers directly into the
|
|
* always interned duk_hstring objects (there should be enough
|
|
* flag bits free for that)?
|
|
*/
|
|
|
|
i_end = (strict_mode ? DUK_STRIDX_END_RESERVED : DUK_STRIDX_START_STRICT_RESERVED);
|
|
|
|
advtok = DUK__ADVTOK(0, DUK_TOK_IDENTIFIER);
|
|
if (out_token->num_escapes == 0) {
|
|
for (i = DUK_STRIDX_START_RESERVED; i < i_end; i++) {
|
|
DUK_ASSERT_DISABLE(i >= 0); /* unsigned */
|
|
DUK_ASSERT(i < DUK_HEAP_NUM_STRINGS);
|
|
if (DUK_HTHREAD_GET_STRING(lex_ctx->thr, i) == str) {
|
|
advtok = DUK__ADVTOK(0, DUK_STRIDX_TO_TOK(i));
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
} else if (DUK__ISDIGIT(x) || (x == DUK_ASC_PERIOD)) {
|
|
/* Note: decimal number may start with a period, but must be followed by a digit */
|
|
|
|
/*
|
|
* Pre-parsing for decimal, hex, octal (both legacy and ES2015),
|
|
* and binary literals, followed by an actual parser step
|
|
* provided by numconv.
|
|
*
|
|
* Note: the leading sign character ('+' or '-') is -not- part of
|
|
* the production in E5 grammar, and that the a DecimalLiteral
|
|
* starting with a '0' must be followed by a non-digit.
|
|
*
|
|
* XXX: the two step parsing process is quite awkward, it would
|
|
* be more straightforward to allow numconv to parse the longest
|
|
* valid prefix (it already does that, it only needs to indicate
|
|
* where the input ended). However, the lexer decodes characters
|
|
* using a limited lookup window, so this is not a trivial change.
|
|
*/
|
|
|
|
/* XXX: because of the final check below (that the literal is not
|
|
* followed by a digit), this could maybe be simplified, if we bail
|
|
* out early from a leading zero (and if there are no periods etc).
|
|
* Maybe too complex.
|
|
*/
|
|
|
|
duk_double_t val;
|
|
duk_bool_t legacy_oct = 0;
|
|
duk_small_int_t state; /* 0=before period/exp,
|
|
* 1=after period, before exp
|
|
* 2=after exp, allow '+' or '-'
|
|
* 3=after exp and exp sign
|
|
*/
|
|
duk_small_uint_t s2n_flags;
|
|
duk_codepoint_t y, z;
|
|
duk_small_int_t s2n_radix = 10;
|
|
duk_small_uint_t pre_adv = 0;
|
|
|
|
DUK__INITBUFFER(lex_ctx);
|
|
y = DUK__L1();
|
|
|
|
if (x == DUK_ASC_0) {
|
|
z = DUK_LOWERCASE_CHAR_ASCII(y);
|
|
|
|
pre_adv = 2; /* default for 0xNNN, 0oNNN, 0bNNN. */
|
|
if (z == DUK_ASC_LC_X) {
|
|
s2n_radix = 16;
|
|
} else if (z == DUK_ASC_LC_O) {
|
|
s2n_radix = 8;
|
|
} else if (z == DUK_ASC_LC_B) {
|
|
s2n_radix = 2;
|
|
} else {
|
|
pre_adv = 0;
|
|
if (DUK__ISDIGIT(y)) {
|
|
if (strict_mode) {
|
|
/* Reject octal like \07 but also octal-lookalike
|
|
* decimal like \08 in strict mode.
|
|
*/
|
|
goto fail_number_literal;
|
|
} else {
|
|
/* Legacy OctalIntegerLiteral or octal-lookalice
|
|
* decimal. Deciding between the two happens below
|
|
* in digit scanning.
|
|
*/
|
|
DUK__APPENDBUFFER(lex_ctx, x);
|
|
pre_adv = 1;
|
|
legacy_oct = 1;
|
|
s2n_radix = 8; /* tentative unless conflicting digits found */
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, pre_adv);
|
|
|
|
/* XXX: we could parse integers here directly, and fall back
|
|
* to numconv only when encountering a fractional expression
|
|
* or when an octal literal turned out to be decimal (0778 etc).
|
|
*/
|
|
state = 0;
|
|
for (;;) {
|
|
x = DUK__L0(); /* re-lookup curr char on first round */
|
|
if (DUK__ISDIGIT(x)) {
|
|
/* Note: intentionally allow leading zeroes here, as the
|
|
* actual parser will check for them.
|
|
*/
|
|
if (state == 0 && legacy_oct && (x == DUK_ASC_8 || x == DUK_ASC_9)) {
|
|
/* Started out as an octal-lookalike
|
|
* but interpreted as decimal, e.g.
|
|
* '0779' -> 779. This also means
|
|
* that fractions are allowed, e.g.
|
|
* '0779.123' is allowed but '0777.123'
|
|
* is not!
|
|
*/
|
|
s2n_radix = 10;
|
|
}
|
|
if (state == 2) {
|
|
state = 3;
|
|
}
|
|
} else if (s2n_radix == 16 && DUK__ISHEXDIGIT(x)) {
|
|
/* Note: 'e' and 'E' are also accepted here. */
|
|
;
|
|
} else if (x == DUK_ASC_PERIOD) {
|
|
if (state >= 1 || s2n_radix != 10) {
|
|
break;
|
|
} else {
|
|
state = 1;
|
|
}
|
|
} else if (x == DUK_ASC_LC_E || x == DUK_ASC_UC_E) {
|
|
if (state >= 2 || s2n_radix != 10) {
|
|
break;
|
|
} else {
|
|
state = 2;
|
|
}
|
|
} else if (x == DUK_ASC_MINUS || x == DUK_ASC_PLUS) {
|
|
if (state != 2) {
|
|
break;
|
|
} else {
|
|
state = 3;
|
|
}
|
|
} else {
|
|
break;
|
|
}
|
|
DUK__APPENDBUFFER(lex_ctx, x);
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
}
|
|
|
|
/* XXX: better coercion */
|
|
(void) duk__internbuffer(lex_ctx, lex_ctx->slot1_idx);
|
|
|
|
if (s2n_radix != 10) {
|
|
/* For bases other than 10, integer only. */
|
|
s2n_flags = DUK_S2N_FLAG_ALLOW_LEADING_ZERO;
|
|
} else {
|
|
s2n_flags = DUK_S2N_FLAG_ALLOW_EXP |
|
|
DUK_S2N_FLAG_ALLOW_FRAC |
|
|
DUK_S2N_FLAG_ALLOW_NAKED_FRAC |
|
|
DUK_S2N_FLAG_ALLOW_EMPTY_FRAC |
|
|
DUK_S2N_FLAG_ALLOW_LEADING_ZERO;
|
|
}
|
|
|
|
duk_dup(lex_ctx->thr, lex_ctx->slot1_idx);
|
|
duk_numconv_parse(lex_ctx->thr, s2n_radix, s2n_flags);
|
|
val = duk_to_number_m1(lex_ctx->thr);
|
|
if (DUK_ISNAN(val)) {
|
|
goto fail_number_literal;
|
|
}
|
|
duk_replace(lex_ctx->thr, lex_ctx->slot1_idx); /* could also just pop? */
|
|
|
|
DUK__INITBUFFER(lex_ctx); /* free some memory */
|
|
|
|
/* Section 7.8.3 (note): NumericLiteral must be followed by something other than
|
|
* IdentifierStart or DecimalDigit.
|
|
*/
|
|
|
|
if (DUK__ISDIGIT(DUK__L0()) || duk_unicode_is_identifier_start(DUK__L0())) {
|
|
goto fail_number_literal;
|
|
}
|
|
|
|
out_token->num = val;
|
|
advtok = DUK__ADVTOK(0, DUK_TOK_NUMBER);
|
|
} else if (duk_unicode_is_whitespace(DUK__LOOKUP(lex_ctx, 0))) {
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
goto restart;
|
|
} else if (x < 0) {
|
|
advtok = DUK__ADVTOK(0, DUK_TOK_EOF);
|
|
} else {
|
|
goto fail_token;
|
|
}
|
|
skip_slow_path:
|
|
|
|
/*
|
|
* Shared exit path
|
|
*/
|
|
|
|
DUK__ADVANCEBYTES(lex_ctx, advtok >> 8);
|
|
out_token->t = advtok & 0xff;
|
|
if (out_token->t_nores == DUK_TOK_INVALID) {
|
|
out_token->t_nores = out_token->t;
|
|
}
|
|
out_token->lineterm = got_lineterm;
|
|
|
|
/* Automatic semicolon insertion is allowed if a token is preceded
|
|
* by line terminator(s), or terminates a statement list (right curly
|
|
* or EOF).
|
|
*/
|
|
if (got_lineterm || out_token->t == DUK_TOK_RCURLY || out_token->t == DUK_TOK_EOF) {
|
|
out_token->allow_auto_semi = 1;
|
|
} else {
|
|
out_token->allow_auto_semi = 0;
|
|
}
|
|
|
|
return;
|
|
|
|
fail_token_limit:
|
|
DUK_ERROR_RANGE(lex_ctx->thr, DUK_STR_TOKEN_LIMIT);
|
|
DUK_WO_NORETURN(return;);
|
|
|
|
fail_token:
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_TOKEN);
|
|
DUK_WO_NORETURN(return;);
|
|
|
|
fail_number_literal:
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_NUMBER_LITERAL);
|
|
DUK_WO_NORETURN(return;);
|
|
|
|
fail_escape:
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_ESCAPE);
|
|
DUK_WO_NORETURN(return;);
|
|
|
|
fail_unterm_regexp:
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_UNTERMINATED_REGEXP);
|
|
DUK_WO_NORETURN(return;);
|
|
|
|
fail_unterm_comment:
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_UNTERMINATED_COMMENT);
|
|
DUK_WO_NORETURN(return;);
|
|
|
|
#if !defined(DUK_USE_REGEXP_SUPPORT)
|
|
fail_regexp_support:
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_REGEXP_SUPPORT_DISABLED);
|
|
DUK_WO_NORETURN(return;);
|
|
#endif
|
|
}
|
|
|
|
#if defined(DUK_USE_REGEXP_SUPPORT)
|
|
|
|
/*
|
|
* Parse a RegExp token. The grammar is described in E5 Section 15.10.
|
|
* Terminal constructions (such as quantifiers) are parsed directly here.
|
|
*
|
|
* 0xffffffffU is used as a marker for "infinity" in quantifiers. Further,
|
|
* DUK__MAX_RE_QUANT_DIGITS limits the maximum number of digits that
|
|
* will be accepted for a quantifier.
|
|
*/
|
|
|
|
DUK_INTERNAL void duk_lexer_parse_re_token(duk_lexer_ctx *lex_ctx, duk_re_token *out_token) {
|
|
duk_small_uint_t advtok = 0; /* init is unnecessary but suppresses "may be used uninitialized" warnings */
|
|
duk_codepoint_t x, y;
|
|
|
|
if (++lex_ctx->token_count >= lex_ctx->token_limit) {
|
|
goto fail_token_limit;
|
|
}
|
|
|
|
duk_memzero(out_token, sizeof(*out_token));
|
|
|
|
x = DUK__L0();
|
|
y = DUK__L1();
|
|
|
|
DUK_DDD(DUK_DDDPRINT("parsing regexp token, L0=%ld, L1=%ld", (long) x, (long) y));
|
|
|
|
switch (x) {
|
|
case DUK_ASC_PIPE: {
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_DISJUNCTION);
|
|
break;
|
|
}
|
|
case DUK_ASC_CARET: {
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_ASSERT_START);
|
|
break;
|
|
}
|
|
case DUK_ASC_DOLLAR: {
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_ASSERT_END);
|
|
break;
|
|
}
|
|
case DUK_ASC_QUESTION: {
|
|
out_token->qmin = 0;
|
|
out_token->qmax = 1;
|
|
if (y == DUK_ASC_QUESTION) {
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_QUANTIFIER);
|
|
out_token->greedy = 0;
|
|
} else {
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_QUANTIFIER);
|
|
out_token->greedy = 1;
|
|
}
|
|
break;
|
|
}
|
|
case DUK_ASC_STAR: {
|
|
out_token->qmin = 0;
|
|
out_token->qmax = DUK_RE_QUANTIFIER_INFINITE;
|
|
if (y == DUK_ASC_QUESTION) {
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_QUANTIFIER);
|
|
out_token->greedy = 0;
|
|
} else {
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_QUANTIFIER);
|
|
out_token->greedy = 1;
|
|
}
|
|
break;
|
|
}
|
|
case DUK_ASC_PLUS: {
|
|
out_token->qmin = 1;
|
|
out_token->qmax = DUK_RE_QUANTIFIER_INFINITE;
|
|
if (y == DUK_ASC_QUESTION) {
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_QUANTIFIER);
|
|
out_token->greedy = 0;
|
|
} else {
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_QUANTIFIER);
|
|
out_token->greedy = 1;
|
|
}
|
|
break;
|
|
}
|
|
case DUK_ASC_LCURLY: {
|
|
/* Production allows 'DecimalDigits', including leading zeroes */
|
|
duk_uint32_t val1 = 0;
|
|
duk_uint32_t val2 = DUK_RE_QUANTIFIER_INFINITE;
|
|
duk_small_int_t digits = 0;
|
|
#if defined(DUK_USE_ES6_REGEXP_SYNTAX)
|
|
duk_lexer_point lex_pt;
|
|
#endif
|
|
|
|
#if defined(DUK_USE_ES6_REGEXP_SYNTAX)
|
|
/* Store lexer position, restoring if quantifier is invalid. */
|
|
DUK_LEXER_GETPOINT(lex_ctx, &lex_pt);
|
|
#endif
|
|
|
|
for (;;) {
|
|
DUK__ADVANCECHARS(lex_ctx, 1); /* eat '{' on entry */
|
|
x = DUK__L0();
|
|
if (DUK__ISDIGIT(x)) {
|
|
digits++;
|
|
val1 = val1 * 10 + (duk_uint32_t) duk__hexval(x);
|
|
} else if (x == DUK_ASC_COMMA) {
|
|
if (digits > DUK__MAX_RE_QUANT_DIGITS) {
|
|
goto invalid_quantifier;
|
|
}
|
|
if (val2 != DUK_RE_QUANTIFIER_INFINITE) {
|
|
goto invalid_quantifier;
|
|
}
|
|
if (DUK__L1() == DUK_ASC_RCURLY) {
|
|
/* form: { DecimalDigits , }, val1 = min count */
|
|
if (digits == 0) {
|
|
goto invalid_quantifier;
|
|
}
|
|
out_token->qmin = val1;
|
|
out_token->qmax = DUK_RE_QUANTIFIER_INFINITE;
|
|
DUK__ADVANCECHARS(lex_ctx, 2);
|
|
break;
|
|
}
|
|
val2 = val1;
|
|
val1 = 0;
|
|
digits = 0; /* not strictly necessary because of lookahead '}' above */
|
|
} else if (x == DUK_ASC_RCURLY) {
|
|
if (digits > DUK__MAX_RE_QUANT_DIGITS) {
|
|
goto invalid_quantifier;
|
|
}
|
|
if (digits == 0) {
|
|
goto invalid_quantifier;
|
|
}
|
|
if (val2 != DUK_RE_QUANTIFIER_INFINITE) {
|
|
/* val2 = min count, val1 = max count */
|
|
out_token->qmin = val2;
|
|
out_token->qmax = val1;
|
|
} else {
|
|
/* val1 = count */
|
|
out_token->qmin = val1;
|
|
out_token->qmax = val1;
|
|
}
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
break;
|
|
} else {
|
|
goto invalid_quantifier;
|
|
}
|
|
}
|
|
if (DUK__L0() == DUK_ASC_QUESTION) {
|
|
out_token->greedy = 0;
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
} else {
|
|
out_token->greedy = 1;
|
|
}
|
|
advtok = DUK__ADVTOK(0, DUK_RETOK_QUANTIFIER);
|
|
break;
|
|
invalid_quantifier:
|
|
#if defined(DUK_USE_ES6_REGEXP_SYNTAX)
|
|
/* Failed to match the quantifier, restore lexer and parse
|
|
* opening brace as a literal.
|
|
*/
|
|
DUK_LEXER_SETPOINT(lex_ctx, &lex_pt);
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_CHAR);
|
|
out_token->num = DUK_ASC_LCURLY;
|
|
#else
|
|
goto fail_quantifier;
|
|
#endif
|
|
break;
|
|
}
|
|
case DUK_ASC_PERIOD: {
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_PERIOD);
|
|
break;
|
|
}
|
|
case DUK_ASC_BACKSLASH: {
|
|
/* The E5.1 specification does not seem to allow IdentifierPart characters
|
|
* to be used as identity escapes. Unfortunately this includes '$', which
|
|
* cannot be escaped as '\$'; it needs to be escaped e.g. as '\u0024'.
|
|
* Many other implementations (including V8 and Rhino, for instance) do
|
|
* accept '\$' as a valid identity escape, which is quite pragmatic, and
|
|
* ES2015 Annex B relaxes the rules to allow these (and other) real world forms.
|
|
*/
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_CHAR); /* default: char escape (two chars) */
|
|
if (y == DUK_ASC_LC_B) {
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_ASSERT_WORD_BOUNDARY);
|
|
} else if (y == DUK_ASC_UC_B) {
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_ASSERT_NOT_WORD_BOUNDARY);
|
|
} else if (y == DUK_ASC_LC_F) {
|
|
out_token->num = 0x000c;
|
|
} else if (y == DUK_ASC_LC_N) {
|
|
out_token->num = 0x000a;
|
|
} else if (y == DUK_ASC_LC_T) {
|
|
out_token->num = 0x0009;
|
|
} else if (y == DUK_ASC_LC_R) {
|
|
out_token->num = 0x000d;
|
|
} else if (y == DUK_ASC_LC_V) {
|
|
out_token->num = 0x000b;
|
|
} else if (y == DUK_ASC_LC_C) {
|
|
x = DUK__L2();
|
|
if ((x >= DUK_ASC_LC_A && x <= DUK_ASC_LC_Z) ||
|
|
(x >= DUK_ASC_UC_A && x <= DUK_ASC_UC_Z)) {
|
|
out_token->num = (duk_uint32_t) (x % 32);
|
|
advtok = DUK__ADVTOK(3, DUK_RETOK_ATOM_CHAR);
|
|
} else {
|
|
goto fail_escape;
|
|
}
|
|
} else if (y == DUK_ASC_LC_X || y == DUK_ASC_LC_U) {
|
|
/* The token value is the Unicode codepoint without
|
|
* it being decode into surrogate pair characters
|
|
* here. The \u{H+} is only allowed in Unicode mode
|
|
* which we don't support yet.
|
|
*/
|
|
out_token->num = (duk_uint32_t) duk__lexer_parse_escape(lex_ctx, 0 /*allow_es6*/);
|
|
advtok = DUK__ADVTOK(0, DUK_RETOK_ATOM_CHAR);
|
|
} else if (y == DUK_ASC_LC_D) {
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_DIGIT);
|
|
} else if (y == DUK_ASC_UC_D) {
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_NOT_DIGIT);
|
|
} else if (y == DUK_ASC_LC_S) {
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_WHITE);
|
|
} else if (y == DUK_ASC_UC_S) {
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_NOT_WHITE);
|
|
} else if (y == DUK_ASC_LC_W) {
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_WORD_CHAR);
|
|
} else if (y == DUK_ASC_UC_W) {
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_NOT_WORD_CHAR);
|
|
} else if (DUK__ISDIGIT(y)) {
|
|
/* E5 Section 15.10.2.11 */
|
|
if (y == DUK_ASC_0) {
|
|
if (DUK__ISDIGIT(DUK__L2())) {
|
|
goto fail_escape;
|
|
}
|
|
out_token->num = 0x0000;
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_CHAR);
|
|
} else {
|
|
/* XXX: shared parsing? */
|
|
duk_uint32_t val = 0;
|
|
duk_small_int_t i;
|
|
for (i = 0; ; i++) {
|
|
if (i >= DUK__MAX_RE_DECESC_DIGITS) {
|
|
goto fail_escape;
|
|
}
|
|
DUK__ADVANCECHARS(lex_ctx, 1); /* eat backslash on entry */
|
|
x = DUK__L0();
|
|
if (!DUK__ISDIGIT(x)) {
|
|
break;
|
|
}
|
|
val = val * 10 + (duk_uint32_t) duk__hexval(x);
|
|
}
|
|
/* DUK__L0() cannot be a digit, because the loop doesn't terminate if it is */
|
|
advtok = DUK__ADVTOK(0, DUK_RETOK_ATOM_BACKREFERENCE);
|
|
out_token->num = val;
|
|
}
|
|
#if defined(DUK_USE_ES6_REGEXP_SYNTAX)
|
|
} else if (y >= 0) {
|
|
/* For ES2015 Annex B, accept any source character as identity
|
|
* escape except 'c' which is used for control characters.
|
|
* http://www.ecma-international.org/ecma-262/6.0/#sec-regular-expressions-patterns
|
|
* Careful not to match end-of-buffer (<0) here.
|
|
* This is not yet full ES2015 Annex B because cases above
|
|
* (like hex escape) won't backtrack.
|
|
*/
|
|
DUK_ASSERT(y != DUK_ASC_LC_C); /* covered above */
|
|
#else /* DUK_USE_ES6_REGEXP_SYNTAX */
|
|
} else if ((y >= 0 && !duk_unicode_is_identifier_part(y)) ||
|
|
y == DUK_UNICODE_CP_ZWNJ ||
|
|
y == DUK_UNICODE_CP_ZWJ) {
|
|
/* For ES5.1 identity escapes are not allowed for identifier
|
|
* parts. This conflicts with a lot of real world code as this
|
|
* doesn't e.g. allow escaping a dollar sign as /\$/, see
|
|
* test-regexp-identity-escape-dollar.js.
|
|
*/
|
|
#endif /* DUK_USE_ES6_REGEXP_SYNTAX */
|
|
out_token->num = (duk_uint32_t) y;
|
|
} else {
|
|
goto fail_escape;
|
|
}
|
|
break;
|
|
}
|
|
case DUK_ASC_LPAREN: {
|
|
/* XXX: naming is inconsistent: ATOM_END_GROUP ends an ASSERT_START_LOOKAHEAD */
|
|
|
|
if (y == DUK_ASC_QUESTION) {
|
|
if (DUK__L2() == DUK_ASC_EQUALS) {
|
|
/* (?= */
|
|
advtok = DUK__ADVTOK(3, DUK_RETOK_ASSERT_START_POS_LOOKAHEAD);
|
|
} else if (DUK__L2() == DUK_ASC_EXCLAMATION) {
|
|
/* (?! */
|
|
advtok = DUK__ADVTOK(3, DUK_RETOK_ASSERT_START_NEG_LOOKAHEAD);
|
|
} else if (DUK__L2() == DUK_ASC_COLON) {
|
|
/* (?: */
|
|
advtok = DUK__ADVTOK(3, DUK_RETOK_ATOM_START_NONCAPTURE_GROUP);
|
|
} else {
|
|
goto fail_group;
|
|
}
|
|
} else {
|
|
/* ( */
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_START_CAPTURE_GROUP);
|
|
}
|
|
break;
|
|
}
|
|
case DUK_ASC_RPAREN: {
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_END_GROUP);
|
|
break;
|
|
}
|
|
case DUK_ASC_LBRACKET: {
|
|
/*
|
|
* To avoid creating a heavy intermediate value for the list of ranges,
|
|
* only the start token ('[' or '[^') is parsed here. The regexp
|
|
* compiler parses the ranges itself.
|
|
*/
|
|
|
|
/* XXX: with DUK_USE_ES6_REGEXP_SYNTAX we should allow left bracket
|
|
* literal too, but it's not easy to parse without backtracking.
|
|
*/
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_START_CHARCLASS);
|
|
if (y == DUK_ASC_CARET) {
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_START_CHARCLASS_INVERTED);
|
|
}
|
|
break;
|
|
}
|
|
#if !defined(DUK_USE_ES6_REGEXP_SYNTAX)
|
|
case DUK_ASC_RCURLY:
|
|
case DUK_ASC_RBRACKET: {
|
|
/* Although these could be parsed as PatternCharacters unambiguously (here),
|
|
* E5 Section 15.10.1 grammar explicitly forbids these as PatternCharacters.
|
|
*/
|
|
goto fail_invalid_char;
|
|
break;
|
|
}
|
|
#endif
|
|
case -1: {
|
|
/* EOF */
|
|
advtok = DUK__ADVTOK(0, DUK_TOK_EOF);
|
|
break;
|
|
}
|
|
default: {
|
|
/* PatternCharacter, all excluded characters are matched by cases above */
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_CHAR);
|
|
out_token->num = (duk_uint32_t) x;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Shared exit path
|
|
*/
|
|
|
|
DUK__ADVANCEBYTES(lex_ctx, advtok >> 8);
|
|
out_token->t = advtok & 0xff;
|
|
return;
|
|
|
|
fail_token_limit:
|
|
DUK_ERROR_RANGE(lex_ctx->thr, DUK_STR_TOKEN_LIMIT);
|
|
DUK_WO_NORETURN(return;);
|
|
|
|
fail_escape:
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_REGEXP_ESCAPE);
|
|
DUK_WO_NORETURN(return;);
|
|
|
|
fail_group:
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_REGEXP_GROUP);
|
|
DUK_WO_NORETURN(return;);
|
|
|
|
#if !defined(DUK_USE_ES6_REGEXP_SYNTAX)
|
|
fail_invalid_char:
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_REGEXP_CHARACTER);
|
|
DUK_WO_NORETURN(return;);
|
|
|
|
fail_quantifier:
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_QUANTIFIER);
|
|
DUK_WO_NORETURN(return;);
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* Special parser for character classes; calls callback for every
|
|
* range parsed and returns the number of ranges present.
|
|
*/
|
|
|
|
/* XXX: this duplicates functionality in duk_regexp.c where a similar loop is
|
|
* required anyway. We could use that BUT we need to update the regexp compiler
|
|
* 'nranges' too. Work this out a bit more cleanly to save space.
|
|
*/
|
|
|
|
/* XXX: the handling of character range detection is a bit convoluted.
|
|
* Try to simplify and make smaller.
|
|
*/
|
|
|
|
/* XXX: logic for handling character ranges is now incorrect, it will accept
|
|
* e.g. [\d-z] whereas it should croak from it? SMJS accepts this too, though.
|
|
*
|
|
* Needs a read through and a lot of additional tests.
|
|
*/
|
|
|
|
DUK_LOCAL
|
|
void duk__emit_u16_direct_ranges(duk_lexer_ctx *lex_ctx,
|
|
duk_re_range_callback gen_range,
|
|
void *userdata,
|
|
const duk_uint16_t *ranges,
|
|
duk_small_int_t num) {
|
|
const duk_uint16_t *ranges_end;
|
|
|
|
DUK_UNREF(lex_ctx);
|
|
|
|
ranges_end = ranges + num;
|
|
while (ranges < ranges_end) {
|
|
/* mark range 'direct', bypass canonicalization (see Wiki) */
|
|
gen_range(userdata, (duk_codepoint_t) ranges[0], (duk_codepoint_t) ranges[1], 1);
|
|
ranges += 2;
|
|
}
|
|
}
|
|
|
|
DUK_INTERNAL void duk_lexer_parse_re_ranges(duk_lexer_ctx *lex_ctx, duk_re_range_callback gen_range, void *userdata) {
|
|
duk_codepoint_t start = -1;
|
|
duk_codepoint_t ch;
|
|
duk_codepoint_t x;
|
|
duk_bool_t dash = 0;
|
|
duk_small_uint_t adv = 0;
|
|
|
|
DUK_DD(DUK_DDPRINT("parsing regexp ranges"));
|
|
|
|
for (;;) {
|
|
DUK__ADVANCECHARS(lex_ctx, adv);
|
|
adv = 1;
|
|
|
|
x = DUK__L0();
|
|
|
|
ch = -1; /* not strictly necessary, but avoids "uninitialized variable" warnings */
|
|
DUK_UNREF(ch);
|
|
|
|
if (x < 0) {
|
|
goto fail_unterm_charclass;
|
|
} else if (x == DUK_ASC_RBRACKET) {
|
|
if (start >= 0) {
|
|
gen_range(userdata, start, start, 0);
|
|
}
|
|
DUK__ADVANCECHARS(lex_ctx, 1); /* eat ']' before finishing */
|
|
break;
|
|
} else if (x == DUK_ASC_MINUS) {
|
|
if (start >= 0 && !dash && DUK__L1() != DUK_ASC_RBRACKET) {
|
|
/* '-' as a range indicator */
|
|
dash = 1;
|
|
continue;
|
|
} else {
|
|
/* '-' verbatim */
|
|
ch = x;
|
|
}
|
|
} else if (x == DUK_ASC_BACKSLASH) {
|
|
/*
|
|
* The escapes are same as outside a character class, except that \b has a
|
|
* different meaning, and \B and backreferences are prohibited (see E5
|
|
* Section 15.10.2.19). However, it's difficult to share code because we
|
|
* handle e.g. "\n" very differently: here we generate a single character
|
|
* range for it.
|
|
*/
|
|
|
|
/* XXX: ES2015 surrogate pair handling. */
|
|
|
|
x = DUK__L1();
|
|
|
|
adv = 2;
|
|
|
|
if (x == DUK_ASC_LC_B) {
|
|
/* Note: '\b' in char class is different than outside (assertion),
|
|
* '\B' is not allowed and is caught by the duk_unicode_is_identifier_part()
|
|
* check below.
|
|
*/
|
|
ch = 0x0008;
|
|
} else if (x == DUK_ASC_LC_F) {
|
|
ch = 0x000c;
|
|
} else if (x == DUK_ASC_LC_N) {
|
|
ch = 0x000a;
|
|
} else if (x == DUK_ASC_LC_T) {
|
|
ch = 0x0009;
|
|
} else if (x == DUK_ASC_LC_R) {
|
|
ch = 0x000d;
|
|
} else if (x == DUK_ASC_LC_V) {
|
|
ch = 0x000b;
|
|
} else if (x == DUK_ASC_LC_C) {
|
|
x = DUK__L2();
|
|
adv = 3;
|
|
if ((x >= DUK_ASC_LC_A && x <= DUK_ASC_LC_Z) ||
|
|
(x >= DUK_ASC_UC_A && x <= DUK_ASC_UC_Z)) {
|
|
ch = (x % 32);
|
|
} else {
|
|
goto fail_escape;
|
|
}
|
|
} else if (x == DUK_ASC_LC_X || x == DUK_ASC_LC_U) {
|
|
/* The \u{H+} form is only allowed in Unicode mode which
|
|
* we don't support yet.
|
|
*/
|
|
ch = duk__lexer_parse_escape(lex_ctx, 0 /*allow_es6*/);
|
|
adv = 0;
|
|
} else if (x == DUK_ASC_LC_D) {
|
|
duk__emit_u16_direct_ranges(lex_ctx,
|
|
gen_range,
|
|
userdata,
|
|
duk_unicode_re_ranges_digit,
|
|
sizeof(duk_unicode_re_ranges_digit) / sizeof(duk_uint16_t));
|
|
ch = -1;
|
|
} else if (x == DUK_ASC_UC_D) {
|
|
duk__emit_u16_direct_ranges(lex_ctx,
|
|
gen_range,
|
|
userdata,
|
|
duk_unicode_re_ranges_not_digit,
|
|
sizeof(duk_unicode_re_ranges_not_digit) / sizeof(duk_uint16_t));
|
|
ch = -1;
|
|
} else if (x == DUK_ASC_LC_S) {
|
|
duk__emit_u16_direct_ranges(lex_ctx,
|
|
gen_range,
|
|
userdata,
|
|
duk_unicode_re_ranges_white,
|
|
sizeof(duk_unicode_re_ranges_white) / sizeof(duk_uint16_t));
|
|
ch = -1;
|
|
} else if (x == DUK_ASC_UC_S) {
|
|
duk__emit_u16_direct_ranges(lex_ctx,
|
|
gen_range,
|
|
userdata,
|
|
duk_unicode_re_ranges_not_white,
|
|
sizeof(duk_unicode_re_ranges_not_white) / sizeof(duk_uint16_t));
|
|
ch = -1;
|
|
} else if (x == DUK_ASC_LC_W) {
|
|
duk__emit_u16_direct_ranges(lex_ctx,
|
|
gen_range,
|
|
userdata,
|
|
duk_unicode_re_ranges_wordchar,
|
|
sizeof(duk_unicode_re_ranges_wordchar) / sizeof(duk_uint16_t));
|
|
ch = -1;
|
|
} else if (x == DUK_ASC_UC_W) {
|
|
duk__emit_u16_direct_ranges(lex_ctx,
|
|
gen_range,
|
|
userdata,
|
|
duk_unicode_re_ranges_not_wordchar,
|
|
sizeof(duk_unicode_re_ranges_not_wordchar) / sizeof(duk_uint16_t));
|
|
ch = -1;
|
|
} else if (DUK__ISDIGIT(x)) {
|
|
/* DecimalEscape, only \0 is allowed, no leading
|
|
* zeroes are allowed.
|
|
*
|
|
* ES2015 Annex B also allows (maximal match) legacy
|
|
* octal escapes up to \377 and \8 and \9 are
|
|
* accepted as literal '8' and '9', also in strict mode.
|
|
*/
|
|
|
|
#if defined(DUK_USE_ES6_REGEXP_SYNTAX)
|
|
ch = duk__lexer_parse_legacy_octal(lex_ctx, &adv, 0 /*reject_annex_b*/);
|
|
DUK_ASSERT(ch >= 0); /* no rejections */
|
|
#else
|
|
if (x == DUK_ASC_0 && !DUK__ISDIGIT(DUK__L2())) {
|
|
ch = 0x0000;
|
|
} else {
|
|
goto fail_escape;
|
|
}
|
|
#endif
|
|
#if defined(DUK_USE_ES6_REGEXP_SYNTAX)
|
|
} else if (x >= 0) {
|
|
/* IdentityEscape: ES2015 Annex B allows almost all
|
|
* source characters here. Match anything except
|
|
* EOF here.
|
|
*/
|
|
ch = x;
|
|
#else /* DUK_USE_ES6_REGEXP_SYNTAX */
|
|
} else if (!duk_unicode_is_identifier_part(x)) {
|
|
/* IdentityEscape: ES5.1 doesn't allow identity escape
|
|
* for identifier part characters, which conflicts with
|
|
* some real world code. For example, it doesn't allow
|
|
* /[\$]/ which is awkward.
|
|
*/
|
|
ch = x;
|
|
#endif /* DUK_USE_ES6_REGEXP_SYNTAX */
|
|
} else {
|
|
goto fail_escape;
|
|
}
|
|
} else {
|
|
/* character represents itself */
|
|
ch = x;
|
|
}
|
|
|
|
/* ch is a literal character here or -1 if parsed entity was
|
|
* an escape such as "\s".
|
|
*/
|
|
|
|
if (ch < 0) {
|
|
/* multi-character sets not allowed as part of ranges, see
|
|
* E5 Section 15.10.2.15, abstract operation CharacterRange.
|
|
*/
|
|
if (start >= 0) {
|
|
if (dash) {
|
|
goto fail_range;
|
|
} else {
|
|
gen_range(userdata, start, start, 0);
|
|
start = -1;
|
|
/* dash is already 0 */
|
|
}
|
|
}
|
|
} else {
|
|
if (start >= 0) {
|
|
if (dash) {
|
|
if (start > ch) {
|
|
goto fail_range;
|
|
}
|
|
gen_range(userdata, start, ch, 0);
|
|
start = -1;
|
|
dash = 0;
|
|
} else {
|
|
gen_range(userdata, start, start, 0);
|
|
start = ch;
|
|
/* dash is already 0 */
|
|
}
|
|
} else {
|
|
start = ch;
|
|
}
|
|
}
|
|
}
|
|
|
|
return;
|
|
|
|
fail_escape:
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_REGEXP_ESCAPE);
|
|
DUK_WO_NORETURN(return;);
|
|
|
|
fail_range:
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_RANGE);
|
|
DUK_WO_NORETURN(return;);
|
|
|
|
fail_unterm_charclass:
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_UNTERMINATED_CHARCLASS);
|
|
DUK_WO_NORETURN(return;);
|
|
}
|
|
|
|
#endif /* DUK_USE_REGEXP_SUPPORT */
|