OVMS3/OVMS.V3/components/duktape/src-input/duk_lexer.c

/*
 *  Lexer for source files, ToNumber() string conversions, RegExp expressions,
 *  and JSON.
 *
 *  Provides a stream of ECMAScript tokens from an UTF-8/CESU-8 buffer.  The
 *  caller can also rewind the token stream into a certain position which is
 *  needed by the compiler part for multi-pass scanning.  Tokens are
 *  represented as duk_token structures, and contain line number information.
 *  Token types are identified with DUK_TOK_* defines.
 *
 *  Characters are decoded into a fixed size lookup window consisting of
 *  decoded Unicode code points, with window positions past the end of the
 *  input filled with an invalid codepoint (-1).  The tokenizer can thus
 *  perform multiple character lookups efficiently and with few sanity
 *  checks (such as access outside the end of the input), which keeps the
 *  tokenization code small at the cost of performance.
 *
 *  Character data in tokens, such as identifier names and string literals,
 *  is encoded into CESU-8 format on-the-fly while parsing the token in
 *  question.  The string data is made reachable to garbage collection by
 *  placing the token-related values in value stack entries allocated for
 *  this purpose by the caller.  The characters exist in Unicode code point
 *  form only in the fixed size lookup window, which keeps character data
 *  expansion (of especially ASCII data) low.
 *
 *  Token parsing supports the full range of Unicode characters as described
 *  in the E5 specification.  Parsing has been optimized for ASCII characters
 *  because ordinary ECMAScript code consists almost entirely of ASCII
 *  characters.  Matching of complex Unicode codepoint sets (such as in the
 *  IdentifierStart and IdentifierPart productions) is optimized for size,
 *  and is done using a linear scan of a bit-packed list of ranges.  This is
 *  very slow, but should never be entered unless the source code actually
 *  contains Unicode characters.
 *
 *  ECMAScript tokenization is partially context sensitive.  First,
 *  additional future reserved words are recognized in strict mode (see E5
 *  Section 7.6.1.2).  Second, a forward slash character ('/') can be
 *  recognized either as starting a RegExp literal or as a division operator,
 *  depending on context.  The caller must provide necessary context flags
 *  when requesting a new token.
 *
 *  Future work:
 *
 *    * Make line number tracking optional, as it consumes space.
 *
 *    * Add a feature flag for disabling UTF-8 decoding of input, as most
 *      source code is ASCII.  Because of Unicode escapes written in ASCII,
 *      this does not allow Unicode support to be removed from e.g.
 *      duk_unicode_is_identifier_start() nor does it allow removal of CESU-8
 *      encoding of e.g. string literals.
 *
 *    * Add a feature flag for disabling Unicode compliance of e.g. identifier
 *      names.  This allows for a build more than a kilobyte smaller, because
 *      Unicode ranges needed by duk_unicode_is_identifier_start() and
 *      duk_unicode_is_identifier_part() can be dropped.  String literals
 *      should still be allowed to contain escaped Unicode, so this still does
 *      not allow removal of CESU-8 encoding of e.g. string literals.
 *
 *    * Character lookup tables for codepoints above BMP could be stripped.
 *
 *    * Strictly speaking, E5 specification requires that source code consists
 *      of 16-bit code units, and if not, must be conceptually converted to
 *      that format first.  The current lexer processes Unicode code points
 *      and allows characters outside the BMP.  These should be converted to
 *      surrogate pairs while reading the source characters into the window,
 *      not after tokens have been formed (as is done now).  However, the fix
 *      is not trivial because two characters are decoded from one codepoint.
 *
 *    * Optimize for speed as well as size.  Large if-else ladders are (at
 *      least potentially) slow.
 */

#include "duk_internal.h"

/*
 *  Various defines and file specific helper macros
 */

#define DUK__MAX_RE_DECESC_DIGITS     9
#define DUK__MAX_RE_QUANT_DIGITS      9   /* Does not allow e.g. 2**31-1, but one more would allow overflows of u32. */

/* whether to use macros or helper function depends on call count */
#define DUK__ISDIGIT(x)          ((x) >= DUK_ASC_0 && (x) <= DUK_ASC_9)
#define DUK__ISHEXDIGIT(x)       duk__is_hex_digit((x))
#define DUK__ISOCTDIGIT(x)       ((x) >= DUK_ASC_0 && (x) <= DUK_ASC_7)
#define DUK__ISDIGIT03(x)        ((x) >= DUK_ASC_0 && (x) <= DUK_ASC_3)
#define DUK__ISDIGIT47(x)        ((x) >= DUK_ASC_4 && (x) <= DUK_ASC_7)

/* lexer character window helpers */
#define DUK__LOOKUP(lex_ctx,idx)            ((lex_ctx)->window[(idx)].codepoint)
#define DUK__ADVANCECHARS(lex_ctx,count)    duk__advance_chars((lex_ctx), (count))
#define DUK__ADVANCEBYTES(lex_ctx,count)    duk__advance_bytes((lex_ctx), (count))
#define DUK__INITBUFFER(lex_ctx)            duk__initbuffer((lex_ctx))
#define DUK__APPENDBUFFER(lex_ctx,x)        duk__appendbuffer((lex_ctx), (duk_codepoint_t) (x))
#define DUK__APPENDBUFFER_ASCII(lex_ctx,x)  duk__appendbuffer_ascii((lex_ctx), (duk_codepoint_t) (x))

/* lookup shorthands (note: assume context variable is named 'lex_ctx') */
#define DUK__L0()  DUK__LOOKUP(lex_ctx, 0)
#define DUK__L1()  DUK__LOOKUP(lex_ctx, 1)
#define DUK__L2()  DUK__LOOKUP(lex_ctx, 2)
#define DUK__L3()  DUK__LOOKUP(lex_ctx, 3)
#define DUK__L4()  DUK__LOOKUP(lex_ctx, 4)
#define DUK__L5()  DUK__LOOKUP(lex_ctx, 5)

/* packed advance/token number macro used by multiple functions */
#define DUK__ADVTOK(advbytes,tok)  ((((advbytes) * sizeof(duk_lexer_codepoint)) << 8) + (tok))

/*
 *  Advance lookup window by N characters, filling in new characters as
 *  necessary.  After returning caller is guaranteed a character window of
 *  at least DUK_LEXER_WINDOW_SIZE characters.
 *
 *  The main function duk__advance_bytes() is called at least once per every
 *  token so it has a major lexer/compiler performance impact.  There are two
 *  variants for the main duk__advance_bytes() algorithm: a sliding window
 *  approach which is slightly faster at the cost of larger code footprint,
 *  and a simple copying one.
 *
 *  Decoding directly from the source string would be another lexing option.
 *  But the lookup window based approach has the advantage of hiding the
 *  source string and its encoding effectively which gives more flexibility
 *  going forward to e.g. support chunked streaming of source from flash.
 *
 *  Decodes UTF-8/CESU-8 leniently with support for code points from U+0000 to
 *  U+10FFFF, causing an error if the input is unparseable.  Leniency means:
 *
 *    * Unicode code point validation is intentionally not performed,
 *      except to check that the codepoint does not exceed 0x10ffff.
 *
 *    * In particular, surrogate pairs are allowed and not combined, which
 *      allows source files to represent all SourceCharacters with CESU-8.
 *      Broken surrogate pairs are allowed, as ECMAScript does not mandate
 *      their validation.
 *
 *    * Allow non-shortest UTF-8 encodings.
 *
 *  Leniency here causes few security concerns because all character data is
 *  decoded into Unicode codepoints before lexer processing, and is then
 *  re-encoded into CESU-8.  The source can be parsed as strict UTF-8 with
 *  a compiler option.  However, ECMAScript source characters include -all-
 *  16-bit unsigned integer codepoints, so leniency seems to be appropriate.
 *
 *  Note that codepoints above the BMP are not strictly SourceCharacters,
 *  but the lexer still accepts them as such.  Before ending up in a string
 *  or an identifier name, codepoints above BMP are converted into surrogate
 *  pairs and then CESU-8 encoded, resulting in 16-bit Unicode data as
 *  expected by ECMAScript.
 *
 *  An alternative approach to dealing with invalid or partial sequences
 *  would be to skip them and replace them with e.g. the Unicode replacement
 *  character U+FFFD.  This has limited utility because a replacement character
 *  will most likely cause a parse error, unless it occurs inside a string.
 *  Further, ECMAScript source is typically pure ASCII.
 *
 *  See:
 *
 *     http://en.wikipedia.org/wiki/UTF-8
 *     http://en.wikipedia.org/wiki/CESU-8
 *     http://tools.ietf.org/html/rfc3629
 *     http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
 *
 *  Future work:
 *
 *    * Reject other invalid Unicode sequences (see Wikipedia entry for examples)
 *      in strict UTF-8 mode.
 *
 *    * Size optimize.  An attempt to use a 16-byte lookup table for the first
 *      byte resulted in a code increase though.
 *
 *    * Is checking against maximum 0x10ffff really useful?  4-byte encoding
 *      imposes a certain limit anyway.
 *
 *    * Support chunked streaming of source code.  Can be implemented either
 *      by streaming chunks of bytes or chunks of codepoints.
 */

#if defined(DUK_USE_LEXER_SLIDING_WINDOW)
DUK_LOCAL void duk__fill_lexer_buffer(duk_lexer_ctx *lex_ctx, duk_small_uint_t start_offset_bytes) {
	duk_lexer_codepoint *cp, *cp_end;
	duk_ucodepoint_t x;
	duk_small_uint_t contlen;
	const duk_uint8_t *p, *p_end;
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
	duk_ucodepoint_t mincp;
#endif
	duk_int_t input_line;

	/* Use temporaries and update lex_ctx only when finished. */
	input_line = lex_ctx->input_line;
	p = lex_ctx->input + lex_ctx->input_offset;
	p_end = lex_ctx->input + lex_ctx->input_length;

	cp = (duk_lexer_codepoint *) (void *) ((duk_uint8_t *) lex_ctx->buffer + start_offset_bytes);
	cp_end = lex_ctx->buffer + DUK_LEXER_BUFFER_SIZE;

	for (; cp != cp_end; cp++) {
		cp->offset = (duk_size_t) (p - lex_ctx->input);
		cp->line = input_line;

		/* XXX: potential issue with signed pointers, p_end < p. */
		if (DUK_UNLIKELY(p >= p_end)) {
			/* If input_offset were assigned a negative value, it would
			 * result in a large positive value.  Most likely it would be
			 * larger than input_length and be caught here.  In any case
			 * no memory unsafe behavior would happen.
			 */
			cp->codepoint = -1;
			continue;
		}

		x = (duk_ucodepoint_t) (*p++);

		/* Fast path. */

		if (DUK_LIKELY(x < 0x80UL)) {
			DUK_ASSERT(x != 0x2028UL && x != 0x2029UL);  /* not LS/PS */
			if (DUK_UNLIKELY(x <= 0x000dUL)) {
				if ((x == 0x000aUL) ||
				    ((x == 0x000dUL) && (p >= p_end || *p != 0x000aUL))) {
					/* lookup for 0x000a above assumes shortest encoding now */

					/* E5 Section 7.3, treat the following as newlines:
					 *   LF
					 *   CR [not followed by LF]
					 *   LS
					 *   PS
					 *
					 * For CR LF, CR is ignored if it is followed by LF, and the LF will bump
					 * the line number.
					 */
					input_line++;
				}
			}

			cp->codepoint = (duk_codepoint_t) x;
			continue;
		}

		/* Slow path. */

		if (x < 0xc0UL) {
			/* 10xx xxxx -> invalid */
			goto error_encoding;
		} else if (x < 0xe0UL) {
			/* 110x xxxx   10xx xxxx  */
			contlen = 1;
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
			mincp = 0x80UL;
#endif
			x = x & 0x1fUL;
		} else if (x < 0xf0UL) {
			/* 1110 xxxx   10xx xxxx   10xx xxxx */
			contlen = 2;
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
			mincp = 0x800UL;
#endif
			x = x & 0x0fUL;
		} else if (x < 0xf8UL) {
			/* 1111 0xxx   10xx xxxx   10xx xxxx   10xx xxxx */
			contlen = 3;
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
			mincp = 0x10000UL;
#endif
			x = x & 0x07UL;
		} else {
			/* no point in supporting encodings of 5 or more bytes */
			goto error_encoding;
		}

		DUK_ASSERT(p_end >= p);
		if ((duk_size_t) contlen > (duk_size_t) (p_end - p)) {
			goto error_clipped;
		}

		while (contlen > 0) {
			duk_small_uint_t y;
			y = *p++;
			if ((y & 0xc0U) != 0x80U) {
				/* check that byte has the form 10xx xxxx */
				goto error_encoding;
			}
			x = x << 6;
			x += y & 0x3fUL;
			contlen--;
		}

		/* check final character validity */

		if (x > 0x10ffffUL) {
			goto error_encoding;
		}
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
		if (x < mincp || (x >= 0xd800UL && x <= 0xdfffUL) || x == 0xfffeUL) {
			goto error_encoding;
		}
#endif

		DUK_ASSERT(x != 0x000aUL && x != 0x000dUL);
		if ((x == 0x2028UL) || (x == 0x2029UL)) {
			input_line++;
		}

		cp->codepoint = (duk_codepoint_t) x;
	}

	lex_ctx->input_offset = (duk_size_t) (p - lex_ctx->input);
	lex_ctx->input_line = input_line;
	return;

 error_clipped:   /* clipped codepoint */
 error_encoding:  /* invalid codepoint encoding or codepoint */
	lex_ctx->input_offset = (duk_size_t) (p - lex_ctx->input);
	lex_ctx->input_line = input_line;

	DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_SOURCE_DECODE_FAILED);
	DUK_WO_NORETURN(return;);
}

DUK_LOCAL void duk__advance_bytes(duk_lexer_ctx *lex_ctx, duk_small_uint_t count_bytes) {
	duk_small_uint_t used_bytes, avail_bytes;

	DUK_ASSERT_DISABLE(count_bytes >= 0);  /* unsigned */
	DUK_ASSERT(count_bytes <= (duk_small_uint_t) (DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint)));
	DUK_ASSERT(lex_ctx->window >= lex_ctx->buffer);
	DUK_ASSERT(lex_ctx->window < lex_ctx->buffer + DUK_LEXER_BUFFER_SIZE);
	DUK_ASSERT((duk_uint8_t *) lex_ctx->window + count_bytes <= (duk_uint8_t *) lex_ctx->buffer + DUK_LEXER_BUFFER_SIZE * sizeof(duk_lexer_codepoint));

	/* Zero 'count' is also allowed to make call sites easier.
	 * Arithmetic in bytes generates better code in GCC.
	 */

	lex_ctx->window = (duk_lexer_codepoint *) (void *) ((duk_uint8_t *) lex_ctx->window + count_bytes);  /* avoid multiply */
	used_bytes = (duk_small_uint_t) ((duk_uint8_t *) lex_ctx->window - (duk_uint8_t *) lex_ctx->buffer);
	avail_bytes = DUK_LEXER_BUFFER_SIZE * sizeof(duk_lexer_codepoint) - used_bytes;
	if (avail_bytes < (duk_small_uint_t) (DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint))) {
		/* Not enough data to provide a full window, so "scroll" window to
		 * start of buffer and fill up the rest.
		 */
		duk_memmove((void *) lex_ctx->buffer,
		            (const void *) lex_ctx->window,
		            (size_t) avail_bytes);
		lex_ctx->window = lex_ctx->buffer;
		duk__fill_lexer_buffer(lex_ctx, avail_bytes);
	}
}

DUK_LOCAL void duk__init_lexer_window(duk_lexer_ctx *lex_ctx) {
	lex_ctx->window = lex_ctx->buffer;
	duk__fill_lexer_buffer(lex_ctx, 0);
}
#else  /* DUK_USE_LEXER_SLIDING_WINDOW */
DUK_LOCAL duk_codepoint_t duk__read_char(duk_lexer_ctx *lex_ctx) {
	duk_ucodepoint_t x;
	duk_small_uint_t len;
	duk_small_uint_t i;
	const duk_uint8_t *p;
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
	duk_ucodepoint_t mincp;
#endif
	duk_size_t input_offset;

	input_offset = lex_ctx->input_offset;
	if (DUK_UNLIKELY(input_offset >= lex_ctx->input_length)) {
		/* If input_offset were assigned a negative value, it would
		 * result in a large positive value.  Most likely it would be
		 * larger than input_length and be caught here.  In any case
		 * no memory unsafe behavior would happen.
		 */
		return -1;
	}

	p = lex_ctx->input + input_offset;
	x = (duk_ucodepoint_t) (*p);

	if (DUK_LIKELY(x < 0x80UL)) {
		/* 0xxx xxxx -> fast path */

		/* input offset tracking */
		lex_ctx->input_offset++;

		DUK_ASSERT(x != 0x2028UL && x != 0x2029UL);  /* not LS/PS */
		if (DUK_UNLIKELY(x <= 0x000dUL)) {
			if ((x == 0x000aUL) ||
			    ((x == 0x000dUL) && (lex_ctx->input_offset >= lex_ctx->input_length ||
			                         lex_ctx->input[lex_ctx->input_offset] != 0x000aUL))) {
				/* lookup for 0x000a above assumes shortest encoding now */

				/* E5 Section 7.3, treat the following as newlines:
				 *   LF
				 *   CR [not followed by LF]
				 *   LS
				 *   PS
				 *
				 * For CR LF, CR is ignored if it is followed by LF, and the LF will bump
				 * the line number.
				 */
				lex_ctx->input_line++;
			}
		}

		return (duk_codepoint_t) x;
	}

	/* Slow path. */

	if (x < 0xc0UL) {
		/* 10xx xxxx -> invalid */
		goto error_encoding;
	} else if (x < 0xe0UL) {
		/* 110x xxxx   10xx xxxx  */
		len = 2;
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
		mincp = 0x80UL;
#endif
		x = x & 0x1fUL;
	} else if (x < 0xf0UL) {
		/* 1110 xxxx   10xx xxxx   10xx xxxx */
		len = 3;
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
		mincp = 0x800UL;
#endif
		x = x & 0x0fUL;
	} else if (x < 0xf8UL) {
		/* 1111 0xxx   10xx xxxx   10xx xxxx   10xx xxxx */
		len = 4;
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
		mincp = 0x10000UL;
#endif
		x = x & 0x07UL;
	} else {
		/* no point in supporting encodings of 5 or more bytes */
		goto error_encoding;
	}

	DUK_ASSERT(lex_ctx->input_length >= lex_ctx->input_offset);
	if ((duk_size_t) len > (duk_size_t) (lex_ctx->input_length - lex_ctx->input_offset)) {
		goto error_clipped;
	}

	p++;
	for (i = 1; i < len; i++) {
		duk_small_uint_t y;
		y = *p++;
		if ((y & 0xc0U) != 0x80U) {
			/* check that byte has the form 10xx xxxx */
			goto error_encoding;
		}
		x = x << 6;
		x += y & 0x3fUL;
	}

	/* check final character validity */

	if (x > 0x10ffffUL) {
		goto error_encoding;
	}
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
	if (x < mincp || (x >= 0xd800UL && x <= 0xdfffUL) || x == 0xfffeUL) {
		goto error_encoding;
	}
#endif

	/* input offset tracking */
	lex_ctx->input_offset += len;

	/* line tracking */
	DUK_ASSERT(x != 0x000aUL && x != 0x000dUL);
	if ((x == 0x2028UL) || (x == 0x2029UL)) {
		lex_ctx->input_line++;
	}

	return (duk_codepoint_t) x;

 error_clipped:   /* clipped codepoint */
 error_encoding:  /* invalid codepoint encoding or codepoint */
	DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_SOURCE_DECODE_FAILED);
	DUK_WO_NORETURN(return 0;);
}

DUK_LOCAL void duk__advance_bytes(duk_lexer_ctx *lex_ctx, duk_small_uint_t count_bytes) {
	duk_small_uint_t keep_bytes;
	duk_lexer_codepoint *cp, *cp_end;

	DUK_ASSERT_DISABLE(count_bytes >= 0);  /* unsigned */
	DUK_ASSERT(count_bytes <= (duk_small_uint_t) (DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint)));

	/* Zero 'count' is also allowed to make call sites easier. */

	keep_bytes = DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint) - count_bytes;
	duk_memmove((void *) lex_ctx->window,
	            (const void *) ((duk_uint8_t *) lex_ctx->window + count_bytes),
	            (size_t) keep_bytes);

	cp = (duk_lexer_codepoint *) ((duk_uint8_t *) lex_ctx->window + keep_bytes);
	cp_end = lex_ctx->window + DUK_LEXER_WINDOW_SIZE;
	for (; cp != cp_end; cp++) {
		cp->offset = lex_ctx->input_offset;
		cp->line = lex_ctx->input_line;
		cp->codepoint = duk__read_char(lex_ctx);
	}
}

DUK_LOCAL void duk__init_lexer_window(duk_lexer_ctx *lex_ctx) {
	/* Call with count == DUK_LEXER_WINDOW_SIZE to fill buffer initially. */
	duk__advance_bytes(lex_ctx, DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint));  /* fill window */
}
#endif  /* DUK_USE_LEXER_SLIDING_WINDOW */

DUK_LOCAL void duk__advance_chars(duk_lexer_ctx *lex_ctx, duk_small_uint_t count_chars) {
	duk__advance_bytes(lex_ctx, count_chars * sizeof(duk_lexer_codepoint));
}

/*
 *  (Re)initialize the temporary byte buffer.  May be called extra times
 *  with little impact.
 */

DUK_LOCAL void duk__initbuffer(duk_lexer_ctx *lex_ctx) {
	/* Reuse buffer as is unless buffer has grown large. */
	if (DUK_HBUFFER_DYNAMIC_GET_SIZE(lex_ctx->buf) < DUK_LEXER_TEMP_BUF_LIMIT) {
		/* Keep current size */
	} else {
		duk_hbuffer_resize(lex_ctx->thr, lex_ctx->buf, DUK_LEXER_TEMP_BUF_LIMIT);
	}

	DUK_BW_INIT_WITHBUF(lex_ctx->thr, &lex_ctx->bw, lex_ctx->buf);
}

/*
 *  Append a Unicode codepoint to the temporary byte buffer.  Performs
 *  CESU-8 surrogate pair encoding for codepoints above the BMP.
 *  Existing surrogate pairs are allowed and also encoded into CESU-8.
 */

DUK_LOCAL void duk__appendbuffer(duk_lexer_ctx *lex_ctx, duk_codepoint_t x) {
	/*
	 *  Since character data is only generated by decoding the source or by
	 *  the compiler itself, we rely on the input codepoints being correct
	 *  and avoid a check here.
	 *
	 *  Character data can also come here through decoding of Unicode
	 *  escapes ("\udead\ubeef") so all 16-but unsigned values can be
	 *  present, even when the source file itself is strict UTF-8.
	 */
	DUK_ASSERT(x >= 0 && x <= 0x10ffffL);

	DUK_BW_WRITE_ENSURE_CESU8(lex_ctx->thr, &lex_ctx->bw, (duk_ucodepoint_t) x);
}

DUK_LOCAL void duk__appendbuffer_ascii(duk_lexer_ctx *lex_ctx, duk_codepoint_t x) {
	/* ASCII characters can be emitted as a single byte without encoding
	 * which matters for some fast paths.
	 */
	DUK_ASSERT(x >= 0 && x <= 0x7f);

	DUK_BW_WRITE_ENSURE_U8(lex_ctx->thr, &lex_ctx->bw, (duk_uint8_t) x);
}

/*
 *  Intern the temporary byte buffer into a valstack slot
 *  (in practice, slot1 or slot2).
 */

DUK_LOCAL duk_hstring *duk__internbuffer(duk_lexer_ctx *lex_ctx, duk_idx_t valstack_idx) {
	DUK_ASSERT(valstack_idx == lex_ctx->slot1_idx || valstack_idx == lex_ctx->slot2_idx);

	DUK_BW_PUSH_AS_STRING(lex_ctx->thr, &lex_ctx->bw);
	duk_replace(lex_ctx->thr, valstack_idx);
	return duk_known_hstring(lex_ctx->thr, valstack_idx);
}

/*
 *  Init lexer context
 */

DUK_INTERNAL void duk_lexer_initctx(duk_lexer_ctx *lex_ctx) {
	DUK_ASSERT(lex_ctx != NULL);

	duk_memzero(lex_ctx, sizeof(*lex_ctx));
#if defined(DUK_USE_EXPLICIT_NULL_INIT)
#if defined(DUK_USE_LEXER_SLIDING_WINDOW)
	lex_ctx->window = NULL;
#endif
	lex_ctx->thr = NULL;
	lex_ctx->input = NULL;
	lex_ctx->buf = NULL;
#endif
}

/*
 *  Set lexer input position and reinitialize lookup window.
 */

DUK_INTERNAL void duk_lexer_getpoint(duk_lexer_ctx *lex_ctx, duk_lexer_point *pt) {
	pt->offset = lex_ctx->window[0].offset;
	pt->line = lex_ctx->window[0].line;
}

DUK_INTERNAL void duk_lexer_setpoint(duk_lexer_ctx *lex_ctx, duk_lexer_point *pt) {
	DUK_ASSERT_DISABLE(pt->offset >= 0);  /* unsigned */
	DUK_ASSERT(pt->line >= 1);
	lex_ctx->input_offset = pt->offset;
	lex_ctx->input_line = pt->line;
	duk__init_lexer_window(lex_ctx);
}

/*
 *  Lexing helpers
 */

/* Numeric value of a hex digit (also covers octal and decimal digits) or
 * -1 if not a valid hex digit.
 */
DUK_LOCAL duk_codepoint_t duk__hexval_validate(duk_codepoint_t x) {
	duk_small_int_t t;

	/* Here 'x' is a Unicode codepoint */
	if (DUK_LIKELY(x >= 0 && x <= 0xff)) {
		t = duk_hex_dectab[x];
		if (DUK_LIKELY(t >= 0)) {
			return t;
		}
	}

	return -1;
}

/* Just a wrapper for call sites where 'x' is known to be valid so
 * we assert for it before decoding.
 */
DUK_LOCAL duk_codepoint_t duk__hexval(duk_codepoint_t x) {
	duk_codepoint_t ret;

	DUK_ASSERT((x >= DUK_ASC_0 && x <= DUK_ASC_9) ||
	           (x >= DUK_ASC_LC_A && x <= DUK_ASC_LC_F) ||
	           (x >= DUK_ASC_UC_A && x <= DUK_ASC_UC_F));
	ret = duk__hexval_validate(x);
	DUK_ASSERT(ret >= 0 && ret <= 15);
	return ret;
}

/* having this as a separate function provided a size benefit */
DUK_LOCAL duk_bool_t duk__is_hex_digit(duk_codepoint_t x) {
	if (DUK_LIKELY(x >= 0 && x <= 0xff)) {
		return (duk_hex_dectab[x] >= 0);
	}
	return 0;
}

/* Parse a Unicode escape of the form \xHH, \uHHHH, or \u{H+}.  Shared by
 * source and RegExp parsing.
 */
DUK_LOCAL duk_codepoint_t duk__lexer_parse_escape(duk_lexer_ctx *lex_ctx, duk_bool_t allow_es6) {
	duk_small_int_t digits;  /* Initial value 2 or 4 for fixed length escapes, 0 for ES2015 \u{H+}. */
	duk_codepoint_t escval;
	duk_codepoint_t x;
	duk_small_uint_t adv;

	DUK_ASSERT(DUK__L0() == DUK_ASC_BACKSLASH);  /* caller responsibilities */
	DUK_ASSERT(DUK__L1() == DUK_ASC_LC_X || DUK__L1() == DUK_ASC_LC_U);
	DUK_UNREF(allow_es6);

	adv = 2;
	digits = 2;
	if (DUK__L1() == DUK_ASC_LC_U) {
		digits = 4;
#if defined(DUK_USE_ES6_UNICODE_ESCAPE)
		if (DUK__L2() == DUK_ASC_LCURLY && allow_es6) {
			digits = 0;
			adv = 3;
		}
#endif
	}
	DUK__ADVANCECHARS(lex_ctx, adv);

	escval = 0;
	for (;;) {
		/* One of the escape forms: \xHH, \uHHHH, \u{H+}.
		 * The 'digits' variable tracks parsing state and is
		 * initialized to:
		 *
		 *   \xHH     2
		 *   \uHH     4
		 *   \u{H+}   0 first time, updated to -1 to indicate
		 *            at least one digit has been parsed
		 *
		 * Octal parsing is handled separately because it can be
		 * done with fixed lookahead and also has validation
		 * rules which depend on the escape length (which is
		 * variable).
		 *
		 * We don't need a specific check for x < 0 (end of
		 * input) or duk_unicode_is_line_terminator(x)
		 * because the 'dig' decode will fail and lead to a
		 * SyntaxError.
		 */
		duk_codepoint_t dig;

		x = DUK__L0();
		DUK__ADVANCECHARS(lex_ctx, 1);

		dig = duk__hexval_validate(x);
		if (digits > 0) {
			digits--;
			if (dig < 0) {
				goto fail_escape;
			}
			DUK_ASSERT(dig >= 0x00 && dig <= 0x0f);
			escval = (escval << 4) + dig;
			if (digits == 0) {
				DUK_ASSERT(escval >= 0 && escval <= 0xffffL);
				break;
			}
		} else {
#if defined(DUK_USE_ES6_UNICODE_ESCAPE)
			DUK_ASSERT(digits == 0 /* first time */ || digits == -1 /* others */);
			if (dig >= 0) {
				DUK_ASSERT(dig >= 0x00 && dig <= 0x0f);
				escval = (escval << 4) + dig;
				if (escval > 0x10ffffL) {
					goto fail_escape;
				}
			} else if (x == DUK_ASC_RCURLY) {
				if (digits == 0) {
					/* Empty escape, \u{}. */
					goto fail_escape;
				}
				DUK_ASSERT(escval >= 0 && escval <= 0x10ffffL);
				break;
			} else {
				goto fail_escape;
			}
			digits = -1;  /* Indicate we have at least one digit. */
#else  /* DUK_USE_ES6_UNICODE_ESCAPE */
			DUK_ASSERT(0);  /* Never happens if \u{H+} support disabled. */
#endif  /* DUK_USE_ES6_UNICODE_ESCAPE */
		}
	}

	return escval;

 fail_escape:
	DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_ESCAPE);
	DUK_WO_NORETURN(return 0;);
}

/* Parse legacy octal escape of the form \N{1,3}, e.g. \0, \5, \0377.  Maximum
 * allowed value is \0377 (U+00FF), longest match is used.  Used for both string
 * RegExp octal escape parsing.  Window[0] must be the slash '\' and the first
 * digit must already be validated to be in [0-9] by the caller.
 */
DUK_LOCAL duk_codepoint_t duk__lexer_parse_legacy_octal(duk_lexer_ctx *lex_ctx, duk_small_uint_t *out_adv, duk_bool_t reject_annex_b) {
	duk_codepoint_t cp;
	duk_small_uint_t lookup_idx;
	duk_small_uint_t adv;
	duk_codepoint_t tmp;

	DUK_ASSERT(out_adv != NULL);
	DUK_ASSERT(DUK__LOOKUP(lex_ctx, 0) == DUK_ASC_BACKSLASH);
	DUK_ASSERT(DUK__LOOKUP(lex_ctx, 1) >= DUK_ASC_0 && DUK__LOOKUP(lex_ctx, 1) <= DUK_ASC_9);

	cp = 0;
	tmp = 0;
	for (lookup_idx = 1; lookup_idx <= 3; lookup_idx++) {
		DUK_DDD(DUK_DDDPRINT("lookup_idx=%ld, cp=%ld", (long) lookup_idx, (long) cp));
		tmp = DUK__LOOKUP(lex_ctx, lookup_idx);
		if (tmp < DUK_ASC_0 || tmp > DUK_ASC_7) {
			/* No more valid digits. */
			break;
		}
		tmp = (cp << 3) + (tmp - DUK_ASC_0);
		if (tmp > 0xff) {
			/* Three digit octal escapes above \377 (= 0xff)
			 * are not allowed.
			 */
			break;
		}
		cp = tmp;
	}
	DUK_DDD(DUK_DDDPRINT("final lookup_idx=%ld, cp=%ld", (long) lookup_idx, (long) cp));

	adv = lookup_idx;
	if (lookup_idx == 1) {
		DUK_DDD(DUK_DDDPRINT("\\8 or \\9 -> treat as literal, accept in strict mode too"));
		DUK_ASSERT(tmp == DUK_ASC_8 || tmp == DUK_ASC_9);
		cp = tmp;
		adv++;  /* correction to above, eat offending character */
	} else if (lookup_idx == 2 && cp == 0) {
		/* Note: 'foo\0bar' is OK in strict mode, but 'foo\00bar' is not.
		 * It won't be interpreted as 'foo\u{0}0bar' but as a SyntaxError.
		 */
		DUK_DDD(DUK_DDDPRINT("\\0 -> accept in strict mode too"));
	} else {
		/* This clause also handles non-shortest zero, e.g. \00. */
		if (reject_annex_b) {
			DUK_DDD(DUK_DDDPRINT("non-zero octal literal %ld -> reject in strict-mode", (long) cp));
			cp = -1;
		} else {
			DUK_DDD(DUK_DDDPRINT("non-zero octal literal %ld -> accepted", (long) cp));
			DUK_ASSERT(cp >= 0 && cp <= 0xff);
		}
	}

	*out_adv = adv;

	DUK_ASSERT((cp >= 0 && cp <= 0xff) || (cp == -1 && reject_annex_b));
	return cp;
}

/* XXX: move strict mode to lex_ctx? */
DUK_LOCAL void duk__lexer_parse_string_literal(duk_lexer_ctx *lex_ctx, duk_token *out_token, duk_small_int_t quote, duk_bool_t strict_mode) {
	duk_small_uint_t adv;

	for (adv = 1 /* initial quote */ ;;) {
		duk_codepoint_t x;

		DUK__ADVANCECHARS(lex_ctx, adv);  /* eat opening quote on first loop */
		x = DUK__L0();

		adv = 1;
		if (x == quote) {
			DUK__ADVANCECHARS(lex_ctx, 1);  /* eat closing quote */
			break;
		} else if (x == '\\') {
			/* DUK__L0        -> '\' char
			 * DUK__L1 ... DUK__L5 -> more lookup
			 */
			duk_small_int_t emitcp = -1;

			x = DUK__L1();

			/* How much to advance before next loop. */
			adv = 2;  /* note: long live range */

			switch (x) {
			case '\'':
				emitcp = 0x0027;
				break;
			case '"':
				emitcp = 0x0022;
				break;
			case '\\':
				emitcp = 0x005c;
				break;
			case 'b':
				emitcp = 0x0008;
				break;
			case 'f':
				emitcp = 0x000c;
				break;
			case 'n':
				emitcp = 0x000a;
				break;
			case 'r':
				emitcp = 0x000d;
				break;
			case 't':
				emitcp = 0x0009;
				break;
			case 'v':
				emitcp = 0x000b;
				break;
			case 'x':
			case 'u': {
				duk_codepoint_t esc_cp;
				esc_cp = duk__lexer_parse_escape(lex_ctx, 1 /*allow_es6*/);
				DUK__APPENDBUFFER(lex_ctx, esc_cp);
				adv = 0;
				break;
			}
			default: {
				if (duk_unicode_is_line_terminator(x)) {
					/* line continuation */
					if (x == 0x000d && DUK__L2() == 0x000a) {
						/* CR LF again a special case */
						adv = 3;  /* line terminator, CR, LF */
					}
				} else if (DUK__ISDIGIT(x)) {
					/*
					 *  Octal escape or zero escape:
					 *    \0                                     (lookahead not OctalDigit)
					 *    \1 ... \7                              (lookahead not OctalDigit)
					 *    \ZeroToThree OctalDigit                (lookahead not OctalDigit)
					 *    \FourToSeven OctalDigit                (no lookahead restrictions)
					 *    \ZeroToThree OctalDigit OctalDigit     (no lookahead restrictions)
					 *
					 *  Zero escape is part of the standard syntax.  Octal escapes are
					 *  defined in E5 Section B.1.2, and are only allowed in non-strict mode.
					 *  Any other productions starting with a decimal digit are invalid
					 *  but are in practice treated like identity escapes.
					 *
					 *  Parse octal (up to 3 digits) from the lookup window.
					 */

					emitcp = duk__lexer_parse_legacy_octal(lex_ctx, &adv, strict_mode /*reject_annex_b*/);
					if (emitcp < 0) {
						goto fail_escape;
					}
				} else if (x < 0) {
					goto fail_unterminated;
				} else {
					/* escaped NonEscapeCharacter */
					DUK__APPENDBUFFER(lex_ctx, x);
				}
			}  /* end default clause */
			}  /* end switch */

			/* Shared handling for single codepoint escapes. */
			if (emitcp >= 0) {
				DUK__APPENDBUFFER(lex_ctx, emitcp);
			}

			/* Track number of escapes; count not really needed but directive
			 * prologues need to detect whether there were any escapes or line
			 * continuations or not.
			 */
			out_token->num_escapes++;
		} else if (x >= 0x20 && x <= 0x7f) {
			/* Fast path for ASCII case, avoids line terminator
			 * check and CESU-8 encoding.
			 */
			DUK_ASSERT(x >= 0);
			DUK_ASSERT(!duk_unicode_is_line_terminator(x));
			DUK_ASSERT(x != quote);
			DUK_ASSERT(x != DUK_ASC_BACKSLASH);
			DUK__APPENDBUFFER_ASCII(lex_ctx, x);
		} else if (x < 0 || duk_unicode_is_line_terminator(x)) {
			goto fail_unterminated;
		} else {
			/* Character which is part of the string but wasn't handled
			 * by the fast path.
			 */
			DUK__APPENDBUFFER(lex_ctx, x);
		}
	} /* string parse loop */

	return;

 fail_escape:
	DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_ESCAPE);
	DUK_WO_NORETURN(return;);

 fail_unterminated:
	DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_UNTERMINATED_STRING);
	DUK_WO_NORETURN(return;);
}

/* Skip to end-of-line (or end-of-file), used for single line comments. */
DUK_LOCAL void duk__lexer_skip_to_endofline(duk_lexer_ctx *lex_ctx) {
	for (;;) {
		duk_codepoint_t x;

		x = DUK__L0();
		if (x < 0 || duk_unicode_is_line_terminator(x)) {
			break;
		}
		DUK__ADVANCECHARS(lex_ctx, 1);
	}
}

/*
 *  Parse ECMAScript source InputElementDiv or InputElementRegExp
 *  (E5 Section 7), skipping whitespace, comments, and line terminators.
 *
 *  Possible results are:
 *    (1) a token
 *    (2) a line terminator (skipped)
 *    (3) a comment (skipped)
 *    (4) EOF
 *
 *  White space is automatically skipped from the current position (but
 *  not after the input element).  If input has already ended, returns
 *  DUK_TOK_EOF indefinitely.  If a parse error occurs, uses an DUK_ERROR()
 *  macro call (and hence a longjmp through current heap longjmp context).
 *  Comments and line terminator tokens are automatically skipped.
 *
 *  The input element being matched is determined by regexp_mode; if set,
 *  parses a InputElementRegExp, otherwise a InputElementDiv.  The
 *  difference between these are handling of productions starting with a
 *  forward slash.
 *
 *  If strict_mode is set, recognizes additional future reserved words
 *  specific to strict mode, and refuses to parse octal literals.
 *
 *  The matching strategy below is to (currently) use a six character
 *  lookup window to quickly determine which production is the -longest-
 *  matching one, and then parse that.  The top-level if-else clauses
 *  match the first character, and the code blocks for each clause
 *  handle -all- alternatives for that first character.  ECMAScript
 *  specification uses the "longest match wins" semantics, so the order
 *  of the if-clauses matters.
 *
 *  Misc notes:
 *
 *    * ECMAScript numeric literals do not accept a sign character.
 *      Consequently e.g. "-1.0" is parsed as two tokens: a negative
 *      sign and a positive numeric literal.  The compiler performs
 *      the negation during compilation, so this has no adverse impact.
 *
 *    * There is no token for "undefined": it is just a value available
 *      from the global object (or simply established by doing a reference
 *      to an undefined value).
 *
 *    * Some contexts want Identifier tokens, which are IdentifierNames
 *      excluding reserved words, while some contexts want IdentifierNames
 *      directly.  In the latter case e.g. "while" is interpreted as an
 *      identifier name, not a DUK_TOK_WHILE token.  The solution here is
 *      to provide both token types: DUK_TOK_WHILE goes to 't' while
 *      DUK_TOK_IDENTIFIER goes to 't_nores', and 'slot1' always contains
 *      the identifier / keyword name.
 *
 *    * Directive prologue needs to identify string literals such as
 *      "use strict" and 'use strict', which are sensitive to line
 *      continuations and escape sequences.  For instance, "use\u0020strict"
 *      is a valid directive but is distinct from "use strict".  The solution
 *      here is to decode escapes while tokenizing, but to keep track of the
 *      number of escapes.  Directive detection can then check that the
 *      number of escapes is zero.
 *
 *    * Multi-line comments with one or more internal LineTerminator are
 *      treated like a line terminator to comply with automatic semicolon
 *      insertion.
 */

DUK_INTERNAL
void duk_lexer_parse_js_input_element(duk_lexer_ctx *lex_ctx,
                                      duk_token *out_token,
                                      duk_bool_t strict_mode,
                                      duk_bool_t regexp_mode) {
	duk_codepoint_t x;           /* temporary, must be signed and 32-bit to hold Unicode code points */
	duk_small_uint_t advtok = 0; /* (advance << 8) + token_type, updated at function end,
	                              * init is unnecessary but suppresses "may be used uninitialized" warnings.
	                              */
	duk_bool_t got_lineterm = 0;  /* got lineterm preceding non-whitespace, non-lineterm token */

	if (++lex_ctx->token_count >= lex_ctx->token_limit) {
		goto fail_token_limit;
	}

	out_token->t = DUK_TOK_EOF;
	out_token->t_nores = DUK_TOK_INVALID;  /* marker: copy t if not changed */
#if 0  /* not necessary to init, disabled for faster parsing */
	out_token->num = DUK_DOUBLE_NAN;
	out_token->str1 = NULL;
	out_token->str2 = NULL;
#endif
	out_token->num_escapes = 0;
	/* out_token->lineterm set by caller */

	/* This would be nice, but parsing is faster without resetting the
	 * value slots.  The only side effect is that references to temporary
	 * string values may linger until lexing is finished; they're then
	 * freed normally.
	 */
#if 0
	duk_to_undefined(lex_ctx->thr, lex_ctx->slot1_idx);
	duk_to_undefined(lex_ctx->thr, lex_ctx->slot2_idx);
#endif

	/* 'advtok' indicates how much to advance and which token id to assign
	 * at the end.  This shared functionality minimizes code size.  All
	 * code paths are required to set 'advtok' to some value, so no default
	 * init value is used.  Code paths calling DUK_ERROR() never return so
	 * they don't need to set advtok.
	 */

	/*
	 *  Matching order:
	 *
	 *    Punctuator first chars, also covers comments, regexps
	 *    LineTerminator
	 *    Identifier or reserved word, also covers null/true/false literals
	 *    NumericLiteral
	 *    StringLiteral
	 *    EOF
	 *
	 *  The order does not matter as long as the longest match is
	 *  always correctly identified.  There are order dependencies
	 *  in the clauses, so it's not trivial to convert to a switch.
	 */

 restart_lineupdate:
	out_token->start_line = lex_ctx->window[0].line;

 restart:
	out_token->start_offset = lex_ctx->window[0].offset;

	x = DUK__L0();

	switch (x) {
	case DUK_ASC_SPACE:
	case DUK_ASC_HT:  /* fast paths for space and tab */
		DUK__ADVANCECHARS(lex_ctx, 1);
		goto restart;
	case DUK_ASC_LF:  /* LF line terminator; CR LF and Unicode lineterms are handled in slow path */
		DUK__ADVANCECHARS(lex_ctx, 1);
		got_lineterm = 1;
		goto restart_lineupdate;
#if defined(DUK_USE_SHEBANG_COMMENTS)
	case DUK_ASC_HASH:  /* '#' */
		if (DUK__L1() == DUK_ASC_EXCLAMATION && lex_ctx->window[0].offset == 0 &&
		    (lex_ctx->flags & DUK_COMPILE_SHEBANG)) {
			/* "Shebang" comment ('#! ...') on first line. */
			/* DUK__ADVANCECHARS(lex_ctx, 2) would be correct here, but not necessary */
			duk__lexer_skip_to_endofline(lex_ctx);
			goto restart;  /* line terminator will be handled on next round */
		}
		goto fail_token;
#endif  /* DUK_USE_SHEBANG_COMMENTS */
	case DUK_ASC_SLASH:  /* '/' */
		if (DUK__L1() == DUK_ASC_SLASH) {
			/*
			 *  E5 Section 7.4, allow SourceCharacter (which is any 16-bit
			 *  code point).
			 */

			/* DUK__ADVANCECHARS(lex_ctx, 2) would be correct here, but not necessary */
			duk__lexer_skip_to_endofline(lex_ctx);
			goto restart;  /* line terminator will be handled on next round */
		} else if (DUK__L1() == DUK_ASC_STAR) {
			/*
			 *  E5 Section 7.4.  If the multi-line comment contains a newline,
			 *  it is treated like a single line terminator for automatic
			 *  semicolon insertion.
			 */

			duk_bool_t last_asterisk = 0;
			DUK__ADVANCECHARS(lex_ctx, 2);
			for (;;) {
				x = DUK__L0();
				if (x < 0) {
					goto fail_unterm_comment;
				}
				DUK__ADVANCECHARS(lex_ctx, 1);
				if (last_asterisk && x == DUK_ASC_SLASH) {
					break;
				}
				if (duk_unicode_is_line_terminator(x)) {
					got_lineterm = 1;
				}
				last_asterisk = (x == DUK_ASC_STAR);
			}
			goto restart_lineupdate;
		} else if (regexp_mode) {
#if defined(DUK_USE_REGEXP_SUPPORT)
			/*
			 *  "/" followed by something in regexp mode.  See E5 Section 7.8.5.
			 *
			 *  RegExp parsing is a bit complex.  First, the regexp body is delimited
			 *  by forward slashes, but the body may also contain forward slashes as
			 *  part of an escape sequence or inside a character class (delimited by
			 *  square brackets).  A mini state machine is used to implement these.
			 *
			 *  Further, an early (parse time) error must be thrown if the regexp
			 *  would cause a run-time error when used in the expression new RegExp(...).
			 *  Parsing here simply extracts the (candidate) regexp, and also accepts
			 *  invalid regular expressions (which are delimited properly).  The caller
			 *  (compiler) must perform final validation and regexp compilation.
			 *
			 *  RegExp first char may not be '/' (single line comment) or '*' (multi-
			 *  line comment).  These have already been checked above, so there is no
			 *  need below for special handling of the first regexp character as in
			 *  the E5 productions.
			 *
			 *  About unicode escapes within regexp literals:
			 *
			 *      E5 Section 7.8.5 grammar does NOT accept \uHHHH escapes.
			 *      However, Section 6 states that regexps accept the escapes,
			 *      see paragraph starting with "In string literals...".
			 *      The regexp grammar, which sees the decoded regexp literal
			 *      (after lexical parsing) DOES have a \uHHHH unicode escape.
			 *      So, for instance:
			 *
			 *          /\u1234/
			 *
			 *      should first be parsed by the lexical grammar as:
			 *
			 *          '\' 'u'      RegularExpressionBackslashSequence
			 *          '1'          RegularExpressionNonTerminator
			 *          '2'          RegularExpressionNonTerminator
			 *          '3'          RegularExpressionNonTerminator
			 *          '4'          RegularExpressionNonTerminator
			 *
			 *      and the escape itself is then parsed by the regexp engine.
			 *      This is the current implementation.
			 *
			 *  Minor spec inconsistency:
			 *
			 *      E5 Section 7.8.5 RegularExpressionBackslashSequence is:
			 *
			 *         \ RegularExpressionNonTerminator
			 *
			 *      while Section A.1 RegularExpressionBackslashSequence is:
			 *
			 *         \ NonTerminator
			 *
			 *      The latter is not normative and a typo.
			 *
			 */

			/* first, parse regexp body roughly */

			duk_small_int_t state = 0;  /* 0=base, 1=esc, 2=class, 3=class+esc */

			DUK__INITBUFFER(lex_ctx);
			for (;;) {
				DUK__ADVANCECHARS(lex_ctx, 1);  /* skip opening slash on first loop */
				x = DUK__L0();
				if (x < 0 || duk_unicode_is_line_terminator(x)) {
					goto fail_unterm_regexp;
				}
				x = DUK__L0();  /* re-read to avoid spill / fetch */
				if (state == 0) {
					if (x == DUK_ASC_SLASH) {
						DUK__ADVANCECHARS(lex_ctx, 1);  /* eat closing slash */
						break;
					} else if (x == DUK_ASC_BACKSLASH) {
						state = 1;
					} else if (x == DUK_ASC_LBRACKET) {
						state = 2;
					}
				} else if (state == 1) {
					state = 0;
				} else if (state == 2) {
					if (x == DUK_ASC_RBRACKET) {
						state = 0;
					} else if (x == DUK_ASC_BACKSLASH) {
						state = 3;
					}
				} else { /* state == 3 */
					state = 2;
				}
				DUK__APPENDBUFFER(lex_ctx, x);
			}
			out_token->str1 = duk__internbuffer(lex_ctx, lex_ctx->slot1_idx);

			/* second, parse flags */

			DUK__INITBUFFER(lex_ctx);
			for (;;) {
				x = DUK__L0();
				if (!duk_unicode_is_identifier_part(x)) {
					break;
				}
				x = DUK__L0();  /* re-read to avoid spill / fetch */
				DUK__APPENDBUFFER(lex_ctx, x);
				DUK__ADVANCECHARS(lex_ctx, 1);
			}
			out_token->str2 = duk__internbuffer(lex_ctx, lex_ctx->slot2_idx);

			DUK__INITBUFFER(lex_ctx);  /* free some memory */

			/* validation of the regexp is caller's responsibility */

			advtok = DUK__ADVTOK(0, DUK_TOK_REGEXP);
#else  /* DUK_USE_REGEXP_SUPPORT */
			goto fail_regexp_support;
#endif  /* DUK_USE_REGEXP_SUPPORT */
		} else if (DUK__L1() == DUK_ASC_EQUALS) {
			/* "/=" and not in regexp mode */
			advtok = DUK__ADVTOK(2, DUK_TOK_DIV_EQ);
		} else {
			/* "/" and not in regexp mode */
			advtok = DUK__ADVTOK(1, DUK_TOK_DIV);
		}
		break;
	case DUK_ASC_LCURLY:  /* '{' */
		advtok = DUK__ADVTOK(1, DUK_TOK_LCURLY);
		break;
	case DUK_ASC_RCURLY:  /* '}' */
		advtok = DUK__ADVTOK(1, DUK_TOK_RCURLY);
		break;
	case DUK_ASC_LPAREN:  /* '(' */
		advtok = DUK__ADVTOK(1, DUK_TOK_LPAREN);
		break;
	case DUK_ASC_RPAREN:  /* ')' */
		advtok = DUK__ADVTOK(1, DUK_TOK_RPAREN);
		break;
	case DUK_ASC_LBRACKET:  /* '[' */
		advtok = DUK__ADVTOK(1, DUK_TOK_LBRACKET);
		break;
	case DUK_ASC_RBRACKET:  /* ']' */
		advtok = DUK__ADVTOK(1, DUK_TOK_RBRACKET);
		break;
	case DUK_ASC_PERIOD:  /* '.' */
		if (DUK__ISDIGIT(DUK__L1())) {
			/* Period followed by a digit can only start DecimalLiteral
			 * (handled in slow path).  We could jump straight into the
			 * DecimalLiteral handling but should avoid goto to inside
			 * a block.
			 */
			goto slow_path;
		}
		advtok = DUK__ADVTOK(1, DUK_TOK_PERIOD);
		break;
	case DUK_ASC_SEMICOLON:  /* ';' */
		advtok = DUK__ADVTOK(1, DUK_TOK_SEMICOLON);
		break;
	case DUK_ASC_COMMA:  /* ',' */
		advtok = DUK__ADVTOK(1, DUK_TOK_COMMA);
		break;
	case DUK_ASC_LANGLE:  /* '<' */
#if defined(DUK_USE_HTML_COMMENTS)
		if (DUK__L1() == DUK_ASC_EXCLAMATION && DUK__L2() == DUK_ASC_MINUS && DUK__L3() == DUK_ASC_MINUS) {
			/*
			 *  ES2015: B.1.3, handle "<!--" SingleLineHTMLOpenComment
			 */

			/* DUK__ADVANCECHARS(lex_ctx, 4) would be correct here, but not necessary */
			duk__lexer_skip_to_endofline(lex_ctx);
			goto restart;  /* line terminator will be handled on next round */
		}
		else
#endif  /* DUK_USE_HTML_COMMENTS */
		if (DUK__L1() == DUK_ASC_LANGLE && DUK__L2() == DUK_ASC_EQUALS) {
			advtok = DUK__ADVTOK(3, DUK_TOK_ALSHIFT_EQ);
		} else if (DUK__L1() == DUK_ASC_EQUALS) {
			advtok = DUK__ADVTOK(2, DUK_TOK_LE);
		} else if (DUK__L1() == DUK_ASC_LANGLE) {
			advtok = DUK__ADVTOK(2, DUK_TOK_ALSHIFT);
		} else {
			advtok = DUK__ADVTOK(1, DUK_TOK_LT);
		}
		break;
	case DUK_ASC_RANGLE:  /* '>' */
		if (DUK__L1() == DUK_ASC_RANGLE && DUK__L2() == DUK_ASC_RANGLE && DUK__L3() == DUK_ASC_EQUALS) {
			advtok = DUK__ADVTOK(4, DUK_TOK_RSHIFT_EQ);
		} else if (DUK__L1() == DUK_ASC_RANGLE && DUK__L2() == DUK_ASC_RANGLE) {
			advtok = DUK__ADVTOK(3, DUK_TOK_RSHIFT);
		} else if (DUK__L1() == DUK_ASC_RANGLE && DUK__L2() == DUK_ASC_EQUALS) {
			advtok = DUK__ADVTOK(3, DUK_TOK_ARSHIFT_EQ);
		} else if (DUK__L1() == DUK_ASC_EQUALS) {
			advtok = DUK__ADVTOK(2, DUK_TOK_GE);
		} else if (DUK__L1() == DUK_ASC_RANGLE) {
			advtok = DUK__ADVTOK(2, DUK_TOK_ARSHIFT);
		} else {
			advtok = DUK__ADVTOK(1, DUK_TOK_GT);
		}
		break;
	case DUK_ASC_EQUALS:  /* '=' */
		if (DUK__L1() == DUK_ASC_EQUALS && DUK__L2() == DUK_ASC_EQUALS) {
			advtok = DUK__ADVTOK(3, DUK_TOK_SEQ);
		} else if (DUK__L1() == DUK_ASC_EQUALS) {
			advtok = DUK__ADVTOK(2, DUK_TOK_EQ);
		} else {
			advtok = DUK__ADVTOK(1, DUK_TOK_EQUALSIGN);
		}
		break;
	case DUK_ASC_EXCLAMATION:  /* '!' */
		if (DUK__L1() == DUK_ASC_EQUALS && DUK__L2() == DUK_ASC_EQUALS) {
			advtok = DUK__ADVTOK(3, DUK_TOK_SNEQ);
		} else if (DUK__L1() == DUK_ASC_EQUALS) {
			advtok = DUK__ADVTOK(2, DUK_TOK_NEQ);
		} else {
			advtok = DUK__ADVTOK(1, DUK_TOK_LNOT);
		}
		break;
	case DUK_ASC_PLUS:  /* '+' */
		if (DUK__L1() == DUK_ASC_PLUS) {
			advtok = DUK__ADVTOK(2, DUK_TOK_INCREMENT);
		} else if (DUK__L1() == DUK_ASC_EQUALS) {
			advtok = DUK__ADVTOK(2, DUK_TOK_ADD_EQ);
		} else {
			advtok = DUK__ADVTOK(1, DUK_TOK_ADD);
		}
		break;
	case DUK_ASC_MINUS:  /* '-' */
#if defined(DUK_USE_HTML_COMMENTS)
		if (got_lineterm && DUK__L1() == DUK_ASC_MINUS && DUK__L2() == DUK_ASC_RANGLE) {
			/*
			 *  ES2015: B.1.3, handle "-->" SingleLineHTMLCloseComment
			 *  Only allowed:
			 *  - on new line
			 *  - preceded only by whitespace
			 *  - preceded by end of multiline comment and optional whitespace
			 *
			 * Since whitespace generates no tokens, and multiline comments
			 * are treated as a line ending, consulting `got_lineterm` is
			 * sufficient to test for these three options.
			 */

			/* DUK__ADVANCECHARS(lex_ctx, 3) would be correct here, but not necessary */
			duk__lexer_skip_to_endofline(lex_ctx);
			goto restart;  /* line terminator will be handled on next round */
		} else
#endif  /* DUK_USE_HTML_COMMENTS */
		if (DUK__L1() == DUK_ASC_MINUS) {
			advtok = DUK__ADVTOK(2, DUK_TOK_DECREMENT);
		} else if (DUK__L1() == DUK_ASC_EQUALS) {
			advtok = DUK__ADVTOK(2, DUK_TOK_SUB_EQ);
		} else {
			advtok = DUK__ADVTOK(1, DUK_TOK_SUB);
		}
		break;
	case DUK_ASC_STAR:  /* '*' */
#if defined(DUK_USE_ES7_EXP_OPERATOR)
		if (DUK__L1() == DUK_ASC_STAR && DUK__L2() == DUK_ASC_EQUALS) {
			advtok = DUK__ADVTOK(3, DUK_TOK_EXP_EQ);
		} else if (DUK__L1() == DUK_ASC_STAR) {
			advtok = DUK__ADVTOK(2, DUK_TOK_EXP);
		} else
#endif
		if (DUK__L1() == DUK_ASC_EQUALS) {
			advtok = DUK__ADVTOK(2, DUK_TOK_MUL_EQ);
		} else {
			advtok = DUK__ADVTOK(1, DUK_TOK_MUL);
		}
		break;
	case DUK_ASC_PERCENT:  /* '%' */
		if (DUK__L1() == DUK_ASC_EQUALS) {
			advtok = DUK__ADVTOK(2, DUK_TOK_MOD_EQ);
		} else {
			advtok = DUK__ADVTOK(1, DUK_TOK_MOD);
		}
		break;
	case DUK_ASC_AMP:  /* '&' */
		if (DUK__L1() == DUK_ASC_AMP) {
			advtok = DUK__ADVTOK(2, DUK_TOK_LAND);
		} else if (DUK__L1() == DUK_ASC_EQUALS) {
			advtok = DUK__ADVTOK(2, DUK_TOK_BAND_EQ);
		} else {
			advtok = DUK__ADVTOK(1, DUK_TOK_BAND);
		}
		break;
	case DUK_ASC_PIPE:  /* '|' */
		if (DUK__L1() == DUK_ASC_PIPE) {
			advtok = DUK__ADVTOK(2, DUK_TOK_LOR);
		} else if (DUK__L1() == DUK_ASC_EQUALS) {
			advtok = DUK__ADVTOK(2, DUK_TOK_BOR_EQ);
		} else {
			advtok = DUK__ADVTOK(1, DUK_TOK_BOR);
		}
		break;
	case DUK_ASC_CARET:  /* '^' */
		if (DUK__L1() == DUK_ASC_EQUALS) {
			advtok = DUK__ADVTOK(2, DUK_TOK_BXOR_EQ);
		} else {
			advtok = DUK__ADVTOK(1, DUK_TOK_BXOR);
		}
		break;
	case DUK_ASC_TILDE:  /* '~' */
		advtok = DUK__ADVTOK(1, DUK_TOK_BNOT);
		break;
	case DUK_ASC_QUESTION:  /* '?' */
		advtok = DUK__ADVTOK(1, DUK_TOK_QUESTION);
		break;
	case DUK_ASC_COLON:  /* ':' */
		advtok = DUK__ADVTOK(1, DUK_TOK_COLON);
		break;
	case DUK_ASC_DOUBLEQUOTE:    /* '"' */
	case DUK_ASC_SINGLEQUOTE: {  /* '\'' */
		DUK__INITBUFFER(lex_ctx);
		duk__lexer_parse_string_literal(lex_ctx, out_token, x /*quote*/, strict_mode);
		duk__internbuffer(lex_ctx, lex_ctx->slot1_idx);
		out_token->str1 = duk_known_hstring(lex_ctx->thr, lex_ctx->slot1_idx);

		DUK__INITBUFFER(lex_ctx);  /* free some memory */

		advtok = DUK__ADVTOK(0, DUK_TOK_STRING);
		break;
	}
	default:
		goto slow_path;
	}  /* switch */

	goto skip_slow_path;

 slow_path:
	if (duk_unicode_is_line_terminator(x)) {
		if (x == 0x000d && DUK__L1() == 0x000a) {
			/*
			 *  E5 Section 7.3: CR LF is detected as a single line terminator for
			 *  line numbers.  Here we also detect it as a single line terminator
			 *  token.
			 */
			DUK__ADVANCECHARS(lex_ctx, 2);
		} else {
			DUK__ADVANCECHARS(lex_ctx, 1);
		}
		got_lineterm = 1;
		goto restart_lineupdate;
	} else if (duk_unicode_is_identifier_start(x) || x == DUK_ASC_BACKSLASH) {
		/*
		 *  Parse an identifier and then check whether it is:
		 *    - reserved word (keyword or other reserved word)
		 *    - "null"  (NullLiteral)
		 *    - "true"  (BooleanLiteral)
		 *    - "false" (BooleanLiteral)
		 *    - anything else => identifier
		 *
		 *  This does not follow the E5 productions cleanly, but is
		 *  useful and compact.
		 *
		 *  Note that identifiers may contain Unicode escapes,
		 *  see E5 Sections 6 and 7.6.  They must be decoded first,
		 *  and the result checked against allowed characters.
		 *  The above if-clause accepts an identifier start and an
		 *  '\' character -- no other token can begin with a '\'.
		 *
		 *  Note that "get" and "set" are not reserved words in E5
		 *  specification so they are recognized as plain identifiers
		 *  (the tokens DUK_TOK_GET and DUK_TOK_SET are actually not
		 *  used now).  The compiler needs to work around this.
		 *
		 *  Strictly speaking, following ECMAScript longest match
		 *  specification, an invalid escape for the first character
		 *  should cause a syntax error.  However, an invalid escape
		 *  for IdentifierParts should just terminate the identifier
		 *  early (longest match), and let the next tokenization
		 *  fail.  For instance Rhino croaks with 'foo\z' when
		 *  parsing the identifier.  This has little practical impact.
		 */

		duk_small_uint_t i, i_end;
		duk_bool_t first = 1;
		duk_hstring *str;

		DUK__INITBUFFER(lex_ctx);
		for (;;) {
			/* re-lookup first char on first loop */
			if (DUK__L0() == DUK_ASC_BACKSLASH) {
				duk_codepoint_t esc_cp;
				if (DUK__L1() != DUK_ASC_LC_U) {
					goto fail_escape;
				}
				esc_cp = duk__lexer_parse_escape(lex_ctx, 1 /*allow_es6*/);
				DUK__APPENDBUFFER(lex_ctx, esc_cp);

				/* IdentifierStart is stricter than IdentifierPart, so if the first
				 * character is escaped, must have a stricter check here.
				 */
				if (!(first ? duk_unicode_is_identifier_start(esc_cp) : duk_unicode_is_identifier_part(esc_cp))) {
					goto fail_escape;
				}

				/* Track number of escapes: necessary for proper keyword
				 * detection.
				 */
				out_token->num_escapes++;
			} else {
				/* Note: first character is checked against this.  But because
				 * IdentifierPart includes all IdentifierStart characters, and
				 * the first character (if unescaped) has already been checked
				 * in the if condition, this is OK.
				 */
				if (!duk_unicode_is_identifier_part(DUK__L0())) {
					break;
				}
				DUK__APPENDBUFFER(lex_ctx, DUK__L0());
				DUK__ADVANCECHARS(lex_ctx, 1);
			}
			first = 0;
		}

		out_token->str1 = duk__internbuffer(lex_ctx, lex_ctx->slot1_idx);
		str = out_token->str1;
		out_token->t_nores = DUK_TOK_IDENTIFIER;

		DUK__INITBUFFER(lex_ctx);  /* free some memory */

		/*
		 *  Interned identifier is compared against reserved words, which are
		 *  currently interned into the heap context.  See genbuiltins.py.
		 *
		 *  Note that an escape in the identifier disables recognition of
		 *  keywords; e.g. "\u0069f = 1;" is a valid statement (assigns to
		 *  identifier named "if").  This is not necessarily compliant,
		 *  see test-dec-escaped-char-in-keyword.js.
		 *
		 *  Note: "get" and "set" are awkward.  They are not officially
		 *  ReservedWords (and indeed e.g. "var set = 1;" is valid), and
		 *  must come out as DUK_TOK_IDENTIFIER.  The compiler needs to
		 *  work around this a bit.
		 */

		/* XXX: optimize by adding the token numbers directly into the
		 * always interned duk_hstring objects (there should be enough
		 * flag bits free for that)?
		 */

		i_end = (strict_mode ? DUK_STRIDX_END_RESERVED : DUK_STRIDX_START_STRICT_RESERVED);

		advtok = DUK__ADVTOK(0, DUK_TOK_IDENTIFIER);
		if (out_token->num_escapes == 0) {
			for (i = DUK_STRIDX_START_RESERVED; i < i_end; i++) {
				DUK_ASSERT_DISABLE(i >= 0);  /* unsigned */
				DUK_ASSERT(i < DUK_HEAP_NUM_STRINGS);
				if (DUK_HTHREAD_GET_STRING(lex_ctx->thr, i) == str) {
					advtok = DUK__ADVTOK(0, DUK_STRIDX_TO_TOK(i));
					break;
				}
			}
		}
	} else if (DUK__ISDIGIT(x) || (x == DUK_ASC_PERIOD)) {
		/* Note: decimal number may start with a period, but must be followed by a digit */

		/*
		 *  Pre-parsing for decimal, hex, octal (both legacy and ES2015),
		 *  and binary literals, followed by an actual parser step
		 *  provided by numconv.
		 *
		 *  Note: the leading sign character ('+' or '-') is -not- part of
		 *  the production in E5 grammar, and that the a DecimalLiteral
		 *  starting with a '0' must be followed by a non-digit.
		 *
		 *  XXX: the two step parsing process is quite awkward, it would
		 *  be more straightforward to allow numconv to parse the longest
		 *  valid prefix (it already does that, it only needs to indicate
		 *  where the input ended).  However, the lexer decodes characters
		 *  using a limited lookup window, so this is not a trivial change.
		 */

		/* XXX: because of the final check below (that the literal is not
		 * followed by a digit), this could maybe be simplified, if we bail
		 * out early from a leading zero (and if there are no periods etc).
		 * Maybe too complex.
		 */

		duk_double_t val;
		duk_bool_t legacy_oct = 0;
		duk_small_int_t state;  /* 0=before period/exp,
		                         * 1=after period, before exp
		                         * 2=after exp, allow '+' or '-'
		                         * 3=after exp and exp sign
		                         */
		duk_small_uint_t s2n_flags;
		duk_codepoint_t y, z;
		duk_small_int_t s2n_radix = 10;
		duk_small_uint_t pre_adv = 0;

		DUK__INITBUFFER(lex_ctx);
		y = DUK__L1();

		if (x == DUK_ASC_0) {
			z = DUK_LOWERCASE_CHAR_ASCII(y);

			pre_adv = 2;  /* default for 0xNNN, 0oNNN, 0bNNN. */
			if (z == DUK_ASC_LC_X) {
				s2n_radix = 16;
			} else if (z == DUK_ASC_LC_O) {
				s2n_radix = 8;
			} else if (z == DUK_ASC_LC_B) {
				s2n_radix = 2;
			} else {
				pre_adv = 0;
				if (DUK__ISDIGIT(y)) {
					if (strict_mode) {
						/* Reject octal like \07 but also octal-lookalike
						 * decimal like \08 in strict mode.
						 */
						goto fail_number_literal;
					} else {
						/* Legacy OctalIntegerLiteral or octal-lookalice
						 * decimal.  Deciding between the two happens below
						 * in digit scanning.
						 */
						DUK__APPENDBUFFER(lex_ctx, x);
						pre_adv = 1;
						legacy_oct = 1;
						s2n_radix = 8;  /* tentative unless conflicting digits found */
					}
				}
			}
		}

		DUK__ADVANCECHARS(lex_ctx, pre_adv);

		/* XXX: we could parse integers here directly, and fall back
		 * to numconv only when encountering a fractional expression
		 * or when an octal literal turned out to be decimal (0778 etc).
		 */
		state = 0;
		for (;;) {
			x = DUK__L0();  /* re-lookup curr char on first round */
			if (DUK__ISDIGIT(x)) {
				/* Note: intentionally allow leading zeroes here, as the
				 * actual parser will check for them.
				 */
				if (state == 0 && legacy_oct && (x == DUK_ASC_8 || x == DUK_ASC_9)) {
					/* Started out as an octal-lookalike
					 * but interpreted as decimal, e.g.
					 * '0779' -> 779.  This also means
					 * that fractions are allowed, e.g.
					 * '0779.123' is allowed but '0777.123'
					 * is not!
					 */
					s2n_radix = 10;
				}
				if (state == 2) {
					state = 3;
				}
			} else if (s2n_radix == 16 && DUK__ISHEXDIGIT(x)) {
				/* Note: 'e' and 'E' are also accepted here. */
				;
			} else if (x == DUK_ASC_PERIOD) {
				if (state >= 1 || s2n_radix != 10) {
					break;
				} else {
					state = 1;
				}
			} else if (x == DUK_ASC_LC_E || x == DUK_ASC_UC_E) {
				if (state >= 2 || s2n_radix != 10) {
					break;
				} else {
					state = 2;
				}
			} else if (x == DUK_ASC_MINUS || x == DUK_ASC_PLUS) {
				if (state != 2) {
					break;
				} else {
					state = 3;
				}
			} else {
				break;
			}
			DUK__APPENDBUFFER(lex_ctx, x);
			DUK__ADVANCECHARS(lex_ctx, 1);
		}

		/* XXX: better coercion */
		(void) duk__internbuffer(lex_ctx, lex_ctx->slot1_idx);

		if (s2n_radix != 10) {
			/* For bases other than 10, integer only. */
			s2n_flags = DUK_S2N_FLAG_ALLOW_LEADING_ZERO;
		} else {
			s2n_flags = DUK_S2N_FLAG_ALLOW_EXP |
			            DUK_S2N_FLAG_ALLOW_FRAC |
			            DUK_S2N_FLAG_ALLOW_NAKED_FRAC |
			            DUK_S2N_FLAG_ALLOW_EMPTY_FRAC |
			            DUK_S2N_FLAG_ALLOW_LEADING_ZERO;
		}

		duk_dup(lex_ctx->thr, lex_ctx->slot1_idx);
		duk_numconv_parse(lex_ctx->thr, s2n_radix, s2n_flags);
		val = duk_to_number_m1(lex_ctx->thr);
		if (DUK_ISNAN(val)) {
			goto fail_number_literal;
		}
		duk_replace(lex_ctx->thr, lex_ctx->slot1_idx);  /* could also just pop? */

		DUK__INITBUFFER(lex_ctx);  /* free some memory */

		/* Section 7.8.3 (note): NumericLiteral must be followed by something other than
		 * IdentifierStart or DecimalDigit.
		 */

		if (DUK__ISDIGIT(DUK__L0()) || duk_unicode_is_identifier_start(DUK__L0())) {
			goto fail_number_literal;
		}

		out_token->num = val;
		advtok = DUK__ADVTOK(0, DUK_TOK_NUMBER);
	} else if (duk_unicode_is_whitespace(DUK__LOOKUP(lex_ctx, 0))) {
		DUK__ADVANCECHARS(lex_ctx, 1);
		goto restart;
	} else if (x < 0) {
		advtok = DUK__ADVTOK(0, DUK_TOK_EOF);
	} else {
		goto fail_token;
	}
 skip_slow_path:

	/*
	 *  Shared exit path
	 */

	DUK__ADVANCEBYTES(lex_ctx, advtok >> 8);
	out_token->t = advtok & 0xff;
	if (out_token->t_nores == DUK_TOK_INVALID) {
		out_token->t_nores = out_token->t;
	}
	out_token->lineterm = got_lineterm;

	/* Automatic semicolon insertion is allowed if a token is preceded
	 * by line terminator(s), or terminates a statement list (right curly
	 * or EOF).
	 */
	if (got_lineterm || out_token->t == DUK_TOK_RCURLY || out_token->t == DUK_TOK_EOF) {
		out_token->allow_auto_semi = 1;
	} else {
		out_token->allow_auto_semi = 0;
	}

	return;

 fail_token_limit:
	DUK_ERROR_RANGE(lex_ctx->thr, DUK_STR_TOKEN_LIMIT);
	DUK_WO_NORETURN(return;);

 fail_token:
	DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_TOKEN);
	DUK_WO_NORETURN(return;);

 fail_number_literal:
	DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_NUMBER_LITERAL);
	DUK_WO_NORETURN(return;);

 fail_escape:
	DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_ESCAPE);
	DUK_WO_NORETURN(return;);

 fail_unterm_regexp:
	DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_UNTERMINATED_REGEXP);
	DUK_WO_NORETURN(return;);

 fail_unterm_comment:
	DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_UNTERMINATED_COMMENT);
	DUK_WO_NORETURN(return;);

#if !defined(DUK_USE_REGEXP_SUPPORT)
 fail_regexp_support:
	DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_REGEXP_SUPPORT_DISABLED);
	DUK_WO_NORETURN(return;);
#endif
}

#if defined(DUK_USE_REGEXP_SUPPORT)

/*
 *  Parse a RegExp token.  The grammar is described in E5 Section 15.10.
 *  Terminal constructions (such as quantifiers) are parsed directly here.
 *
 *  0xffffffffU is used as a marker for "infinity" in quantifiers.  Further,
 *  DUK__MAX_RE_QUANT_DIGITS limits the maximum number of digits that
 *  will be accepted for a quantifier.
 */

DUK_INTERNAL void duk_lexer_parse_re_token(duk_lexer_ctx *lex_ctx, duk_re_token *out_token) {
	duk_small_uint_t advtok = 0;  /* init is unnecessary but suppresses "may be used uninitialized" warnings */
	duk_codepoint_t x, y;

	if (++lex_ctx->token_count >= lex_ctx->token_limit) {
		goto fail_token_limit;
	}

	duk_memzero(out_token, sizeof(*out_token));

	x = DUK__L0();
	y = DUK__L1();

	DUK_DDD(DUK_DDDPRINT("parsing regexp token, L0=%ld, L1=%ld", (long) x, (long) y));

	switch (x) {
	case DUK_ASC_PIPE: {
		advtok = DUK__ADVTOK(1, DUK_RETOK_DISJUNCTION);
		break;
	}
	case DUK_ASC_CARET: {
		advtok = DUK__ADVTOK(1, DUK_RETOK_ASSERT_START);
		break;
	}
	case DUK_ASC_DOLLAR: {
		advtok = DUK__ADVTOK(1, DUK_RETOK_ASSERT_END);
		break;
	}
	case DUK_ASC_QUESTION: {
		out_token->qmin = 0;
		out_token->qmax = 1;
		if (y == DUK_ASC_QUESTION) {
			advtok = DUK__ADVTOK(2, DUK_RETOK_QUANTIFIER);
			out_token->greedy = 0;
		} else {
			advtok = DUK__ADVTOK(1, DUK_RETOK_QUANTIFIER);
			out_token->greedy = 1;
		}
		break;
	}
	case DUK_ASC_STAR: {
		out_token->qmin = 0;
		out_token->qmax = DUK_RE_QUANTIFIER_INFINITE;
		if (y == DUK_ASC_QUESTION) {
			advtok = DUK__ADVTOK(2, DUK_RETOK_QUANTIFIER);
			out_token->greedy = 0;
		} else {
			advtok = DUK__ADVTOK(1, DUK_RETOK_QUANTIFIER);
			out_token->greedy = 1;
		}
		break;
	}
	case DUK_ASC_PLUS: {
		out_token->qmin = 1;
		out_token->qmax = DUK_RE_QUANTIFIER_INFINITE;
		if (y == DUK_ASC_QUESTION) {
			advtok = DUK__ADVTOK(2, DUK_RETOK_QUANTIFIER);
			out_token->greedy = 0;
		} else {
			advtok = DUK__ADVTOK(1, DUK_RETOK_QUANTIFIER);
			out_token->greedy = 1;
		}
		break;
	}
	case DUK_ASC_LCURLY: {
		/* Production allows 'DecimalDigits', including leading zeroes */
		duk_uint32_t val1 = 0;
		duk_uint32_t val2 = DUK_RE_QUANTIFIER_INFINITE;
		duk_small_int_t digits = 0;
#if defined(DUK_USE_ES6_REGEXP_SYNTAX)
		duk_lexer_point lex_pt;
#endif

#if defined(DUK_USE_ES6_REGEXP_SYNTAX)
		/* Store lexer position, restoring if quantifier is invalid. */
		DUK_LEXER_GETPOINT(lex_ctx, &lex_pt);
#endif

		for (;;) {
			DUK__ADVANCECHARS(lex_ctx, 1);  /* eat '{' on entry */
			x = DUK__L0();
			if (DUK__ISDIGIT(x)) {
				digits++;
				val1 = val1 * 10 + (duk_uint32_t) duk__hexval(x);
			} else if (x == DUK_ASC_COMMA) {
				if (digits > DUK__MAX_RE_QUANT_DIGITS) {
					goto invalid_quantifier;
				}
				if (val2 != DUK_RE_QUANTIFIER_INFINITE) {
					goto invalid_quantifier;
				}
				if (DUK__L1() == DUK_ASC_RCURLY) {
					/* form: { DecimalDigits , }, val1 = min count */
					if (digits == 0) {
						goto invalid_quantifier;
					}
					out_token->qmin = val1;
					out_token->qmax = DUK_RE_QUANTIFIER_INFINITE;
					DUK__ADVANCECHARS(lex_ctx, 2);
					break;
				}
				val2 = val1;
				val1 = 0;
				digits = 0;  /* not strictly necessary because of lookahead '}' above */
			} else if (x == DUK_ASC_RCURLY) {
				if (digits > DUK__MAX_RE_QUANT_DIGITS) {
					goto invalid_quantifier;
				}
				if (digits == 0) {
					goto invalid_quantifier;
				}
				if (val2 != DUK_RE_QUANTIFIER_INFINITE) {
					/* val2 = min count, val1 = max count */
					out_token->qmin = val2;
					out_token->qmax = val1;
				} else {
					/* val1 = count */
					out_token->qmin = val1;
					out_token->qmax = val1;
				}
				DUK__ADVANCECHARS(lex_ctx, 1);
				break;
			} else {
				goto invalid_quantifier;
			}
		}
		if (DUK__L0() == DUK_ASC_QUESTION) {
			out_token->greedy = 0;
			DUK__ADVANCECHARS(lex_ctx, 1);
		} else {
			out_token->greedy = 1;
		}
		advtok = DUK__ADVTOK(0, DUK_RETOK_QUANTIFIER);
		break;
 invalid_quantifier:
#if defined(DUK_USE_ES6_REGEXP_SYNTAX)
		/* Failed to match the quantifier, restore lexer and parse
		 * opening brace as a literal.
		 */
		DUK_LEXER_SETPOINT(lex_ctx, &lex_pt);
		advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_CHAR);
		out_token->num = DUK_ASC_LCURLY;
#else
		goto fail_quantifier;
#endif
		break;
	}
	case DUK_ASC_PERIOD: {
		advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_PERIOD);
		break;
	}
	case DUK_ASC_BACKSLASH: {
		/* The E5.1 specification does not seem to allow IdentifierPart characters
		 * to be used as identity escapes.  Unfortunately this includes '$', which
		 * cannot be escaped as '\$'; it needs to be escaped e.g. as '\u0024'.
		 * Many other implementations (including V8 and Rhino, for instance) do
		 * accept '\$' as a valid identity escape, which is quite pragmatic, and
		 * ES2015 Annex B relaxes the rules to allow these (and other) real world forms.
		 */

		advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_CHAR);  /* default: char escape (two chars) */
		if (y == DUK_ASC_LC_B) {
			advtok = DUK__ADVTOK(2, DUK_RETOK_ASSERT_WORD_BOUNDARY);
		} else if (y == DUK_ASC_UC_B) {
			advtok = DUK__ADVTOK(2, DUK_RETOK_ASSERT_NOT_WORD_BOUNDARY);
		} else if (y == DUK_ASC_LC_F) {
			out_token->num = 0x000c;
		} else if (y == DUK_ASC_LC_N) {
			out_token->num = 0x000a;
		} else if (y == DUK_ASC_LC_T) {
			out_token->num = 0x0009;
		} else if (y == DUK_ASC_LC_R) {
			out_token->num = 0x000d;
		} else if (y == DUK_ASC_LC_V) {
			out_token->num = 0x000b;
		} else if (y == DUK_ASC_LC_C) {
			x = DUK__L2();
			if ((x >= DUK_ASC_LC_A && x <= DUK_ASC_LC_Z) ||
			    (x >= DUK_ASC_UC_A && x <= DUK_ASC_UC_Z)) {
				out_token->num = (duk_uint32_t) (x % 32);
				advtok = DUK__ADVTOK(3, DUK_RETOK_ATOM_CHAR);
			} else {
				goto fail_escape;
			}
		} else if (y == DUK_ASC_LC_X || y == DUK_ASC_LC_U) {
			/* The token value is the Unicode codepoint without
			 * it being decode into surrogate pair characters
			 * here.  The \u{H+} is only allowed in Unicode mode
			 * which we don't support yet.
			 */
			out_token->num = (duk_uint32_t) duk__lexer_parse_escape(lex_ctx, 0 /*allow_es6*/);
			advtok = DUK__ADVTOK(0, DUK_RETOK_ATOM_CHAR);
		} else if (y == DUK_ASC_LC_D) {
			advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_DIGIT);
		} else if (y == DUK_ASC_UC_D) {
			advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_NOT_DIGIT);
		} else if (y == DUK_ASC_LC_S) {
			advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_WHITE);
		} else if (y == DUK_ASC_UC_S) {
			advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_NOT_WHITE);
		} else if (y == DUK_ASC_LC_W) {
			advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_WORD_CHAR);
		} else if (y == DUK_ASC_UC_W) {
			advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_NOT_WORD_CHAR);
		} else if (DUK__ISDIGIT(y)) {
			/* E5 Section 15.10.2.11 */
			if (y == DUK_ASC_0) {
				if (DUK__ISDIGIT(DUK__L2())) {
					goto fail_escape;
				}
				out_token->num = 0x0000;
				advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_CHAR);
			} else {
				/* XXX: shared parsing? */
				duk_uint32_t val = 0;
				duk_small_int_t i;
				for (i = 0; ; i++) {
					if (i >= DUK__MAX_RE_DECESC_DIGITS) {
						goto fail_escape;
					}
					DUK__ADVANCECHARS(lex_ctx, 1);  /* eat backslash on entry */
					x = DUK__L0();
					if (!DUK__ISDIGIT(x)) {
						break;
					}
					val = val * 10 + (duk_uint32_t) duk__hexval(x);
				}
				/* DUK__L0() cannot be a digit, because the loop doesn't terminate if it is */
				advtok = DUK__ADVTOK(0, DUK_RETOK_ATOM_BACKREFERENCE);
				out_token->num = val;
			}
#if defined(DUK_USE_ES6_REGEXP_SYNTAX)
		} else if (y >= 0) {
			/* For ES2015 Annex B, accept any source character as identity
			 * escape except 'c' which is used for control characters.
			 * http://www.ecma-international.org/ecma-262/6.0/#sec-regular-expressions-patterns
			 * Careful not to match end-of-buffer (<0) here.
			 * This is not yet full ES2015 Annex B because cases above
			 * (like hex escape) won't backtrack.
			 */
			DUK_ASSERT(y != DUK_ASC_LC_C);  /* covered above */
#else  /* DUK_USE_ES6_REGEXP_SYNTAX */
		} else if ((y >= 0 && !duk_unicode_is_identifier_part(y)) ||
		           y == DUK_UNICODE_CP_ZWNJ ||
		           y == DUK_UNICODE_CP_ZWJ) {
			/* For ES5.1 identity escapes are not allowed for identifier
			 * parts.  This conflicts with a lot of real world code as this
			 * doesn't e.g. allow escaping a dollar sign as /\$/, see
			 * test-regexp-identity-escape-dollar.js.
			 */
#endif  /* DUK_USE_ES6_REGEXP_SYNTAX */
			out_token->num = (duk_uint32_t) y;
		} else {
			goto fail_escape;
		}
		break;
	}
	case DUK_ASC_LPAREN: {
		/* XXX: naming is inconsistent: ATOM_END_GROUP ends an ASSERT_START_LOOKAHEAD */

		if (y == DUK_ASC_QUESTION) {
			if (DUK__L2() == DUK_ASC_EQUALS) {
				/* (?= */
				advtok = DUK__ADVTOK(3, DUK_RETOK_ASSERT_START_POS_LOOKAHEAD);
			} else if (DUK__L2() == DUK_ASC_EXCLAMATION) {
				/* (?! */
				advtok = DUK__ADVTOK(3, DUK_RETOK_ASSERT_START_NEG_LOOKAHEAD);
			} else if (DUK__L2() == DUK_ASC_COLON) {
				/* (?: */
				advtok = DUK__ADVTOK(3, DUK_RETOK_ATOM_START_NONCAPTURE_GROUP);
			} else {
				goto fail_group;
			}
		} else {
			/* ( */
			advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_START_CAPTURE_GROUP);
		}
		break;
	}
	case DUK_ASC_RPAREN: {
		advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_END_GROUP);
		break;
	}
	case DUK_ASC_LBRACKET: {
		/*
		 *  To avoid creating a heavy intermediate value for the list of ranges,
		 *  only the start token ('[' or '[^') is parsed here.  The regexp
		 *  compiler parses the ranges itself.
		 */

		/* XXX: with DUK_USE_ES6_REGEXP_SYNTAX we should allow left bracket
		 * literal too, but it's not easy to parse without backtracking.
		 */

		advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_START_CHARCLASS);
		if (y == DUK_ASC_CARET) {
			advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_START_CHARCLASS_INVERTED);
		}
		break;
	}
#if !defined(DUK_USE_ES6_REGEXP_SYNTAX)
	case DUK_ASC_RCURLY:
	case DUK_ASC_RBRACKET: {
		/* Although these could be parsed as PatternCharacters unambiguously (here),
		 * E5 Section 15.10.1 grammar explicitly forbids these as PatternCharacters.
		 */
		goto fail_invalid_char;
		break;
	}
#endif
	case -1: {
		/* EOF */
		advtok = DUK__ADVTOK(0, DUK_TOK_EOF);
		break;
	}
	default: {
		/* PatternCharacter, all excluded characters are matched by cases above */
		advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_CHAR);
		out_token->num = (duk_uint32_t) x;
		break;
	}
	}

	/*
	 *  Shared exit path
	 */

	DUK__ADVANCEBYTES(lex_ctx, advtok >> 8);
	out_token->t = advtok & 0xff;
	return;

 fail_token_limit:
	DUK_ERROR_RANGE(lex_ctx->thr, DUK_STR_TOKEN_LIMIT);
	DUK_WO_NORETURN(return;);

 fail_escape:
	DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_REGEXP_ESCAPE);
	DUK_WO_NORETURN(return;);

 fail_group:
	DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_REGEXP_GROUP);
	DUK_WO_NORETURN(return;);

#if !defined(DUK_USE_ES6_REGEXP_SYNTAX)
 fail_invalid_char:
	DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_REGEXP_CHARACTER);
	DUK_WO_NORETURN(return;);

 fail_quantifier:
	DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_QUANTIFIER);
	DUK_WO_NORETURN(return;);
#endif
}

/*
 *  Special parser for character classes; calls callback for every
 *  range parsed and returns the number of ranges present.
 */

/* XXX: this duplicates functionality in duk_regexp.c where a similar loop is
 * required anyway.  We could use that BUT we need to update the regexp compiler
 * 'nranges' too.  Work this out a bit more cleanly to save space.
 */

/* XXX: the handling of character range detection is a bit convoluted.
 * Try to simplify and make smaller.
 */

/* XXX: logic for handling character ranges is now incorrect, it will accept
 * e.g. [\d-z] whereas it should croak from it?  SMJS accepts this too, though.
 *
 * Needs a read through and a lot of additional tests.
 */

DUK_LOCAL
void duk__emit_u16_direct_ranges(duk_lexer_ctx *lex_ctx,
                                 duk_re_range_callback gen_range,
                                 void *userdata,
                                 const duk_uint16_t *ranges,
                                 duk_small_int_t num) {
	const duk_uint16_t *ranges_end;

	DUK_UNREF(lex_ctx);

	ranges_end = ranges + num;
	while (ranges < ranges_end) {
		/* mark range 'direct', bypass canonicalization (see Wiki) */
		gen_range(userdata, (duk_codepoint_t) ranges[0], (duk_codepoint_t) ranges[1], 1);
		ranges += 2;
	}
}

DUK_INTERNAL void duk_lexer_parse_re_ranges(duk_lexer_ctx *lex_ctx, duk_re_range_callback gen_range, void *userdata) {
	duk_codepoint_t start = -1;
	duk_codepoint_t ch;
	duk_codepoint_t x;
	duk_bool_t dash = 0;
	duk_small_uint_t adv = 0;

	DUK_DD(DUK_DDPRINT("parsing regexp ranges"));

	for (;;) {
		DUK__ADVANCECHARS(lex_ctx, adv);
		adv = 1;

		x = DUK__L0();

		ch = -1;  /* not strictly necessary, but avoids "uninitialized variable" warnings */
		DUK_UNREF(ch);

		if (x < 0) {
			goto fail_unterm_charclass;
		} else if (x == DUK_ASC_RBRACKET) {
			if (start >= 0) {
				gen_range(userdata, start, start, 0);
			}
			DUK__ADVANCECHARS(lex_ctx, 1);  /* eat ']' before finishing */
			break;
		} else if (x == DUK_ASC_MINUS) {
			if (start >= 0 && !dash && DUK__L1() != DUK_ASC_RBRACKET) {
				/* '-' as a range indicator */
				dash = 1;
				continue;
			} else {
				/* '-' verbatim */
				ch = x;
			}
		} else if (x == DUK_ASC_BACKSLASH) {
			/*
			 *  The escapes are same as outside a character class, except that \b has a
			 *  different meaning, and \B and backreferences are prohibited (see E5
			 *  Section 15.10.2.19).  However, it's difficult to share code because we
			 *  handle e.g. "\n" very differently: here we generate a single character
			 *  range for it.
			 */

			/* XXX: ES2015 surrogate pair handling. */

			x = DUK__L1();

			adv = 2;

			if (x == DUK_ASC_LC_B) {
				/* Note: '\b' in char class is different than outside (assertion),
				 * '\B' is not allowed and is caught by the duk_unicode_is_identifier_part()
				 * check below.
				 */
				ch = 0x0008;
			} else if (x == DUK_ASC_LC_F) {
				ch = 0x000c;
			} else if (x == DUK_ASC_LC_N) {
				ch = 0x000a;
			} else if (x == DUK_ASC_LC_T) {
				ch = 0x0009;
			} else if (x == DUK_ASC_LC_R) {
				ch = 0x000d;
			} else if (x == DUK_ASC_LC_V) {
				ch = 0x000b;
			} else if (x == DUK_ASC_LC_C) {
				x = DUK__L2();
				adv = 3;
				if ((x >= DUK_ASC_LC_A && x <= DUK_ASC_LC_Z) ||
				    (x >= DUK_ASC_UC_A && x <= DUK_ASC_UC_Z)) {
					ch = (x % 32);
				} else {
					goto fail_escape;
				}
			} else if (x == DUK_ASC_LC_X || x == DUK_ASC_LC_U) {
				/* The \u{H+} form is only allowed in Unicode mode which
				 * we don't support yet.
				 */
				ch = duk__lexer_parse_escape(lex_ctx, 0 /*allow_es6*/);
				adv = 0;
			} else if (x == DUK_ASC_LC_D) {
				duk__emit_u16_direct_ranges(lex_ctx,
				                            gen_range,
				                            userdata,
				                            duk_unicode_re_ranges_digit,
				                            sizeof(duk_unicode_re_ranges_digit) / sizeof(duk_uint16_t));
				ch = -1;
			} else if (x == DUK_ASC_UC_D) {
				duk__emit_u16_direct_ranges(lex_ctx,
				                            gen_range,
				                            userdata,
				                            duk_unicode_re_ranges_not_digit,
				                            sizeof(duk_unicode_re_ranges_not_digit) / sizeof(duk_uint16_t));
				ch = -1;
			} else if (x == DUK_ASC_LC_S) {
				duk__emit_u16_direct_ranges(lex_ctx,
				                            gen_range,
				                            userdata,
				                            duk_unicode_re_ranges_white,
				                            sizeof(duk_unicode_re_ranges_white) / sizeof(duk_uint16_t));
				ch = -1;
			} else if (x == DUK_ASC_UC_S) {
				duk__emit_u16_direct_ranges(lex_ctx,
				                            gen_range,
				                            userdata,
				                            duk_unicode_re_ranges_not_white,
				                            sizeof(duk_unicode_re_ranges_not_white) / sizeof(duk_uint16_t));
				ch = -1;
			} else if (x == DUK_ASC_LC_W) {
				duk__emit_u16_direct_ranges(lex_ctx,
				                            gen_range,
				                            userdata,
				                            duk_unicode_re_ranges_wordchar,
				                            sizeof(duk_unicode_re_ranges_wordchar) / sizeof(duk_uint16_t));
				ch = -1;
			} else if (x == DUK_ASC_UC_W) {
				duk__emit_u16_direct_ranges(lex_ctx,
				                            gen_range,
				                            userdata,
				                            duk_unicode_re_ranges_not_wordchar,
				                            sizeof(duk_unicode_re_ranges_not_wordchar) / sizeof(duk_uint16_t));
				ch = -1;
			} else if (DUK__ISDIGIT(x)) {
				/* DecimalEscape, only \0 is allowed, no leading
				 * zeroes are allowed.
				 *
				 * ES2015 Annex B also allows (maximal match) legacy
				 * octal escapes up to \377 and \8 and \9 are
				 * accepted as literal '8' and '9', also in strict mode.
				 */

#if defined(DUK_USE_ES6_REGEXP_SYNTAX)
				ch = duk__lexer_parse_legacy_octal(lex_ctx, &adv, 0 /*reject_annex_b*/);
				DUK_ASSERT(ch >= 0);  /* no rejections */
#else
				if (x == DUK_ASC_0 && !DUK__ISDIGIT(DUK__L2())) {
					ch = 0x0000;
				} else {
					goto fail_escape;
				}
#endif
#if defined(DUK_USE_ES6_REGEXP_SYNTAX)
			} else if (x >= 0) {
				/* IdentityEscape: ES2015 Annex B allows almost all
				 * source characters here.  Match anything except
				 * EOF here.
				 */
				ch = x;
#else  /* DUK_USE_ES6_REGEXP_SYNTAX */
			} else if (!duk_unicode_is_identifier_part(x)) {
				/* IdentityEscape: ES5.1 doesn't allow identity escape
				 * for identifier part characters, which conflicts with
				 * some real world code.  For example, it doesn't allow
				 * /[\$]/ which is awkward.
				 */
				ch = x;
#endif  /* DUK_USE_ES6_REGEXP_SYNTAX */
			} else {
				goto fail_escape;
			}
		} else {
			/* character represents itself */
			ch = x;
		}

		/* ch is a literal character here or -1 if parsed entity was
		 * an escape such as "\s".
		 */

		if (ch < 0) {
			/* multi-character sets not allowed as part of ranges, see
			 * E5 Section 15.10.2.15, abstract operation CharacterRange.
			 */
			if (start >= 0) {
				if (dash) {
					goto fail_range;
				} else {
					gen_range(userdata, start, start, 0);
					start = -1;
					/* dash is already 0 */
				}
			}
		} else {
			if (start >= 0) {
				if (dash) {
					if (start > ch) {
						goto fail_range;
					}
					gen_range(userdata, start, ch, 0);
					start = -1;
					dash = 0;
				} else {
					gen_range(userdata, start, start, 0);
					start = ch;
					/* dash is already 0 */
				}
			} else {
				start = ch;
			}
		}
	}

	return;

 fail_escape:
	DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_REGEXP_ESCAPE);
	DUK_WO_NORETURN(return;);

 fail_range:
	DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_RANGE);
	DUK_WO_NORETURN(return;);

 fail_unterm_charclass:
	DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_UNTERMINATED_CHARCLASS);
	DUK_WO_NORETURN(return;);
}

#endif  /* DUK_USE_REGEXP_SUPPORT */