733 lines
25 KiB
Python
733 lines
25 KiB
Python
#!/usr/bin/env python2
|
|
#
|
|
# Extract rules for Unicode case conversion, specifically the behavior
|
|
# required by ECMAScript E5 in Sections 15.5.4.16 to 15.5.4.19. The
|
|
# bitstream encoded rules are used for the slow path at run time, so
|
|
# compactness is favored over speed.
|
|
#
|
|
# There is no support for context or locale sensitive rules, as they
|
|
# are handled directly in C code before consulting tables generated
|
|
# here. ECMAScript requires case conversion both with and without
|
|
# locale/language specific rules (e.g. String.prototype.toLowerCase()
|
|
# and String.prototype.toLocaleLowerCase()), so they are best handled
|
|
# in C anyway.
|
|
#
|
|
# Case conversion rules for ASCII are also excluded as they are handled
|
|
# by the C fast path. Rules for non-BMP characters (codepoints above
|
|
# U+FFFF) are omitted as they're not required for standard ECMAScript.
|
|
#
|
|
|
|
import os
|
|
import sys
|
|
import re
|
|
import math
|
|
import optparse
|
|
|
|
import dukutil
|
|
|
|
class UnicodeData:
|
|
"""Read UnicodeData.txt into an internal representation."""
|
|
|
|
def __init__(self, filename):
|
|
self.data = self.read_unicode_data(filename)
|
|
print('read %d unicode data entries' % len(self.data))
|
|
|
|
def read_unicode_data(self, filename):
|
|
res = []
|
|
f = open(filename, 'rb')
|
|
for line in f:
|
|
if line.startswith('#'):
|
|
continue
|
|
line = line.strip()
|
|
if line == '':
|
|
continue
|
|
parts = line.split(';')
|
|
if len(parts) != 15:
|
|
raise Exception('invalid unicode data line')
|
|
res.append(parts)
|
|
f.close()
|
|
|
|
# Sort based on Unicode codepoint.
|
|
def mycmp(a,b):
|
|
return cmp(long(a[0], 16), long(b[0], 16))
|
|
|
|
res.sort(cmp=mycmp)
|
|
return res
|
|
|
|
class SpecialCasing:
|
|
"""Read SpecialCasing.txt into an internal representation."""
|
|
|
|
def __init__(self, filename):
|
|
self.data = self.read_special_casing_data(filename)
|
|
print('read %d special casing entries' % len(self.data))
|
|
|
|
def read_special_casing_data(self, filename):
|
|
res = []
|
|
f = open(filename, 'rb')
|
|
for line in f:
|
|
try:
|
|
idx = line.index('#')
|
|
line = line[:idx]
|
|
except ValueError:
|
|
pass
|
|
line = line.strip()
|
|
if line == '':
|
|
continue
|
|
parts = line.split(';')
|
|
parts = [i.strip() for i in parts]
|
|
while len(parts) < 6:
|
|
parts.append('')
|
|
res.append(parts)
|
|
f.close()
|
|
return res
|
|
|
|
def parse_unicode_sequence(x):
|
|
"""Parse a Unicode sequence like ABCD 1234 into a unicode string."""
|
|
res = ''
|
|
for i in x.split(' '):
|
|
i = i.strip()
|
|
if i == '':
|
|
continue
|
|
res += unichr(long(i, 16))
|
|
return res
|
|
|
|
def get_base_conversion_maps(unicode_data):
|
|
"""Create case conversion tables without handling special casing yet."""
|
|
|
|
uc = {} # uppercase, codepoint (number) -> string
|
|
lc = {} # lowercase
|
|
tc = {} # titlecase
|
|
|
|
for x in unicode_data.data:
|
|
c1 = long(x[0], 16)
|
|
|
|
# just 16-bit support needed
|
|
if c1 >= 0x10000:
|
|
continue
|
|
|
|
if x[12] != '':
|
|
# field 12: simple uppercase mapping
|
|
c2 = parse_unicode_sequence(x[12])
|
|
uc[c1] = c2
|
|
tc[c1] = c2 # titlecase default == uppercase, overridden below if necessary
|
|
if x[13] != '':
|
|
# field 13: simple lowercase mapping
|
|
c2 = parse_unicode_sequence(x[13])
|
|
lc[c1] = c2
|
|
if x[14] != '':
|
|
# field 14: simple titlecase mapping
|
|
c2 = parse_unicode_sequence(x[14])
|
|
tc[c1] = c2
|
|
|
|
return uc, lc, tc
|
|
|
|
def update_special_casings(uc, lc, tc, special_casing):
|
|
"""Update case conversion tables with special case conversion rules."""
|
|
|
|
for x in special_casing.data:
|
|
c1 = long(x[0], 16)
|
|
|
|
if x[4] != '':
|
|
# conditions
|
|
continue
|
|
|
|
lower = parse_unicode_sequence(x[1])
|
|
title = parse_unicode_sequence(x[2])
|
|
upper = parse_unicode_sequence(x[3])
|
|
|
|
if len(lower) > 1:
|
|
lc[c1] = lower
|
|
if len(upper) > 1:
|
|
uc[c1] = upper
|
|
if len(title) > 1:
|
|
tc[c1] = title
|
|
|
|
print('- special case: %d %d %d' % (len(lower), len(upper), len(title)))
|
|
|
|
def remove_ascii_part(convmap):
|
|
"""Remove ASCII case conversion parts (handled by C fast path)."""
|
|
|
|
for i in xrange(128):
|
|
if convmap.has_key(i):
|
|
del convmap[i]
|
|
|
|
def scan_range_with_skip(convmap, start_idx, skip):
|
|
"""Scan for a range of continuous case conversion with a certain 'skip'."""
|
|
|
|
conv_i = start_idx
|
|
if not convmap.has_key(conv_i):
|
|
return None, None, None
|
|
elif len(convmap[conv_i]) > 1:
|
|
return None, None, None
|
|
else:
|
|
conv_o = ord(convmap[conv_i])
|
|
|
|
start_i = conv_i
|
|
start_o = conv_o
|
|
|
|
while True:
|
|
new_i = conv_i + skip
|
|
new_o = conv_o + skip
|
|
|
|
if not convmap.has_key(new_i):
|
|
break
|
|
if len(convmap[new_i]) > 1:
|
|
break
|
|
if ord(convmap[new_i]) != new_o:
|
|
break
|
|
|
|
conv_i = new_i
|
|
conv_o = new_o
|
|
|
|
# [start_i,conv_i] maps to [start_o,conv_o], ignore ranges of 1 char.
|
|
count = (conv_i - start_i) / skip + 1
|
|
if count <= 1:
|
|
return None, None, None
|
|
|
|
# We have an acceptable range, remove them from the convmap here.
|
|
for i in xrange(start_i, conv_i + skip, skip):
|
|
del convmap[i]
|
|
|
|
return start_i, start_o, count
|
|
|
|
def find_first_range_with_skip(convmap, skip):
|
|
"""Find first range with a certain 'skip' value."""
|
|
|
|
for i in xrange(65536):
|
|
start_i, start_o, count = scan_range_with_skip(convmap, i, skip)
|
|
if start_i is None:
|
|
continue
|
|
return start_i, start_o, count
|
|
|
|
return None, None, None
|
|
|
|
def generate_caseconv_tables(convmap):
|
|
"""Generate bit-packed case conversion table for a given conversion map."""
|
|
|
|
# The bitstream encoding is based on manual inspection for whatever
|
|
# regularity the Unicode case conversion rules have.
|
|
#
|
|
# Start with a full description of case conversions which does not
|
|
# cover all codepoints; unmapped codepoints convert to themselves.
|
|
# Scan for range-to-range mappings with a range of skips starting from 1.
|
|
# Whenever a valid range is found, remove it from the map. Finally,
|
|
# output the remaining case conversions (1:1 and 1:n) on a per codepoint
|
|
# basis.
|
|
#
|
|
# This is very slow because we always scan from scratch, but its the
|
|
# most reliable and simple way to scan
|
|
|
|
print('generate caseconv tables')
|
|
|
|
ranges = [] # range mappings (2 or more consecutive mappings with a certain skip)
|
|
singles = [] # 1:1 character mappings
|
|
multis = [] # 1:n character mappings
|
|
|
|
# Ranges with skips
|
|
|
|
for skip in xrange(1,6+1): # skips 1...6 are useful
|
|
while True:
|
|
start_i, start_o, count = find_first_range_with_skip(convmap, skip)
|
|
if start_i is None:
|
|
break
|
|
print('- skip %d: %d %d %d' % (skip, start_i, start_o, count))
|
|
ranges.append([start_i, start_o, count, skip])
|
|
|
|
# 1:1 conversions
|
|
|
|
k = convmap.keys()
|
|
k.sort()
|
|
for i in k:
|
|
if len(convmap[i]) > 1:
|
|
continue
|
|
singles.append([i, ord(convmap[i])]) # codepoint, codepoint
|
|
del convmap[i]
|
|
|
|
# There are many mappings to 2-char sequences with latter char being U+0399.
|
|
# These could be handled as a special case, but we don't do that right now.
|
|
#
|
|
# [8064L, u'\u1f08\u0399']
|
|
# [8065L, u'\u1f09\u0399']
|
|
# [8066L, u'\u1f0a\u0399']
|
|
# [8067L, u'\u1f0b\u0399']
|
|
# [8068L, u'\u1f0c\u0399']
|
|
# [8069L, u'\u1f0d\u0399']
|
|
# [8070L, u'\u1f0e\u0399']
|
|
# [8071L, u'\u1f0f\u0399']
|
|
# ...
|
|
#
|
|
# tmp = {}
|
|
# k = convmap.keys()
|
|
# k.sort()
|
|
# for i in k:
|
|
# if len(convmap[i]) == 2 and convmap[i][1] == u'\u0399':
|
|
# tmp[i] = convmap[i][0]
|
|
# del convmap[i]
|
|
# print(repr(tmp))
|
|
#
|
|
# skip = 1
|
|
# while True:
|
|
# start_i, start_o, count = find_first_range_with_skip(tmp, skip)
|
|
# if start_i is None:
|
|
# break
|
|
# print('- special399, skip %d: %d %d %d' % (skip, start_i, start_o, count))
|
|
# print(len(tmp.keys()))
|
|
# print(repr(tmp))
|
|
# XXX: need to put 12 remaining mappings back to convmap
|
|
|
|
# 1:n conversions
|
|
|
|
k = convmap.keys()
|
|
k.sort()
|
|
for i in k:
|
|
multis.append([i, convmap[i]]) # codepoint, string
|
|
del convmap[i]
|
|
|
|
for t in singles:
|
|
print '- singles: ' + repr(t)
|
|
|
|
for t in multis:
|
|
print '- multis: ' + repr(t)
|
|
|
|
print '- range mappings: %d' % len(ranges)
|
|
print '- single character mappings: %d' % len(singles)
|
|
print '- complex mappings (1:n): %d' % len(multis)
|
|
print '- remaining (should be zero): %d' % len(convmap.keys())
|
|
|
|
# XXX: opportunities for diff encoding skip=3 ranges?
|
|
prev = None
|
|
for t in ranges:
|
|
# range: [start_i, start_o, count, skip]
|
|
if t[3] != 3:
|
|
continue
|
|
if prev is not None:
|
|
print '- %d %d' % (t[0] - prev[0], t[1] - prev[1])
|
|
else:
|
|
print '- start: %d %d' % (t[0], t[1])
|
|
prev = t
|
|
|
|
# Bit packed encoding.
|
|
|
|
be = dukutil.BitEncoder()
|
|
|
|
for curr_skip in xrange(1, 7): # 1...6
|
|
count = 0
|
|
for r in ranges:
|
|
start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3]
|
|
if skip != curr_skip:
|
|
continue
|
|
count += 1
|
|
be.bits(count, 6)
|
|
print('- encode: skip=%d, count=%d' % (curr_skip, count))
|
|
|
|
for r in ranges:
|
|
start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3]
|
|
if skip != curr_skip:
|
|
continue
|
|
be.bits(start_i, 16)
|
|
be.bits(start_o, 16)
|
|
be.bits(r_count, 7)
|
|
be.bits(0x3f, 6) # maximum count value = end of skips
|
|
|
|
count = len(singles)
|
|
be.bits(count, 7)
|
|
for t in singles:
|
|
cp_i, cp_o = t[0], t[1]
|
|
be.bits(cp_i, 16)
|
|
be.bits(cp_o, 16)
|
|
|
|
count = len(multis)
|
|
be.bits(count, 7)
|
|
for t in multis:
|
|
cp_i, str_o = t[0], t[1]
|
|
be.bits(cp_i, 16)
|
|
be.bits(len(str_o), 2)
|
|
for i in xrange(len(str_o)):
|
|
be.bits(ord(str_o[i]), 16)
|
|
|
|
return be.getBytes(), be.getNumBits()
|
|
|
|
def generate_regexp_canonicalize_tables(convmap):
|
|
"""Generate tables for case insensitive RegExp normalization."""
|
|
|
|
# Generate a direct codepoint lookup for canonicalizing BMP range.
|
|
|
|
def generate_canontab():
|
|
res = []
|
|
highest_nonid = -1
|
|
|
|
for cp in xrange(65536):
|
|
res_cp = cp # default to as is
|
|
if convmap.has_key(cp):
|
|
tmp = convmap[cp]
|
|
if len(tmp) == 1:
|
|
# If multiple codepoints from input, ignore.
|
|
res_cp = ord(tmp[0])
|
|
if cp >= 0x80 and res_cp < 0x80:
|
|
res_cp = cp # If non-ASCII mapped to ASCII, ignore.
|
|
if cp != res_cp:
|
|
highest_nonid = cp
|
|
res.append(res_cp)
|
|
|
|
# At the moment this is 65370, which means there's very little
|
|
# gain in assuming 1:1 mapping above a certain BMP codepoint
|
|
# (though we do assume 1:1 mapping for above BMP codepoints).
|
|
print('- highest non-identity mapping: %d' % highest_nonid)
|
|
|
|
return res
|
|
|
|
print('generate canontab')
|
|
canontab = generate_canontab()
|
|
|
|
# Figure out which BMP values are never the result of canonicalization.
|
|
# Such codepoints are "don't care" in the sense that they are never
|
|
# matched against at runtime: ranges are canonicalized at compile time,
|
|
# and codepoint being matched is also canonicalized at run time.
|
|
# (Currently unused.)
|
|
|
|
def generate_dontcare():
|
|
res = [ True ] * 65536
|
|
for cp in canontab:
|
|
res[cp] = False
|
|
res_count = 0
|
|
for x in res:
|
|
if x:
|
|
res_count += 1
|
|
print('- %d dontcare codepoints' % res_count)
|
|
return res
|
|
|
|
print('generate canon dontcare')
|
|
dontcare = generate_dontcare()
|
|
|
|
# Generate maximal continuous ranges for canonicalization. A continuous
|
|
# range is a sequence with N codepoints where IN+i canonicalizes to OUT+i
|
|
# for fixed IN, OUT, and i in 0...N-1. There are unfortunately >1000
|
|
# of these ranges, mostly because there are a lot of individual exceptions.
|
|
# (Currently unused.)
|
|
|
|
canon_ranges = []
|
|
for cp in xrange(65536):
|
|
canon_ranges.append([ cp, canontab[cp], 1 ]) # 1 codepoint ranges at first
|
|
def merge_compatible_nogap(rng1, rng2):
|
|
# Merge adjacent ranges if continuity allows.
|
|
if rng1[0] + rng1[2] == rng2[0] and \
|
|
rng1[1] + rng1[2] == rng2[1]:
|
|
return [ rng1[0], rng1[1], rng1[2] + rng2[2] ]
|
|
return None
|
|
def merge_check_nogap():
|
|
len_start = len(canon_ranges)
|
|
for i in xrange(len(canon_ranges) - 1):
|
|
j = i + 1
|
|
rng1 = canon_ranges[i]
|
|
rng2 = canon_ranges[j]
|
|
if rng1 is None or rng2 is None: continue
|
|
merged = merge_compatible_nogap(rng1, rng2)
|
|
if merged is not None:
|
|
canon_ranges[j] = None
|
|
canon_ranges[i] = merged
|
|
filtered = []
|
|
for x in canon_ranges:
|
|
if x is not None:
|
|
filtered.append(x)
|
|
len_end = len(filtered)
|
|
if len_end < len_start:
|
|
return filtered
|
|
return None
|
|
|
|
print('generate canon_ranges')
|
|
while True:
|
|
# Starting from individual ranges of 1 codepoint, merge adjacent
|
|
# ranges until no more ranges can be merged.
|
|
t = merge_check_nogap()
|
|
if t is None:
|
|
break
|
|
canon_ranges = t
|
|
print('- %d ranges' % len(canon_ranges))
|
|
#for rng in canon_ranges:
|
|
# print('canon_ranges:')
|
|
# print(repr(rng))
|
|
|
|
# Generate true/false ranges for BMP codepoints where:
|
|
# - A codepoint is flagged true if continuity is broken at that point, so
|
|
# an explicit codepoint canonicalization is needed at runtime.
|
|
# - A codepoint is flagged false if case conversion is continuous from the
|
|
# previous codepoint, i.e. out_curr = out_prev + 1.
|
|
#
|
|
# The result is a lot of small ranges due to a lot of small 'false' ranges.
|
|
# Reduce the range set by checking if adjacent 'true' ranges have at most
|
|
# false_limit 'false' entries between them. If so, force the 'false'
|
|
# entries to 'true' (safe but results in an unnecessary runtime codepoint
|
|
# lookup) and merge the three ranges into a larger 'true' range.
|
|
#
|
|
# (Currently unused.)
|
|
|
|
def generate_needcheck_straight():
|
|
res = [ True ] * 65536
|
|
assert(canontab[0] == 0) # can start from in == out == 0
|
|
prev_in = -1
|
|
prev_out = -1
|
|
for i in xrange(65536):
|
|
# First create a straight true/false bitmap for BMP.
|
|
curr_in = i
|
|
curr_out = canontab[i]
|
|
if prev_in + 1 == curr_in and prev_out + 1 == curr_out:
|
|
res[i] = False
|
|
prev_in = curr_in
|
|
prev_out = curr_out
|
|
return res
|
|
def generate_needcheck_ranges(data):
|
|
# Generate maximal accurate ranges.
|
|
prev = None
|
|
count = 0
|
|
ranges = []
|
|
for i in data:
|
|
if prev is None or prev != i:
|
|
if prev is not None:
|
|
ranges.append([ prev, count ])
|
|
prev = i
|
|
count = 1
|
|
else:
|
|
count += 1
|
|
if prev is not None:
|
|
ranges.append([ prev, count ])
|
|
return ranges
|
|
def fillin_needcheck_ranges(data, false_limit):
|
|
# Fill in TRUE-FALSE*N-TRUE gaps into TRUE-TRUE*N-TRUE which is
|
|
# safe (leads to an unnecessary runtime check) but reduces
|
|
# range data size considerably.
|
|
res = []
|
|
for r in data:
|
|
res.append([ r[0], r[1] ])
|
|
while True:
|
|
found = False
|
|
for i in xrange(len(res) - 2):
|
|
r1 = res[i]
|
|
r2 = res[i + 1]
|
|
r3 = res[i + 2]
|
|
if r1[0] == True and r2[0] == False and r3[0] == True and \
|
|
r2[1] <= false_limit:
|
|
#print('fillin %d falses' % r2[1])
|
|
res.pop(i + 2)
|
|
res.pop(i + 1)
|
|
res[i] = [ True, r1[1] + r2[1] + r3[1] ]
|
|
found = True
|
|
break
|
|
if not found:
|
|
break
|
|
return res
|
|
|
|
print('generate needcheck straight')
|
|
needcheck = generate_needcheck_straight()
|
|
|
|
print('generate needcheck without false fillins')
|
|
needcheck_ranges1 = generate_needcheck_ranges(needcheck)
|
|
print('- %d ranges' % len(needcheck_ranges1))
|
|
#print(needcheck_ranges1)
|
|
|
|
print('generate needcheck with false fillins')
|
|
needcheck_ranges2 = fillin_needcheck_ranges(needcheck_ranges1, 11)
|
|
print('- %d ranges' % len(needcheck_ranges2))
|
|
#print(needcheck_ranges2)
|
|
|
|
# Generate a bitmap for BMP, divided into N-codepoint blocks, with each
|
|
# bit indicating: "entire codepoint block canonicalizes continuously, and
|
|
# the block is continuous with the previous and next block". A 'true'
|
|
# entry allows runtime code to just skip the block, advancing 'in' and
|
|
# 'out' by the block size, with no codepoint conversion. The block size
|
|
# should be large enough to produce a relatively small lookup table, but
|
|
# small enough to reduce codepoint conversions to a manageable number
|
|
# because the conversions are (currently) quite slow. This matters
|
|
# especially for case-insensitive RegExps; without any optimization,
|
|
# /[\u0000-\uffff]/i requires 65536 case conversions for runtime
|
|
# normalization.
|
|
|
|
block_shift = 5
|
|
block_size = 1 << block_shift
|
|
block_mask = block_size - 1
|
|
num_blocks = 65536 / block_size
|
|
|
|
def generate_block_bits(check_continuity):
|
|
res = [ True ] * num_blocks
|
|
for i in xrange(num_blocks):
|
|
base_in = i * block_size
|
|
base_out = canontab[base_in]
|
|
if check_continuity:
|
|
lower = -1 # [-1,block_size]
|
|
upper = block_size + 1
|
|
else:
|
|
lower = 0 # [0,block_size-1]
|
|
upper = block_size
|
|
for j in xrange(lower, upper):
|
|
cp = base_in + j
|
|
if cp >= 0x0000 and cp <= 0xffff and canontab[cp] != base_out + j:
|
|
res[i] = False
|
|
break
|
|
return res
|
|
|
|
def dump_block_bitmap(bits):
|
|
tmp = ''.join([ ({ True: 'x', False: '.' })[b] for b in bits])
|
|
tmp = re.sub(r'.{64}', lambda x: x.group(0) + '\n', tmp)
|
|
blocks_true = tmp.count('x')
|
|
blocks_false = tmp.count('.')
|
|
print('%d codepoint blocks are continuous, %d blocks are not' % (blocks_true, blocks_false))
|
|
sys.stdout.write(tmp)
|
|
#print(bits)
|
|
|
|
def dump_test_lookup(bits):
|
|
sys.stdout.write('duk_uint8_t test = {');
|
|
for b in bits:
|
|
if b:
|
|
sys.stdout.write('1,')
|
|
else:
|
|
sys.stdout.write('0,')
|
|
sys.stdout.write('};\n')
|
|
|
|
def convert_to_bitmap(bits):
|
|
# C code looks up bits as:
|
|
# index = codepoint >> N
|
|
# bitnum = codepoint & mask
|
|
# bitmask = 1 << bitnum
|
|
# So block 0 is mask 0x01 of first byte, block 1 is mask 0x02 of
|
|
# first byte, etc.
|
|
res = []
|
|
curr = 0
|
|
mask = 0x01
|
|
for b in bits:
|
|
if b:
|
|
curr += mask
|
|
mask = mask * 2
|
|
if mask == 0x100:
|
|
res.append(curr)
|
|
curr = 0
|
|
mask = 0x01
|
|
assert(mask == 0x01) # no leftover
|
|
return res
|
|
|
|
print('generate canon block bitmap without continuity')
|
|
block_bits1 = generate_block_bits(False)
|
|
dump_block_bitmap(block_bits1)
|
|
dump_test_lookup(block_bits1)
|
|
|
|
print('generate canon block bitmap with continuity')
|
|
block_bits2 = generate_block_bits(True)
|
|
dump_block_bitmap(block_bits2)
|
|
dump_test_lookup(block_bits2)
|
|
|
|
print('generate final canon bitmap')
|
|
block_bitmap = convert_to_bitmap(block_bits2)
|
|
print('- %d bytes' % len(block_bitmap))
|
|
print('- ' + repr(block_bitmap))
|
|
canon_bitmap = {
|
|
'data': block_bitmap,
|
|
'block_size': block_size,
|
|
'block_shift': block_shift,
|
|
'block_mask': block_mask
|
|
}
|
|
|
|
# This is useful to figure out corner case test cases.
|
|
print('canon blocks which are different with and without continuity check')
|
|
for i in xrange(num_blocks):
|
|
if block_bits1[i] != block_bits2[i]:
|
|
print('- block %d ([%d,%d]) differs' % (i, i * block_size, i * block_size + block_size - 1))
|
|
|
|
return canontab, canon_bitmap
|
|
|
|
def clonedict(x):
|
|
"Shallow clone of input dict."
|
|
res = {}
|
|
for k in x.keys():
|
|
res[k] = x[k]
|
|
return res
|
|
|
|
def main():
|
|
parser = optparse.OptionParser()
|
|
parser.add_option('--command', dest='command', default='caseconv_bitpacked')
|
|
parser.add_option('--unicode-data', dest='unicode_data')
|
|
parser.add_option('--special-casing', dest='special_casing')
|
|
parser.add_option('--out-source', dest='out_source')
|
|
parser.add_option('--out-header', dest='out_header')
|
|
parser.add_option('--table-name-lc', dest='table_name_lc', default='caseconv_lc')
|
|
parser.add_option('--table-name-uc', dest='table_name_uc', default='caseconv_uc')
|
|
parser.add_option('--table-name-re-canon-lookup', dest='table_name_re_canon_lookup', default='caseconv_re_canon_lookup')
|
|
parser.add_option('--table-name-re-canon-bitmap', dest='table_name_re_canon_bitmap', default='caseconv_re_canon_bitmap')
|
|
(opts, args) = parser.parse_args()
|
|
|
|
unicode_data = UnicodeData(opts.unicode_data)
|
|
special_casing = SpecialCasing(opts.special_casing)
|
|
|
|
uc, lc, tc = get_base_conversion_maps(unicode_data)
|
|
update_special_casings(uc, lc, tc, special_casing)
|
|
|
|
if opts.command == 'caseconv_bitpacked':
|
|
# XXX: ASCII and non-BMP filtering could be an option but is now hardcoded
|
|
|
|
# ASCII is handled with 'fast path' so not needed here.
|
|
t = clonedict(uc)
|
|
remove_ascii_part(t)
|
|
uc_bytes, uc_nbits = generate_caseconv_tables(t)
|
|
|
|
t = clonedict(lc)
|
|
remove_ascii_part(t)
|
|
lc_bytes, lc_nbits = generate_caseconv_tables(t)
|
|
|
|
# Generate C source and header files.
|
|
genc = dukutil.GenerateC()
|
|
genc.emitHeader('extract_caseconv.py')
|
|
genc.emitArray(uc_bytes, opts.table_name_uc, size=len(uc_bytes), typename='duk_uint8_t', intvalues=True, const=True)
|
|
genc.emitArray(lc_bytes, opts.table_name_lc, size=len(lc_bytes), typename='duk_uint8_t', intvalues=True, const=True)
|
|
f = open(opts.out_source, 'wb')
|
|
f.write(genc.getString())
|
|
f.close()
|
|
|
|
genc = dukutil.GenerateC()
|
|
genc.emitHeader('extract_caseconv.py')
|
|
genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_uc, len(uc_bytes)))
|
|
genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_lc, len(lc_bytes)))
|
|
f = open(opts.out_header, 'wb')
|
|
f.write(genc.getString())
|
|
f.close()
|
|
elif opts.command == 're_canon_lookup':
|
|
# Direct canonicalization lookup for case insensitive regexps, includes ascii part.
|
|
t = clonedict(uc)
|
|
re_canon_lookup, re_canon_bitmap = generate_regexp_canonicalize_tables(t)
|
|
|
|
genc = dukutil.GenerateC()
|
|
genc.emitHeader('extract_caseconv.py')
|
|
genc.emitArray(re_canon_lookup, opts.table_name_re_canon_lookup, size=len(re_canon_lookup), typename='duk_uint16_t', intvalues=True, const=True)
|
|
f = open(opts.out_source, 'wb')
|
|
f.write(genc.getString())
|
|
f.close()
|
|
|
|
genc = dukutil.GenerateC()
|
|
genc.emitHeader('extract_caseconv.py')
|
|
genc.emitLine('extern const duk_uint16_t %s[%d];' % (opts.table_name_re_canon_lookup, len(re_canon_lookup)))
|
|
f = open(opts.out_header, 'wb')
|
|
f.write(genc.getString())
|
|
f.close()
|
|
elif opts.command == 're_canon_bitmap':
|
|
# N-codepoint block bitmap for skipping continuous codepoint blocks
|
|
# quickly.
|
|
t = clonedict(uc)
|
|
re_canon_lookup, re_canon_bitmap = generate_regexp_canonicalize_tables(t)
|
|
|
|
genc = dukutil.GenerateC()
|
|
genc.emitHeader('extract_caseconv.py')
|
|
genc.emitArray(re_canon_bitmap['data'], opts.table_name_re_canon_bitmap, size=len(re_canon_bitmap['data']), typename='duk_uint8_t', intvalues=True, const=True)
|
|
f = open(opts.out_source, 'wb')
|
|
f.write(genc.getString())
|
|
f.close()
|
|
|
|
genc = dukutil.GenerateC()
|
|
genc.emitHeader('extract_caseconv.py')
|
|
genc.emitDefine('DUK_CANON_BITMAP_BLKSIZE', re_canon_bitmap['block_size'])
|
|
genc.emitDefine('DUK_CANON_BITMAP_BLKSHIFT', re_canon_bitmap['block_shift'])
|
|
genc.emitDefine('DUK_CANON_BITMAP_BLKMASK', re_canon_bitmap['block_mask'])
|
|
genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_re_canon_bitmap, len(re_canon_bitmap['data'])))
|
|
f = open(opts.out_header, 'wb')
|
|
f.write(genc.getString())
|
|
f.close()
|
|
else:
|
|
raise Exception('invalid command: %r' % opts.command)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|