#!/usr/bin/env python2 # # Select a set of Unicode characters (based on included/excluded categories # etc) and write out a compact bitstream for matching a character against # the set at runtime. This is for the slow path, where we're especially # concerned with compactness. A C source file with the table is written, # together with a matching C header. # # Unicode categories (such as 'Z') can be used. Two pseudo-categories # are also available for exclusion only: ASCII and NONBMP. "ASCII" # category excludes ASCII codepoints which is useful because C code # typically contains an ASCII fast path so ASCII characters don't need # to be considered in the Unicode tables. "NONBMP" excludes codepoints # above U+FFFF which is useful because such codepoints don't need to be # supported in standard ECMAScript. # import os import sys import math import optparse import dukutil def read_unicode_data(unidata, catsinc, catsexc, filterfunc): "Read UnicodeData.txt, including lines matching catsinc unless excluded by catsexc or filterfunc." res = [] f = open(unidata, 'rb') def filter_none(cp): return True if filterfunc is None: filterfunc = filter_none # The Unicode parsing is slow enough to warrant some speedups. exclude_cat_exact = {} for cat in catsexc: exclude_cat_exact[cat] = True include_cat_exact = {} for cat in catsinc: include_cat_exact[cat] = True for line in f: #line = line.strip() parts = line.split(';') codepoint = parts[0] if not filterfunc(long(codepoint, 16)): continue category = parts[2] if exclude_cat_exact.has_key(category): continue # quick reject rejected = False for cat in catsexc: if category.startswith(cat) or codepoint == cat: rejected = True break if rejected: continue if include_cat_exact.has_key(category): res.append(line) continue accepted = False for cat in catsinc: if category.startswith(cat) or codepoint == cat: accepted = True break if accepted: res.append(line) f.close() # Sort based on Unicode codepoint def mycmp(a,b): t1 = a.split(';') t2 = b.split(';') n1 = long(t1[0], 16) n2 = long(t2[0], 16) return cmp(n1, n2) res.sort(cmp=mycmp) return res def scan_ranges(lines): "Scan continuous ranges from (filtered) UnicodeData.txt lines." ranges = [] range_start = None prev = None for line in lines: t = line.split(';') n = long(t[0], 16) if range_start is None: range_start = n else: if n == prev + 1: # continue range pass else: ranges.append((range_start, prev)) range_start = n prev = n if range_start is not None: ranges.append((range_start, prev)) return ranges def generate_png(lines, fname): "Generate an illustrative PNG of the character set." from PIL import Image m = {} for line in lines: t = line.split(';') n = long(t[0], 16) m[n] = 1 codepoints = 0x10ffff + 1 width = int(256) height = int(math.ceil(float(codepoints) / float(width))) im = Image.new('RGB', (width, height)) black = (0,0,0) white = (255,255,255) for cp in xrange(codepoints): y = cp / width x = cp % width if m.has_key(long(cp)): im.putpixel((x,y), black) else: im.putpixel((x,y), white) im.save(fname) def generate_match_table1(ranges): "Unused match table format." # This is an earlier match table format which is no longer used. # IdentifierStart-UnicodeLetter has 445 ranges and generates a # match table of 2289 bytes. data = [] prev_re = None def genrange(rs, re): if (rs > re): raise Exception('assumption failed: rs=%d re=%d' % (rs, re)) while True: now = re - rs + 1 if now > 255: now = 255 data.append(now) # range now data.append(0) # skip 0 rs = rs + now else: data.append(now) # range now break def genskip(ss, se): if (ss > se): raise Exception('assumption failed: ss=%d se=%s' % (ss, se)) while True: now = se - ss + 1 if now > 255: now = 255 data.append(now) # skip now data.append(0) # range 0 ss = ss + now else: data.append(now) # skip now break for rs, re in ranges: if prev_re is not None: genskip(prev_re + 1, rs - 1) genrange(rs, re) prev_re = re num_entries = len(data) # header: start of first range # num entries hdr = [] hdr.append(ranges[0][0] >> 8) # XXX: check that not 0x10000 or over hdr.append(ranges[0][1] & 0xff) hdr.append(num_entries >> 8) hdr.append(num_entries & 0xff) return hdr + data def generate_match_table2(ranges): "Unused match table format." # Another attempt at a match table which is also unused. # Total tables for all current classes is now 1472 bytes. data = [] def enc(x): while True: if x < 0x80: data.append(x) break data.append(0x80 + (x & 0x7f)) x = x >> 7 prev_re = 0 for rs, re in ranges: r1 = rs - prev_re # 1 or above (no unjoined ranges) r2 = re - rs # 0 or above enc(r1) enc(r2) prev_re = re enc(0) # end marker return data def generate_match_table3(ranges): "Current match table format." # Yet another attempt, similar to generate_match_table2 except # in packing format. # # Total match size now (at time of writing): 1194 bytes. # # This is the current encoding format used in duk_lexer.c. be = dukutil.BitEncoder() freq = [0] * (0x10ffff + 1) # informative def enc(x): freq[x] += 1 if x <= 0x0e: # 4-bit encoding be.bits(x, 4) return x -= 0x0e + 1 if x <= 0xfd: # 12-bit encoding be.bits(0x0f, 4) be.bits(x, 8) return x -= 0xfd + 1 if x <= 0xfff: # 24-bit encoding be.bits(0x0f, 4) be.bits(0xfe, 8) be.bits(x, 12) return x -= 0xfff + 1 if True: # 36-bit encoding be.bits(0x0f, 4) be.bits(0xff, 8) be.bits(x, 24) return raise Exception('cannot encode') prev_re = 0 for rs, re in ranges: r1 = rs - prev_re # 1 or above (no unjoined ranges) r2 = re - rs # 0 or above enc(r1) enc(r2) prev_re = re enc(0) # end marker data, nbits = be.getBytes(), be.getNumBits() return data, freq def main(): parser = optparse.OptionParser() parser.add_option('--unicode-data', dest='unicode_data') # UnicodeData.txt parser.add_option('--special-casing', dest='special_casing') # SpecialCasing.txt parser.add_option('--include-categories', dest='include_categories') parser.add_option('--exclude-categories', dest='exclude_categories', default='NONE') parser.add_option('--out-source', dest='out_source') parser.add_option('--out-header', dest='out_header') parser.add_option('--out-png', dest='out_png') parser.add_option('--table-name', dest='table_name', default='match_table') (opts, args) = parser.parse_args() unidata = opts.unicode_data catsinc = [] if opts.include_categories != '': catsinc = opts.include_categories.split(',') catsexc = [] if opts.exclude_categories != 'NONE': catsexc = opts.exclude_categories.split(',') print 'CATSEXC: %s' % repr(catsexc) print 'CATSINC: %s' % repr(catsinc) # pseudocategories filter_ascii = ('ASCII' in catsexc) filter_nonbmp = ('NONBMP' in catsexc) # Read raw result def filter1(x): if filter_ascii and x <= 0x7f: # exclude ascii return False if filter_nonbmp and x >= 0x10000: # exclude non-bmp return False return True print('read unicode data') uni_filtered = read_unicode_data(unidata, catsinc, catsexc, filter1) print('done reading unicode data') # Raw output #print('RAW OUTPUT:') #print('===========') #print('\n'.join(uni_filtered)) # Scan ranges #print('') #print('RANGES:') #print('=======') ranges = scan_ranges(uni_filtered) #for i in ranges: # if i[0] == i[1]: # print('0x%04x' % i[0]) # else: # print('0x%04x ... 0x%04x' % (i[0], i[1])) #print('') print('%d ranges total' % len(ranges)) # Generate match table #print('') #print('MATCH TABLE:') #print('============') #matchtable1 = generate_match_table1(ranges) #matchtable2 = generate_match_table2(ranges) matchtable3, freq = generate_match_table3(ranges) #print 'match table: %s' % repr(matchtable3) print 'match table length: %d bytes' % len(matchtable3) print 'encoding freq:' for i in xrange(len(freq)): if freq[i] == 0: continue print ' %6d: %d' % (i, freq[i]) print('') print('MATCH C TABLE -> file %s' % repr(opts.out_header)) # Create C source and header files genc = dukutil.GenerateC() genc.emitHeader('extract_chars.py') genc.emitArray(matchtable3, opts.table_name, size=len(matchtable3), typename='duk_uint8_t', intvalues=True, const=True) if opts.out_source is not None: f = open(opts.out_source, 'wb') f.write(genc.getString()) f.close() genc = dukutil.GenerateC() genc.emitHeader('extract_chars.py') genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name, len(matchtable3))) if opts.out_header is not None: f = open(opts.out_header, 'wb') f.write(genc.getString()) f.close() # Image (for illustrative purposes only) if opts.out_png is not None: generate_png(uni_filtered, opts.out_png) if __name__ == '__main__': main()