OVMS3/OVMS.V3/components/duktape/tools/extract_chars.py

#!/usr/bin/env python2
#
#  Select a set of Unicode characters (based on included/excluded categories
#  etc) and write out a compact bitstream for matching a character against
#  the set at runtime.  This is for the slow path, where we're especially
#  concerned with compactness.  A C source file with the table is written,
#  together with a matching C header.
#
#  Unicode categories (such as 'Z') can be used.  Two pseudo-categories
#  are also available for exclusion only: ASCII and NONBMP.  "ASCII"
#  category excludes ASCII codepoints which is useful because C code
#  typically contains an ASCII fast path so ASCII characters don't need
#  to be considered in the Unicode tables.  "NONBMP" excludes codepoints
#  above U+FFFF which is useful because such codepoints don't need to be
#  supported in standard ECMAScript.
#

import os
import sys
import math
import optparse

import dukutil

def read_unicode_data(unidata, catsinc, catsexc, filterfunc):
    "Read UnicodeData.txt, including lines matching catsinc unless excluded by catsexc or filterfunc."
    res = []
    f = open(unidata, 'rb')

    def filter_none(cp):
        return True
    if filterfunc is None:
        filterfunc = filter_none

    # The Unicode parsing is slow enough to warrant some speedups.
    exclude_cat_exact = {}
    for cat in catsexc:
        exclude_cat_exact[cat] = True
    include_cat_exact = {}
    for cat in catsinc:
        include_cat_exact[cat] = True

    for line in f:
        #line = line.strip()
        parts = line.split(';')

        codepoint = parts[0]
        if not filterfunc(long(codepoint, 16)):
            continue

        category = parts[2]
        if exclude_cat_exact.has_key(category):
            continue  # quick reject

        rejected = False
        for cat in catsexc:
            if category.startswith(cat) or codepoint == cat:
                rejected = True
                break
        if rejected:
            continue

        if include_cat_exact.has_key(category):
            res.append(line)
            continue

        accepted = False
        for cat in catsinc:
            if category.startswith(cat) or codepoint == cat:
                accepted = True
                break
        if accepted:
            res.append(line)

    f.close()

    # Sort based on Unicode codepoint
    def mycmp(a,b):
        t1 = a.split(';')
        t2 = b.split(';')
        n1 = long(t1[0], 16)
        n2 = long(t2[0], 16)
        return cmp(n1, n2)

    res.sort(cmp=mycmp)

    return res

def scan_ranges(lines):
    "Scan continuous ranges from (filtered) UnicodeData.txt lines."
    ranges = []
    range_start = None
    prev = None

    for line in lines:
        t = line.split(';')
        n = long(t[0], 16)
        if range_start is None:
            range_start = n
        else:
            if n == prev + 1:
                # continue range
                pass
            else:
                ranges.append((range_start, prev))
                range_start = n
        prev = n

    if range_start is not None:
        ranges.append((range_start, prev))

    return ranges

def generate_png(lines, fname):
    "Generate an illustrative PNG of the character set."
    from PIL import Image

    m = {}
    for line in lines:
        t = line.split(';')
        n = long(t[0], 16)
        m[n] = 1

    codepoints = 0x10ffff + 1
    width = int(256)
    height = int(math.ceil(float(codepoints) / float(width)))
    im = Image.new('RGB', (width, height))
    black = (0,0,0)
    white = (255,255,255)
    for cp in xrange(codepoints):
        y = cp / width
        x = cp % width

        if m.has_key(long(cp)):
            im.putpixel((x,y), black)
        else:
            im.putpixel((x,y), white)

    im.save(fname)

def generate_match_table1(ranges):
    "Unused match table format."

    # This is an earlier match table format which is no longer used.
    # IdentifierStart-UnicodeLetter has 445 ranges and generates a
    # match table of 2289 bytes.

    data = []
    prev_re = None

    def genrange(rs, re):
        if (rs > re):
            raise Exception('assumption failed: rs=%d re=%d' % (rs, re))

        while True:
            now = re - rs + 1
            if now > 255:
                now = 255
                data.append(now)    # range now
                data.append(0)        # skip 0
                rs = rs + now
            else:
                data.append(now)    # range now
                break

    def genskip(ss, se):
        if (ss > se):
            raise Exception('assumption failed: ss=%d se=%s' % (ss, se))

        while True:
            now = se - ss + 1
            if now > 255:
                now = 255
                data.append(now)    # skip now
                data.append(0)        # range 0
                ss = ss + now
            else:
                data.append(now)    # skip now
                break

    for rs, re in ranges:
        if prev_re is not None:
            genskip(prev_re + 1, rs - 1)
        genrange(rs, re)
        prev_re = re

    num_entries = len(data)

    # header: start of first range
    #         num entries
    hdr = []
    hdr.append(ranges[0][0] >> 8)    # XXX: check that not 0x10000 or over
    hdr.append(ranges[0][1] & 0xff)
    hdr.append(num_entries >> 8)
    hdr.append(num_entries & 0xff)

    return hdr + data

def generate_match_table2(ranges):
    "Unused match table format."

    # Another attempt at a match table which is also unused.
    # Total tables for all current classes is now 1472 bytes.

    data = []

    def enc(x):
        while True:
            if x < 0x80:
                data.append(x)
                break
            data.append(0x80 + (x & 0x7f))
            x = x >> 7

    prev_re = 0

    for rs, re in ranges:
        r1 = rs - prev_re    # 1 or above (no unjoined ranges)
        r2 = re - rs        # 0 or above
        enc(r1)
        enc(r2)
        prev_re = re

    enc(0)    # end marker

    return data

def generate_match_table3(ranges):
    "Current match table format."

    # Yet another attempt, similar to generate_match_table2 except
    # in packing format.
    #
    # Total match size now (at time of writing): 1194 bytes.
    #
    # This is the current encoding format used in duk_lexer.c.

    be = dukutil.BitEncoder()

    freq = [0] * (0x10ffff + 1)  # informative

    def enc(x):
        freq[x] += 1

        if x <= 0x0e:
            # 4-bit encoding
            be.bits(x, 4)
            return
        x -= 0x0e + 1
        if x <= 0xfd:
            # 12-bit encoding
            be.bits(0x0f, 4)
            be.bits(x, 8)
            return
        x -= 0xfd + 1
        if x <= 0xfff:
            # 24-bit encoding
            be.bits(0x0f, 4)
            be.bits(0xfe, 8)
            be.bits(x, 12)
            return
        x -= 0xfff + 1
        if True:
            # 36-bit encoding
            be.bits(0x0f, 4)
            be.bits(0xff, 8)
            be.bits(x, 24)
            return

        raise Exception('cannot encode')

    prev_re = 0

    for rs, re in ranges:
        r1 = rs - prev_re    # 1 or above (no unjoined ranges)
        r2 = re - rs        # 0 or above
        enc(r1)
        enc(r2)
        prev_re = re

    enc(0)    # end marker

    data, nbits = be.getBytes(), be.getNumBits()
    return data, freq

def main():
    parser = optparse.OptionParser()
    parser.add_option('--unicode-data', dest='unicode_data')      # UnicodeData.txt
    parser.add_option('--special-casing', dest='special_casing')  # SpecialCasing.txt
    parser.add_option('--include-categories', dest='include_categories')
    parser.add_option('--exclude-categories', dest='exclude_categories', default='NONE')
    parser.add_option('--out-source', dest='out_source')
    parser.add_option('--out-header', dest='out_header')
    parser.add_option('--out-png', dest='out_png')
    parser.add_option('--table-name', dest='table_name', default='match_table')
    (opts, args) = parser.parse_args()

    unidata = opts.unicode_data
    catsinc = []
    if opts.include_categories != '':
        catsinc = opts.include_categories.split(',')
    catsexc = []
    if opts.exclude_categories != 'NONE':
        catsexc = opts.exclude_categories.split(',')

    print 'CATSEXC: %s' % repr(catsexc)
    print 'CATSINC: %s' % repr(catsinc)

    # pseudocategories
    filter_ascii = ('ASCII' in catsexc)
    filter_nonbmp = ('NONBMP' in catsexc)

    # Read raw result
    def filter1(x):
        if filter_ascii and x <= 0x7f:
            # exclude ascii
            return False
        if filter_nonbmp and x >= 0x10000:
            # exclude non-bmp
            return False
        return True

    print('read unicode data')
    uni_filtered = read_unicode_data(unidata, catsinc, catsexc, filter1)
    print('done reading unicode data')

    # Raw output
    #print('RAW OUTPUT:')
    #print('===========')
    #print('\n'.join(uni_filtered))

    # Scan ranges
    #print('')
    #print('RANGES:')
    #print('=======')
    ranges = scan_ranges(uni_filtered)
    #for i in ranges:
    #    if i[0] == i[1]:
    #        print('0x%04x' % i[0])
    #    else:
    #        print('0x%04x ... 0x%04x' % (i[0], i[1]))
    #print('')
    print('%d ranges total' % len(ranges))

    # Generate match table
    #print('')
    #print('MATCH TABLE:')
    #print('============')
    #matchtable1 = generate_match_table1(ranges)
    #matchtable2 = generate_match_table2(ranges)
    matchtable3, freq = generate_match_table3(ranges)
    #print 'match table: %s' % repr(matchtable3)
    print 'match table length: %d bytes' % len(matchtable3)
    print 'encoding freq:'
    for i in xrange(len(freq)):
        if freq[i] == 0:
            continue
        print '  %6d: %d' % (i, freq[i])

    print('')
    print('MATCH C TABLE -> file %s' % repr(opts.out_header))

    # Create C source and header files
    genc = dukutil.GenerateC()
    genc.emitHeader('extract_chars.py')
    genc.emitArray(matchtable3, opts.table_name, size=len(matchtable3), typename='duk_uint8_t', intvalues=True, const=True)
    if opts.out_source is not None:
        f = open(opts.out_source, 'wb')
        f.write(genc.getString())
        f.close()

    genc = dukutil.GenerateC()
    genc.emitHeader('extract_chars.py')
    genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name, len(matchtable3)))
    if opts.out_header is not None:
        f = open(opts.out_header, 'wb')
        f.write(genc.getString())
        f.close()

    # Image (for illustrative purposes only)
    if opts.out_png is not None:
        generate_png(uni_filtered, opts.out_png)

if __name__ == '__main__':
    main()
Initial commit, fork from original Project 2022-04-05 22:04:46 +00:00			`#!/usr/bin/env python2`
			`#`
			`# Select a set of Unicode characters (based on included/excluded categories`
			`# etc) and write out a compact bitstream for matching a character against`
			`# the set at runtime. This is for the slow path, where we're especially`
			`# concerned with compactness. A C source file with the table is written,`
			`# together with a matching C header.`
			`#`
			`# Unicode categories (such as 'Z') can be used. Two pseudo-categories`
			`# are also available for exclusion only: ASCII and NONBMP. "ASCII"`
			`# category excludes ASCII codepoints which is useful because C code`
			`# typically contains an ASCII fast path so ASCII characters don't need`
			`# to be considered in the Unicode tables. "NONBMP" excludes codepoints`
			`# above U+FFFF which is useful because such codepoints don't need to be`
			`# supported in standard ECMAScript.`
			`#`

			`import os`
			`import sys`
			`import math`
			`import optparse`

			`import dukutil`

			`def read_unicode_data(unidata, catsinc, catsexc, filterfunc):`
			`"Read UnicodeData.txt, including lines matching catsinc unless excluded by catsexc or filterfunc."`
			`res = []`
			`f = open(unidata, 'rb')`

			`def filter_none(cp):`
			`return True`
			`if filterfunc is None:`
			`filterfunc = filter_none`

			`# The Unicode parsing is slow enough to warrant some speedups.`
			`exclude_cat_exact = {}`
			`for cat in catsexc:`
			`exclude_cat_exact[cat] = True`
			`include_cat_exact = {}`
			`for cat in catsinc:`
			`include_cat_exact[cat] = True`

			`for line in f:`
			`#line = line.strip()`
			`parts = line.split(';')`

			`codepoint = parts[0]`
			`if not filterfunc(long(codepoint, 16)):`
			`continue`

			`category = parts[2]`
			`if exclude_cat_exact.has_key(category):`
			`continue # quick reject`

			`rejected = False`
			`for cat in catsexc:`
			`if category.startswith(cat) or codepoint == cat:`
			`rejected = True`
			`break`
			`if rejected:`
			`continue`

			`if include_cat_exact.has_key(category):`
			`res.append(line)`
			`continue`

			`accepted = False`
			`for cat in catsinc:`
			`if category.startswith(cat) or codepoint == cat:`
			`accepted = True`
			`break`
			`if accepted:`
			`res.append(line)`

			`f.close()`

			`# Sort based on Unicode codepoint`
			`def mycmp(a,b):`
			`t1 = a.split(';')`
			`t2 = b.split(';')`
			`n1 = long(t1[0], 16)`
			`n2 = long(t2[0], 16)`
			`return cmp(n1, n2)`

			`res.sort(cmp=mycmp)`

			`return res`

			`def scan_ranges(lines):`
			`"Scan continuous ranges from (filtered) UnicodeData.txt lines."`
			`ranges = []`
			`range_start = None`
			`prev = None`

			`for line in lines:`
			`t = line.split(';')`
			`n = long(t[0], 16)`
			`if range_start is None:`
			`range_start = n`
			`else:`
			`if n == prev + 1:`
			`# continue range`
			`pass`
			`else:`
			`ranges.append((range_start, prev))`
			`range_start = n`
			`prev = n`

			`if range_start is not None:`
			`ranges.append((range_start, prev))`

			`return ranges`

			`def generate_png(lines, fname):`
			`"Generate an illustrative PNG of the character set."`
			`from PIL import Image`

			`m = {}`
			`for line in lines:`
			`t = line.split(';')`
			`n = long(t[0], 16)`
			`m[n] = 1`

			`codepoints = 0x10ffff + 1`
			`width = int(256)`
			`height = int(math.ceil(float(codepoints) / float(width)))`
			`im = Image.new('RGB', (width, height))`
			`black = (0,0,0)`
			`white = (255,255,255)`
			`for cp in xrange(codepoints):`
			`y = cp / width`
			`x = cp % width`

			`if m.has_key(long(cp)):`
			`im.putpixel((x,y), black)`
			`else:`
			`im.putpixel((x,y), white)`

			`im.save(fname)`

			`def generate_match_table1(ranges):`
			`"Unused match table format."`

			`# This is an earlier match table format which is no longer used.`
			`# IdentifierStart-UnicodeLetter has 445 ranges and generates a`
			`# match table of 2289 bytes.`

			`data = []`
			`prev_re = None`

			`def genrange(rs, re):`
			`if (rs > re):`
			`raise Exception('assumption failed: rs=%d re=%d' % (rs, re))`

			`while True:`
			`now = re - rs + 1`
			`if now > 255:`
			`now = 255`
			`data.append(now) # range now`
			`data.append(0) # skip 0`
			`rs = rs + now`
			`else:`
			`data.append(now) # range now`
			`break`

			`def genskip(ss, se):`
			`if (ss > se):`
			`raise Exception('assumption failed: ss=%d se=%s' % (ss, se))`

			`while True:`
			`now = se - ss + 1`
			`if now > 255:`
			`now = 255`
			`data.append(now) # skip now`
			`data.append(0) # range 0`
			`ss = ss + now`
			`else:`
			`data.append(now) # skip now`
			`break`

			`for rs, re in ranges:`
			`if prev_re is not None:`
			`genskip(prev_re + 1, rs - 1)`
			`genrange(rs, re)`
			`prev_re = re`

			`num_entries = len(data)`

			`# header: start of first range`
			`# num entries`
			`hdr = []`
			`hdr.append(ranges[0][0] >> 8) # XXX: check that not 0x10000 or over`
			`hdr.append(ranges[0][1] & 0xff)`
			`hdr.append(num_entries >> 8)`
			`hdr.append(num_entries & 0xff)`

			`return hdr + data`

			`def generate_match_table2(ranges):`
			`"Unused match table format."`

			`# Another attempt at a match table which is also unused.`
			`# Total tables for all current classes is now 1472 bytes.`

			`data = []`

			`def enc(x):`
			`while True:`
			`if x < 0x80:`
			`data.append(x)`
			`break`
			`data.append(0x80 + (x & 0x7f))`
			`x = x >> 7`

			`prev_re = 0`

			`for rs, re in ranges:`
			`r1 = rs - prev_re # 1 or above (no unjoined ranges)`
			`r2 = re - rs # 0 or above`
			`enc(r1)`
			`enc(r2)`
			`prev_re = re`

			`enc(0) # end marker`

			`return data`

			`def generate_match_table3(ranges):`
			`"Current match table format."`

			`# Yet another attempt, similar to generate_match_table2 except`
			`# in packing format.`
			`#`
			`# Total match size now (at time of writing): 1194 bytes.`
			`#`
			`# This is the current encoding format used in duk_lexer.c.`

			`be = dukutil.BitEncoder()`

			`freq = [0] * (0x10ffff + 1) # informative`

			`def enc(x):`
			`freq[x] += 1`

			`if x <= 0x0e:`
			`# 4-bit encoding`
			`be.bits(x, 4)`
			`return`
			`x -= 0x0e + 1`
			`if x <= 0xfd:`
			`# 12-bit encoding`
			`be.bits(0x0f, 4)`
			`be.bits(x, 8)`
			`return`
			`x -= 0xfd + 1`
			`if x <= 0xfff:`
			`# 24-bit encoding`
			`be.bits(0x0f, 4)`
			`be.bits(0xfe, 8)`
			`be.bits(x, 12)`
			`return`
			`x -= 0xfff + 1`
			`if True:`
			`# 36-bit encoding`
			`be.bits(0x0f, 4)`
			`be.bits(0xff, 8)`
			`be.bits(x, 24)`
			`return`

			`raise Exception('cannot encode')`

			`prev_re = 0`

			`for rs, re in ranges:`
			`r1 = rs - prev_re # 1 or above (no unjoined ranges)`
			`r2 = re - rs # 0 or above`
			`enc(r1)`
			`enc(r2)`
			`prev_re = re`

			`enc(0) # end marker`

			`data, nbits = be.getBytes(), be.getNumBits()`
			`return data, freq`

			`def main():`
			`parser = optparse.OptionParser()`
			`parser.add_option('--unicode-data', dest='unicode_data') # UnicodeData.txt`
			`parser.add_option('--special-casing', dest='special_casing') # SpecialCasing.txt`
			`parser.add_option('--include-categories', dest='include_categories')`
			`parser.add_option('--exclude-categories', dest='exclude_categories', default='NONE')`
			`parser.add_option('--out-source', dest='out_source')`
			`parser.add_option('--out-header', dest='out_header')`
			`parser.add_option('--out-png', dest='out_png')`
			`parser.add_option('--table-name', dest='table_name', default='match_table')`
			`(opts, args) = parser.parse_args()`

			`unidata = opts.unicode_data`
			`catsinc = []`
			`if opts.include_categories != '':`
			`catsinc = opts.include_categories.split(',')`
			`catsexc = []`
			`if opts.exclude_categories != 'NONE':`
			`catsexc = opts.exclude_categories.split(',')`

			`print 'CATSEXC: %s' % repr(catsexc)`
			`print 'CATSINC: %s' % repr(catsinc)`

			`# pseudocategories`
			`filter_ascii = ('ASCII' in catsexc)`
			`filter_nonbmp = ('NONBMP' in catsexc)`

			`# Read raw result`
			`def filter1(x):`
			`if filter_ascii and x <= 0x7f:`
			`# exclude ascii`
			`return False`
			`if filter_nonbmp and x >= 0x10000:`
			`# exclude non-bmp`
			`return False`
			`return True`

			`print('read unicode data')`
			`uni_filtered = read_unicode_data(unidata, catsinc, catsexc, filter1)`
			`print('done reading unicode data')`

			`# Raw output`
			`#print('RAW OUTPUT:')`
			`#print('===========')`
			`#print('\n'.join(uni_filtered))`

			`# Scan ranges`
			`#print('')`
			`#print('RANGES:')`
			`#print('=======')`
			`ranges = scan_ranges(uni_filtered)`
			`#for i in ranges:`
			`# if i[0] == i[1]:`
			`# print('0x%04x' % i[0])`
			`# else:`
			`# print('0x%04x ... 0x%04x' % (i[0], i[1]))`
			`#print('')`
			`print('%d ranges total' % len(ranges))`

			`# Generate match table`
			`#print('')`
			`#print('MATCH TABLE:')`
			`#print('============')`
			`#matchtable1 = generate_match_table1(ranges)`
			`#matchtable2 = generate_match_table2(ranges)`
			`matchtable3, freq = generate_match_table3(ranges)`
			`#print 'match table: %s' % repr(matchtable3)`
			`print 'match table length: %d bytes' % len(matchtable3)`
			`print 'encoding freq:'`
			`for i in xrange(len(freq)):`
			`if freq[i] == 0:`
			`continue`
			`print ' %6d: %d' % (i, freq[i])`

			`print('')`
			`print('MATCH C TABLE -> file %s' % repr(opts.out_header))`

			`# Create C source and header files`
			`genc = dukutil.GenerateC()`
			`genc.emitHeader('extract_chars.py')`
			`genc.emitArray(matchtable3, opts.table_name, size=len(matchtable3), typename='duk_uint8_t', intvalues=True, const=True)`
			`if opts.out_source is not None:`
			`f = open(opts.out_source, 'wb')`
			`f.write(genc.getString())`
			`f.close()`

			`genc = dukutil.GenerateC()`
			`genc.emitHeader('extract_chars.py')`
			`genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name, len(matchtable3)))`
			`if opts.out_header is not None:`
			`f = open(opts.out_header, 'wb')`
			`f.write(genc.getString())`
			`f.close()`

			`# Image (for illustrative purposes only)`
			`if opts.out_png is not None:`
			`generate_png(uni_filtered, opts.out_png)`

			`if __name__ == '__main__':`
			`main()`