#!/usr/bin/env python2
#
#  Scan potential external strings from ECMAScript and C files.
#
#  Very simplistic example with a lot of limitations:
#
#    - Doesn't handle multiple variables in a variable declaration
#
#    - Only extracts strings from C files, these may correspond to
#      Duktape/C bindings (but in many cases don't)
#

import os
import sys
import re
import json

strmap = {}

# ECMAScript function declaration
re_funcname = re.compile(r'function\s+(\w+)', re.UNICODE)

# ECMAScript variable declaration
# XXX: doesn't handle multiple variables
re_vardecl = re.compile(r'var\s+(\w+)', re.UNICODE)

# ECMAScript variable assignment
re_varassign = re.compile(r'(\w+)\s*=\s*', re.UNICODE)

# ECMAScript dotted property reference (also matches numbers like
# '4.0', which are separately rejected below)
re_propref = re.compile(r'(\w+(?:\.\w+)+)', re.UNICODE)
re_digits = re.compile(r'^\d+$', re.UNICODE)

# ECMAScript or C string literal
re_strlit_dquot = re.compile(r'("(?:\\"|\\\\|[^"])*")', re.UNICODE)
re_strlit_squot = re.compile(r'(\'(?:\\\'|\\\\|[^\'])*\')', re.UNICODE)

def strDecode(x):
    # Need to decode hex, unicode, and other escapes.  Python syntax
    # is close enough to C and ECMAScript so use eval for now.

    try:
        return eval('u' + x)  # interpret as unicode string
    except:
        sys.stderr.write('Failed to parse: ' + repr(x) + ', ignoring\n')
        return None

def scan(f, fn):
    global strmap

    # Scan rules depend on file type
    if fn[-2:] == '.c':
        use_funcname = False
        use_vardecl = False
        use_varassign = False
        use_propref = False
        use_strlit_dquot = True
        use_strlit_squot = False
    else:
        use_funcname = True
        use_vardecl = True
        use_varassign = True
        use_propref = True
        use_strlit_dquot = True
        use_strlit_squot = True

    for line in f:
        # Assume input data is UTF-8
        line = line.decode('utf-8')

        if use_funcname:
            for m in re_funcname.finditer(line):
                strmap[m.group(1)] = True

        if use_vardecl:
            for m in re_vardecl.finditer(line):
                strmap[m.group(1)] = True

        if use_varassign:
            for m in re_varassign.finditer(line):
                strmap[m.group(1)] = True

        if use_propref:
            for m in re_propref.finditer(line):
                parts = m.group(1).split('.')
                if re_digits.match(parts[0]) is not None:
                    # Probably a number ('4.0' or such)
                    pass
                else:
                    for part in parts:
                        strmap[part] = True

        if use_strlit_dquot:
            for m in re_strlit_dquot.finditer(line):
                s = strDecode(m.group(1))
                if s is not None:
                    strmap[s] = True

        if use_strlit_squot:
            for m in re_strlit_squot.finditer(line):
                s = strDecode(m.group(1))
                if s is not None:
                    strmap[s] = True

def main():
    for fn in sys.argv[1:]:
        f = open(fn, 'rb')
        scan(f, fn)
        f.close()

    strs = []
    strs_base64 = []
    doc = {
        # Strings as Unicode strings
        'scanned_strings': strs,

        # Strings as base64-encoded UTF-8 data, which should be ready
        # to be used in C code (Duktape internal string representation
        # is UTF-8)
        'scanned_strings_base64': strs_base64
    }
    k = strmap.keys()
    k.sort()
    for s in k:
        strs.append(s)
        t = s.encode('utf-8').encode('base64')
        if len(t) > 0 and t[-1] == '\n':
            t = t[0:-1]
        strs_base64.append(t)

    print(json.dumps(doc, indent=4, ensure_ascii=True, sort_keys=True))

if __name__ == '__main__':
    main()