
135 lines
3.8 KiB

#!/usr/bin/env python2
# Scan potential external strings from ECMAScript and C files.
# Very simplistic example with a lot of limitations:
# - Doesn't handle multiple variables in a variable declaration
# - Only extracts strings from C files, these may correspond to
# Duktape/C bindings (but in many cases don't)
import os
import sys
import re
import json
strmap = {}
# ECMAScript function declaration
re_funcname = re.compile(r'function\s+(\w+)', re.UNICODE)
# ECMAScript variable declaration
# XXX: doesn't handle multiple variables
re_vardecl = re.compile(r'var\s+(\w+)', re.UNICODE)
# ECMAScript variable assignment
re_varassign = re.compile(r'(\w+)\s*=\s*', re.UNICODE)
# ECMAScript dotted property reference (also matches numbers like
# '4.0', which are separately rejected below)
re_propref = re.compile(r'(\w+(?:\.\w+)+)', re.UNICODE)
re_digits = re.compile(r'^\d+$', re.UNICODE)
# ECMAScript or C string literal
re_strlit_dquot = re.compile(r'("(?:\\"|\\\\|[^"])*")', re.UNICODE)
re_strlit_squot = re.compile(r'(\'(?:\\\'|\\\\|[^\'])*\')', re.UNICODE)
def strDecode(x):
# Need to decode hex, unicode, and other escapes. Python syntax
# is close enough to C and ECMAScript so use eval for now.
return eval('u' + x) # interpret as unicode string
sys.stderr.write('Failed to parse: ' + repr(x) + ', ignoring\n')
return None
def scan(f, fn):
global strmap
# Scan rules depend on file type
if fn[-2:] == '.c':
use_funcname = False
use_vardecl = False
use_varassign = False
use_propref = False
use_strlit_dquot = True
use_strlit_squot = False
use_funcname = True
use_vardecl = True
use_varassign = True
use_propref = True
use_strlit_dquot = True
use_strlit_squot = True
for line in f:
# Assume input data is UTF-8
line = line.decode('utf-8')
if use_funcname:
for m in re_funcname.finditer(line):
strmap[m.group(1)] = True
if use_vardecl:
for m in re_vardecl.finditer(line):
strmap[m.group(1)] = True
if use_varassign:
for m in re_varassign.finditer(line):
strmap[m.group(1)] = True
if use_propref:
for m in re_propref.finditer(line):
parts = m.group(1).split('.')
if re_digits.match(parts[0]) is not None:
# Probably a number ('4.0' or such)
for part in parts:
strmap[part] = True
if use_strlit_dquot:
for m in re_strlit_dquot.finditer(line):
s = strDecode(m.group(1))
if s is not None:
strmap[s] = True
if use_strlit_squot:
for m in re_strlit_squot.finditer(line):
s = strDecode(m.group(1))
if s is not None:
strmap[s] = True
def main():
for fn in sys.argv[1:]:
f = open(fn, 'rb')
scan(f, fn)
strs = []
strs_base64 = []
doc = {
# Strings as Unicode strings
'scanned_strings': strs,
# Strings as base64-encoded UTF-8 data, which should be ready
# to be used in C code (Duktape internal string representation
# is UTF-8)
'scanned_strings_base64': strs_base64
k = strmap.keys()
for s in k:
t = s.encode('utf-8').encode('base64')
if len(t) > 0 and t[-1] == '\n':
t = t[0:-1]
print(json.dumps(doc, indent=4, ensure_ascii=True, sort_keys=True))
if __name__ == '__main__':