136 lines
3.8 KiB
Python
136 lines
3.8 KiB
Python
|
#!/usr/bin/env python2
|
||
|
#
|
||
|
# Scan potential external strings from ECMAScript and C files.
|
||
|
#
|
||
|
# Very simplistic example with a lot of limitations:
|
||
|
#
|
||
|
# - Doesn't handle multiple variables in a variable declaration
|
||
|
#
|
||
|
# - Only extracts strings from C files, these may correspond to
|
||
|
# Duktape/C bindings (but in many cases don't)
|
||
|
#
|
||
|
|
||
|
import os
|
||
|
import sys
|
||
|
import re
|
||
|
import json
|
||
|
|
||
|
strmap = {}
|
||
|
|
||
|
# ECMAScript function declaration
|
||
|
re_funcname = re.compile(r'function\s+(\w+)', re.UNICODE)
|
||
|
|
||
|
# ECMAScript variable declaration
|
||
|
# XXX: doesn't handle multiple variables
|
||
|
re_vardecl = re.compile(r'var\s+(\w+)', re.UNICODE)
|
||
|
|
||
|
# ECMAScript variable assignment
|
||
|
re_varassign = re.compile(r'(\w+)\s*=\s*', re.UNICODE)
|
||
|
|
||
|
# ECMAScript dotted property reference (also matches numbers like
|
||
|
# '4.0', which are separately rejected below)
|
||
|
re_propref = re.compile(r'(\w+(?:\.\w+)+)', re.UNICODE)
|
||
|
re_digits = re.compile(r'^\d+$', re.UNICODE)
|
||
|
|
||
|
# ECMAScript or C string literal
|
||
|
re_strlit_dquot = re.compile(r'("(?:\\"|\\\\|[^"])*")', re.UNICODE)
|
||
|
re_strlit_squot = re.compile(r'(\'(?:\\\'|\\\\|[^\'])*\')', re.UNICODE)
|
||
|
|
||
|
def strDecode(x):
|
||
|
# Need to decode hex, unicode, and other escapes. Python syntax
|
||
|
# is close enough to C and ECMAScript so use eval for now.
|
||
|
|
||
|
try:
|
||
|
return eval('u' + x) # interpret as unicode string
|
||
|
except:
|
||
|
sys.stderr.write('Failed to parse: ' + repr(x) + ', ignoring\n')
|
||
|
return None
|
||
|
|
||
|
def scan(f, fn):
|
||
|
global strmap
|
||
|
|
||
|
# Scan rules depend on file type
|
||
|
if fn[-2:] == '.c':
|
||
|
use_funcname = False
|
||
|
use_vardecl = False
|
||
|
use_varassign = False
|
||
|
use_propref = False
|
||
|
use_strlit_dquot = True
|
||
|
use_strlit_squot = False
|
||
|
else:
|
||
|
use_funcname = True
|
||
|
use_vardecl = True
|
||
|
use_varassign = True
|
||
|
use_propref = True
|
||
|
use_strlit_dquot = True
|
||
|
use_strlit_squot = True
|
||
|
|
||
|
for line in f:
|
||
|
# Assume input data is UTF-8
|
||
|
line = line.decode('utf-8')
|
||
|
|
||
|
if use_funcname:
|
||
|
for m in re_funcname.finditer(line):
|
||
|
strmap[m.group(1)] = True
|
||
|
|
||
|
if use_vardecl:
|
||
|
for m in re_vardecl.finditer(line):
|
||
|
strmap[m.group(1)] = True
|
||
|
|
||
|
if use_varassign:
|
||
|
for m in re_varassign.finditer(line):
|
||
|
strmap[m.group(1)] = True
|
||
|
|
||
|
if use_propref:
|
||
|
for m in re_propref.finditer(line):
|
||
|
parts = m.group(1).split('.')
|
||
|
if re_digits.match(parts[0]) is not None:
|
||
|
# Probably a number ('4.0' or such)
|
||
|
pass
|
||
|
else:
|
||
|
for part in parts:
|
||
|
strmap[part] = True
|
||
|
|
||
|
if use_strlit_dquot:
|
||
|
for m in re_strlit_dquot.finditer(line):
|
||
|
s = strDecode(m.group(1))
|
||
|
if s is not None:
|
||
|
strmap[s] = True
|
||
|
|
||
|
if use_strlit_squot:
|
||
|
for m in re_strlit_squot.finditer(line):
|
||
|
s = strDecode(m.group(1))
|
||
|
if s is not None:
|
||
|
strmap[s] = True
|
||
|
|
||
|
def main():
|
||
|
for fn in sys.argv[1:]:
|
||
|
f = open(fn, 'rb')
|
||
|
scan(f, fn)
|
||
|
f.close()
|
||
|
|
||
|
strs = []
|
||
|
strs_base64 = []
|
||
|
doc = {
|
||
|
# Strings as Unicode strings
|
||
|
'scanned_strings': strs,
|
||
|
|
||
|
# Strings as base64-encoded UTF-8 data, which should be ready
|
||
|
# to be used in C code (Duktape internal string representation
|
||
|
# is UTF-8)
|
||
|
'scanned_strings_base64': strs_base64
|
||
|
}
|
||
|
k = strmap.keys()
|
||
|
k.sort()
|
||
|
for s in k:
|
||
|
strs.append(s)
|
||
|
t = s.encode('utf-8').encode('base64')
|
||
|
if len(t) > 0 and t[-1] == '\n':
|
||
|
t = t[0:-1]
|
||
|
strs_base64.append(t)
|
||
|
|
||
|
print(json.dumps(doc, indent=4, ensure_ascii=True, sort_keys=True))
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
main()
|