Here's my code for parsing Merth's tokens file with ElementTree:
Code: #!/usr/bin/env python
import xml.etree.ElementTree as ET
import sys
def get_byte(attrib):
return int(attrib['byte'][1:],16)
def concatenate_bytes(tokbytes):
ret = 0
mpow = len(tokbytes)-1
for i,byte in enumerate(tokbytes):
ret += byte * 256**(mpow-i)
return ret
def cleanup_chars(string):
trouble = dict( (i,repr(c.encode('utf-8'))[1:-1]) for i,c in enumerate(string) if ord(c) >= 128 or c == "\\")
if trouble:
string = "".join([c if i not in trouble else trouble[i] for i,c in enumerate(string)])
return string
def emit_token(string,tokbytes,raw_mode=False,rootattrs=None):
if string == r'\n' and not raw_mode:
string = r'\n|\r\n?'
tlen=1.5
quotes = False
elif string == "" and not raw_mode:
string = "<<EOF>>"
quotes = False
tlen = 0
else:
quotes = True
tlen = len(string)
string = cleanup_chars(string)
string = "".join([i for i in ['"',string.replace('"',r'\"'),'"'] if quotes or i!='"'])
return (tlen,string,tokbytes,rootattrs) if raw_mode else ((tlen,'%s\t{\treturn 0x%X;\t}' % (string, concatenate_bytes(tokbytes))))
def add_all_tokens(down_from,tokens,byte_prefix,raw_mode=False):
for token in down_from.findall("{http://merthsoft.com/Tokens}Token"):
bp=byte_prefix+[get_byte(token.attrib)]
if 'string' in token.attrib:
tokens.append(emit_token(token.attrib['string'],bp,raw_mode=raw_mode,rootattrs=token.attrib))
for alt in token.findall("{http://merthsoft.com/Tokens}Alt"):
tokens.append(emit_token(alt.attrib['string'],bp,raw_mode=raw_mode,rootattrs=token.attrib))
tokens = add_all_tokens(token,tokens,bp,raw_mode=raw_mode)
return tokens
def getET(filename):
ET.register_namespace("","http://merthsoft.com/Tokens")
return ET.parse(filename).getroot()
root = getET(fname)
tokens = add_all_tokens(root,[],[],raw_mode=True)
I also have some other routines for classifying them, but that's probably less relevant for you.