1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
|
#! /usr/bin/python3
import re,sys
htmlchar=re.compile(b'&#x([a-f0-9][a-f0-9]);')
def spanGroupVal(text):
result=[]
prefix=""
found=[(h.span(), h.group(1)) for h in htmlchar.finditer(text)]
for span, group in found:
if group==b'c3': # 195, Ã
prefix=group
val=""
else:
if prefix:
utf8bytes=chr(int(prefix,16))+chr(int(group,16))
val=utf8bytes.encode("latin1").decode("utf8")
else:
val=chr(int(group,16))
prefix=""
result.append((span, group, val))
return result
def replace(text, spv):
curs1=0
curs2=0
result=b''
for span, group, val in spv:
curs2=span[0]
result+=text[curs1:curs2]
result+=val.encode("utf8")
curs1=span[1]
result+=text[curs1:]
return result
if __name__=="__main__":
infileName=sys.argv[1]
outfileName=sys.argv[2]
text=open(infileName,"rb").read()
spv=spanGroupVal(text)
newText=replace(text,spv)
open(outfileName,"wb").write(newText)
|