1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
|
#!/usr/bin/python3
import argparse
import xml.etree.ElementTree as ET
parser = argparse.ArgumentParser('filter an xml file for language variety specifiers')
parser.add_argument('var', help='language variety to retain (comma-separated for multiple)')
parser.add_argument('infile', help='input file')
parser.add_argument('outfile', help='output file')
parser.add_argument('-k', '--keep', help='node types to retain unchanged', action='append', default=[])
parser.add_argument('-a', '--attr', help='attribute which specifies language variety (default: v)', default='v')
args = parser.parse_args()
keep = set(args.var.split(','))
def should_keep(node):
global args, keep
if args.attr not in node.attrib:
return True
vs = set(node.attrib[args.attr].split(','))
node.attrib.pop(args.attr)
return not vs.isdisjoint(keep)
def filter_node(node):
global args, keep
rem = []
for ch in node:
if ch.tag in args.keep:
continue
elif should_keep(ch):
filter_node(ch)
else:
rem.append(ch)
for r in rem:
node.remove(r)
tree = ET.parse(args.infile)
filter_node(tree.getroot())
tree.write(args.outfile)
|