1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302
|
#!/usr/bin/env python3
"""
This script attempts to deduplicate entries in a dictionary or to detect whether
duplicates are present.
In the normal operation mode, the questionable entries are removed, the result
is written to a file-dedup.py and then it is converted to the C5 format. From
there, a diff with the original format is done. With the diff, the human in
front of this very computer can then reason about the changes.
There is also a mode where the script tries to find a duplicated translation and
exits upon the first match.
Dependencies: xsltproc, diff python >= 3.4.
The script tries to preserve the original XML, but may do the following changes:
- Manual sense enumeration (using the attribute `n`) may be stripped.
Because of the removal of doubled senses, the enumeration might be broken
and it's automatically inserted by the output formatters, anyway.
- XML declarations may get lost, feel free to add support for it. Comments
work fine.
"""
import argparse
import io
import itertools
import os
import shlex
import shutil
import sys
import xml.etree.ElementTree as ET
# TEI name space, Python's parser doesn't handle them nicely
TEI_NS = '{http://www.tei-c.org/ns/1.0}'
NS = {'t': TEI_NS.strip('{}')}
# findall/iter with TEI namespace removed
findall = lambda x,y: x.findall(TEI_NS + y)
tei_iter = lambda x,y: x.iter(TEI_NS + y)
class HelpfulParser(argparse.ArgumentParser):
"""Unlike the super class, this arg parse instance will print the error it
encountered as well as the complete usage of the program. It will also
redirect the usage to a output formatter."""
def __init__(self, name, description=None):
super().__init__(prog=name, description=description)
def error(self, message):
"""Print error message and usage information."""
sys.stderr.write('Error: ' + message + '\n')
self.print_help()
sys.exit(2)
class CommentedTreeBuilder(ET.TreeBuilder):
"""A TreeBuilder subclass that retains XML comments from the source. It can
be treated as a black box saving the contents before and after the root tag,
so that it can be re-added when writing back a XML ElementTree to disk. This
is necessary because of ElementTree's inability to handle declarations
nicely."""
def comment(self, data):
self.start(ET.Comment, {})
self.data(data)
self.end(ET.Comment)
# These helpers allow the identification of similar nodes (comparison of quote
# or usg)
def nodes_eq(node1, node2, tag=None):
"""Check whether both given nodes contain equal text. If tag is given, not
the nodes themselves, but the given child nodes are compared."""
if not tag:
tag = node1.tag
node1 = node1.find('.//' + TEI_NS + tag)
node2 = node2.find('.//' + TEI_NS + tag)
if node1 is None and node2 is None:
return True # means equality, differences are searched
if node1 is None or node2 is None:
return False
return node1.text == node2.text
usages_match = lambda n1, n2: nodes_eq(n1, n2, 'usg')
def translations_of(sense):
trans = sense.findall('.//t:quote', NS)
if not trans:
trans = sense.findall('.//t:def', NS)
return trans
def rm_doubled_senses(entry):
"""Some entries have multiple senses. A few of them are exactly the same,
remove these.
This function returns True if an element has been altered"""
senses = list(findall(entry, 'sense'))
if len(senses) == 1:
return
# obtain a mapping from XML node -> list of words within `<quote>…</quote>`
senses = {sense: translations_of(sense) for sense in senses}
changed = False
# pair each sense with another and compare their content
for s1, s2 in itertools.combinations(senses.items(), 2):
# both sets match
if not len(s1[1]) == len(s2[1]):
continue
if not set(t.text for t in s1[1]) == set(t.text for t in s2[1]):
continue # don't match
# two senses are *exacctly* identical?
try:
entry.remove(s2[0]) # remove sense from entry
changed = True
except ValueError: # already removed?
pass
return changed
def node_is_empty(node):
"""Check that this node and all its children contain no text (apart from
white space)."""
if node.text.strip():
return False
for child in node.getchildren():
if not node_is_empty(child):
return False
return True
def rm_empty_nodes(entry):
"""This function removes nodes which have no text and no children and are
hence without semantic. It also resets the fixed counters on sense, since
they are generated by the output formatter anyway.
This function returns True if an empty node has been removed."""
changed = False
# sometimes parent nodes are empty after their empty children have been
# removed, so do this three times (won't work with deeper nestings…)
for _ in range(0, 2):
changed_here = False
nodes = [(None, entry)]
for parent, node in nodes:
if node_is_empty(node):
if parent:
parent.remove(node)
changed_here = changed = True
else:
nodes.extend((node, c) for c in node.getchildren())
if not changed_here:
break # nothing removed, no further iterations
else:
changed = True
# try to strip enumeration of senses they aren't adjacent anymore; map
# sense: numeration first or discard if no numbering
sense_numbers = {sense: int(sense.attrib.get('n'))
for sense in tei_iter(entry, 'sense') if sense.attrib.get('n')}
if sense_numbers:
# if not all values between min and max sense number, enumeration is
# broken and can be stripped
if not all(n in sense_numbers.values()
for n in range(1, max(iter(sense_numbers.values())))):
for node in sense_numbers:
# strip manual enumeration, handled by output formatters and might
# be wrong after node removal
del node.attrib['n']
changed = True
return changed
def rm_doubled_quotes(entry):
"""Some entries have doubled quotes (translations) within different senses.
Remove the doubled quotes.
This function return True, if the entry has been modified."""
senses = list(findall(entry, 'sense'))
# add quote elements
senses = [(s, cit, q) for s in senses for cit in findall(s, 'cit')
for q in findall(cit, 'quote')]
if len(senses) <= 1:
return
changed = False
# pair each sense with another and compare their content
for trans1, trans2 in itertools.combinations(senses, 2):
sense1, _cit1, quote1 = trans1
sense2, cit2, quote2 = trans2
# translation could have been removed by a previous pairing
if quote1.text == quote2.text and usages_match(sense1, sense2):
try:
cit2.remove(quote2)
except ValueError:
continue # already removed
changed = True
return changed
def exec(command):
"""Execute a command or fail straight away."""
ret = os.system(command)
if ret:
sys.stderr.write("Process exited with %i: %s" % (ret, command))
sys.exit(ret)
#pylint: disable=too-few-public-methods
class XmlParserWrapper:
"""This thin wrapper guards the parsing process. It manually finds the TEI
element and copies everything before and afterwards *verbatim*. This is due
to the inability of the ElementTree parser to handle multiple "root
elements", for instance comments before or after the root node or '<!'
declarations.
"""
def __init__(self, file_name):
with open(file_name, encoding='utf-8') as file:
content = file.read()
tei_start = content.find('<TEI')
if tei_start < 0:
raise ValueError("Couldn't find string `<TEI` in the XML file. Please extend this parser.")
self.before_root = content[:tei_start]
content = content[tei_start:]
tei_end = content.find('</TEI>')
if tei_end < 0:
raise ValueError("Couldn't find `</TEI>` in the input file, please extend the parser.")
tei_end += len('</TEI>')
self.after_root = content[tei_end:]
content = content[:tei_end]
parser = ET.XMLParser(target = CommentedTreeBuilder())
try:
parser.feed(content)
except ET.ParseError as e:
sys.stderr.write("Error while parsing input file\n")
sys.stderr.write(str(e) + '\n')
sys.exit(15)
self.root = parser.close()
def write(self, file_name):
"""Write the XML element tree to a file, with hopefully a very similar
formatting as before."""
tree = ET.ElementTree(self.root)
in_mem = io.BytesIO()
tree.write(in_mem, encoding="UTF-8")
in_mem.seek(0)
with open(file_name, 'wb') as file:
file.write(self.before_root.encode('UTF-8'))
file.write(in_mem.read())
file.write(self.after_root.encode('UTF-8'))
if not self.after_root.endswith('\n'):
file.write(b'\n')
def main():
parser = HelpfulParser("deduplicator", description=("Fnd and remove "
"duplicated translations and empty TEI nodes"))
parser.add_argument("-s", "--detect_changes", dest="detect_changes",
help=("check whether duplicates or empty nodes can be detected "
"and exit with exit code 42 if the first change would "
"need to be made"),
action="store_true", default=False)
parser.add_argument('dictionary_path', help='input TEI file', nargs="+")
args = parser.parse_args(sys.argv[1:])
# register TEI name space without prefix to dump the *same* XML file
ET.register_namespace('', 'http://www.tei-c.org/ns/1.0')
dictionary_path = args.dictionary_path[0]
tree = XmlParserWrapper(dictionary_path)
changed = False
if args.detect_changes: # abort if first change detected
for entry in tei_iter(tree.root, 'entry'):
changed = changed or rm_doubled_senses(entry)
# this one is dangerous
#changed = changed or rm_doubled_quotes(entry)
# the processing above might leave empty parent nodes, remove those
changed = changed or rm_empty_nodes(entry)
if args.detect_changes and changed:
print(("E1: Found duplicated entries or empty XML nodes. Try "
"`make rm_duplicates`."))
sys.exit(42)
else: # always apply changes
for entry in tei_iter(tree.root, 'entry'):
changed1 = rm_doubled_senses(entry)
changed2 = rm_doubled_quotes(entry)
# the processing above might leave empty parent nodes, remove those
changed3 = rm_empty_nodes(entry)
if changed1 or changed2 or changed3:
changed = True
if args.detect_changes and changed:
print(("E1: Found duplicated entries or empty XML nodes. Try "
"`make rm_duplicates`."))
if changed:
output_fn = os.path.join('build', 'tei',
dictionary_path.replace('.tei', '-dedup.tei'))
exec('mkdir -p build/tei')
tree.write(output_fn)
# get a human-readable diff of the changes
c5 = lambda x: shlex.quote(x.replace('.tei', '.c5'))
shutil.copy('freedict-P5.dtd', os.path.dirname(output_fn))
exec('xsltproc $FREEDICT_TOOLS/xsl/tei2c5.xsl %s > %s' % (output_fn,
c5(output_fn)))
# convert original dictionary to c5
exec('make --no-print-directory build-dictd')
# execute diff without checking the return type
if not shutil.which('less'):
exec('diff -u build/dictd/%s %s' % (c5(dictionary_path), c5(output_fn)))
else:
os.system('diff -u build/dictd/%s %s | less' % (c5(dictionary_path), c5(output_fn)))
if input("Do you want to overwrite %s with the new version (y|n): " % \
dictionary_path) == 'y':
shutil.copy(output_fn, dictionary_path)
if __name__ == '__main__':
main()
|