File: rm_duplicates.py

package info (click to toggle)
freedict-tools 0.7.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,520 kB
  • sloc: python: 2,869; haskell: 1,999; perl: 1,509; yacc: 502; sh: 435; sed: 392; makefile: 141; xml: 10
file content (302 lines) | stat: -rw-r--r-- 12,409 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
#!/usr/bin/env python3
"""
This script attempts to deduplicate entries in a dictionary or to detect whether
duplicates are present.
In the normal operation mode, the questionable entries are removed, the result
is written to a file-dedup.py and then it is converted to the C5 format. From
there, a diff with the original format is done. With the diff, the human in
front of this very computer can then reason about the changes.

There is also a mode where the script tries to find a duplicated translation and
exits upon the first match.

Dependencies: xsltproc, diff python >= 3.4.

The script tries to preserve the original XML, but may do the following changes:

-   Manual sense enumeration (using the attribute `n`) may be stripped.
    Because of the removal of doubled senses, the enumeration might be broken
    and it's automatically inserted by the output formatters, anyway.
-   XML declarations may get lost, feel free to add support for it. Comments
    work fine.
"""

import argparse
import io
import itertools
import os
import shlex
import shutil
import sys
import xml.etree.ElementTree as ET

# TEI name space, Python's parser doesn't handle them nicely
TEI_NS = '{http://www.tei-c.org/ns/1.0}'
NS = {'t': TEI_NS.strip('{}')}
# findall/iter with TEI namespace removed
findall = lambda x,y: x.findall(TEI_NS + y)
tei_iter = lambda x,y: x.iter(TEI_NS + y)

class HelpfulParser(argparse.ArgumentParser):
    """Unlike the super class, this arg parse instance will print the error it
    encountered as well as the complete usage of the program. It will also
    redirect the usage to a output formatter."""
    def __init__(self, name, description=None):
        super().__init__(prog=name, description=description)

    def error(self, message):
        """Print error message and usage information."""
        sys.stderr.write('Error: ' + message + '\n')
        self.print_help()
        sys.exit(2)


class CommentedTreeBuilder(ET.TreeBuilder):
    """A TreeBuilder subclass that retains XML comments from the source. It can
    be treated as a black box saving the contents before and after the root tag,
    so that it can be re-added when writing back a XML ElementTree to disk. This
    is necessary because of ElementTree's inability to handle declarations
    nicely."""
    def comment(self, data):
        self.start(ET.Comment, {})
        self.data(data)
        self.end(ET.Comment)

# These helpers allow the identification of similar nodes (comparison of quote
# or usg)
def nodes_eq(node1, node2, tag=None):
    """Check whether both given nodes contain equal text. If tag is given, not
    the nodes themselves, but the given child nodes are compared."""
    if not tag:
        tag = node1.tag
    node1 = node1.find('.//' + TEI_NS + tag)
    node2 = node2.find('.//' + TEI_NS + tag)
    if node1 is None and node2 is None:
        return True # means equality, differences are searched
    if node1 is None or node2 is None:
        return False
    return node1.text == node2.text

usages_match = lambda n1, n2: nodes_eq(n1, n2, 'usg')
def translations_of(sense):
    trans = sense.findall('.//t:quote', NS)
    if not trans:
        trans = sense.findall('.//t:def', NS)
    return trans

def rm_doubled_senses(entry):
    """Some entries have multiple senses. A few of them are exactly the same,
    remove these.
    This function returns True if an element has been altered"""
    senses = list(findall(entry, 'sense'))
    if len(senses) == 1:
        return

    # obtain a mapping from XML node -> list of words within `<quote>…</quote>`
    senses = {sense: translations_of(sense) for sense in senses}
    changed = False
    # pair each sense with another and compare their content
    for s1, s2 in itertools.combinations(senses.items(), 2):
        # both sets match
        if not len(s1[1]) == len(s2[1]):
            continue
        if not set(t.text for t in s1[1]) == set(t.text for t in s2[1]):
            continue # don't match
        # two senses are *exacctly* identical?
        try:
            entry.remove(s2[0]) # remove sense from entry
            changed = True
        except ValueError: # already removed?
            pass
    return changed

def node_is_empty(node):
    """Check that this node and all its children contain no text (apart from
    white space)."""
    if node.text.strip():
        return False
    for child in node.getchildren():
        if not node_is_empty(child):
            return False
    return True

def rm_empty_nodes(entry):
    """This function removes nodes which have no text and no children and are
    hence without semantic. It also resets the fixed counters on sense, since
    they are generated by the output formatter anyway.
    This function returns True if an empty node has been removed."""
    changed = False
    # sometimes parent nodes are empty after their empty children have been
    # removed, so do this three times (won't work with deeper nestings…)
    for _ in range(0, 2):
        changed_here = False
        nodes = [(None, entry)]
        for parent, node in nodes:
            if node_is_empty(node):
                if parent:
                    parent.remove(node)
                    changed_here = changed = True
            else:
                nodes.extend((node, c) for c in node.getchildren())
        if not changed_here:
            break # nothing removed, no further iterations
        else:
            changed = True
    # try to strip enumeration of senses they aren't adjacent anymore; map
    # sense: numeration first or discard if no numbering
    sense_numbers = {sense: int(sense.attrib.get('n'))
            for sense in tei_iter(entry, 'sense') if sense.attrib.get('n')}
    if sense_numbers:
        # if not all values between min and max sense number, enumeration is
        # broken and can be stripped
        if not all(n in sense_numbers.values()
                for n in range(1, max(iter(sense_numbers.values())))):
            for node in sense_numbers:
                # strip manual enumeration, handled by output formatters and might
                # be wrong after node removal
                del node.attrib['n']
                changed = True
    return changed

def rm_doubled_quotes(entry):
    """Some entries have doubled quotes (translations) within different senses.
    Remove the doubled quotes.
    This function return True, if the entry has been modified."""
    senses = list(findall(entry, 'sense'))
    # add quote elements
    senses = [(s, cit, q)  for s in senses for cit in findall(s, 'cit')
            for q in findall(cit, 'quote')]
    if len(senses) <= 1:
        return
    changed = False
    # pair each sense with another and compare their content
    for trans1, trans2 in itertools.combinations(senses, 2):
        sense1, _cit1, quote1 = trans1
        sense2, cit2, quote2 = trans2
        # translation could have been removed by a previous pairing
        if quote1.text == quote2.text and usages_match(sense1, sense2):
            try:
                cit2.remove(quote2)
            except ValueError:
                continue # already removed
            changed = True
    return changed

def exec(command):
    """Execute a command or fail straight away."""
    ret = os.system(command)
    if ret:
        sys.stderr.write("Process exited with %i: %s" % (ret, command))
        sys.exit(ret)

#pylint: disable=too-few-public-methods
class XmlParserWrapper:
    """This thin wrapper guards the parsing process.  It manually finds the TEI
    element and copies everything before and afterwards *verbatim*. This is due
    to the inability of the ElementTree parser to handle multiple "root
    elements", for instance comments before or after the root node or '<!'
    declarations.
    """
    def __init__(self, file_name):
        with open(file_name, encoding='utf-8') as file:
            content = file.read()
        tei_start = content.find('<TEI')
        if tei_start < 0:
            raise ValueError("Couldn't find string `<TEI` in the XML file.  Please extend this parser.")
        self.before_root = content[:tei_start]
        content = content[tei_start:]
        tei_end = content.find('</TEI>')
        if tei_end < 0:
            raise ValueError("Couldn't find `</TEI>` in the input file, please extend the parser.")
        tei_end += len('</TEI>')
        self.after_root = content[tei_end:]
        content = content[:tei_end]
        parser = ET.XMLParser(target = CommentedTreeBuilder())
        try:
            parser.feed(content)
        except ET.ParseError as e:
            sys.stderr.write("Error while parsing input file\n")
            sys.stderr.write(str(e) + '\n')
            sys.exit(15)

        self.root = parser.close()

    def write(self, file_name):
        """Write the XML element tree to a file, with hopefully a very similar
        formatting as before."""
        tree = ET.ElementTree(self.root)
        in_mem = io.BytesIO()
        tree.write(in_mem, encoding="UTF-8")
        in_mem.seek(0)
        with open(file_name, 'wb') as file:
            file.write(self.before_root.encode('UTF-8'))
            file.write(in_mem.read())
            file.write(self.after_root.encode('UTF-8'))
            if not self.after_root.endswith('\n'):
                file.write(b'\n')


def main():
    parser = HelpfulParser("deduplicator", description=("Fnd and remove "
        "duplicated translations and empty TEI nodes"))
    parser.add_argument("-s", "--detect_changes", dest="detect_changes",
              help=("check whether duplicates or empty nodes can be detected "
                  "and exit with exit code 42 if the first change would "
                  "need to be made"),
                  action="store_true", default=False)
    parser.add_argument('dictionary_path', help='input TEI file', nargs="+")
    args = parser.parse_args(sys.argv[1:])

    # register TEI name space without prefix to dump the *same* XML file
    ET.register_namespace('', 'http://www.tei-c.org/ns/1.0')
    dictionary_path = args.dictionary_path[0]
    tree = XmlParserWrapper(dictionary_path)
    changed = False
    if args.detect_changes: # abort if first change detected
        for entry in tei_iter(tree.root, 'entry'):
            changed = changed or rm_doubled_senses(entry)
            # this one is dangerous
            #changed = changed or rm_doubled_quotes(entry)
            # the processing above might leave empty parent nodes, remove those
            changed = changed or rm_empty_nodes(entry)
            if args.detect_changes and changed:
                print(("E1: Found duplicated entries or empty XML nodes. Try "
                        "`make rm_duplicates`."))
                sys.exit(42)
    else: # always apply changes
        for entry in tei_iter(tree.root, 'entry'):
            changed1 = rm_doubled_senses(entry)
            changed2 = rm_doubled_quotes(entry)
            # the processing above might leave empty parent nodes, remove those
            changed3 = rm_empty_nodes(entry)
            if changed1 or changed2 or changed3:
                changed = True
        if args.detect_changes and changed:
            print(("E1: Found duplicated entries or empty XML nodes. Try "
                    "`make rm_duplicates`."))


    if changed:
        output_fn = os.path.join('build', 'tei',
                dictionary_path.replace('.tei', '-dedup.tei'))
        exec('mkdir -p build/tei')
        tree.write(output_fn)
        # get a human-readable diff of the changes
        c5 = lambda x: shlex.quote(x.replace('.tei', '.c5'))
        shutil.copy('freedict-P5.dtd', os.path.dirname(output_fn))
        exec('xsltproc $FREEDICT_TOOLS/xsl/tei2c5.xsl %s > %s' % (output_fn,
            c5(output_fn)))
        # convert original dictionary to c5
        exec('make --no-print-directory build-dictd')
        # execute diff without checking the return type
        if not shutil.which('less'):
            exec('diff -u build/dictd/%s %s' % (c5(dictionary_path), c5(output_fn)))
        else:
            os.system('diff -u build/dictd/%s %s | less' % (c5(dictionary_path), c5(output_fn)))
        if input("Do you want to overwrite %s with the new version (y|n): " % \
                dictionary_path) == 'y':
            shutil.copy(output_fn, dictionary_path)


if __name__ == '__main__':
    main()