File: office.py

package info (click to toggle)
mat2 0.14.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 12,428 kB
  • sloc: python: 3,758; makefile: 7
file content (628 lines) | stat: -rw-r--r-- 27,822 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
import random
import uuid
import logging
import os
import re
import zipfile
from typing import Pattern, Any, Tuple, Dict

import xml.etree.ElementTree as ET  # type: ignore

from .archive import ZipParser

# pylint: disable=line-too-long


def _parse_xml(full_path: str) -> Tuple[ET.ElementTree, Dict[str, str]]:
    """ This function parses XML, with namespace support. """
    namespace_map = dict()
    for _, (key, value) in ET.iterparse(full_path, ("start-ns", )):
        # The ns[0-9]+ namespaces are reserved for internal usage, so
        # we have to use an other nomenclature.
        if re.match('^ns[0-9]+$', key, re.I):  # pragma: no cover
            key = 'mat' + key[2:]

        namespace_map[key] = value
        ET.register_namespace(key, value)

    return ET.parse(full_path), namespace_map


def _sort_xml_attributes(full_path: str) -> bool:
    """ Sort xml attributes lexicographically,
    because it's possible to fingerprint producers (MS Office, Libreoffice, …)
    since they are all using different orders.
    """
    tree = ET.parse(full_path)

    for c in tree.getroot():
        c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc')))

    tree.write(full_path, xml_declaration=True, encoding='utf-8')
    return True


class MSOfficeParser(ZipParser):
    """
    The methods modifying XML documents are usually doing so in two loops:
        1. finding the tag/attributes to remove;
        2. actually editing the document
    since it's tricky to modify the XML while iterating on it.
    """
    mimetypes = {
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
        'application/vnd.openxmlformats-officedocument.presentationml.presentation'
    }
    content_types_to_keep = {
        'application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml',  # /word/endnotes.xml
        'application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml',  # /word/footnotes.xml
        'application/vnd.openxmlformats-officedocument.extended-properties+xml',  # /docProps/app.xml
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml',  # /word/document.xml
        'application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml',  # /word/fontTable.xml
        'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml',  # /word/footer.xml
        'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml',  # /word/header.xml
        'application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml',  # /word/styles.xml
        'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml',  # /word/numbering.xml (used for bullet point formatting)
        'application/vnd.openxmlformats-officedocument.theme+xml',  # /word/theme/theme[0-9].xml (used for font and background coloring, etc.)
        'application/vnd.openxmlformats-package.core-properties+xml',  # /docProps/core.xml

        # for more complicated powerpoints
        'application/vnd.openxmlformats-officedocument.presentationml.notesSlide+xml',
        'application/vnd.openxmlformats-officedocument.presentationml.notesMaster+xml',
        'application/vnd.openxmlformats-officedocument.presentationml.handoutMaster+xml',
        'application/vnd.openxmlformats-officedocument.drawingml.diagramData+xml',
        'application/vnd.openxmlformats-officedocument.drawingml.diagramLayout+xml',
        'application/vnd.openxmlformats-officedocument.drawingml.diagramStyle+xml',
        'application/vnd.openxmlformats-officedocument.drawingml.diagramColors+xml',
        'application/vnd.ms-office.drawingml.diagramDrawing+xml',

        # Do we want to keep the following ones?
        'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml',
    }

    def __init__(self, filename):
        super().__init__(filename)

        # MSOffice documents are using various counters for cross-references,
        # we collect them all, to make sure that they're effectively counters,
        # and not unique id used for fingerprinting.
        self.__counters = {
            'cNvPr': set(),
            'rid': set(),
            }

        self.files_to_keep = set(map(re.compile, {  # type: ignore
            r'^\[Content_Types\]\.xml$',
            r'^_rels/\.rels$',
            r'^xl/sharedStrings\.xml$',  # https://docs.microsoft.com/en-us/office/open-xml/working-with-the-shared-string-table
            r'^xl/calcChain\.xml$',
            r'^(?:word|ppt|xl)/_rels/(document|workbook|presentation)\.xml\.rels$',
            r'^(?:word|ppt|xl)/_rels/footer[0-9]*\.xml\.rels$',
            r'^(?:word|ppt|xl)/_rels/header[0-9]*\.xml\.rels$',
            r'^(?:word|ppt|xl)/charts/_rels/chart[0-9]+\.xml\.rels$',
            r'^(?:word|ppt|xl)/charts/colors[0-9]+\.xml$',
            r'^(?:word|ppt|xl)/charts/style[0-9]+\.xml$',
            r'^(?:word|ppt|xl)/drawings/_rels/drawing[0-9]+\.xml\.rels$',
            r'^(?:word|ppt|xl)/styles\.xml$',
            # TODO: randomize axId ( https://docs.microsoft.com/en-us/openspecs/office_standards/ms-oi29500/089f849f-fcd6-4fa0-a281-35aa6a432a16 )
            r'^(?:word|ppt|xl)/charts/chart[0-9]*\.xml$',
            r'^xl/workbook\.xml$',
            r'^xl/worksheets/sheet[0-9]+\.xml$',
            r'^ppt/slideLayouts/_rels/slideLayout[0-9]+\.xml\.rels$',
            r'^ppt/slideLayouts/slideLayout[0-9]+\.xml$',
            r'^(?:word|ppt|xl)/tableStyles\.xml$',
            r'^(?:word|ppt|xl)/tables/table[0-9]+\.xml$',
            r'^ppt/slides/_rels/slide[0-9]*\.xml\.rels$',
            r'^ppt/slides/slide[0-9]*\.xml$',
            # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
            r'^(?:word|ppt|xl)/stylesWithEffects\.xml$',
            r'^ppt/presentation\.xml$',
            # TODO: check if p:bgRef can be randomized
            r'^ppt/slideMasters/slideMaster[0-9]+\.xml',
            r'^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels',
            r'^xl/worksheets/_rels/sheet[0-9]+\.xml\.rels',
            r'^(?:word|ppt|xl)/drawings/vmlDrawing[0-9]+\.vml',
            r'^(?:word|ppt|xl)/drawings/drawing[0-9]+\.xml',
            r'^(?:word|ppt|xl)/embeddings/Microsoft_Excel_Worksheet[0-9]+\.xlsx',
            # rels for complicated powerpoints
            r'^ppt/notesSlides/_rels/notesSlide[0-9]+\.xml\.rels',
            r'^ppt/notesMasters/_rels/notesMaster[0-9]+\.xml\.rels',
            r'^ppt/handoutMasters/_rels/handoutMaster[0-9]+\.xml\.rels',
        }))
        self.files_to_omit = set(map(re.compile, {  # type: ignore
            r'^\[trash\]/',
            r'^customXml/',
            r'webSettings\.xml$',
            r'^docProps/custom\.xml$',
            r'^docProps/thumbnail.wmf$',
            r'^(?:word|ppt|xl)/printerSettings/',
            r'^(?:word|ppt|xl)/theme',
            r'^(?:word|ppt|xl)/people\.xml$',
            r'^(?:word|ppt|xl)/persons/person\.xml$',
            r'^(?:word|ppt|xl)/numbering\.xml$',
            r'^(?:word|ppt|xl)/tags/',
            r'^(?:word|ppt|xl)/glossary/',
            # View properties like view mode, last viewed slide etc
            r'^(?:word|ppt|xl)/viewProps\.xml$',
            # Additional presentation-wide properties like printing properties,
            # presentation show properties etc.
            r'^(?:word|ppt|xl)/presProps\.xml$',
            r'^(?:word|ppt|xl)/comments[0-9]*\.xml$',
            r'^(?:word|ppt|xl)/threadedComments/threadedComment[0-9]*\.xml$',
            r'^(?:word|ppt|xl)/commentsExtended\.xml$',
            r'^(?:word|ppt|xl)/commentsExtensible\.xml$',
            r'^(?:word|ppt|xl)/commentsIds\.xml$',
            # we have an allowlist in self.files_to_keep,
            # so we can trash everything else
            r'^(?:word|ppt|xl)/_rels/',
            r'docMetadata/LabelInfo\.xml$'
        }))

        if self.__fill_files_to_keep_via_content_types() is False:
            raise ValueError

    def __fill_files_to_keep_via_content_types(self) -> bool:
        """ There is a suer-handy `[Content_Types].xml` file
        in MS Office archives, describing what each other file contains.
        The self.content_types_to_keep member contains a type allowlist,
        so we're using it to fill the self.files_to_keep one.
        """
        with zipfile.ZipFile(self.filename) as zin:
            if '[Content_Types].xml' not in zin.namelist():
                return False
            xml_data = zin.read('[Content_Types].xml')

        self.content_types: Dict[str, str] = dict()
        try:
            tree = ET.fromstring(xml_data)
        except ET.ParseError:
            return False
        for c in tree:
            if 'PartName' not in c.attrib or 'ContentType' not in c.attrib:  # pragma: no cover
                continue
            elif c.attrib['ContentType'] in self.content_types_to_keep:
                fname = c.attrib['PartName'][1:]  # remove leading `/`
                re_fname = re.compile('^' + re.escape(fname) + '$')
                self.files_to_keep.add(re_fname)  # type: ignore
        return True

    @staticmethod
    def __remove_rsid(full_path: str) -> bool:
        """ The method will remove "revision session ID".  We're using '}rsid'
        instead of proper parsing, since rsid can have multiple forms, like
        `rsidRDefault`, `rsidR`, `rsids`, …

        For more details, see
        - https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx
        - https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/
        """
        try:
            tree, namespace = _parse_xml(full_path)
        except ET.ParseError as e:  # pragma: no cover
            logging.error("Unable to parse %s: %s", full_path, e)
            return False

        # rsid, tags or attributes, are always under the `w` namespace
        if 'w' not in namespace:
            return True

        parent_map = {c:p for p in tree.iter() for c in p}

        elements_to_remove = list()
        for item in tree.iterfind('.//', namespace):
            if '}rsid' in item.tag.strip().lower():  # rsid as tag
                elements_to_remove.append(item)
                continue
            for key in list(item.attrib.keys()):  # rsid as attribute
                if '}rsid' in key.lower():
                    del item.attrib[key]

        for element in elements_to_remove:
            parent_map[element].remove(element)

        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True

    @staticmethod
    def __remove_nsid(full_path: str) -> bool:
        """
        nsid are random identifiers that can be used to ease the merging of
        some components of a document.  They can also be used for
        fingerprinting.

        See the spec for more details: https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing.nsid?view=openxml-2.8.1
        """
        try:
            tree, namespace = _parse_xml(full_path)
        except ET.ParseError as e:  # pragma: no cover
            logging.error("Unable to parse %s: %s", full_path, e)
            return False

        # The nsid tag is always under the `w` namespace
        if 'w' not in namespace:
            return True

        parent_map = {c: p for p in tree.iter() for c in p}

        elements_to_remove = list()
        for element in tree.iterfind('.//w:nsid', namespace):
            elements_to_remove.append(element)
        for element in elements_to_remove:
            parent_map[element].remove(element)

        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True

    @staticmethod
    def __remove_revisions(full_path: str) -> bool:
        try:
            tree, namespace = _parse_xml(full_path)
        except ET.ParseError as e:  # pragma: no cover
            logging.error("Unable to parse %s: %s", full_path, e)
            return False

        # Revisions are either deletions (`w:del`) or
        # insertions (`w:ins`)
        del_presence = tree.find('.//w:del', namespace)
        ins_presence = tree.find('.//w:ins', namespace)
        if del_presence is None and ins_presence is None:
            return True  # No revisions are present

        parent_map = {c:p for p in tree.iter() for c in p}

        elements_del = list()
        for element in tree.iterfind('.//w:del', namespace):
            elements_del.append(element)
        for element in elements_del:
            parent_map[element].remove(element)

        elements_ins = list()
        for element in tree.iterfind('.//w:ins', namespace):
            for position, item in enumerate(tree.iter()):  # pragma: no cover
                if item == element:
                    for children in element.iterfind('./*'):
                        elements_ins.append((element, position, children))
                    break

        for (element, position, children) in elements_ins:
            parent_map[element].insert(position, children)

        # the list can sometimes contain duplicate elements, so don't remove
        # until all children have been processed
        for (element, position, children) in elements_ins:
            if element in parent_map[element]:
                parent_map[element].remove(element)

        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True

    @staticmethod
    def __remove_document_comment_meta(full_path: str) -> bool:
        try:
            tree, namespace = _parse_xml(full_path)
        except ET.ParseError as e:  # pragma: no cover
            logging.error("Unable to parse %s: %s", full_path, e)
            return False

        # search the docs to see if we can bail early
        range_start = tree.find('.//w:commentRangeStart', namespace)
        range_end = tree.find('.//w:commentRangeEnd', namespace)
        references = tree.find('.//w:commentReference', namespace)
        if range_start is None and range_end is None and references is None:
            return True  # No comment meta tags are present

        parent_map = {c:p for p in tree.iter() for c in p}

        # iterate over the elements and add them to list
        elements_del = list()
        for element in tree.iterfind('.//w:commentRangeStart', namespace):
            elements_del.append(element)
        for element in tree.iterfind('.//w:commentRangeEnd', namespace):
            elements_del.append(element)
        for element in tree.iterfind('.//w:commentReference', namespace):
            elements_del.append(element)

        # remove the elements
        for element in elements_del:
            parent_map[element].remove(element)

        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True

    def __remove_document_xml_rels_members(self, full_path: str) -> bool:
        """ Remove the dangling references from the word/_rels/document.xml.rels file, since MS office doesn't like them.
        """
        try:
            tree, namespace = _parse_xml(full_path)
        except ET.ParseError as e:  # pragma: no cover
            logging.error("Unable to parse %s: %s", full_path, e)
            return False

        if len(namespace.items()) != 1:  # pragma: no cover
            logging.debug("Got several namespaces for Types: %s", namespace.items())

        removed_fnames = set()
        with zipfile.ZipFile(self.filename) as zin:
            for fname in [item.filename for item in zin.infolist()]:
                for file_to_omit in self.files_to_omit:
                    if file_to_omit.search(fname):
                        matches = map(lambda r: r.search(fname), self.files_to_keep)
                        if any(matches):  # the file is in the allowlist
                            continue
                        removed_fnames.add(fname)
                        break

        root = tree.getroot()
        for item in root.findall('{%s}Relationship' % namespace['']):
            name = 'word/' + item.attrib['Target'] # add the word/ prefix to the path, since all document rels are in the word/ directory
            if name in removed_fnames:
                root.remove(item)

        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True

    def __remove_content_type_members(self, full_path: str) -> bool:
        """ The method will remove the dangling references
        form the [Content_Types].xml file, since MS office doesn't like them
        """
        try:
            tree, namespace = _parse_xml(full_path)
        except ET.ParseError as e:  # pragma: no cover
            logging.error("Unable to parse %s: %s", full_path, e)
            return False

        if len(namespace.items()) != 1:  # pragma: no cover
            logging.debug("Got several namespaces for Types: %s", namespace.items())

        removed_fnames = set()
        with zipfile.ZipFile(self.filename) as zin:
            for fname in [item.filename for item in zin.infolist()]:
                for file_to_omit in self.files_to_omit:
                    if file_to_omit.search(fname):
                        matches = map(lambda r: r.search(fname), self.files_to_keep)
                        if any(matches):  # the file is in the allowlist
                            continue
                        removed_fnames.add(fname)
                        break

        root = tree.getroot()
        for item in root.findall('{%s}Override' % namespace['']):
            name = item.attrib['PartName'][1:]  # remove the leading '/'
            if name in removed_fnames:
                root.remove(item)

        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True

    def _final_checks(self) -> bool:
        for k, v in self.__counters.items():
            if v and len(v) != max(v):
                # TODO: make this an error and return False
                # once the ability to correct the counters is implemented
                logging.warning("%s contains invalid %s: %s", self.filename, k, v)
                return True
        return True

    def __collect_counters(self, full_path: str):
        with open(full_path, encoding='utf-8') as f:
            content = f.read()
            # "relationship Id"
            for i in re.findall(r'(?:\s|r:)[iI][dD]="rId([0-9]+)"(?:\s|/)', content):
                self.__counters['rid'].add(int(i))
            # "connector for Non-visual property"
            for i in re.findall(r'<p:cNvPr id="([0-9]+)"', content):
                self.__counters['cNvPr'].add(int(i))

    @staticmethod
    def __randomize_creationId(full_path: str) -> bool:
        try:
            tree, namespace = _parse_xml(full_path)
        except ET.ParseError as e:  # pragma: no cover
            logging.error("Unable to parse %s: %s", full_path, e)
            return False

        if 'p14' not in namespace:
            return True  # pragma: no cover

        for item in tree.iterfind('.//p14:creationId', namespace):
            item.set('val', '%s' % random.randint(0, 2**32))
        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True

    @staticmethod
    def __randomize_sldMasterId(full_path: str) -> bool:
        try:
            tree, namespace = _parse_xml(full_path)
        except ET.ParseError as e:  # pragma: no cover
            logging.error("Unable to parse %s: %s", full_path, e)
            return False

        if 'p' not in namespace:
            return True  # pragma: no cover

        for item in tree.iterfind('.//p:sldMasterId', namespace):
            item.set('id', '%s' % random.randint(0, 2**32))
        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True

    def _specific_cleanup(self, full_path: str) -> bool:
        # pylint: disable=too-many-return-statements,too-many-branches
        if os.stat(full_path).st_size == 0:  # Don't process empty files
            return True

        if not full_path.endswith(('.xml', '.xml.rels')):
            return True

        if self.__randomize_creationId(full_path) is False:
            return False

        self.__collect_counters(full_path)

        if full_path.endswith('/[Content_Types].xml'):
            # this file contains references to files that we might
            # remove, and MS Office doesn't like dangling references
            if self.__remove_content_type_members(full_path) is False:  # pragma: no cover
                return False
        elif full_path.endswith('/word/document.xml'):
            # this file contains the revisions
            if self.__remove_revisions(full_path) is False:
                return False  # pragma: no cover
            # remove comment references and ranges
            if self.__remove_document_comment_meta(full_path) is False:
                return False  # pragma: no cover
        elif full_path.endswith('/word/_rels/document.xml.rels'):
            # similar to the above, but for the document.xml.rels file
            if self.__remove_document_xml_rels_members(full_path) is False:  # pragma: no cover
                return False
        elif full_path.endswith('/docProps/app.xml'):
            # This file must be present and valid,
            # so we're removing as much as we can.
            with open(full_path, 'wb') as f:
                f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
                f.write(b'<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties">')
                f.write(b'</Properties>')
        elif full_path.endswith('/docProps/core.xml'):
            # This file must be present and valid,
            # so we're removing as much as we can.
            with open(full_path, 'wb') as f:
                f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
                f.write(b'<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties">')
                f.write(b'</cp:coreProperties>')
        elif full_path.endswith('/ppt/tableStyles.xml'):  # pragma: no cover
            # This file must be present and valid,
            # so we're removing as much as we can.
            with open(full_path, 'wb') as f:
                f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
                uid = str(uuid.uuid4()).encode('utf-8')
                f.write(b'<a:tblStyleLst def="{%s}" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"/>' % uid)
        elif full_path.endswith('ppt/presentation.xml'):
            if self.__randomize_sldMasterId(full_path) is False:
                return False  # pragma: no cover

        if self.__remove_rsid(full_path) is False:
            return False  # pragma: no cover

        if self.__remove_nsid(full_path) is False:
            return False  # pragma: no cover

        try:
            _sort_xml_attributes(full_path)
        except ET.ParseError as e:  # pragma: no cover
            logging.error("Unable to parse %s: %s", full_path, e)
            return False

        # This is awful, I'm sorry.
        #
        # Microsoft Office isn't happy when we have the `mc:Ignorable`
        # tag containing namespaces that aren't present in the xml file,
        # so instead of trying to remove this specific tag with etree,
        # we're removing it, with a regexp.
        #
        # Since we're the ones producing this file, via the call to
        # _sort_xml_attributes, there won't be any "funny tricks".
        # Worst case, the tag isn't present, and everything is fine.
        #
        # see: https://docs.microsoft.com/en-us/dotnet/framework/wpf/advanced/mc-ignorable-attribute
        with open(full_path, 'rb') as f:
            text = f.read()
            out = re.sub(b'mc:Ignorable="[^"]*"', b'', text, count=1)
        with open(full_path, 'wb') as f:
            f.write(out)

        return True

    def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
        """
        Yes, I know that parsing xml with regexp ain't pretty,
        be my guest and fix it if you want.
        """
        if not file_path.startswith('docProps/') or not file_path.endswith('.xml'):
            return {}

        with open(full_path, encoding='utf-8') as f:
            try:
                results = re.findall(r"<(.+)>(.+)</\1>", f.read(), re.I | re.M)
                return {k: v for (k, v) in results}
            except (TypeError, UnicodeDecodeError):
                # We didn't manage to parse the xml file
                return {file_path: 'harmful content', }


class LibreOfficeParser(ZipParser):
    mimetypes = {
        'application/vnd.oasis.opendocument.text',
        'application/vnd.oasis.opendocument.spreadsheet',
        'application/vnd.oasis.opendocument.presentation',
        'application/vnd.oasis.opendocument.graphics',
        'application/vnd.oasis.opendocument.chart',
        'application/vnd.oasis.opendocument.formula',
        'application/vnd.oasis.opendocument.image',
    }

    def __init__(self, filename):
        super().__init__(filename)

        self.files_to_keep = set(map(re.compile, {  # type: ignore
            r'^META-INF/manifest\.xml$',
            r'^content\.xml$',
            r'^manifest\.rdf$',
            r'^mimetype$',
            r'^settings\.xml$',
            r'^styles\.xml$',
        }))
        self.files_to_omit = set(map(re.compile, {  # type: ignore
            r'^meta\.xml$',
            r'^layout-cache$',
            r'^Configurations2/',
            r'^Thumbnails/',
        }))

    @staticmethod
    def __remove_revisions(full_path: str) -> bool:
        try:
            tree, namespace = _parse_xml(full_path)
        except ET.ParseError as e:
            logging.error("Unable to parse %s: %s", full_path, e)
            return False

        if 'office' not in namespace:  # no revisions in the current file
            return True

        for text in tree.getroot().iterfind('.//office:text', namespace):
            for changes in text.iterfind('.//text:tracked-changes', namespace):
                text.remove(changes)

        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True

    def _specific_cleanup(self, full_path: str) -> bool:
        if os.stat(full_path).st_size == 0:  # Don't process empty files
            return True

        if os.path.basename(full_path).endswith('.xml'):
            if os.path.basename(full_path) == 'content.xml':
                if self.__remove_revisions(full_path) is False:
                    return False

            try:
                _sort_xml_attributes(full_path)
            except ET.ParseError as e:
                logging.error("Unable to parse %s: %s", full_path, e)
                return False
        return True

    def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
        """
        Yes, I know that parsing xml with regexp ain't pretty,
        be my guest and fix it if you want.
        """
        if file_path != 'meta.xml':
            return {}
        with open(full_path, encoding='utf-8') as f:
            try:
                results = re.findall(r"<((?:meta|dc|cp).+?)[^>]*>(.+)</\1>", f.read(), re.I|re.M)
                return {k:v for (k, v) in results}
            except (TypeError, UnicodeDecodeError):  # We didn't manage to parse the xml file
                # We didn't manage to parse the xml file
                return {file_path: 'harmful content', }