Package: pikepdf / 1.17.3+dfsg-5

Fix-XXE-vulnerability-in-XMP-metadata-parsing.patch Patch series | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
From: "James R. Barlow" <james@purplerock.ca>
Date: Sat, 27 Mar 2021 00:43:21 -0700
Subject: Fix XXE vulnerability in XMP metadata parsing

For details:
https://portswigger.net/web-security/xxe

Reported by: Eric Therond eric.therond@sonarsource.com) of Sonarsource (https://www.sonarsource.com/)

(cherry picked from commit 3f38f73218e5e782fe411ccbb3b44a793c0b343a)
---
 src/pikepdf/_xml.py            | 30 ++++++++++++++++++++++++++++++
 src/pikepdf/models/metadata.py | 10 +++++-----
 tests/test_metadata.py         | 24 ++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 5 deletions(-)
 create mode 100644 src/pikepdf/_xml.py

diff --git a/src/pikepdf/_xml.py b/src/pikepdf/_xml.py
new file mode 100644
index 0000000..f0e1c38
--- /dev/null
+++ b/src/pikepdf/_xml.py
@@ -0,0 +1,30 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+# Copyright (C) 2021, James R. Barlow (https://github.com/jbarlow83/)
+
+
+from typing import IO, Any, AnyStr, Union
+
+from lxml.etree import XMLParser as _UnsafeXMLParser
+from lxml.etree import parse as _parse
+
+
+class _XMLParser(_UnsafeXMLParser):
+    def __init__(self, *args, **kwargs):
+        # Prevent XXE attacks
+        # https://rules.sonarsource.com/python/type/Vulnerability/RSPEC-2755
+        kwargs['resolve_entities'] = False
+        kwargs['no_network'] = True
+        super().__init__(*args, **kwargs)
+
+
+def parse_xml(source: Union[AnyStr, IO[Any]], recover: bool = False):
+    """Wrapper around lxml's parse to provide protection against XXE attacks."""
+
+    parser = _XMLParser(recover=recover, remove_pis=False)
+    return _parse(source, parser=parser)
+
+
+__all__ = ['parse_xml']
diff --git a/src/pikepdf/models/metadata.py b/src/pikepdf/models/metadata.py
index e34c2e4..ccd9da7 100644
--- a/src/pikepdf/models/metadata.py
+++ b/src/pikepdf/models/metadata.py
@@ -15,10 +15,11 @@ from io import BytesIO
 from warnings import warn
 
 from lxml import etree
-from lxml.etree import QName, XMLParser, XMLSyntaxError, parse
+from lxml.etree import QName, XMLSyntaxError
 
 from .. import Name, Stream, String
 from .. import __version__ as pikepdf_version
+from .._xml import parse_xml
 
 XMP_NS_DC = "http://purl.org/dc/elements/1.1/"
 XMP_NS_PDF = "http://ns.adobe.com/pdf/1.3/"
@@ -334,14 +335,13 @@ class PdfMetadata(MutableMapping):
             data = XMP_EMPTY  # on some platforms lxml chokes on empty documents
 
         def basic_parser(xml):
-            return parse(BytesIO(xml))
+            return parse_xml(BytesIO(xml))
 
         def strip_illegal_bytes_parser(xml):
-            return parse(BytesIO(re_xml_illegal_bytes.sub(b'', xml)))
+            return parse_xml(BytesIO(re_xml_illegal_bytes.sub(b'', xml)))
 
         def recovery_parser(xml):
-            parser = XMLParser(recover=True)
-            return parse(BytesIO(xml), parser)
+            return parse_xml(BytesIO(xml), recover=True)
 
         def replace_with_empty_xmp(_xml=None):
             log.warning("Error occurred parsing XMP, replacing with empty XMP.")
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
index 120b1a9..e9d96a0 100644
--- a/tests/test_metadata.py
+++ b/tests/test_metadata.py
@@ -587,3 +587,27 @@ def test_issue_100(trivial):
         UserWarning, match="no XMP equivalent"
     ):
         m.load_from_docinfo({'/AAPL:Example': pikepdf.Array([42])})
+
+
+def test_xxe(trivial, outdir):
+    secret = outdir / 'secret.txt'
+    secret.write_text("This is a secret")
+    trivial.Root.Metadata = Stream(
+        trivial,
+        b"""\
+<?xpacket begin='\xef\xbb\xbf' id='W5M0MpCehiHzreSzNTczkc9d'?>
+<!DOCTYPE rdf:RDF [<!ENTITY xxe SYSTEM "file://%s">]>
+<x:xmpmeta xmlns:x='adobe:ns:meta/' x:xmptk='Image'>
+<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+<note>
+<to>&xxe;</to>
+<from>xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx</from>
+</note>
+</rdf:RDF>
+</x:xmpmeta>
+<?xpacket end='w'?>
+    """
+        % os.fsencode(secret),
+    )
+    with trivial.open_metadata() as m:
+        assert 'This is a secret' not in str(m)