1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
|
from html import parser, escape
from typing import Any, Optional, Dict, List, Tuple, Set
import re
import string
from . import abstract
# pylint: disable=too-many-instance-attributes
class CSSParser(abstract.AbstractParser):
"""There is no such things as metadata in CSS files,
only comments of the form `/* … */`, so we're removing the laters."""
mimetypes = {'text/css', }
flags = re.MULTILINE | re.DOTALL
def remove_all(self) -> bool:
with open(self.filename, encoding='utf-8') as f:
try:
content = f.read()
except UnicodeDecodeError: # pragma: no cover
raise ValueError
cleaned = re.sub(r'/\*.*?\*/', '', content, count=0, flags=self.flags)
with open(self.output_filename, 'w', encoding='utf-8') as f:
f.write(cleaned)
return True
def get_meta(self) -> Dict[str, Any]:
metadata = {}
with open(self.filename, encoding='utf-8') as f:
try:
content = f.read()
except UnicodeDecodeError: # pragma: no cover
raise ValueError
cssdoc = re.findall(r'/\*(.*?)\*/', content, self.flags)
for match in cssdoc:
for line in match.splitlines():
try:
k, v = line.split(':')
metadata[k.strip(string.whitespace + '*')] = v.strip()
except ValueError:
metadata['harmful data'] = line.strip()
return metadata
class AbstractHTMLParser(abstract.AbstractParser):
tags_blocklist: Set[str] = set()
# In some html/xml-based formats some tags are mandatory,
# so we're keeping them, but are discarding their content
tags_required_blocklist: Set[str] = set()
def __init__(self, filename):
super().__init__(filename)
self.__parser = _HTMLParser(self.filename, self.tags_blocklist,
self.tags_required_blocklist)
with open(filename, encoding='utf-8') as f:
self.__parser.feed(f.read())
self.__parser.close()
def get_meta(self) -> Dict[str, Any]:
return self.__parser.get_meta()
def remove_all(self) -> bool:
return self.__parser.remove_all(self.output_filename)
class HTMLParser(AbstractHTMLParser):
mimetypes = {'text/html', 'application/xhtml+xml'}
tags_blocklist = {'meta', }
tags_required_blocklist = {'title', }
class DTBNCXParser(AbstractHTMLParser):
mimetypes = {'application/x-dtbncx+xml', }
tags_required_blocklist = {'title', 'doctitle', 'meta'}
class _HTMLParser(parser.HTMLParser):
"""Python doesn't have a validating html parser in its stdlib, so
we're using an internal queue to track all the opening/closing tags,
and hoping for the best.
Moreover, the parser.HTMLParser call doesn't provide a get_endtag_text
method, so we have to use get_starttag_text instead, put its result in a
LIFO, and transform it in a closing tag when needed.
Also, gotcha: the `tag` parameters are always in lowercase.
"""
def __init__(self, filename, blocklisted_tags, required_blocklisted_tags):
super().__init__()
self.filename = filename
self.__textrepr = ''
self.__meta = {}
self.__validation_queue: List[str] = list()
# We're using counters instead of booleans, to handle nested tags
self.__in_dangerous_but_required_tag = 0
self.__in_dangerous_tag = 0
if required_blocklisted_tags & blocklisted_tags: # pragma: nocover
raise ValueError("There is an overlap between %s and %s" % (
required_blocklisted_tags, blocklisted_tags))
self.tag_required_blocklist = required_blocklisted_tags
self.tag_blocklist = blocklisted_tags
def error(self, message): # pragma: no cover
""" Amusingly, Python's documentation doesn't mention that this
function needs to be implemented in subclasses of the parent class
of parser.HTMLParser. This was found by fuzzing,
triggering the following exception:
NotImplementedError: subclasses of ParserBase must override error()
"""
raise ValueError(message)
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):
# Ignore the type, because mypy is too stupid to infer
# that get_starttag_text() can't return None.
original_tag = self.get_starttag_text() # type: ignore
self.__validation_queue.append(original_tag) # type: ignore
if tag in self.tag_blocklist:
self.__in_dangerous_tag += 1
if self.__in_dangerous_tag == 0:
if self.__in_dangerous_but_required_tag == 0:
self.__textrepr += original_tag
if tag in self.tag_required_blocklist:
self.__in_dangerous_but_required_tag += 1
def handle_endtag(self, tag: str):
if not self.__validation_queue:
raise ValueError("The closing tag %s doesn't have a corresponding "
"opening one in %s." % (tag, self.filename))
previous_tag = self.__validation_queue.pop()
previous_tag = previous_tag[1:-1] # remove < and >
previous_tag = previous_tag.split(' ')[0] # remove attributes
if tag != previous_tag.lower():
raise ValueError("The closing tag %s doesn't match the previous "
"tag %s in %s" %
(tag, previous_tag, self.filename))
if tag in self.tag_required_blocklist:
self.__in_dangerous_but_required_tag -= 1
if self.__in_dangerous_tag == 0:
if self.__in_dangerous_but_required_tag == 0:
# There is no `get_endtag_text()` method :/
self.__textrepr += '</' + previous_tag + '>'
if tag in self.tag_blocklist:
self.__in_dangerous_tag -= 1
def handle_data(self, data: str):
if self.__in_dangerous_but_required_tag == 0:
if self.__in_dangerous_tag == 0:
if data.strip():
self.__textrepr += escape(data)
def handle_startendtag(self, tag: str,
attrs: List[Tuple[str, Optional[str]]]):
if tag in self.tag_required_blocklist | self.tag_blocklist:
meta = {k:v for k, v in attrs}
name = meta.get('name', 'harmful metadata')
content = meta.get('content', 'harmful data')
self.__meta[name] = content
if self.__in_dangerous_tag == 0:
if tag in self.tag_required_blocklist:
self.__textrepr += '<' + tag + ' />'
return
if self.__in_dangerous_tag == 0:
if self.__in_dangerous_but_required_tag == 0:
self.__textrepr += self.get_starttag_text()
def remove_all(self, output_filename: str) -> bool:
if self.__validation_queue:
raise ValueError("Some tags (%s) were left unclosed in %s" % (
', '.join(self.__validation_queue),
self.filename))
with open(output_filename, 'w', encoding='utf-8') as f:
f.write(self.__textrepr)
return True
def get_meta(self) -> Dict[str, Any]:
if self.__validation_queue:
raise ValueError("Some tags (%s) were left unclosed in %s" % (
', '.join(self.__validation_queue),
self.filename))
return self.__meta
|