1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
|
# Copyright (c) 2022 Tulir Asokan
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
from __future__ import annotations
from html.parser import HTMLParser
class HTMLNode(list):
tag: str
text: str
tail: str
attrib: dict[str, str]
def __repr__(self) -> str:
return (
f"HTMLNode(tag='{self.tag}', attrs={self.attrib}, text='{self.text}', "
f"tail='{self.tail}', children={list(self)})"
)
def __init__(self, tag: str, attrs: list[tuple[str, str]]) -> None:
super().__init__()
self.tag = tag
self.text = ""
self.tail = ""
self.attrib = dict(attrs)
class NodeifyingParser(HTMLParser):
# From https://www.w3.org/TR/html5/syntax.html#writing-html-documents-elements
void_tags = (
"area",
"base",
"br",
"col",
"command",
"embed",
"hr",
"img",
"input",
"link",
"meta",
"param",
"source",
"track",
"wbr",
)
stack: list[HTMLNode]
def __init__(self) -> None:
super().__init__()
self.stack = [HTMLNode("html", [])]
def handle_starttag(self, tag: str, attrs: list[tuple[str, str]]) -> None:
node = HTMLNode(tag, attrs)
self.stack[-1].append(node)
if tag not in self.void_tags:
self.stack.append(node)
def handle_startendtag(self, tag, attrs):
self.stack[-1].append(HTMLNode(tag, attrs))
def handle_endtag(self, tag: str) -> None:
if tag == self.stack[-1].tag:
self.stack.pop()
def handle_data(self, data: str) -> None:
if len(self.stack[-1]) > 0:
self.stack[-1][-1].tail += data
else:
self.stack[-1].text += data
def error(self, message: str) -> None:
pass
def read_html(data: str) -> HTMLNode:
parser = NodeifyingParser()
parser.feed(data)
return parser.stack[0]
|