File: html_reader.py

package info (click to toggle)
mautrix-python 0.20.7-1
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 1,812 kB
  • sloc: python: 19,103; makefile: 16
file content (83 lines) | stat: -rw-r--r-- 2,119 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# Copyright (c) 2022 Tulir Asokan
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
from __future__ import annotations

from html.parser import HTMLParser


class HTMLNode(list):
    tag: str
    text: str
    tail: str
    attrib: dict[str, str]

    def __repr__(self) -> str:
        return (
            f"HTMLNode(tag='{self.tag}', attrs={self.attrib}, text='{self.text}', "
            f"tail='{self.tail}', children={list(self)})"
        )

    def __init__(self, tag: str, attrs: list[tuple[str, str]]) -> None:
        super().__init__()
        self.tag = tag
        self.text = ""
        self.tail = ""
        self.attrib = dict(attrs)


class NodeifyingParser(HTMLParser):
    # From https://www.w3.org/TR/html5/syntax.html#writing-html-documents-elements
    void_tags = (
        "area",
        "base",
        "br",
        "col",
        "command",
        "embed",
        "hr",
        "img",
        "input",
        "link",
        "meta",
        "param",
        "source",
        "track",
        "wbr",
    )

    stack: list[HTMLNode]

    def __init__(self) -> None:
        super().__init__()
        self.stack = [HTMLNode("html", [])]

    def handle_starttag(self, tag: str, attrs: list[tuple[str, str]]) -> None:
        node = HTMLNode(tag, attrs)
        self.stack[-1].append(node)
        if tag not in self.void_tags:
            self.stack.append(node)

    def handle_startendtag(self, tag, attrs):
        self.stack[-1].append(HTMLNode(tag, attrs))

    def handle_endtag(self, tag: str) -> None:
        if tag == self.stack[-1].tag:
            self.stack.pop()

    def handle_data(self, data: str) -> None:
        if len(self.stack[-1]) > 0:
            self.stack[-1][-1].tail += data
        else:
            self.stack[-1].text += data

    def error(self, message: str) -> None:
        pass


def read_html(data: str) -> HTMLNode:
    parser = NodeifyingParser()
    parser.feed(data)
    return parser.stack[0]