1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
|
from typing import Literal, Optional, Union, Type
MAX_HTML_INPUT_SIZE = 250e+7
ParserCls = Union[Type["HTMLParser"], Type["LexborHTMLParser"]]
Parser = Union["HTMLParser", "LexborHTMLParser"]
FRAGMENT = Literal[
"document",
"fragment",
"head",
"body",
"head_and_body",
"document_no_head",
"document_no_body",
"document_no_head_no_body",
]
def preprocess_input(html, decode_errors='ignore'):
if isinstance(html, (str, unicode)):
bytes_html = html.encode('UTF-8', errors=decode_errors)
elif isinstance(html, bytes):
bytes_html = html
else:
raise TypeError("Expected a string, but %s found" % type(html).__name__)
html_len = len(bytes_html)
if html_len > MAX_HTML_INPUT_SIZE:
raise ValueError("The specified HTML input is too large to be processed (%d bytes)" % html_len)
return bytes_html, html_len
def do_create_tag(tag: str, parser_cls: ParserCls):
if not tag:
raise ValueError("Tag name cannot be empty")
return do_parse_fragment(f"<{tag}></{tag}>", parser_cls)[0]
def get_fragment_type(
html: str,
parser_cls: ParserCls,
tree: Optional[Parser] = None,
) -> FRAGMENT:
if not tree:
tree = parser_cls(html)
import re
html_re = re.compile(r"<html|<body|<head(?!er)", re.IGNORECASE)
has_html = False
has_head = False
has_body = False
for match in html_re.finditer(html):
if match[0] == "<html":
has_html = True
elif match[0] == "<head":
has_head = True
elif match[0] == "<body":
has_body = True
if has_html and has_head and has_body:
break
if has_html and has_head and has_body:
return "document"
elif has_html and not has_head and has_body:
return "document_no_head"
elif has_html and has_head and not has_body:
return "document_no_body"
elif has_html and not has_head and not has_body:
return "document_no_head_no_body"
elif has_head and not has_body:
return "head"
elif not has_head and has_body:
return "body"
elif has_head and has_body:
return "head_and_body"
else:
return "fragment"
def do_parse_fragment(html: str, parser_cls: ParserCls):
"""
Given HTML, parse it into a list of Nodes, such that the nodes
correspond to the given HTML.
For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
if they are missing. This function does not add these tags.
"""
html = html.strip()
tree = parser_cls(html)
frag_type = get_fragment_type(html, parser_cls, tree)
if frag_type == "document":
return [tree.root]
if frag_type == "document_no_head":
tree.head.decompose(recursive=True)
return [tree.root]
if frag_type == "document_no_body":
tree.body.decompose(recursive=True)
return [tree.root]
if frag_type == "document_no_head_no_body":
tree.head.decompose(recursive=True)
tree.body.decompose(recursive=True)
return [tree.root]
elif frag_type == "head":
tree.body.decompose(recursive=True)
return [tree.head]
elif frag_type == "body":
tree.head.decompose(recursive=True)
return [tree.body]
elif frag_type == "head_and_body":
return [tree.head, tree.body]
else:
return [
*tree.head.iter(include_text=True),
*tree.body.iter(include_text=True),
]
|