1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
|
include "../utils.pxi"
import re
def create_tag(tag: str):
"""
Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag,
e.g. `"<div></div>"`.
Use `LexborHTMLParser().create_node(..)` if you need to create a node tied to a specific parser instance.
"""
return LexborHTMLParser(f"<{tag}></{tag}>", is_fragment=True).root
def parse_fragment(html: str):
"""
Given HTML, parse it into a list of Nodes, such that the nodes
correspond to the given HTML.
For contrast, HTMLParser adds `<html>`, `<head>`, and `<body>` tags
if they are missing. This function does not add these tags.
"""
return do_parse_fragment(html, LexborHTMLParser)
def extract_html_comment(text: str) -> str:
"""Extract the inner content of an HTML comment string.
Args:
text: Raw HTML comment, including the ``<!--`` and ``-->`` markers.
Returns:
The comment body with surrounding whitespace stripped.
Raises:
ValueError: If the input is not a well-formed HTML comment.
Examples:
>>> extract_html_comment("<!-- hello -->")
'hello'
"""
if match := re.fullmatch(r"\s*<!--\s*(.*?)\s*-->\s*", text, flags=re.DOTALL):
return match.group(1).strip()
msg = "Input is not a valid HTML comment"
raise ValueError(msg)
cdef inline bint is_empty_text_node(lxb_dom_node_t *text_node):
"""
Check whether a node is a text node made up solely of HTML ASCII whitespace.
Parameters
----------
text_node : lxb_dom_node_t *
Pointer to the node that should be inspected.
Returns
-------
bint
``True`` if ``text_node`` is a text node whose character data contains
only space, tab, newline, form feed, or carriage return characters;
otherwise ``False``.
"""
if text_node == NULL or text_node.type != LXB_DOM_NODE_TYPE_TEXT:
return False
cdef lxb_dom_character_data_t *text_character_data = <lxb_dom_character_data_t *> text_node
cdef lexbor_str_t *text_buffer = &text_character_data.data
cdef size_t text_length = text_buffer.length
cdef lxb_char_t *text_bytes = text_buffer.data
return _is_whitespace_only(text_bytes, text_length)
cdef inline bint _is_whitespace_only(const lxb_char_t *buffer, size_t buffer_length) nogil:
"""
Determine whether a byte buffer consists only of HTML ASCII whitespace.
Parameters
----------
buffer : const lxb_char_t *
Pointer to the buffer to inspect.
buffer_length : size_t
Number of bytes available in ``buffer``.
Returns
-------
bint
``True`` if ``buffer`` is ``NULL``, empty, or contains only space
(0x20), tab (0x09), line feed (0x0A), form feed (0x0C), or carriage
return (0x0D) bytes; otherwise ``False``.
Notes
-----
Mirrors Lexbor's ``lexbor_utils_whitespace`` macro and stays inline to
keep the GIL released in hot loops.
"""
cdef const lxb_char_t *cursor = buffer
cdef const lxb_char_t *end = buffer + buffer_length
cdef lxb_char_t current_char
if buffer == NULL or buffer_length == 0:
return True
# Inline whitespace check mirroring lexbor_utils_whitespace(chr, !=, &&)
while cursor < end:
current_char = cursor[0]
if (current_char != ' ' and current_char != '\t' and current_char != '\n'
and current_char != '\f' and current_char != '\r'):
return False
cursor += 1
return True
|