1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
|
cimport cython
from cpython.exc cimport PyErr_SetObject
from cpython.list cimport PyList_GET_SIZE
@cython.final
cdef class LexborCSSSelector:
def __init__(self):
self._create_css_parser()
self.results = []
self.current_node = None
cdef int _create_css_parser(self) except -1:
cdef lxb_status_t status
self.parser = lxb_css_parser_create()
status = lxb_css_parser_init(self.parser, NULL)
if status != LXB_STATUS_OK:
PyErr_SetObject(SelectolaxError, "Can't initialize CSS parser.")
return -1
self.css_selectors = lxb_css_selectors_create()
status = lxb_css_selectors_init(self.css_selectors)
if status != LXB_STATUS_OK:
PyErr_SetObject(SelectolaxError, "Can't initialize CSS selector.")
return -1
lxb_css_parser_selectors_set(self.parser, self.css_selectors)
self.selectors = lxb_selectors_create()
status = lxb_selectors_init(self.selectors)
lxb_selectors_opt_set(self.selectors, LXB_SELECTORS_OPT_MATCH_ROOT)
if status != LXB_STATUS_OK:
PyErr_SetObject(SelectolaxError, "Can't initialize CSS selector.")
return -1
return 0
cpdef list find(self, str query, LexborNode node):
return self._find(query, node, 0)
cpdef list find_first(self, str query, LexborNode node):
return self._find(query, node, 1)
cpdef list _find(self, str query, LexborNode node, bint only_first):
cdef lxb_css_selector_list_t* selectors
cdef lxb_char_t* c_selector
cdef lxb_css_selector_list_t * selectors_list
if not isinstance(query, str):
raise TypeError("Query must be a string.")
bytes_query = query.encode(_ENCODING)
selectors_list = lxb_css_selectors_parse(self.parser, <lxb_char_t *> bytes_query, <size_t>len(bytes_query))
if selectors_list == NULL:
raise SelectolaxError("Can't parse CSS selector.")
self.current_node = node
self.results = []
if only_first:
status = lxb_selectors_find(self.selectors, node.node, selectors_list,
<lxb_selectors_cb_f>css_finder_callback_first, <void*>self)
else:
status = lxb_selectors_find(self.selectors, node.node, selectors_list,
<lxb_selectors_cb_f>css_finder_callback, <void*>self)
results = list(self.results)
self.results = []
self.current_node = None
lxb_css_selector_list_destroy_memory(selectors_list)
return results
cpdef int any_matches(self, str query, LexborNode node) except -1:
cdef lxb_css_selector_list_t * selectors
cdef lxb_char_t * c_selector
cdef lxb_css_selector_list_t * selectors_list
cdef int result
if not isinstance(query, str):
raise TypeError("Query must be a string.")
bytes_query = query.encode(_ENCODING)
selectors_list = lxb_css_selectors_parse(self.parser, <lxb_char_t *> bytes_query, <size_t> len(query))
if selectors_list == NULL:
PyErr_SetObject(SelectolaxError, "Can't parse CSS selector.")
return -1
self.results = []
status = lxb_selectors_find(self.selectors, node.node, selectors_list,
<lxb_selectors_cb_f> css_matcher_callback, <void *> self)
if status != LXB_STATUS_OK:
lxb_css_selector_list_destroy_memory(selectors_list)
PyErr_SetObject(SelectolaxError, "Can't parse CSS selector.")
return -1
result = PyList_GET_SIZE(self.results) > 0
self.results = []
lxb_css_selector_list_destroy_memory(selectors_list)
return result
def __dealloc__(self):
if self.selectors != NULL:
lxb_selectors_destroy(self.selectors, True)
if self.parser != NULL:
lxb_css_parser_destroy(self.parser, True)
if self.css_selectors != NULL:
lxb_css_selectors_destroy(self.css_selectors, True)
cdef class LexborSelector:
"""An advanced CSS selector that supports additional operations.
Think of it as a toolkit that mimics some of the features of XPath.
Please note, this is an experimental feature that can change in the future.
"""
cdef LexborNode node
cdef list nodes
def __init__(self, LexborNode node, query):
self.node = node
self.nodes = self.node.parser.selector.find(query, self.node) if query else [node, ]
cpdef css(self, str query):
"""Evaluate CSS selector against current scope."""
raise NotImplementedError("This features is not supported by the lexbor backend. Please use Modest backend.")
@property
def matches(self) -> list:
"""Returns all possible matches"""
return self.nodes
@property
def any_matches(self) -> bool:
"""Returns True if there are any matches"""
return bool(self.nodes)
def text_contains(self, str text, bool deep=True, str separator='', bool strip=False) -> LexborSelector:
"""Filter all current matches given text."""
cdef list nodes = []
for node in self.nodes:
node_text = node.text(deep=deep, separator=separator, strip=strip)
if node_text and text in node_text:
nodes.append(node)
self.nodes = nodes
return self
def any_text_contains(self, str text, bool deep=True, str separator='', bool strip=False) -> bool:
"""Returns True if any node in the current search scope contains specified text"""
cdef LexborNode node
for node in self.nodes:
node_text = node.text(deep=deep, separator=separator, strip=strip)
if node_text and text in node_text:
return True
return False
def attribute_longer_than(self, str attribute, int length, str start = None) -> LexborSelector:
"""Filter all current matches by attribute length.
Similar to `string-length` in XPath.
"""
cdef list nodes = []
for node in self.nodes:
attr = node.attributes.get(attribute)
if not attr:
continue
if attr and start and start in attr:
attr = attr[attr.find(start) + len(start):]
if len(attr) > length:
nodes.append(node)
self.nodes = nodes
return self
def any_attribute_longer_than(self, str attribute, int length, str start = None) -> bool:
"""Returns True any href attribute longer than a specified length.
Similar to `string-length` in XPath.
"""
cdef LexborNode node
for node in self.nodes:
attr = node.attributes.get(attribute)
if attr and start and start in attr:
attr = attr[attr.find(start) + len(start):]
if len(attr) > length:
return True
return False
def __bool__(self):
return bool(self.nodes)
cdef lxb_status_t css_finder_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx):
cdef LexborNode lxb_node
cdef LexborCSSSelector cls
cls = <LexborCSSSelector> ctx
lxb_node = LexborNode.new(<lxb_dom_node_t *> node, cls.current_node.parser)
cls.results.append(lxb_node)
return LXB_STATUS_OK
cdef lxb_status_t css_finder_callback_first(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx):
cdef LexborNode lxb_node
cdef LexborCSSSelector cls
cls = <LexborCSSSelector> ctx
lxb_node = LexborNode.new(<lxb_dom_node_t *> node, cls.current_node.parser)
cls.results.append(lxb_node)
return LXB_STATUS_STOP
cdef lxb_status_t css_matcher_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx):
cdef LexborNode lxb_node
cdef LexborCSSSelector cls
cls = <LexborCSSSelector> ctx
cls.results.append(True)
return LXB_STATUS_STOP
|