1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
|
cimport cython
from cpython.exc cimport PyErr_SetObject
@cython.final
cdef class CSSSelector:
cdef char *c_selector
cdef mycss_entry_t *css_entry
cdef modest_finder_t *finder
cdef mycss_selectors_list_t *selectors_list
def __init__(self, str selector):
selector_pybyte = selector.encode('UTF-8')
self.c_selector = selector_pybyte
# In order to propagate errors these methods should return no value
self._create_css_parser()
self._prepare_selector(self.css_entry, self.c_selector, len(self.c_selector))
self.finder = modest_finder_create_simple()
cdef myhtml_collection_t* find(self, myhtml_tree_node_t* scope):
"""Find all possible matches."""
cdef myhtml_collection_t *collection
collection = NULL
modest_finder_by_selectors_list(self.finder, scope, self.selectors_list, &collection)
return collection
cdef int _create_css_parser(self) except -1:
cdef mystatus_t status
cdef mycss_t *mycss = mycss_create()
status = mycss_init(mycss)
if status != 0:
PyErr_SetObject(RuntimeError, "Can't init MyCSS object.")
return -1
self.css_entry = mycss_entry_create()
status = mycss_entry_init(mycss, self.css_entry)
if status != 0:
PyErr_SetObject(RuntimeError, "Can't init MyCSS Entry object.")
return -1
return 0
cdef int _prepare_selector(self, mycss_entry_t *css_entry, const char *selector, size_t selector_size) except -1:
cdef mystatus_t out_status
self.selectors_list = mycss_selectors_parse(mycss_entry_selectors(css_entry), myencoding_t.MyENCODING_UTF_8,
selector, selector_size, &out_status)
if (self.selectors_list == NULL) or (self.selectors_list.flags and MyCSS_SELECTORS_FLAGS_SELECTOR_BAD):
PyErr_SetObject(ValueError, "Bad CSS Selectors: %s" % self.c_selector.decode('utf-8'))
return -1
return 0
def __dealloc__(self):
mycss_selectors_list_destroy(mycss_entry_selectors(self.css_entry), self.selectors_list, 1)
modest_finder_destroy(self.finder, 1)
cdef mycss_t *mycss = self.css_entry.mycss
mycss_entry_destroy(self.css_entry, 1)
mycss_destroy(mycss, 1)
cdef class Selector:
"""An advanced CSS selector that supports additional operations.
Think of it as a toolkit that mimics some of the features of XPath.
Please note, this is an experimental feature that can change in the future.
"""
cdef Node node
cdef list nodes
def __init__(self, Node node, str query):
"""custom init, because __cinit__ doesn't accept C types"""
self.node = node
self.nodes = find_nodes(node.parser, node.node, query) if query else [node, ]
cpdef css(self, str query):
"""Evaluate CSS selector against current scope."""
cdef Node current_node
nodes = list()
for node in self.nodes:
current_node = node
nodes.extend(find_nodes(self.node.parser, current_node.node, query))
self.nodes = nodes
return self
@property
def matches(self):
"""Returns all possible matches"""
return self.nodes
@property
def any_matches(self):
"""Returns True if there are any matches"""
return bool(self.nodes)
def text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
"""Filter all current matches given text."""
nodes = []
cdef Node node
for node in self.nodes:
node_text = node.text(deep=deep, separator=separator, strip=strip)
if node_text and text in node_text:
nodes.append(node)
self.nodes = nodes
return self
def any_text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
"""Returns True if any node in the current search scope contains specified text"""
nodes = []
cdef Node node
for node in self.nodes:
node_text = node.text(deep=deep, separator=separator, strip=strip)
if node_text and text in node_text:
return True
return False
def attribute_longer_than(self, str attribute, int length, str start = None):
"""Filter all current matches by attribute length.
Similar to `string-length` in XPath.
"""
nodes = []
for node in self.nodes:
attr = node.attributes.get(attribute)
if attr and start and start in attr:
attr = attr[attr.find(start) + len(start):]
if len(attr) > length:
nodes.append(node)
self.nodes = nodes
return self
def any_attribute_longer_than(self, str attribute, int length, str start = None):
"""Returns True any href attribute longer than a specified length.
Similar to `string-length` in XPath.
"""
cdef list nodes = []
cdef Node node
for node in self.nodes:
attr = node.attributes.get(attribute)
if attr and start and start in attr:
attr = attr[attr.find(start) + len(start):]
if len(attr) > length:
return True
return False
def __bool__(self):
return bool(self.nodes)
cdef find_nodes(HTMLParser parser, myhtml_tree_node_t *node, str query):
cdef myhtml_collection_t *collection
cdef CSSSelector selector = CSSSelector(query)
cdef Node n
cdef list result = []
collection = selector.find(node)
if collection == NULL:
return result
for i in range(collection.length):
n = Node.new(collection.list[i], parser)
result.append(n)
myhtml_collection_destroy(collection)
return result
cdef bool find_matches(HTMLParser parser, myhtml_tree_node_t *node, tuple selectors):
cdef myhtml_collection_t *collection
cdef CSSSelector selector
cdef int collection_size
cdef str query
for query in selectors:
selector = CSSSelector(query)
collection_size = 0
collection = NULL
collection = selector.find(node)
if collection == NULL:
continue
collection_size = collection.length
myhtml_collection_destroy(collection)
if collection_size > 0:
return True
return False
|