File: selection.pxi

package info (click to toggle)
python-selectolax 0.4.6-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 708 kB
  • sloc: python: 2,239; makefile: 225
file content (217 lines) | stat: -rw-r--r-- 8,086 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
cimport cython
from cpython.exc cimport PyErr_SetObject
from cpython.list cimport PyList_GET_SIZE


@cython.final
cdef class LexborCSSSelector:

    def __init__(self):
        self._create_css_parser()
        self.results = []
        self.current_node = None

    cdef int _create_css_parser(self) except -1:
        cdef lxb_status_t status

        self.parser = lxb_css_parser_create()
        status = lxb_css_parser_init(self.parser, NULL)

        if status != LXB_STATUS_OK:
            PyErr_SetObject(SelectolaxError, "Can't initialize CSS parser.")
            return -1

        self.css_selectors = lxb_css_selectors_create()
        status = lxb_css_selectors_init(self.css_selectors)

        if status != LXB_STATUS_OK:
            PyErr_SetObject(SelectolaxError, "Can't initialize CSS selector.")
            return -1

        lxb_css_parser_selectors_set(self.parser, self.css_selectors)

        self.selectors = lxb_selectors_create()
        status = lxb_selectors_init(self.selectors)
        lxb_selectors_opt_set(self.selectors, LXB_SELECTORS_OPT_MATCH_ROOT)
        if status != LXB_STATUS_OK:
            PyErr_SetObject(SelectolaxError, "Can't initialize CSS selector.")
            return -1
        return 0

    cpdef list find(self, str query, LexborNode node):
        return self._find(query, node, 0)

    cpdef list find_first(self, str query, LexborNode node):
        return self._find(query, node, 1)

    cpdef list _find(self, str query, LexborNode node, bint only_first):
        cdef lxb_css_selector_list_t* selectors
        cdef lxb_char_t* c_selector
        cdef lxb_css_selector_list_t * selectors_list

        if not isinstance(query, str):
            raise TypeError("Query must be a string.")

        bytes_query = query.encode(_ENCODING)
        selectors_list = lxb_css_selectors_parse(self.parser, <lxb_char_t *> bytes_query, <size_t>len(bytes_query))

        if selectors_list == NULL:
            raise SelectolaxError("Can't parse CSS selector.")

        self.current_node = node
        self.results = []
        if only_first:
            status = lxb_selectors_find(self.selectors, node.node, selectors_list,
                                        <lxb_selectors_cb_f>css_finder_callback_first, <void*>self)
        else:
            status = lxb_selectors_find(self.selectors, node.node, selectors_list,
                                        <lxb_selectors_cb_f>css_finder_callback, <void*>self)
        results = list(self.results)
        self.results = []
        self.current_node = None
        lxb_css_selector_list_destroy_memory(selectors_list)
        return results

    cpdef int any_matches(self, str query, LexborNode node) except -1:
        cdef lxb_css_selector_list_t * selectors
        cdef lxb_char_t * c_selector
        cdef lxb_css_selector_list_t * selectors_list
        cdef int result

        if not isinstance(query, str):
            raise TypeError("Query must be a string.")

        bytes_query = query.encode(_ENCODING)
        selectors_list = lxb_css_selectors_parse(self.parser, <lxb_char_t *> bytes_query, <size_t> len(query))

        if selectors_list == NULL:
            PyErr_SetObject(SelectolaxError, "Can't parse CSS selector.")
            return -1

        self.results = []
        status = lxb_selectors_find(self.selectors, node.node, selectors_list,
                                    <lxb_selectors_cb_f> css_matcher_callback, <void *> self)
        if status != LXB_STATUS_OK:
            lxb_css_selector_list_destroy_memory(selectors_list)
            PyErr_SetObject(SelectolaxError, "Can't parse CSS selector.")
            return -1

        result = PyList_GET_SIZE(self.results) > 0
        self.results = []
        lxb_css_selector_list_destroy_memory(selectors_list)
        return result

    def __dealloc__(self):
        if self.selectors != NULL:
            lxb_selectors_destroy(self.selectors, True)
        if self.parser != NULL:
            lxb_css_parser_destroy(self.parser, True)
        if self.css_selectors != NULL:
            lxb_css_selectors_destroy(self.css_selectors, True)


cdef class LexborSelector:
    """An advanced CSS selector that supports additional operations.

    Think of it as a toolkit that mimics some of the features of XPath.

    Please note, this is an experimental feature that can change in the future.
    """
    cdef LexborNode node
    cdef list nodes

    def __init__(self, LexborNode node, query):
        self.node = node
        self.nodes = self.node.parser.selector.find(query, self.node) if query else [node, ]

    cpdef css(self, str query):
        """Evaluate CSS selector against current scope."""
        raise NotImplementedError("This features is not supported by the lexbor backend. Please use Modest backend.")

    @property
    def matches(self) -> list:
        """Returns all possible matches"""
        return self.nodes

    @property
    def any_matches(self) -> bool:
        """Returns True if there are any matches"""
        return bool(self.nodes)

    def text_contains(self, str text, bool deep=True, str separator='', bool strip=False) -> LexborSelector:
        """Filter all current matches given text."""
        cdef list nodes = []
        for node in self.nodes:
            node_text = node.text(deep=deep, separator=separator, strip=strip)
            if node_text and text in node_text:
                nodes.append(node)
        self.nodes = nodes
        return self

    def any_text_contains(self, str text, bool deep=True, str separator='', bool strip=False) -> bool:
        """Returns True if any node in the current search scope contains specified text"""
        cdef LexborNode node
        for node in self.nodes:
            node_text = node.text(deep=deep, separator=separator, strip=strip)
            if node_text and text in node_text:
                return True
        return False

    def attribute_longer_than(self, str attribute, int length, str start  = None) -> LexborSelector:
        """Filter all current matches by attribute length.

        Similar to `string-length` in XPath.
        """
        cdef list nodes = []
        for node in self.nodes:
            attr = node.attributes.get(attribute)
            if not attr:
                continue
            if attr and start and start in attr:
                attr = attr[attr.find(start) + len(start):]
            if len(attr) > length:
                nodes.append(node)
        self.nodes = nodes
        return self

    def any_attribute_longer_than(self, str attribute, int length, str start  = None) -> bool:
        """Returns True any href attribute longer than a specified length.

        Similar to `string-length` in XPath.
        """
        cdef LexborNode node
        for node in self.nodes:
            attr = node.attributes.get(attribute)
            if attr and start and start in attr:
                attr = attr[attr.find(start) + len(start):]
            if len(attr) > length:
                return True
        return False

    def __bool__(self):
        return bool(self.nodes)


cdef lxb_status_t css_finder_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx):
    cdef LexborNode lxb_node
    cdef LexborCSSSelector cls
    cls = <LexborCSSSelector> ctx
    lxb_node = LexborNode.new(<lxb_dom_node_t *> node, cls.current_node.parser)
    cls.results.append(lxb_node)
    return LXB_STATUS_OK

cdef lxb_status_t css_finder_callback_first(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx):
    cdef LexborNode lxb_node
    cdef LexborCSSSelector cls
    cls = <LexborCSSSelector> ctx
    lxb_node = LexborNode.new(<lxb_dom_node_t *> node, cls.current_node.parser)
    cls.results.append(lxb_node)
    return LXB_STATUS_STOP


cdef lxb_status_t css_matcher_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec, void *ctx):
    cdef LexborNode lxb_node
    cdef LexborCSSSelector cls
    cls = <LexborCSSSelector> ctx
    cls.results.append(True)
    return LXB_STATUS_STOP