File: selection.pxi

package info (click to toggle)
python-selectolax 0.4.6-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 708 kB
  • sloc: python: 2,239; makefile: 225
file content (195 lines) | stat: -rw-r--r-- 6,508 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
cimport cython
from cpython.exc cimport PyErr_SetObject


@cython.final
cdef class CSSSelector:

    cdef char *c_selector
    cdef mycss_entry_t *css_entry
    cdef modest_finder_t *finder
    cdef mycss_selectors_list_t *selectors_list

    def __init__(self, str selector):

        selector_pybyte = selector.encode('UTF-8')
        self.c_selector = selector_pybyte

        # In order to propagate errors these methods should return no value
        self._create_css_parser()
        self._prepare_selector(self.css_entry, self.c_selector, len(self.c_selector))
        self.finder = modest_finder_create_simple()

    cdef myhtml_collection_t* find(self, myhtml_tree_node_t* scope):
        """Find all possible matches."""

        cdef myhtml_collection_t *collection

        collection = NULL
        modest_finder_by_selectors_list(self.finder, scope, self.selectors_list, &collection)

        return collection

    cdef int _create_css_parser(self) except -1:
        cdef mystatus_t status

        cdef mycss_t *mycss = mycss_create()
        status = mycss_init(mycss)

        if status != 0:
            PyErr_SetObject(RuntimeError, "Can't init MyCSS object.")
            return -1

        self.css_entry = mycss_entry_create()
        status = mycss_entry_init(mycss, self.css_entry)

        if status != 0:
            PyErr_SetObject(RuntimeError, "Can't init MyCSS Entry object.")
            return -1
        return 0

    cdef int _prepare_selector(self, mycss_entry_t *css_entry, const char *selector, size_t selector_size) except -1:
        cdef mystatus_t out_status
        self.selectors_list = mycss_selectors_parse(mycss_entry_selectors(css_entry), myencoding_t.MyENCODING_UTF_8,
                                                    selector, selector_size, &out_status)

        if (self.selectors_list == NULL) or (self.selectors_list.flags and MyCSS_SELECTORS_FLAGS_SELECTOR_BAD):
            PyErr_SetObject(ValueError, "Bad CSS Selectors: %s" % self.c_selector.decode('utf-8'))
            return -1
        return 0

    def __dealloc__(self):
        mycss_selectors_list_destroy(mycss_entry_selectors(self.css_entry), self.selectors_list, 1)
        modest_finder_destroy(self.finder, 1)

        cdef mycss_t *mycss = self.css_entry.mycss
        mycss_entry_destroy(self.css_entry, 1)
        mycss_destroy(mycss, 1)


cdef class Selector:
    """An advanced CSS selector that supports additional operations.

    Think of it as a toolkit that mimics some of the features of XPath.

    Please note, this is an experimental feature that can change in the future.
    """
    cdef Node node
    cdef list nodes

    def __init__(self, Node node, str query):
        """custom init, because __cinit__ doesn't accept C types"""
        self.node = node
        self.nodes = find_nodes(node.parser, node.node, query) if query else [node, ]

    cpdef css(self, str query):
        """Evaluate CSS selector against current scope."""
        cdef Node current_node
        nodes = list()
        for node in self.nodes:
            current_node = node
            nodes.extend(find_nodes(self.node.parser, current_node.node, query))
        self.nodes = nodes
        return self

    @property
    def matches(self):
        """Returns all possible matches"""
        return self.nodes

    @property
    def any_matches(self):
        """Returns True if there are any matches"""
        return bool(self.nodes)

    def text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
        """Filter all current matches given text."""
        nodes = []
        cdef Node node
        for node in self.nodes:
            node_text = node.text(deep=deep, separator=separator, strip=strip)
            if node_text and text in node_text:
                nodes.append(node)
        self.nodes = nodes
        return self

    def any_text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
        """Returns True if any node in the current search scope contains specified text"""
        nodes = []
        cdef Node node
        for node in self.nodes:
            node_text = node.text(deep=deep, separator=separator, strip=strip)
            if node_text and text in node_text:
                return True
        return False

    def attribute_longer_than(self, str attribute, int length, str start  = None):
        """Filter all current matches by attribute length.

        Similar to `string-length` in XPath.
        """
        nodes = []
        for node in self.nodes:
            attr = node.attributes.get(attribute)
            if attr and start and start in attr:
                attr = attr[attr.find(start) + len(start):]
            if len(attr) > length:
                nodes.append(node)
        self.nodes = nodes
        return self

    def any_attribute_longer_than(self, str attribute, int length, str start  = None):
        """Returns True any href attribute longer than a specified length.

        Similar to `string-length` in XPath.
        """
        cdef list nodes = []
        cdef Node node
        for node in self.nodes:
            attr = node.attributes.get(attribute)
            if attr and start and start in attr:
                attr = attr[attr.find(start) + len(start):]
            if len(attr) > length:
                return True
        return False

    def __bool__(self):
        return bool(self.nodes)

cdef find_nodes(HTMLParser parser, myhtml_tree_node_t *node, str query):
    cdef myhtml_collection_t *collection
    cdef CSSSelector selector = CSSSelector(query)
    cdef Node n
    cdef list result = []
    collection = selector.find(node)

    if collection == NULL:
        return result

    for i in range(collection.length):
        n = Node.new(collection.list[i], parser)
        result.append(n)
    myhtml_collection_destroy(collection)
    return result


cdef bool find_matches(HTMLParser parser, myhtml_tree_node_t *node, tuple selectors):
    cdef myhtml_collection_t *collection
    cdef CSSSelector selector
    cdef int collection_size
    cdef str query

    for query in selectors:
        selector = CSSSelector(query)
        collection_size = 0
        collection = NULL

        collection = selector.find(node)
        if collection == NULL:
            continue

        collection_size = collection.length
        myhtml_collection_destroy(collection)
        if collection_size > 0:
            return True
    return False