File: xpathfuncs.py

package info (click to toggle)
python-parsel 1.6.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 332 kB
  • sloc: python: 1,467; makefile: 213; sh: 8
file content (61 lines) | stat: -rw-r--r-- 1,780 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import re
from lxml import etree

from six import string_types

from w3lib.html import HTML5_WHITESPACE

regex = '[{}]+'.format(HTML5_WHITESPACE)
replace_html5_whitespaces = re.compile(regex).sub


def set_xpathfunc(fname, func):
    """Register a custom extension function to use in XPath expressions.

    The function ``func`` registered under ``fname`` identifier will be called
    for every matching node, being passed a ``context`` parameter as well as
    any parameters passed from the corresponding XPath expression.

    If ``func`` is ``None``, the extension function will be removed.

    See more `in lxml documentation`_.

    .. _`in lxml documentation`: http://lxml.de/extensions.html#xpath-extension-functions

    """
    ns_fns = etree.FunctionNamespace(None)
    if func is not None:
        ns_fns[fname] = func
    else:
        del ns_fns[fname]


def setup():
    set_xpathfunc('has-class', has_class)


def has_class(context, *classes):
    """has-class function.

    Return True if all ``classes`` are present in element's class attr.

    """
    if not context.eval_context.get('args_checked'):
        if not classes:
            raise ValueError(
                'XPath error: has-class must have at least 1 argument')
        for c in classes:
            if not isinstance(c, string_types):
                raise ValueError(
                    'XPath error: has-class arguments must be strings')
        context.eval_context['args_checked'] = True

    node_cls = context.context_node.get('class')
    if node_cls is None:
        return False
    node_cls = ' ' + node_cls + ' '
    node_cls = replace_html5_whitespaces(' ', node_cls)
    for cls in classes:
        if ' ' + cls + ' ' not in node_cls:
            return False
    return True