1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
|
"""
Link extractor based on lxml.html
"""
import operator
from functools import partial
from urllib.parse import urljoin
import lxml.etree as etree
from w3lib.html import strip_html5_whitespace
from w3lib.url import canonicalize_url, safe_url_string
from scrapy.link import Link
from scrapy.linkextractors import FilteringLinkExtractor
from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
from scrapy.utils.python import unique as unique_list
from scrapy.utils.response import get_base_url
# from lxml/src/lxml/html/__init__.py
XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
_collect_string_content = etree.XPath("string()")
def _nons(tag):
if isinstance(tag, str):
if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE) + 1] == XHTML_NAMESPACE:
return tag.split('}')[-1]
return tag
def _identity(x):
return x
def _canonicalize_link_url(link):
return canonicalize_url(link.url, keep_fragments=True)
class LxmlParserLinkExtractor:
def __init__(
self, tag="a", attr="href", process=None, unique=False, strip=True, canonicalized=False
):
self.scan_tag = tag if callable(tag) else partial(operator.eq, tag)
self.scan_attr = attr if callable(attr) else partial(operator.eq, attr)
self.process_attr = process if callable(process) else _identity
self.unique = unique
self.strip = strip
self.link_key = operator.attrgetter("url") if canonicalized else _canonicalize_link_url
def _iter_links(self, document):
for el in document.iter(etree.Element):
if not self.scan_tag(_nons(el.tag)):
continue
attribs = el.attrib
for attrib in attribs:
if not self.scan_attr(attrib):
continue
yield (el, attrib, attribs[attrib])
def _extract_links(self, selector, response_url, response_encoding, base_url):
links = []
# hacky way to get the underlying lxml parsed document
for el, attr, attr_val in self._iter_links(selector.root):
# pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
try:
if self.strip:
attr_val = strip_html5_whitespace(attr_val)
attr_val = urljoin(base_url, attr_val)
except ValueError:
continue # skipping bogus links
else:
url = self.process_attr(attr_val)
if url is None:
continue
url = safe_url_string(url, encoding=response_encoding)
# to fix relative links after process_value
url = urljoin(response_url, url)
link = Link(url, _collect_string_content(el) or '',
nofollow=rel_has_nofollow(el.get('rel')))
links.append(link)
return self._deduplicate_if_needed(links)
def extract_links(self, response):
base_url = get_base_url(response)
return self._extract_links(response.selector, response.url, response.encoding, base_url)
def _process_links(self, links):
""" Normalize and filter extracted links
The subclass should override it if neccessary
"""
return self._deduplicate_if_needed(links)
def _deduplicate_if_needed(self, links):
if self.unique:
return unique_list(links, key=self.link_key)
return links
class LxmlLinkExtractor(FilteringLinkExtractor):
def __init__(
self,
allow=(),
deny=(),
allow_domains=(),
deny_domains=(),
restrict_xpaths=(),
tags=('a', 'area'),
attrs=('href',),
canonicalize=False,
unique=True,
process_value=None,
deny_extensions=None,
restrict_css=(),
strip=True,
restrict_text=None,
):
tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
lx = LxmlParserLinkExtractor(
tag=partial(operator.contains, tags),
attr=partial(operator.contains, attrs),
unique=unique,
process=process_value,
strip=strip,
canonicalized=canonicalize
)
super().__init__(
link_extractor=lx,
allow=allow,
deny=deny,
allow_domains=allow_domains,
deny_domains=deny_domains,
restrict_xpaths=restrict_xpaths,
restrict_css=restrict_css,
canonicalize=canonicalize,
deny_extensions=deny_extensions,
restrict_text=restrict_text,
)
def extract_links(self, response):
"""Returns a list of :class:`~scrapy.link.Link` objects from the
specified :class:`response <scrapy.http.Response>`.
Only links that match the settings passed to the ``__init__`` method of
the link extractor are returned.
Duplicate links are omitted.
"""
base_url = get_base_url(response)
if self.restrict_xpaths:
docs = [
subdoc
for x in self.restrict_xpaths
for subdoc in response.xpath(x)
]
else:
docs = [response.selector]
all_links = []
for doc in docs:
links = self._extract_links(doc, response.url, response.encoding, base_url)
all_links.extend(self._process_links(links))
return unique_list(all_links)
|