1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
|
"""
This module implements the HtmlImageLinkExtractor for extracting
image links only.
"""
from urlparse import urljoin
from scrapy.link import Link
from scrapy.utils.url import canonicalize_url
from scrapy.utils.python import unicode_to_str, flatten
from scrapy.selector.libxml2sel import XPathSelectorList, HtmlXPathSelector
class HTMLImageLinkExtractor(object):
'''HTMLImageLinkExtractor objects are intended to extract image links from HTML pages
given certain xpath locations.
These locations can be passed in a list/tuple either when instanciating the LinkExtractor,
or whenever you call extract_links.
If no locations are specified in any of these places, a default pattern '//img' will be used.
If locations are specified when instanciating the LinkExtractor, and also when calling extract_links,
both locations will be used for that call of extract_links'''
def __init__(self, locations=None, unique=True, canonicalize=True):
self.locations = flatten([locations])
self.unique = unique
self.canonicalize = canonicalize
def extract_from_selector(self, selector, encoding, parent=None):
ret = []
def _add_link(url_sel, alt_sel=None):
url = flatten([url_sel.extract()])
alt = flatten([alt_sel.extract()]) if alt_sel else (u'', )
if url:
ret.append(Link(unicode_to_str(url[0], encoding), alt[0]))
if selector.xmlNode.type == 'element':
if selector.xmlNode.name == 'img':
_add_link(selector.select('@src'), selector.select('@alt') or \
selector.select('@title'))
else:
children = selector.select('child::*')
if len(children):
for child in children:
ret.extend(self.extract_from_selector(child, encoding, parent=selector))
elif selector.xmlNode.name == 'a' and not parent:
_add_link(selector.select('@href'), selector.select('@title'))
else:
_add_link(selector)
return ret
def extract_links(self, response):
xs = HtmlXPathSelector(response)
base_url = xs.select('//base/@href').extract()
base_url = urljoin(response.url, base_url[0].encode(response.encoding)) if base_url else response.url
links = []
for location in self.locations:
if isinstance(location, basestring):
selectors = xs.select(location)
elif isinstance(location, (XPathSelectorList, HtmlXPathSelector)):
selectors = [location] if isinstance(location, HtmlXPathSelector) else location
else:
continue
for selector in selectors:
links.extend(self.extract_from_selector(selector, response.encoding))
seen, ret = set(), []
for link in links:
link.url = urljoin(base_url, link.url)
if self.unique:
if link.url in seen:
continue
else:
seen.add(link.url)
if self.canonicalize:
link.url = canonicalize_url(link.url)
ret.append(link)
return ret
def matches(self, url):
return False
|