File: image.py

package info (click to toggle)
python-scrapy 0.8-3
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 2,904 kB
  • ctags: 2,981
  • sloc: python: 15,349; xml: 199; makefile: 68; sql: 64; sh: 34
file content (84 lines) | stat: -rw-r--r-- 3,270 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""
This module implements the HtmlImageLinkExtractor for extracting 
image links only.
"""

import urlparse

from scrapy.link import Link
from scrapy.utils.url import canonicalize_url, urljoin_rfc
from scrapy.utils.python import unicode_to_str, flatten
from scrapy.selector import XPathSelectorList, HtmlXPathSelector

class HTMLImageLinkExtractor(object):
    '''HTMLImageLinkExtractor objects are intended to extract image links from HTML pages
    given certain xpath locations.

    These locations can be passed in a list/tuple either when instanciating the LinkExtractor,
    or whenever you call extract_links.
    If no locations are specified in any of these places, a default pattern '//img' will be used.
    If locations are specified when instanciating the LinkExtractor, and also when calling extract_links,
    both locations will be used for that call of extract_links'''

    def __init__(self, locations=None, unique=True, canonicalize=True):
        self.locations = flatten([locations])
        self.unique = unique
        self.canonicalize = canonicalize

    def extract_from_selector(self, selector, parent=None):
        ret = []
        def _add_link(url_sel, alt_sel=None):
            url = flatten([url_sel.extract()])
            alt = flatten([alt_sel.extract()]) if alt_sel else (u'', )
            if url:
                ret.append(Link(unicode_to_str(url[0]), alt[0]))

        if selector.xmlNode.type == 'element':
            if selector.xmlNode.name == 'img':
                _add_link(selector.select('@src'), selector.select('@alt') or \
                          selector.select('@title'))
            else:
                children = selector.select('child::*')
                if len(children):
                    for child in children:
                        ret.extend(self.extract_from_selector(child, parent=selector))
                elif selector.xmlNode.name == 'a' and not parent:
                    _add_link(selector.select('@href'), selector.select('@title'))
        else:
            _add_link(selector)

        return ret

    def extract_links(self, response):
        xs = HtmlXPathSelector(response)
        base_url = xs.select('//base/@href').extract()
        base_url = unicode_to_str(base_url[0]) if base_url else unicode_to_str(response.url)

        links = []
        for location in self.locations:
            if isinstance(location, basestring):
                selectors = xs.select(location)
            elif isinstance(location, (XPathSelectorList, HtmlXPathSelector)):
                selectors = [location] if isinstance(location, HtmlXPathSelector) else location
            else:
                continue

            for selector in selectors:
                links.extend(self.extract_from_selector(selector))

        seen, ret = set(), []
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response.encoding)
            if self.unique:
                if link.url in seen:
                    continue
                else:
                    seen.add(link.url)
            if self.canonicalize:
                link.url = canonicalize_url(link.url)
            ret.append(link)

        return ret

    def matches(self, url):
        return False