File: crawl.py

package info (click to toggle)
python-scrapy 0.8-3
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 2,904 kB
  • ctags: 2,981
  • sloc: python: 15,349; xml: 199; makefile: 68; sql: 64; sh: 34
file content (134 lines) | stat: -rw-r--r-- 5,559 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""
This modules implements the CrawlSpider which is the recommended spider to use
for scraping typical web sites that requires crawling pages.

See documentation in docs/topics/spiders.rst
"""

import copy

from scrapy.http import Request
from scrapy.utils.spider import iterate_spider_output
from scrapy.contrib.spiders.init import InitSpider
from scrapy.conf import settings

class Rule(object):
    """
    A rule for crawling, which receives the following constructor arguments:

    link_extractor (required)
       A LinkExtractor which defines the policy for extracting links
    callback (optional)
       A function to use to process the page once it has been downloaded. If
       callback is omitted the page is not procesed, just crawled. If callback
       is a string (instead of callable) a method of the spider class with that
       name is used as the callback function
    cb_kwargs (optional)
       A dict specifying keyword arguments to pass to the callback function
    follow (optional)
       If True, links will be followed from the pages crawled by this rule.
       It defaults to True when no callback is specified or False when a
       callback is specified
    process_links (optional)
       Can be either a callable, or a string with the name of a method defined
       in the spider's class.
       This method will be called with the list of extracted links matching
       this rule (if any) and must return another list of links.
    """

    def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None):
        self.link_extractor = link_extractor
        self.callback = callback
        self.cb_kwargs = cb_kwargs or {}
        self.process_links = process_links
        if follow is None:
            self.follow = False if callback else True
        else:
            self.follow = follow

class CrawlSpider(InitSpider):
    """
    Class for spiders that crawl over web pages and extract/parse their links
    given some crawling rules.

    These crawling rules are established by setting the 'rules' class attribute,
    which is a tuple of Rule objects.
    When the spider is running, it iterates over these rules with each response
    and do what it has to (extract links if follow=True, and return items/requests if
    there's a parsing method defined in the rule).
    """
    rules = ()

    def __init__(self):
        """Constructor takes care of compiling rules"""
        super(CrawlSpider, self).__init__()
        self._compile_rules()

    def parse(self, response):
        """This function is called by the framework core for all the
        start_urls. Do not override this function, override parse_start_url
        instead."""
        return self._response_downloaded(response, self.parse_start_url, cb_kwargs={}, follow=True)

    def parse_start_url(self, response):
        """Overrideable callback function for processing start_urls. It must
        return a list of BaseItem and/or Requests"""
        return []

    def process_results(self, response, results):
        """This overridable method is called for each result (item or request)
        returned by the spider, and it's intended to perform any last time
        processing required before returning the results to the framework core,
        for example setting the item GUIDs. It receives a list of results and
        the response which originated that results. It must return a list
        of results (Items or Requests)."""
        return results

    def _requests_to_follow(self, response):
        """
        This method iterates over each of the spider's rules, extracts the links
        matching each case, filters them (if needed), and returns a list of unique
        requests per response.
        """
        seen = set()
        for rule in self._rules:
            links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
            if links and rule.process_links:
                links = rule.process_links(links)
            seen = seen.union(links)
            for link in links:
                r = Request(url=link.url)
                r.meta['link_text'] = link.text
                r.deferred.addCallback(self._response_downloaded, rule.callback, cb_kwargs=rule.cb_kwargs, follow=rule.follow)
                yield r

    def _response_downloaded(self, response, callback, cb_kwargs, follow):
        """
        This is were any response arrives, and were it's decided whether
        to extract links or not from it, and if it will be parsed or not.
        It returns a list of requests/items.
        """
        if callback:
            cb_res = callback(response, **cb_kwargs) or ()
            cb_res = self.process_results(response, cb_res)
            for requests_or_item in iterate_spider_output(cb_res):
                yield requests_or_item

        if follow and settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True):
            for request_or_item in self._requests_to_follow(response):
                yield request_or_item
                

    def _compile_rules(self):
        """Compile the crawling rules"""

        def get_method(method):
            if callable(method):
                return method
            elif isinstance(method, basestring):
                return getattr(self, method, None)

        self._rules = [copy.copy(r) for r in self.rules]
        for rule in self._rules:
            rule.callback = get_method(rule.callback)
            rule.process_links = get_method(rule.process_links)