File: spidermanager.py

package info (click to toggle)
python-scrapy 0.8-3
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 2,904 kB
  • ctags: 2,981
  • sloc: python: 15,349; xml: 199; makefile: 68; sql: 64; sh: 34
file content (90 lines) | stat: -rw-r--r-- 3,133 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
"""
SpiderManager is the class which locates and manages all website-specific
spiders
"""

import sys
import urlparse

from twisted.plugin import getCache
from twisted.python.rebuild import rebuild

from scrapy.spider.models import ISpider
from scrapy import log
from scrapy.conf import settings
from scrapy.utils.url import url_is_from_spider

class TwistedPluginSpiderManager(object):
    """Spider manager based in Twisted Plugin System"""

    def __init__(self):
        self.loaded = False
        self.force_domain = None
        self._invaliddict = {}
        self._spiders = {}

    def fromdomain(self, domain):
        return self._spiders.get(domain)

    def fromurl(self, url):
        if self.force_domain:
            return self._spiders.get(self.force_domain)
        domain = urlparse.urlparse(url).hostname
        domain = str(domain).replace('www.', '')
        if domain:
            if domain in self._spiders:         # try first locating by domain
                return self._spiders[domain]
            else:                               # else search spider by spider
                plist = self._spiders.values()
                for p in plist:
                    if url_is_from_spider(url, p):
                        return p

    def list(self):
        return self._spiders.keys()

    def load(self, spider_modules=None):
        if spider_modules is None:
            spider_modules = settings.getlist('SPIDER_MODULES')
        self.spider_modules = spider_modules
        self._invaliddict = {}
        self._spiders = {}

        modules = [__import__(m, {}, {}, ['']) for m in self.spider_modules]
        for module in modules:
            for spider in self._getspiders(ISpider, module):
                ISpider.validateInvariants(spider)
                self._spiders[spider.domain_name] = spider
        self.loaded = True

    def _getspiders(self, interface, package):
        """This is an override of twisted.plugin.getPlugin, because we're
        interested in catching exceptions thrown when loading spiders such as
        KeyboardInterrupt
        """
        try:
            allDropins = getCache(package)
            for dropin in allDropins.itervalues():
                for plugin in dropin.plugins:
                    adapted = interface(plugin, None)
                    if adapted is not None:
                        yield adapted
        except KeyboardInterrupt:
            sys.stderr.write("Interrupted while loading Scrapy spiders\n")
            sys.exit(2)

    def close_spider(self, spider):
        """Reload spider module to release any resources held on to by the
        spider
        """
        domain = spider.domain_name
        if domain not in self._spiders:
            return
        spider = self._spiders[domain]
        module_name = spider.__module__
        module = sys.modules[module_name]
        if hasattr(module, 'SPIDER'):
            log.msg("Reloading module %s" % module_name, spider=spider, \
                level=log.DEBUG)
            new_module = rebuild(module, doLog=0)
            self._spiders[domain] = new_module.SPIDER