1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
|
"""
SpiderManager is the class which locates and manages all website-specific
spiders
"""
import sys
import urlparse
from twisted.plugin import getCache
from twisted.python.rebuild import rebuild
from scrapy.spider.models import ISpider
from scrapy import log
from scrapy.conf import settings
from scrapy.utils.url import url_is_from_spider
class TwistedPluginSpiderManager(object):
"""Spider manager based in Twisted Plugin System"""
def __init__(self):
self.loaded = False
self.force_domain = None
self._invaliddict = {}
self._spiders = {}
def fromdomain(self, domain):
return self._spiders.get(domain)
def fromurl(self, url):
if self.force_domain:
return self._spiders.get(self.force_domain)
domain = urlparse.urlparse(url).hostname
domain = str(domain).replace('www.', '')
if domain:
if domain in self._spiders: # try first locating by domain
return self._spiders[domain]
else: # else search spider by spider
plist = self._spiders.values()
for p in plist:
if url_is_from_spider(url, p):
return p
def list(self):
return self._spiders.keys()
def load(self, spider_modules=None):
if spider_modules is None:
spider_modules = settings.getlist('SPIDER_MODULES')
self.spider_modules = spider_modules
self._invaliddict = {}
self._spiders = {}
modules = [__import__(m, {}, {}, ['']) for m in self.spider_modules]
for module in modules:
for spider in self._getspiders(ISpider, module):
ISpider.validateInvariants(spider)
self._spiders[spider.domain_name] = spider
self.loaded = True
def _getspiders(self, interface, package):
"""This is an override of twisted.plugin.getPlugin, because we're
interested in catching exceptions thrown when loading spiders such as
KeyboardInterrupt
"""
try:
allDropins = getCache(package)
for dropin in allDropins.itervalues():
for plugin in dropin.plugins:
adapted = interface(plugin, None)
if adapted is not None:
yield adapted
except KeyboardInterrupt:
sys.stderr.write("Interrupted while loading Scrapy spiders\n")
sys.exit(2)
def close_spider(self, spider):
"""Reload spider module to release any resources held on to by the
spider
"""
domain = spider.domain_name
if domain not in self._spiders:
return
spider = self._spiders[domain]
module_name = spider.__module__
module = sys.modules[module_name]
if hasattr(module, 'SPIDER'):
log.msg("Reloading module %s" % module_name, spider=spider, \
level=log.DEBUG)
new_module = rebuild(module, doLog=0)
self._spiders[domain] = new_module.SPIDER
|