1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
|
from scrapy.spider import BaseSpider
class InitSpider(BaseSpider):
"""Base Spider with initialization facilities"""
def __init__(self, *a, **kw):
super(InitSpider, self).__init__(*a, **kw)
self._postinit_reqs = []
self._init_complete = False
self._init_started = False
def make_requests_from_url(self, url):
req = super(InitSpider, self).make_requests_from_url(url)
if self._init_complete:
return req
self._postinit_reqs.append(req)
if not self._init_started:
self._init_started = True
return self.init_request()
def initialized(self, response=None):
"""This method must be set as the callback of your last initialization
request. See self.init_request() docstring for more info.
"""
self._init_complete = True
reqs = self._postinit_reqs[:]
del self._postinit_reqs
return reqs
def init_request(self):
"""This function should return one initialization request, with the
self.initialized method as callback. When the self.initialized method
is called this spider is considered initialized. If you need to perform
several requests for initializing your spider, you can do so by using
different callbacks. The only requirement is that the final callback
(of the last initialization request) must be self.initialized.
The default implementation calls self.initialized immediately, and
means that no initialization is needed. This method should be
overridden only when you need to perform requests to initialize your
spider
"""
return self.initialized()
|