1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
|
from urlparse import urljoin
from scrapy import log
from scrapy.http import HtmlResponse
from scrapy.utils.response import get_meta_refresh
from scrapy.exceptions import IgnoreRequest, NotConfigured
class BaseRedirectMiddleware(object):
enabled_setting = 'REDIRECT_ENABLED'
def __init__(self, settings):
if not settings.getbool(self.enabled_setting):
raise NotConfigured
self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')
self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def _redirect(self, redirected, request, spider, reason):
ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
redirects = request.meta.get('redirect_times', 0) + 1
if ttl and redirects <= self.max_redirect_times:
redirected.meta['redirect_times'] = redirects
redirected.meta['redirect_ttl'] = ttl - 1
redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
[request.url]
redirected.dont_filter = request.dont_filter
redirected.priority = request.priority + self.priority_adjust
log.msg(format="Redirecting (%(reason)s) to %(redirected)s from %(request)s",
level=log.DEBUG, spider=spider, request=request,
redirected=redirected, reason=reason)
return redirected
else:
log.msg(format="Discarding %(request)s: max redirections reached",
level=log.DEBUG, spider=spider, request=request)
raise IgnoreRequest("max redirections reached")
def _redirect_request_using_get(self, request, redirect_url):
redirected = request.replace(url=redirect_url, method='GET', body='')
redirected.headers.pop('Content-Type', None)
redirected.headers.pop('Content-Length', None)
return redirected
class RedirectMiddleware(BaseRedirectMiddleware):
"""Handle redirection of requests based on response status and meta-refresh html tag"""
def process_response(self, request, response, spider):
if 'dont_redirect' in request.meta:
return response
if request.method == 'HEAD':
if response.status in [301, 302, 303, 307] and 'Location' in response.headers:
redirected_url = urljoin(request.url, response.headers['location'])
redirected = request.replace(url=redirected_url)
return self._redirect(redirected, request, spider, response.status)
else:
return response
if response.status in [302, 303] and 'Location' in response.headers:
redirected_url = urljoin(request.url, response.headers['location'])
redirected = self._redirect_request_using_get(request, redirected_url)
return self._redirect(redirected, request, spider, response.status)
if response.status in [301, 307] and 'Location' in response.headers:
redirected_url = urljoin(request.url, response.headers['location'])
redirected = request.replace(url=redirected_url)
return self._redirect(redirected, request, spider, response.status)
return response
class MetaRefreshMiddleware(BaseRedirectMiddleware):
enabled_setting = 'METAREFRESH_ENABLED'
def __init__(self, settings):
super(MetaRefreshMiddleware, self).__init__(settings)
self._maxdelay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY',
settings.getint('METAREFRESH_MAXDELAY'))
def process_response(self, request, response, spider):
if 'dont_redirect' in request.meta or request.method == 'HEAD' or \
not isinstance(response, HtmlResponse):
return response
if isinstance(response, HtmlResponse):
interval, url = get_meta_refresh(response)
if url and interval < self._maxdelay:
redirected = self._redirect_request_using_get(request, url)
return self._redirect(redirected, request, spider, 'meta refresh')
return response
|