1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
|
import re
from scrapy.spider import BaseSpider
from scrapy.http import Request, XmlResponse
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
from scrapy.utils.gz import gunzip
from scrapy import log
class SitemapSpider(BaseSpider):
sitemap_urls = ()
sitemap_rules = [('', 'parse')]
sitemap_follow = ['']
def __init__(self, *a, **kw):
super(SitemapSpider, self).__init__(*a, **kw)
self._cbs = []
for r, c in self.sitemap_rules:
if isinstance(c, basestring):
c = getattr(self, c)
self._cbs.append((regex(r), c))
self._follow = [regex(x) for x in self.sitemap_follow]
def start_requests(self):
return [Request(x, callback=self._parse_sitemap) for x in self.sitemap_urls]
def _parse_sitemap(self, response):
if response.url.endswith('/robots.txt'):
for url in sitemap_urls_from_robots(response.body):
yield Request(url, callback=self._parse_sitemap)
else:
if isinstance(response, XmlResponse):
body = response.body
elif is_gzipped(response):
body = gunzip(response.body)
else:
log.msg("Ignoring non-XML sitemap: %s" % response, log.WARNING)
return
s = Sitemap(body)
if s.type == 'sitemapindex':
for loc in iterloc(s):
if any(x.search(loc) for x in self._follow):
yield Request(loc, callback=self._parse_sitemap)
elif s.type == 'urlset':
for loc in iterloc(s):
for r, c in self._cbs:
if r.search(loc):
yield Request(loc, callback=c)
break
def is_gzipped(response):
ctype = response.headers.get('Content-Type', '')
return ctype in ('application/x-gzip', 'application/gzip')
def regex(x):
if isinstance(x, basestring):
return re.compile(x)
return x
def iterloc(it):
for d in it:
yield d['loc']
|