1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
|
import re
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from $project_name.items import ${ProjectName}Item
class $classname(CrawlSpider):
domain_name = '$site'
start_urls = ['http://www.$site/']
rules = (
Rule(SgmlLinkExtractor(allow=(r'Items/', )), 'parse_item', follow=True),
)
def parse_item(self, response):
xs = HtmlXPathSelector(response)
i = ${ProjectName}Item()
#i['site_id'] = xs.select('//input[@id="sid"]/@value').extract()
#i['name'] = xs.select('//div[@id="name"]').extract()
#i['description'] = xs.select('//div[@id="description"]').extract()
return i
SPIDER = $classname()
|