File: crawl.tmpl

package info (click to toggle)

python-scrapy 0.8-3

links: PTS, VCS
area: main
in suites: squeeze
size: 2,904 kB
ctags: 2,981
sloc: python: 15,349; xml: 199; makefile: 68; sql: 64; sh: 34

file content (24 lines) | stat: -rw-r--r-- 787 bytes

import re

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from $project_name.items import ${ProjectName}Item

class $classname(CrawlSpider):
    domain_name = '$site'
    start_urls = ['http://www.$site/']

    rules = (
        Rule(SgmlLinkExtractor(allow=(r'Items/', )), 'parse_item', follow=True),
    )

    def parse_item(self, response):
        xs = HtmlXPathSelector(response)
        i = ${ProjectName}Item()
        #i['site_id'] = xs.select('//input[@id="sid"]/@value').extract()
        #i['name'] = xs.select('//div[@id="name"]').extract()
        #i['description'] = xs.select('//div[@id="description"]').extract()
        return i

SPIDER = $classname()