File: crawl.tmpl

package info (click to toggle)
python-scrapy 0.8-3
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 2,904 kB
  • ctags: 2,981
  • sloc: python: 15,349; xml: 199; makefile: 68; sql: 64; sh: 34
file content (24 lines) | stat: -rw-r--r-- 787 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import re

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from $project_name.items import ${ProjectName}Item

class $classname(CrawlSpider):
    domain_name = '$site'
    start_urls = ['http://www.$site/']

    rules = (
        Rule(SgmlLinkExtractor(allow=(r'Items/', )), 'parse_item', follow=True),
    )

    def parse_item(self, response):
        xs = HtmlXPathSelector(response)
        i = ${ProjectName}Item()
        #i['site_id'] = xs.select('//input[@id="sid"]/@value').extract()
        #i['name'] = xs.select('//div[@id="name"]').extract()
        #i['description'] = xs.select('//div[@id="description"]').extract()
        return i

SPIDER = $classname()