File: wenxuecity-znjy.recipe

package info (click to toggle)
calibre 2.75.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 371,080 kB
  • ctags: 58,531
  • sloc: python: 428,639; ansic: 117,109; cpp: 45,255; xml: 42,660; sql: 549; sh: 459; makefile: 128
file content (66 lines) | stat: -rw-r--r-- 2,177 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env  python2

__license__ = 'GPL v3'
__copyright__ = '2010, Derek Liang <Derek.liang.ca @@@at@@@ gmail.com>'
'''
wenxuecity.com
'''
import re

from calibre.web.feeds.news import BasicNewsRecipe


class TheCND(BasicNewsRecipe):

    title = 'wenxuecity - znjy'
    __author__ = 'Derek Liang'
    description = ''
    INDEX = 'http://bbs.wenxuecity.com/znjy/?elite=1'
    language = 'zh'
    conversion_options = {'linearize_tables': True}

    remove_tags_before = dict(name='div', id='message')
    remove_tags_after = dict(name='div', id='message')
    remove_tags = [dict(name='div', id='postmeta'),
                   dict(name='div', id='footer')]
    no_stylesheets = True

    preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]

    def print_version(self, url):
        return url + '?print'

    def parse_index(self):
        soup = self.index_to_soup(self.INDEX)

        feeds = []
        articles = {}

        for a in soup.findAll('a', attrs={'class': 'post'}):
            url = a['href']
            if url.startswith('/'):
                url = 'http://bbs.wenxuecity.com' + url
            title = self.tag_to_string(a)
            self.log('\tFound article: ', title, ' at:', url)
            dateReg = re.search('(\d\d?)/(\d\d?)/(\d\d)',
                                self.tag_to_string(a.parent))
            date = '%(y)s/%(m)02d/%(d)02d' % {'y': dateReg.group(3),
                                              'm': int(dateReg.group(1)), 'd': int(dateReg.group(2))}
            if not articles.has_key(date):  # noqa
                articles[date] = []
            articles[date].append(
                {'title': title, 'url': url, 'description': '', 'date': ''})
            self.log('\t\tAppend to : ', date)

        self.log('log articles', articles)
        mostCurrent = sorted(articles).pop()
        self.title = '文学城 - 子女教育 - ' + mostCurrent

        feeds.append((self.title, articles[mostCurrent]))

        return feeds

    def populate_article_metadata(self, article, soup, first):
        header = soup.find('h3')
        self.log('header: ' + self.tag_to_string(header))
        pass