File: to_rss.py

package info (click to toggle)
puddletag 2.5.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 29,888 kB
  • sloc: python: 24,938; javascript: 21,828; xml: 964; makefile: 129; sh: 85
file content (51 lines) | stat: -rw-r--r-- 1,495 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
    # -*- coding: utf-8 -*-
import sys

from htmllib import HTMLParser
import formatter, re
from htmlentitydefs import name2codepoint as n2cp

def convert_entities(s):
    s = re.sub('&#(\d+);', lambda m: unichr(int(m.groups(0)[0])), s)
    return re.sub('&(\w)+;',
        lambda m: n2cp.get(m.groups(0), u'&%s;' % m.groups(0)[0]), s)

class RSSProcessor(HTMLParser):
    def reset(self):
        self.text = []
        self.__in_hlink = False
        HTMLParser.reset(self)

    def handle_data(self, text):
        if not self.__in_hlink:
            self.text.append(text)

    def handle_charref(self, ref):
        self.text.append('&#' + ref)

    def handle_starttag(self, tag, method, attr):
        if tag == 'a' and attr and dict(attr).get('class', '') == "headerlink":
            self.__in_hlink = True
            return
        if attr:
            self.text.append('<%s %s>' % (tag, ' '.join('%s="%s"' % z for z in attr)))
        else:
            self.text.append('<%s>' % tag)

    def handle_endtag(self, tag, method):
        if tag == 'p':
            import pdb
            pdb.set_trace()
        if tag == 'a' and self.__in_hlink:
            self.__in_hlink = False
        else:
            self.text.append('</%s>' % tag)

    def unknown_endtag(self, tag):
        self.text.append('</%s>' % tag)

def fix_rss(text):
    return re.sub('''<a class=['"]headerlink['"] .*?</a>''', '', text)

if __name__ == '__main__':
    print(fix_rss(open(sys.argv[1], 'r').read()))