File: converthtml.py

package info (click to toggle)
ludevit 7
  • links: PTS
  • area: main
  • in suites: squeeze, wheezy
  • size: 200 kB
  • ctags: 150
  • sloc: python: 979; makefile: 46
file content (196 lines) | stat: -rwxr-xr-x 6,263 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/usr/bin/python


import sys, re, urlparse, urllib

class NullParser:
    "just copies data"
    def __init__(self):
        self.output_buffer = ''

    def feed(self, data):
        self.output_buffer += data

    def close(self):
        pass
        
    def pull(self):
        r = self.output_buffer
        self.output_buffer = ''
        return r


class BasicParser:

    def __init__(self):
        self.in_tag = False
        self.current_tag = ''
        self.current_data = ''
        
    def feed(self, data):
        self.process(data)

    def process(self, txt):
        for c in txt:
            if self.in_tag:
                self.current_tag += c
                if c=='>':
                    self.process_tag(self.current_tag)
                    self.in_tag = False
                    self.current_tag = ''
            else:
                assert not self.current_tag
                if c != '<':
                    self.current_data += c
                else:
                    self.process_data(self.current_data)
                    self.in_tag = True
                    self.current_data = ''
                    self.current_tag = c # i.e., <

    def close(self):
        if self.in_tag: # open < at the and of dosument
            assert self.current_tag
            self.process_tag(self.current_tag)
        else:
            self.process_data(self.current_data)

    def process_tag(self, tagstr):
        "to be subclassed"
        return tagstr
        
    def process_data(self, datastr):
        "to be subclassed"
        return datastr


class CopyParser(BasicParser):

    def __init__(self):
        self.output_buffer = ''
        BasicParser.__init__(self)

    def process_tag(self, tagstr):
        "to be subclassed"
        self.output_buffer += tagstr
        
    def process_data(self, datastr):
        "to be subclassed"
        self.output_buffer += datastr
        
    def pull(self):
        r = self.output_buffer
        self.output_buffer = ''
        return r

class CopyAndModifyParser(CopyParser):            
    def __init__(self):
        self.in_script = False
        self.in_style = False
        CopyParser.__init__(self)

    def process_tag(self, tagstr):
        newtag = self.modify_tag(tagstr)
        self.output_buffer += newtag
        if re.match(r'(?is)<script\b(?!.*?/\s*>)', tagstr):
            self.in_script = True
        elif re.match(r'(?is)</script\b', tagstr):
            self.in_script = False
        if re.match(r'(?is)<style\b(?!.*?/\s*>)', tagstr):
            self.in_style = True
        elif re.match(r'(?is)</style\b', tagstr):
            self.in_style = False
        
    def process_data(self, datastr):
        if self.in_script or self.in_style: # do not modify data inside <script></script> tags...
            newdata = datastr
        else:
            newdata = self.modify_data(datastr)
        self.output_buffer += newdata
    
    def modify_tag(self, tagstr):
        "to be subclassed"
        return tagstr

    def modify_data(self, datastr):
        "to be subclassed"
        return datastr

# inspired from feedparser by Mark Pilgrim
relative_uris = {
                     'a': ('href',),
                     'applet': ('codebase',),
                     'area': ('href',),
                     'blockquote': ('cite',),
                     'body': ('background',),
                     'del': ('cite',),
                     'form': ('action',),
                     'frame': ('longdesc', 'src'),
                     'iframe': ('longdesc', 'src'),
                     'head': ('profile',),
                     'img': ('longdesc', 'src', 'usemap'),
                     'input': ('src', 'usemap'),
                     'ins': ('cite',),
                     'link': ('href',),
                     'object': ('classid', 'codebase', 'data', 'usemap'),
                     'q': ('cite',),
                     'script': ('src',)
                }

_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
def _urljoin(base, uri):
    uri = _urifixer.sub(r'\1\3', uri)
    return urlparse.urljoin(base, uri)

def get_uri_tag_value(tagstr, k):
    "try to get value of 'k' attribute from a given html tag"
    m = re.search(r"""\b%s\=['"](.*?)['"]""" % k, tagstr, re.I+re.S)
    if not m: # not in quotes? hmm...
        m = re.search(r"""\b%s\=(.*?)\s(?=\>)""" % k, tagstr, re.I+re.S)
    if not m: # nothing found
        return None
    return m.start(), m.end(), m.group(1)


class ModifyHrefParser(CopyAndModifyParser):
    # also rewrite href's to go through our cgi script
    def __init__(self, cgi_url, base_url):
        self.cgi_url = cgi_url
	self.base_url = base_url
        CopyAndModifyParser.__init__(self)

    def modify_tag(self, tagstr):
        "NOT to be subclassed"
        if not self.cgi_url:
            return tagstr
        m = re.search(r'<([A-Za-z]+?)\b', tagstr, re.S)
        if m:
            tag = m.group(1)
	    if tag.lower()=='a':
                spanval = get_uri_tag_value(tagstr, 'href')
                if spanval: # found, we need to replace the reference
                    start, end, val = spanval
		    url = _urljoin(self.base_url, val)
                    newval = self.cgi_url+'?'+'url='+urllib.quote_plus(url, safe='')
                    tagstr = tagstr[:start]+''+'href'+'="'+newval+'"'+tagstr[end:]
        return tagstr



class MyParser(ModifyHrefParser):

    def modify_data(self, datastr):
        return re.sub('[a-z]', 'a', datastr)

if __name__=='__main__':
    filehandle = open('a.html', 'r')
    parser = MyParser(base_url='http://www.sme.sk')
    while True:
        data = filehandle.read(1000)               # read in the data in chunks
        if not data: break                      # we've reached the end of the file - python could do with a do:...while syntax...
        parser.feed(data)
        sys.stdout.write(parser.pull())                     # you can output data whilst processing using the push method
    #processedfile = parser.close()              # or all in one go using close  
    parser.close()                       # Even if using push you will still need a final close
    sys.stdout.write( parser.pull())
    filehandle.close()