File: fetch.py

package info (click to toggle)
ludevit 7
  • links: PTS
  • area: main
  • in suites: squeeze, wheezy
  • size: 200 kB
  • ctags: 150
  • sloc: python: 979; makefile: 46
file content (225 lines) | stat: -rwxr-xr-x 7,444 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#!/usr/bin/python

import urllib2
import re
import socket
from htmlentitydefs import name2codepoint

# one minute default timeout for everything - ugly, but urllib2 does not expose timeout API for sockets...
socket.setdefaulttimeout(60)

from ludevit_trans.translator import Translator
from ludevit_trans import tables_ludevit

from converthtml import ModifyHrefParser, NullParser


# read page in chunks of this size
CHUNKSIZE = 5000

# size of the first chunk, used to guess charset and add base url
FIRSTCHUNKSIZE = 5000

BASE_CGI='http://www.juls.savba.sk/ludevit/'

def _replace_entity(m):
     s = m.group(1)
     if s[0] == u'#':
         s = s[1:]
         try:
             if s[0] in u'xX':
                 c = int(s[1:], 16)
             else:
                 c = int(s)
             return unichr(c)
         except ValueError:
             return m.group(0)
     else:
         try:
             return unichr(name2codepoint[s])
         except (ValueError, KeyError):
             return m.group(0)

_entity_re = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
def unescape_entities(s):
     return _entity_re.sub(_replace_entity, s)


class LudevitParser(ModifyHrefParser):

    def __init__(self, encoding, cgi_url, base_url):
        self.encoding = encoding
        ModifyHrefParser.__init__(self, cgi_url, base_url)
        self.translator = Translator(tables_ludevit.table_voc, tables_ludevit.table_ort, tables_ludevit.postprocess)

    
    def modify_data(self, datastr):
    
        # re-encoding here is slow, as compared with encoding the whole chunk
        # before feeding it to the parser
        # however, this deals better with the rare case of non-ascii characters
        # in href's URLs...
        txt = datastr.decode(self.encoding, 'replace')
        txt = unescape_entities(txt)
        tran = self.translator.translate_text(txt)
        tran = tran.encode(self.encoding, 'xmlcharrefreplace')
        return tran


def guess_charset_from_meta(txt):
    charset = None
    m = re.search(r'meta\s*http-equiv\="Content-Type"\s*content\="text/html;\s*charset\=(.+?)"', txt, re.I+re.S)
    if m:
        charset = m.group(1)
    return charset

def guess_if_base(txt):
    "find out if there is a BASE URL in the html page"
    return re.search(r'base\s*href\=', txt, re.I+re.S)


def report_error(text):
    headers = 'Content-Type: text/plain\r\n'
    body = 'An error has occurred: ' + text +'\nOops.\n'
    return headers, body    

def prepare_page(url, user_agent):
    "open url, read some bytes (to guess charset)"

    do_add_base_url = False
    
    # sanity checks
    if len(url)>512:
        charset = 'us-ascii'
        headers, first_chunk = report_error('Overlong URL')
        f = None
        return False, charset, headers, first_chunk, f, do_add_base_url
    protocol = url[:10]
    if ':' not in protocol:
        charset = 'us-ascii'
        headers, first_chunk = report_error('Invalid protocol')
        f = None
        return False, charset, headers, first_chunk, f, do_add_base_url
    protocol = protocol.split(':')[0]
    protocol = protocol.lower()
    if protocol not in ['http', 'https', 'ftp', 'gopher']:
        charset = 'us-ascii'
        headers, first_chunk = report_error('Unsupported protocol')
        f = None
        return False, charset, headers, first_chunk, f, do_add_base_url
    req = urllib2.Request(url)
    req.add_header('User-Agent', user_agent)
    try:
        f = urllib2.urlopen(req)
    except urllib2.HTTPError:
	# redir to original url
        headers = 'Location: %s\r\n' % url
        charset = 'us-ascii'
        first_chunk = ''
        f = None
        return False, charset, headers, first_chunk, f, do_add_base_url
    except urllib2.URLError, exc:
        charset = 'us-ascii'
        headers, first_chunk = report_error(str(exc))
        f = None
        return False, charset, headers, first_chunk, f, do_add_base_url

    resp_info = f.info()
    del resp_info['Content-Length']
    headers = ''.join(resp_info.headers)
    ct = f.info().get('Content-Type', '')
    do_translate = ct.lower().startswith('text')
    if not do_translate:
        return False, None, headers, '', f, do_add_base_url

    charset_from_headers = None
    if ct:
        fields = ct.split(';')
        for field in fields:
            fs = field.strip()
            if fs.lower().startswith('charset='):
                charset_from_headers = fs[len('charset='):].strip().lower()
                break
    first_chunk = f.read(FIRSTCHUNKSIZE)

    charset_from_meta = guess_charset_from_meta(first_chunk)
    charset = charset_from_headers or charset_from_meta
#    if charset_from_meta and charset_from_headers and (charset_from_meta != charset_from_headers):
        # we should honour the charset form headers, as per http standard
	# this code was clever, but e.g. www.nku.gov.sk fails the test
        #if 'windows-1250' in [charset_from_meta, charset_from_headers]:
        #    charset = 'windows-1250'
        #elif 'iso-8859-2' in [charset_from_meta, charset_from_headers]:
        #    charset = 'iso-8859-2'
        # there could be cp852 or MacRoman2 test here, but who uses such encodings nowadays?
        # some do, but let's assume they do not differ in headers and meta...
#        else:
#            charset = charset_from_meta # not standard conforming, but probably better

    # fallback, if everything failed
    if not charset:
        charset = 'windows-1250'

    if ct.lower().startswith('text/html') and not guess_if_base(first_chunk): # if a base url is present in the original html, do not add another one...
        do_add_base_url = True
    
    return True, charset, headers, first_chunk, f, do_add_base_url


def add_base_url(chunk, base):
    "try to find <head> element and add a <base href=...> tag into it"
    bastag = '<base href="%s" />'%base
    headtag = '<head>'+bastag+'</head>'
    if re.search(r'<head\b.*?>', chunk, re.I+re.S):
	r = re.sub(r'(?i)(<head\b.*?>)', r'\1'+bastag, chunk)
    # try to add head
    elif re.search(r'<html\b.*?>', chunk, re.I+re.S):
	r = re.sub(r'(?i)(<html\b.*?>)', r'\1'+headtag , chunk)
    elif chunk.startswith('<!'):
	r = re.sub(r'(<\!.*?>)', r'\1'+headtag, chunk)
    else:
	# no <head>, no <!doctype>, ho <html>... just add it
	r = headtag+chunk

    return r
	

def translate_page(url, user_agent):
    pp = prepare_page(url, user_agent)
    do_translate, charset, headers, chunk, f, do_add_base_url = pp
    yield headers
    yield '\r\n'

    if do_translate:
        if do_add_base_url:
            # use the page as base url
            base = f.geturl() # in case of redirect
            chunk = add_base_url(chunk, base)
        else:
            base = ''
        parser = LudevitParser(charset, BASE_CGI, base)
    else:
        parser = NullParser()

    while f: # f could be None to signalize the url has not been successfuly opened
        newchunk = f.read(CHUNKSIZE)
        if not newchunk:
            break
        chunk += newchunk
        # we have to be careful not to tear utf-8 characters apart...
        if ord(chunk[-1])<128:
            parser.feed(chunk)
            chunk = ''
            yield parser.pull()
    parser.feed(chunk)
    parser.close()
    yield parser.pull()


if __name__=='__main__':

    import sys
    url = sys.argv[1]
    user_agent = sys.argv[2]
    for c in translate_page(url, user_agent):
        sys.stdout.write(c)