File: _html.py

package info (click to toggle)
python-mechanize 1%3A0.4.10%2Bds-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,316 kB
  • sloc: python: 16,656; makefile: 11; sh: 4
file content (338 lines) | stat: -rw-r--r-- 10,872 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
from __future__ import absolute_import

import codecs
import copy
import re

from ._form import parse_forms
from ._headersutil import is_html as _is_html
from ._headersutil import split_header_words
from ._rfc3986 import clean_url, urljoin
from .polyglot import is_string

DEFAULT_ENCODING = "utf-8"
_encoding_pats = (
    # XML declaration
    r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
    # HTML 5 charset
    r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>(?:\s*</meta>){0,1}''',
    # HTML 4 Pragma directive
    r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*</meta>){0,1}''',
)


def compile_pats(binary):
    for raw in _encoding_pats:
        if binary:
            raw = raw.encode('ascii')
        yield re.compile(raw, flags=re.IGNORECASE)


class LazyEncodingPats(object):

    def __call__(self, binary=False):
        attr = 'binary_pats' if binary else 'unicode_pats'
        pats = getattr(self, attr, None)
        if pats is None:
            pats = tuple(compile_pats(binary))
            setattr(self, attr, pats)
        for pat in pats:
            yield pat


lazy_encoding_pats = LazyEncodingPats()


def find_declared_encoding(raw, limit=50*1024):
    prefix = raw[:limit]
    is_binary = isinstance(raw, bytes)
    for pat in lazy_encoding_pats(is_binary):
        m = pat.search(prefix)
        if m is not None:
            ans = m.group(1)
            if is_binary:
                ans = ans.decode('ascii', 'replace')
                return ans


def elem_text(elem):
    if elem.text:
        yield elem.text
    for child in elem:
        for text in elem_text(child):
            yield text
        if child.tail:
            yield child.tail


def iterlinks(root, base_url):
    link_tags = {"a": "href", "area": "href", "iframe": "src"}
    for tag in root.iter('*'):
        if not is_string(tag.tag):
            continue
        q = tag.tag.lower()
        attr = link_tags.get(q)
        if attr is not None:
            val = tag.get(attr)
            if val:
                url = clean_url(val)
                yield Link(base_url, url,
                           compress_whitespace(u''.join(elem_text(tag))), q,
                           tag.items())
        elif q == 'base':
            href = tag.get('href')
            if href:
                base_url = href


def compress_whitespace(text):
    return re.sub(r'\s+', ' ', text or '').strip()


def get_encoding_from_response(response, verify=True):
    # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
    # headers may be in the response.  HTTP-EQUIV headers come last,
    # so try in order from first to last.
    if response:
        for ct in response.info().getheaders("content-type"):
            for k, v in split_header_words([ct])[0]:
                if k == "charset":
                    if not verify:
                        return v
                    try:
                        codecs.lookup(v)
                        return v
                    except LookupError:
                        continue


class EncodingFinder:
    def __init__(self, default_encoding):
        self._default_encoding = default_encoding

    def encoding(self, response):
        return get_encoding_from_response(response) or self._default_encoding


class ResponseTypeFinder:
    def __init__(self, allow_xhtml):
        self._allow_xhtml = allow_xhtml

    def is_html(self, response, encoding):
        ct_hdrs = response.info().getheaders("content-type")
        url = response.geturl()
        # XXX encoding
        return _is_html(ct_hdrs, url, self._allow_xhtml)


class Link:
    '''
    A link in a HTML document

    :ivar absolute_url: The absolutized link URL
    :ivar url: The link URL
    :ivar base_url: The base URL against which this link is resolved
    :ivar text: The link text
    :ivar tag: The link tag name
    :ivar attrs: The tag attributes

    '''
    def __init__(self, base_url, url, text, tag, attrs):
        assert None not in [url, tag, attrs]
        self.base_url = base_url
        self.absolute_url = urljoin(base_url, url)
        self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
        self.text = self.text

    def __eq__(self, other):
        try:
            for name in "url", "text", "tag":
                if getattr(self, name) != getattr(other, name):
                    return False
            if dict(self.attrs) != dict(other.attrs):
                return False
        except AttributeError:
            return False
        return True

    def __ne__(self, other):
        return not self.__eq__(other)

    def __repr__(self):
        return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
            self.base_url, self.url, self.text, self.tag, self.attrs)


def content_parser(data,
                   url=None,
                   response_info=None,
                   transport_encoding=None,
                   default_encoding=DEFAULT_ENCODING,
                   is_html=True):
    '''
    Parse data (a bytes object) into an etree representation such as
    :py:mod:`xml.etree.ElementTree` or `lxml.etree`

    :param bytes data: The data to parse
    :param url: The URL of the document being parsed or None
    :param response_info: Information about the document
        (contains all HTTP headers as :class:`HTTPMessage`)
    :param transport_encoding: The character encoding for the document being
        parsed as specified in the HTTP headers or None.
    :param default_encoding: The character encoding to use if no encoding
        could be detected and no transport_encoding is specified
    :param is_html: If the document is to be parsed as HTML.
    '''
    if not is_html:
        return
    try:
        from html5_parser import parse
    except Exception:
        from html5lib import parse
        kw = {'namespaceHTMLElements': False}
        if transport_encoding and isinstance(data, bytes):
            kw['transport_encoding'] = transport_encoding
        return parse(data, **kw)
    else:
        return parse(data, transport_encoding=transport_encoding)


def get_title(root):
    for title in root.iter('title'):
        text = compress_whitespace(title.text)
        if text:
            return text


lazy = object()


class Factory:
    """Factory for forms, links, etc.

    This interface may expand in future.

    Public methods:

    set_request_class(request_class)
    set_response(response)
    forms()
    links()

    Public attributes:

    Note that accessing these attributes may raise ParseError.

    encoding: string specifying the encoding of response if it contains a text
     document (this value is left unspecified for documents that do not have
     an encoding, e.g. an image file)
    is_html: true if response contains an HTML document (XHTML may be
     regarded as HTML too)
    title: page title, or None if no title or not HTML
    global_form: form object containing all controls that are not descendants
     of any FORM element, or None if the forms_factory does not support
     supplying a global form

    """

    def __init__(
            self,
            default_encoding=DEFAULT_ENCODING,
            allow_xhtml=False, ):
        """

        Pass keyword arguments only.

        """
        self._encoding_finder = EncodingFinder(default_encoding)
        self.form_encoding = default_encoding
        self._response_type_finder = ResponseTypeFinder(
            allow_xhtml=allow_xhtml)
        self._content_parser = content_parser
        self._current_forms = self._current_links = self._current_title = lazy
        self._current_global_form = self._root = lazy
        self._raw_data = b''
        self.is_html, self.encoding = False, DEFAULT_ENCODING

        self.set_response(None)

    def set_content_parser(self, val):
        self._content_parser = val

    def set_request_class(self, request_class):
        """Set request class (mechanize.Request by default).

        HTMLForm instances returned by .forms() will return instances of this
        class when .click()ed.

        """
        self._request_class = request_class

    def set_response(self, response):
        """Set response.

        The response must either be None or implement the same interface as
        objects returned by mechanize.urlopen().

        """
        self._response = copy.copy(response)
        self._current_forms = self._current_links = self._current_title = lazy
        self._current_global_form = self._root = lazy
        self.encoding = self._encoding_finder.encoding(self._response)
        self.is_html = self._response_type_finder.is_html(
            self._response, self.encoding) if self._response else False

    @property
    def root(self):
        if self._root is lazy:
            response = self._response
            raw = self._response.read() if self._response else b''
            default_encoding = self._encoding_finder._default_encoding
            transport_encoding = get_encoding_from_response(response, verify=False)
            declared_encoding = find_declared_encoding(raw)
            self.form_encoding = declared_encoding or transport_encoding or default_encoding
            self._root = self._content_parser(
                raw,
                url=response.geturl() if response else None,
                response_info=response.info() if response else None,
                default_encoding=default_encoding,
                is_html=self.is_html,
                transport_encoding=transport_encoding)
        return self._root

    @property
    def title(self):
        if self._current_title is lazy:
            self._current_title = get_title(
                self.root) if self.root is not None else None
        return self._current_title or u''

    @property
    def global_form(self):
        if self._current_global_form is lazy:
            self.forms()
        return self._current_global_form

    def forms(self):
        """ Return tuple of HTMLForm-like objects. """
        # this implementation sets .global_form as a side-effect
        if self._current_forms is lazy:
            self._current_forms, self._current_global_form = self._get_forms()
        return self._current_forms

    def links(self):
        """Return tuple of mechanize.Link-like objects.  """
        if self._current_links is lazy:
            self._current_links = self._get_links()
        return self._get_links()

    def _get_links(self):
        if self.root is None:
            return ()
        return tuple(iterlinks(self.root, self._response.geturl()))

    def _get_forms(self):
        if self.root is None:
            return (), None
        return parse_forms(self.root,
                           self._response.geturl(), self._request_class, encoding=self.form_encoding)