1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
|
from __future__ import absolute_import
import codecs
import copy
import re
from ._form import parse_forms
from ._headersutil import is_html as _is_html
from ._headersutil import split_header_words
from ._rfc3986 import clean_url, urljoin
from .polyglot import is_string
DEFAULT_ENCODING = "utf-8"
_encoding_pats = (
# XML declaration
r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
# HTML 5 charset
r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>(?:\s*</meta>){0,1}''',
# HTML 4 Pragma directive
r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*</meta>){0,1}''',
)
def compile_pats(binary):
for raw in _encoding_pats:
if binary:
raw = raw.encode('ascii')
yield re.compile(raw, flags=re.IGNORECASE)
class LazyEncodingPats(object):
def __call__(self, binary=False):
attr = 'binary_pats' if binary else 'unicode_pats'
pats = getattr(self, attr, None)
if pats is None:
pats = tuple(compile_pats(binary))
setattr(self, attr, pats)
for pat in pats:
yield pat
lazy_encoding_pats = LazyEncodingPats()
def find_declared_encoding(raw, limit=50*1024):
prefix = raw[:limit]
is_binary = isinstance(raw, bytes)
for pat in lazy_encoding_pats(is_binary):
m = pat.search(prefix)
if m is not None:
ans = m.group(1)
if is_binary:
ans = ans.decode('ascii', 'replace')
return ans
def elem_text(elem):
if elem.text:
yield elem.text
for child in elem:
for text in elem_text(child):
yield text
if child.tail:
yield child.tail
def iterlinks(root, base_url):
link_tags = {"a": "href", "area": "href", "iframe": "src"}
for tag in root.iter('*'):
if not is_string(tag.tag):
continue
q = tag.tag.lower()
attr = link_tags.get(q)
if attr is not None:
val = tag.get(attr)
if val:
url = clean_url(val)
yield Link(base_url, url,
compress_whitespace(u''.join(elem_text(tag))), q,
tag.items())
elif q == 'base':
href = tag.get('href')
if href:
base_url = href
def compress_whitespace(text):
return re.sub(r'\s+', ' ', text or '').strip()
def get_encoding_from_response(response, verify=True):
# HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
# headers may be in the response. HTTP-EQUIV headers come last,
# so try in order from first to last.
if response:
for ct in response.info().getheaders("content-type"):
for k, v in split_header_words([ct])[0]:
if k == "charset":
if not verify:
return v
try:
codecs.lookup(v)
return v
except LookupError:
continue
class EncodingFinder:
def __init__(self, default_encoding):
self._default_encoding = default_encoding
def encoding(self, response):
return get_encoding_from_response(response) or self._default_encoding
class ResponseTypeFinder:
def __init__(self, allow_xhtml):
self._allow_xhtml = allow_xhtml
def is_html(self, response, encoding):
ct_hdrs = response.info().getheaders("content-type")
url = response.geturl()
# XXX encoding
return _is_html(ct_hdrs, url, self._allow_xhtml)
class Link:
'''
A link in a HTML document
:ivar absolute_url: The absolutized link URL
:ivar url: The link URL
:ivar base_url: The base URL against which this link is resolved
:ivar text: The link text
:ivar tag: The link tag name
:ivar attrs: The tag attributes
'''
def __init__(self, base_url, url, text, tag, attrs):
assert None not in [url, tag, attrs]
self.base_url = base_url
self.absolute_url = urljoin(base_url, url)
self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
self.text = self.text
def __eq__(self, other):
try:
for name in "url", "text", "tag":
if getattr(self, name) != getattr(other, name):
return False
if dict(self.attrs) != dict(other.attrs):
return False
except AttributeError:
return False
return True
def __ne__(self, other):
return not self.__eq__(other)
def __repr__(self):
return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
self.base_url, self.url, self.text, self.tag, self.attrs)
def content_parser(data,
url=None,
response_info=None,
transport_encoding=None,
default_encoding=DEFAULT_ENCODING,
is_html=True):
'''
Parse data (a bytes object) into an etree representation such as
:py:mod:`xml.etree.ElementTree` or `lxml.etree`
:param bytes data: The data to parse
:param url: The URL of the document being parsed or None
:param response_info: Information about the document
(contains all HTTP headers as :class:`HTTPMessage`)
:param transport_encoding: The character encoding for the document being
parsed as specified in the HTTP headers or None.
:param default_encoding: The character encoding to use if no encoding
could be detected and no transport_encoding is specified
:param is_html: If the document is to be parsed as HTML.
'''
if not is_html:
return
try:
from html5_parser import parse
except Exception:
from html5lib import parse
kw = {'namespaceHTMLElements': False}
if transport_encoding and isinstance(data, bytes):
kw['transport_encoding'] = transport_encoding
return parse(data, **kw)
else:
return parse(data, transport_encoding=transport_encoding)
def get_title(root):
for title in root.iter('title'):
text = compress_whitespace(title.text)
if text:
return text
lazy = object()
class Factory:
"""Factory for forms, links, etc.
This interface may expand in future.
Public methods:
set_request_class(request_class)
set_response(response)
forms()
links()
Public attributes:
Note that accessing these attributes may raise ParseError.
encoding: string specifying the encoding of response if it contains a text
document (this value is left unspecified for documents that do not have
an encoding, e.g. an image file)
is_html: true if response contains an HTML document (XHTML may be
regarded as HTML too)
title: page title, or None if no title or not HTML
global_form: form object containing all controls that are not descendants
of any FORM element, or None if the forms_factory does not support
supplying a global form
"""
def __init__(
self,
default_encoding=DEFAULT_ENCODING,
allow_xhtml=False, ):
"""
Pass keyword arguments only.
"""
self._encoding_finder = EncodingFinder(default_encoding)
self.form_encoding = default_encoding
self._response_type_finder = ResponseTypeFinder(
allow_xhtml=allow_xhtml)
self._content_parser = content_parser
self._current_forms = self._current_links = self._current_title = lazy
self._current_global_form = self._root = lazy
self._raw_data = b''
self.is_html, self.encoding = False, DEFAULT_ENCODING
self.set_response(None)
def set_content_parser(self, val):
self._content_parser = val
def set_request_class(self, request_class):
"""Set request class (mechanize.Request by default).
HTMLForm instances returned by .forms() will return instances of this
class when .click()ed.
"""
self._request_class = request_class
def set_response(self, response):
"""Set response.
The response must either be None or implement the same interface as
objects returned by mechanize.urlopen().
"""
self._response = copy.copy(response)
self._current_forms = self._current_links = self._current_title = lazy
self._current_global_form = self._root = lazy
self.encoding = self._encoding_finder.encoding(self._response)
self.is_html = self._response_type_finder.is_html(
self._response, self.encoding) if self._response else False
@property
def root(self):
if self._root is lazy:
response = self._response
raw = self._response.read() if self._response else b''
default_encoding = self._encoding_finder._default_encoding
transport_encoding = get_encoding_from_response(response, verify=False)
declared_encoding = find_declared_encoding(raw)
self.form_encoding = declared_encoding or transport_encoding or default_encoding
self._root = self._content_parser(
raw,
url=response.geturl() if response else None,
response_info=response.info() if response else None,
default_encoding=default_encoding,
is_html=self.is_html,
transport_encoding=transport_encoding)
return self._root
@property
def title(self):
if self._current_title is lazy:
self._current_title = get_title(
self.root) if self.root is not None else None
return self._current_title or u''
@property
def global_form(self):
if self._current_global_form is lazy:
self.forms()
return self._current_global_form
def forms(self):
""" Return tuple of HTMLForm-like objects. """
# this implementation sets .global_form as a side-effect
if self._current_forms is lazy:
self._current_forms, self._current_global_form = self._get_forms()
return self._current_forms
def links(self):
"""Return tuple of mechanize.Link-like objects. """
if self._current_links is lazy:
self._current_links = self._get_links()
return self._get_links()
def _get_links(self):
if self.root is None:
return ()
return tuple(iterlinks(self.root, self._response.geturl()))
def _get_forms(self):
if self.root is None:
return (), None
return parse_forms(self.root,
self._response.geturl(), self._request_class, encoding=self.form_encoding)
|