File: browser.py

package info (click to toggle)
python-mechanicalsoup 1.4.0-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 416 kB
sloc: python: 2,165; makefile: 150
file content (351 lines) | stat: -rw-r--r-- 14,987 bytes
parent folder | download | duplicates (2)
import io
import os
import tempfile
import urllib
import weakref
import webbrowser

import bs4
import bs4.dammit
import requests

from .__version__ import __title__, __version__
from .form import Form
from .utils import LinkNotFoundError, is_multipart_file_upload


class Browser:
    """Builds a low-level Browser.

    It is recommended to use :class:`StatefulBrowser` for most applications,
    since it offers more advanced features and conveniences than Browser.

    :param session: Attach a pre-existing requests Session instead of
        constructing a new one.
    :param soup_config: Configuration passed to BeautifulSoup to affect
        the way HTML is parsed. Defaults to ``{'features': 'lxml'}``.
        If overridden, it is highly recommended to `specify a parser
        <https://www.crummy.com/software/BeautifulSoup/bs4/doc/#specifying-the-parser-to-use>`__.
        Otherwise, BeautifulSoup will issue a warning and pick one for
        you, but the parser it chooses may be different on different
        machines.
    :param requests_adapters: Configuration passed to requests, to affect
        the way HTTP requests are performed.
    :param raise_on_404: If True, raise :class:`LinkNotFoundError`
        when visiting a page triggers a 404 Not Found error.
    :param user_agent: Set the user agent header to this value.

    """
    def __init__(self, session=None, soup_config={'features': 'lxml'},
                 requests_adapters=None,
                 raise_on_404=False, user_agent=None):

        self.raise_on_404 = raise_on_404
        self.session = session or requests.Session()

        if hasattr(weakref, 'finalize'):
            self._finalize = weakref.finalize(self.session, self.close)
        else:   # pragma: no cover
            # Python < 3 does not have weakref.finalize, but these
            # versions accept calling session.close() within __del__
            self._finalize = self.close

        self.set_user_agent(user_agent)

        if requests_adapters is not None:
            for adaptee, adapter in requests_adapters.items():
                self.session.mount(adaptee, adapter)

        self.soup_config = soup_config or dict()

    @staticmethod
    def __looks_like_html(response):
        """Guesses entity type when Content-Type header is missing.
        Since Content-Type is not strictly required, some servers leave it out.
        """
        text = response.text.lstrip().lower()
        return text.startswith('<html') or text.startswith('<!doctype')

    @staticmethod
    def add_soup(response, soup_config):
        """Attaches a soup object to a requests response."""
        if ("text/html" in response.headers.get("Content-Type", "") or
                Browser.__looks_like_html(response)):
            # Note: By default (no charset provided in HTTP headers), requests
            # returns 'ISO-8859-1' which is the default for HTML4, even if HTML
            # code specifies a different encoding. In this case, we want to
            # resort to bs4 sniffing, hence the special handling here.
            http_encoding = (
                response.encoding
                if 'charset' in response.headers.get("Content-Type", "")
                else None
            )
            html_encoding = bs4.dammit.EncodingDetector.find_declared_encoding(
                response.content,
                is_html=True
            )
            # See https://www.w3.org/International/questions/qa-html-encoding-declarations.en#httphead  # noqa: E501
            # > The HTTP header has a higher precedence than the in-document
            # > meta declarations.
            encoding = http_encoding if http_encoding else html_encoding
            response.soup = bs4.BeautifulSoup(
                response.content,
                from_encoding=encoding,
                **soup_config
            )
        else:
            response.soup = None

    def set_cookiejar(self, cookiejar):
        """Replaces the current cookiejar in the requests session. Since the
        session handles cookies automatically without calling this function,
        only use this when default cookie handling is insufficient.

        :param cookiejar: Any `http.cookiejar.CookieJar
          <https://docs.python.org/3/library/http.cookiejar.html#http.cookiejar.CookieJar>`__
          compatible object.
        """
        self.session.cookies = cookiejar

    def get_cookiejar(self):
        """Gets the cookiejar from the requests session."""
        return self.session.cookies

    def set_user_agent(self, user_agent):
        """Replaces the current user agent in the requests session headers."""
        # set a default user_agent if not specified
        if user_agent is None:
            requests_ua = requests.utils.default_user_agent()
            user_agent = f'{requests_ua} ({__title__}/{__version__})'

        # the requests module uses a case-insensitive dict for session headers
        self.session.headers['User-agent'] = user_agent

    def request(self, *args, **kwargs):
        """Straightforward wrapper around `requests.Session.request
        <http://docs.python-requests.org/en/master/api/#requests.Session.request>`__.

        :return: `requests.Response
            <http://docs.python-requests.org/en/master/api/#requests.Response>`__
            object with a *soup*-attribute added by :func:`add_soup`.

        This is a low-level function that should not be called for
        basic usage (use :func:`get` or :func:`post` instead). Use it if you
        need an HTTP verb that MechanicalSoup doesn't manage (e.g. MKCOL) for
        example.
        """
        response = self.session.request(*args, **kwargs)
        Browser.add_soup(response, self.soup_config)
        return response

    def get(self, *args, **kwargs):
        """Straightforward wrapper around `requests.Session.get
        <http://docs.python-requests.org/en/master/api/#requests.Session.get>`__.

        :return: `requests.Response
            <http://docs.python-requests.org/en/master/api/#requests.Response>`__
            object with a *soup*-attribute added by :func:`add_soup`.
        """
        response = self.session.get(*args, **kwargs)
        if self.raise_on_404 and response.status_code == 404:
            raise LinkNotFoundError()
        Browser.add_soup(response, self.soup_config)
        return response

    def post(self, *args, **kwargs):
        """Straightforward wrapper around `requests.Session.post
        <http://docs.python-requests.org/en/master/api/#requests.Session.post>`__.

        :return: `requests.Response
            <http://docs.python-requests.org/en/master/api/#requests.Response>`__
            object with a *soup*-attribute added by :func:`add_soup`.
        """
        response = self.session.post(*args, **kwargs)
        Browser.add_soup(response, self.soup_config)
        return response

    def put(self, *args, **kwargs):
        """Straightforward wrapper around `requests.Session.put
        <http://docs.python-requests.org/en/master/api/#requests.Session.put>`__.

        :return: `requests.Response
            <http://docs.python-requests.org/en/master/api/#requests.Response>`__
            object with a *soup*-attribute added by :func:`add_soup`.
        """
        response = self.session.put(*args, **kwargs)
        Browser.add_soup(response, self.soup_config)
        return response

    @staticmethod
    def _get_request_kwargs(method, url, **kwargs):
        """This method exists to raise a TypeError when a method or url is
        specified in the kwargs.
        """
        request_kwargs = {"method": method, "url": url}
        request_kwargs.update(kwargs)
        return request_kwargs

    @classmethod
    def get_request_kwargs(cls, form, url=None, **kwargs):
        """Extract input data from the form."""
        method = str(form.get("method", "get"))
        action = form.get("action")
        url = urllib.parse.urljoin(url, action)
        if url is None:  # This happens when both `action` and `url` are None.
            raise ValueError('no URL to submit to')

        # read https://www.w3.org/TR/html52/sec-forms.html
        if method.lower() == "get":
            data = kwargs.pop("params", dict())
        else:
            data = kwargs.pop("data", dict())
        files = kwargs.pop("files", dict())

        # Use a list of 2-tuples to better reflect the behavior of browser QSL.
        # Requests also retains order when encoding form data in 2-tuple lists.
        data = [(k, v) for k, v in data.items()]

        multipart = form.get("enctype", "") == "multipart/form-data"

        # Process form tags in the order that they appear on the page,
        # skipping those tags that do not have a name-attribute.
        selector = ",".join(f"{tag}[name]" for tag in
                            ("input", "button", "textarea", "select"))
        for tag in form.select(selector):
            name = tag.get("name")  # name-attribute of tag

            # Skip disabled elements, since they should not be submitted.
            if tag.has_attr('disabled'):
                continue

            if tag.name == "input":
                if tag.get("type", "").lower() in ("radio", "checkbox"):
                    if "checked" not in tag.attrs:
                        continue
                    value = tag.get("value", "on")
                else:
                    # browsers use empty string for inputs with missing values
                    value = tag.get("value", "")

                # If the enctype is not multipart, the filename is put in
                # the form as a text input and the file is not sent.
                if is_multipart_file_upload(form, tag):
                    if isinstance(value, io.IOBase):
                        content = value
                        filename = os.path.basename(getattr(value, "name", ""))
                    else:
                        content = ""
                        filename = os.path.basename(value)
                    # If content is the empty string, we still pass it
                    # for consistency with browsers (see
                    # https://github.com/MechanicalSoup/MechanicalSoup/issues/250).
                    files[name] = (filename, content)
                else:
                    if isinstance(value, io.IOBase):
                        value = os.path.basename(getattr(value, "name", ""))
                    data.append((name, value))

            elif tag.name == "button":
                if tag.get("type", "").lower() in ("button", "reset"):
                    continue
                else:
                    data.append((name, tag.get("value", "")))

            elif tag.name == "textarea":
                data.append((name, tag.text))

            elif tag.name == "select":
                # If the value attribute is not specified, the content will
                # be passed as a value instead.
                options = tag.select("option")
                selected_values = [i.get("value", i.text) for i in options
                                   if "selected" in i.attrs]
                if "multiple" in tag.attrs:
                    for value in selected_values:
                        data.append((name, value))
                elif selected_values:
                    # A standard select element only allows one option to be
                    # selected, but browsers pick last if somehow multiple.
                    data.append((name, selected_values[-1]))
                elif options:
                    # Selects the first option if none are selected
                    first_value = options[0].get("value", options[0].text)
                    data.append((name, first_value))

        if method.lower() == "get":
            kwargs["params"] = data
        else:
            kwargs["data"] = data

        # The following part of the function is here to respect the
        # enctype specified by the form, i.e. force sending multipart
        # content. Since Requests doesn't have yet a feature to choose
        # enctype, we have to use tricks to make it behave as we want
        # This code will be updated if Requests implements it.
        if multipart and not files:
            # Requests will switch to "multipart/form-data" only if
            # files pass the `if files:` test, so in this case we use
            # a modified dict that passes the if test even if empty.
            class DictThatReturnsTrue(dict):
                def __bool__(self):
                    return True
                __nonzero__ = __bool__

            files = DictThatReturnsTrue()

        return cls._get_request_kwargs(method, url, files=files, **kwargs)

    def _request(self, form, url=None, **kwargs):
        """Extract input data from the form to pass to a Requests session."""
        request_kwargs = Browser.get_request_kwargs(form, url, **kwargs)
        return self.session.request(**request_kwargs)

    def submit(self, form, url=None, **kwargs):
        """Prepares and sends a form request.

        NOTE: To submit a form with a :class:`StatefulBrowser` instance, it is
        recommended to use :func:`StatefulBrowser.submit_selected` instead of
        this method so that the browser state is correctly updated.

        :param form: The filled-out form.
        :param url: URL of the page the form is on. If the form action is a
            relative path, then this must be specified.
        :param \\*\\*kwargs: Arguments forwarded to `requests.Session.request
            <http://docs.python-requests.org/en/master/api/#requests.Session.request>`__.
            If `files`, `params` (with GET), or `data` (with POST) are
            specified, they will be appended to by the contents of `form`.

        :return: `requests.Response
            <http://docs.python-requests.org/en/master/api/#requests.Response>`__
            object with a *soup*-attribute added by :func:`add_soup`.
        """
        if isinstance(form, Form):
            form = form.form
        response = self._request(form, url, **kwargs)
        Browser.add_soup(response, self.soup_config)
        return response

    def launch_browser(self, soup):
        """Launch a browser to display a page, for debugging purposes.

        :param: soup: Page contents to display, supplied as a bs4 soup object.
        """
        with tempfile.NamedTemporaryFile(delete=False, suffix='.html') as file:
            file.write(soup.encode())
        webbrowser.open('file://' + file.name)

    def close(self):
        """Close the current session, if still open."""
        if self.session is not None:
            self.session.cookies.clear()
            self.session.close()
            self.session = None

    def __del__(self):
        self._finalize()

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()