File: _equiv.py

package info (click to toggle)
python-mechanize 1%3A0.4.10%2Bds-5
links: PTS, VCS
area: main
in suites: forky, sid
size: 1,316 kB
sloc: python: 16,656; makefile: 11; sh: 4
file content (350 lines) | stat: -rw-r--r-- 10,886 bytes
parent folder | download | duplicates (4)
#!/usr/bin/env python
# vim:fileencoding=utf-8
# Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

import re
import string

from ._entities import html5_entities
from .polyglot import codepoint_to_chr

space_chars = frozenset(("\t", "\n", "\u000C", " ", "\r"))
space_chars_bytes = frozenset(item.encode("ascii") for item in space_chars)
ascii_letters_bytes = frozenset(
    item.encode("ascii") for item in string.ascii_letters)
spaces_angle_brackets = space_chars_bytes | frozenset((b">", b"<"))
skip1 = space_chars_bytes | frozenset((b"/", ))
head_elems = frozenset((
    b"html", b"head", b"title", b"base", b"script",
    b"style", b"meta", b"link", b"object"))


def my_unichr(num):
    try:
        return codepoint_to_chr(num)
    except (ValueError, OverflowError):
        return '?'


def replace_entity(match):
    ent = match.group(1).lower()
    if ent in {'apos', 'squot'}:
        # squot is generated by some broken CMS software
        return "'"
    if ent == 'hellips':
        ent = 'hellip'
    if ent.startswith('#'):
        try:
            if ent[1] in ('x', 'X'):
                num = int(ent[2:], 16)
            else:
                num = int(ent[1:])
        except Exception:
            return '&' + ent + ';'
        if num > 255:
            return my_unichr(num)
        try:
            return chr(num).decode('cp1252')
        except UnicodeDecodeError:
            return my_unichr(num)
    try:
        return html5_entities[ent]
    except KeyError:
        pass
    return '&' + ent + ';'


class Bytes(bytes):
    """String-like object with an associated position and various extra methods
    If the position is ever greater than the string length then an exception is
    raised"""

    def __init__(self, value):
        self._position = -1

    def __iter__(self):
        return self

    def __next__(self):
        p = self._position = self._position + 1
        if p >= len(self):
            raise StopIteration
        elif p < 0:
            raise TypeError
        return self[p:p + 1]

    def next(self):
        # Py2 compat
        return self.__next__()

    def previous(self):
        p = self._position
        if p >= len(self):
            raise StopIteration
        elif p < 0:
            raise TypeError
        self._position = p = p - 1
        return self[p:p + 1]

    @property
    def position(self):
        if self._position >= len(self):
            raise StopIteration
        if self._position >= 0:
            return self._position

    @position.setter
    def position(self, position):
        if self._position >= len(self):
            raise StopIteration
        self._position = position

    @property
    def current_byte(self):
        return self[self.position:self.position + 1]

    def skip(self, chars=space_chars_bytes):
        """Skip past a list of characters"""
        p = self.position  # use property for the error-checking
        while p < len(self):
            c = self[p:p + 1]
            if c not in chars:
                self._position = p
                return c
            p += 1
        self._position = p
        return

    def skip_until(self, chars):
        p = pos = self.position
        while p < len(self):
            c = self[p:p + 1]
            if c in chars:
                self._position = p
                return self[pos:p], c
            p += 1
        self._position = p
        return b'', b''

    def match_bytes(self, bytes):
        """Look for a sequence of bytes at the start of a string. If the bytes
        are found return True and advance the position to the byte after the
        match. Otherwise return False and leave the position alone"""
        p = self.position
        data = self[p:p + len(bytes)]
        rv = data.startswith(bytes)
        if rv:
            self.position += len(bytes)
        return rv

    def match_bytes_pat(self, pat):
        bytes = pat.pattern
        m = pat.match(self, self.position)
        if m is None:
            return False
        bytes = m.group()
        self.position += len(bytes)
        return True

    def jump_to(self, bytes):
        """Look for the next sequence of bytes matching a given sequence. If
        a match is found advance the position to the last byte of the match"""
        new_pos = self.find(bytes, max(0, self.position))
        if new_pos > -1:
            new_pos -= self.position
            if self._position == -1:
                self._position = 0
            self._position += (new_pos + len(bytes) - 1)
            return True
        else:
            raise StopIteration


class HTTPEquivParser(object):
    """Mini parser for detecting http-equiv headers from meta tags """

    def __init__(self, data):
        """string - the data to work on """
        self.data = Bytes(data)
        self.headers = []

    def __call__(self):
        mb, mbp = self.data.match_bytes, self.data.match_bytes_pat
        dispatch = (
                (mb, b"<!--", self.handle_comment),
                (mbp, re.compile(b"<meta", flags=re.IGNORECASE),
                    self.handle_meta),
                (mbp, re.compile(b"</head", flags=re.IGNORECASE),
                    lambda: False),
                (mb, b"</", self.handle_possible_end_tag),
                (mb, b"<!", self.handle_other),
                (mb, b"<?", self.handle_other),
                (mb, b"<", self.handle_possible_start_tag)
        )
        for byte in self.data:
            keep_parsing = True
            for matcher, key, method in dispatch:
                if matcher(key):
                    try:
                        keep_parsing = method()
                        break
                    except StopIteration:
                        keep_parsing = False
                        break
            if not keep_parsing:
                break

        ans = []
        entity_pat = re.compile(r'&(\S+?);')
        for name, val in self.headers:
            try:
                name, val = name.decode('ascii'), val.decode('ascii')
            except ValueError:
                continue
            name = entity_pat.sub(replace_entity, name)
            val = entity_pat.sub(replace_entity, val)
            try:
                name, val = name.encode('ascii'), val.encode('ascii')
            except ValueError:
                continue
            ans.append((name, val))
        return ans

    def handle_comment(self):
        """Skip over comments"""
        return self.data.jump_to(b"-->")

    def handle_meta(self):
        if self.data.current_byte not in space_chars_bytes:
            # if we have <meta not followed by a space so just keep going
            return True
        # We have a valid meta element we want to search for attributes
        pending_header = pending_content = None

        while True:
            # Try to find the next attribute after the current position
            attr = self.get_attribute()
            if attr is None:
                return True
            name, val = attr
            name = name.lower()
            if name == b"http-equiv":
                if val:
                    val = val.lower()
                    if pending_content:
                        self.headers.append((val, pending_content))
                        return True
                    pending_header = val
            elif name == b'content':
                if val:
                    if pending_header:
                        self.headers.append((pending_header, val))
                        return True
                    pending_content = val
        return True

    def handle_possible_start_tag(self):
        return self.handle_possible_tag(False)

    def handle_possible_end_tag(self):
        next(self.data)
        return self.handle_possible_tag(True)

    def handle_possible_tag(self, end_tag):
        data = self.data
        if data.current_byte not in ascii_letters_bytes:
            # If the next byte is not an ascii letter either ignore this
            # fragment (possible start tag case) or treat it according to
            # handle_other
            if end_tag:
                data.previous()
                self.handle_other()
            return True

        tag_name, c = data.skip_until(spaces_angle_brackets)
        tag_name = tag_name.lower()
        if not end_tag and tag_name not in head_elems:
            return False
        if c == b"<":
            # return to the first step in the overall "two step" algorithm
            # reprocessing the < byte
            data.previous()
        else:
            # Read all attributes
            attr = self.get_attribute()
            while attr is not None:
                attr = self.get_attribute()
        return True

    def handle_other(self):
        return self.data.jump_to(b">")

    def get_attribute(self):
        """Return a name,value pair for the next attribute in the stream,
        if one is found, or None"""
        data = self.data
        # Step 1 (skip chars)
        c = data.skip(skip1)
        assert c is None or len(c) == 1
        # Step 2
        if c in (b">", None):
            return None
        # Step 3
        attr_name = []
        attr_value = []
        # Step 4 attribute name
        while True:
            if c == b"=" and attr_name:
                break
            elif c in space_chars_bytes:
                # Step 6!
                c = data.skip()
                break
            elif c in (b"/", b">"):
                return b"".join(attr_name), b""
            elif c is None:
                return None
            else:
                attr_name.append(c)
            # Step 5
            c = next(data)
        # Step 7
        if c != b"=":
            data.previous()
            return b"".join(attr_name), b""
        # Step 8
        next(data)
        # Step 9
        c = data.skip()
        # Step 10
        if c in (b"'", b'"'):
            # 10.1
            quote_char = c
            while True:
                # 10.2
                c = next(data)
                # 10.3
                if c == quote_char:
                    next(data)
                    return b"".join(attr_name), b"".join(attr_value)
                # 10.4
                else:
                    attr_value.append(c)
        elif c == b">":
            return b"".join(attr_name), b""
        elif c is None:
            return None
        else:
            attr_value.append(c)
        # Step 11
        while True:
            c = next(data)
            if c in spaces_angle_brackets:
                return b"".join(attr_name), b"".join(attr_value)
            elif c is None:
                return None
            else:
                attr_value.append(c)