File: lexer.py

package info (click to toggle)
python-html5rdf 1.2.1-2
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 3,448 kB
sloc: python: 12,794; makefile: 3
file content (211 lines) | stat: -rw-r--r-- 7,144 bytes
parent folder | download | duplicates (2)
# -*- coding: utf-8 -*-

# Copyright © 2009/2021 Andrey Vlasovskikh
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this
# software and associated documentation files (the "Software"), to deal in the Software
# without restriction, including without limitation the rights to use, copy, modify,
# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to the following
# conditions:
#
# The above copyright notice and this permission notice shall be included in all copies
# or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
# PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

from __future__ import unicode_literals

__all__ = ["make_tokenizer", "TokenSpec", "Token", "LexerError"]

import re


class LexerError(Exception):
    def __init__(self, place, msg):
        self.place = place
        self.msg = msg

    def __str__(self):
        s = "cannot tokenize data"
        line, pos = self.place
        return '%s: %d,%d: "%s"' % (s, line, pos, self.msg)


class TokenSpec(object):
    """A token specification for generating a lexer via `make_tokenizer()`."""

    def __init__(self, type, pattern, flags=0):
        """Initialize a `TokenSpec` object.

        Parameters:
            type (str): User-defined type of the token (e.g. `"name"`, `"number"`,
                `"operator"`)
            pattern (str): Regexp for matching this token type
            flags (int, optional): Regexp flags, the second argument of `re.compile()`
        """
        self.type = type
        self.pattern = pattern
        self.flags = flags

    def __repr__(self):
        return "TokenSpec(%r, %r, %r)" % (self.type, self.pattern, self.flags)


class Token(object):
    """A token object that represents a substring of certain type in your text.

    You can compare tokens for equality using the `==` operator. Tokens also define
    custom `repr()` and `str()`.

    Attributes:
        type (str): User-defined type of the token (e.g. `"name"`, `"number"`,
            `"operator"`)
        value (str): Text value of the token
        start (Optional[Tuple[int, int]]): Start position (_line_, _column_)
        end (Optional[Tuple[int, int]]): End position (_line_, _column_)
    """

    def __init__(self, type, value, start=None, end=None):
        """Initialize a `Token` object."""
        self.type = type
        self.value = value
        self.start = start
        self.end = end

    def __repr__(self):
        return "Token(%r, %r)" % (self.type, self.value)

    def __eq__(self, other):
        # FIXME: Case sensitivity is assumed here
        if other is None:
            return False
        else:
            return self.type == other.type and self.value == other.value

    def _pos_str(self):
        if self.start is None or self.end is None:
            return ""
        else:
            sl, sp = self.start
            el, ep = self.end
            return "%d,%d-%d,%d:" % (sl, sp, el, ep)

    def __str__(self):
        s = "%s %s '%s'" % (self._pos_str(), self.type, self.value)
        return s.strip()

    @property
    def name(self):
        return self.value

    def pformat(self):
        return "%s %s '%s'" % (
            self._pos_str().ljust(20),  # noqa
            self.type.ljust(14),
            self.value,
        )


def make_tokenizer(specs):
    # noinspection GrazieInspection
    """Make a function that tokenizes text based on the regexp specs.

    Type: `(Sequence[TokenSpec | Tuple]) -> Callable[[str], Iterable[Token]]`

    A token spec is `TokenSpec` instance.

    !!! Note

        For legacy reasons, a token spec may also be a tuple of (_type_, _args_), where
        _type_ sets the value of `Token.type` for the token, and _args_ are the
        positional arguments for `re.compile()`: either just (_pattern_,) or
        (_pattern_, _flags_).

    It returns a tokenizer function that takes a string and returns an iterable of
    `Token` objects, or raises `LexerError` if it cannot tokenize the string according
    to its token specs.

    Examples:

    ```pycon
    >>> tokenize = make_tokenizer([
    ...     TokenSpec("space", r"\\s+"),
    ...     TokenSpec("id", r"\\w+"),
    ...     TokenSpec("op", r"[,!]"),
    ... ])
    >>> text = "Hello, World!"
    >>> [t for t in tokenize(text) if t.type != "space"]  # noqa
    [Token('id', 'Hello'), Token('op', ','), Token('id', 'World'), Token('op', '!')]
    >>> text = "Bye?"
    >>> list(tokenize(text))
    Traceback (most recent call last):
        ...
    lexer.LexerError: cannot tokenize data: 1,4: "Bye?"

    ```
    """
    compiled = []
    for spec in specs:
        if isinstance(spec, TokenSpec):
            c = spec.type, re.compile(spec.pattern, spec.flags)
        else:
            name, args = spec
            c = name, re.compile(*args)
        compiled.append(c)

    def match_specs(s, i, position):
        line, pos = position
        for type, regexp in compiled:
            m = regexp.match(s, i)
            if m is not None:
                value = m.group()
                nls = value.count("\n")
                n_line = line + nls
                if nls == 0:
                    n_pos = pos + len(value)
                else:
                    n_pos = len(value) - value.rfind("\n") - 1
                return Token(type, value, (line, pos + 1), (n_line, n_pos))
        else:
            err_line = s.splitlines()[line - 1]
            raise LexerError((line, pos + 1), err_line)

    def f(s):
        length = len(s)
        line, pos = 1, 0
        i = 0
        while i < length:
            t = match_specs(s, i, (line, pos))
            yield t
            line, pos = t.end
            i += len(t.value)

    return f


# This is an example of token specs. See also [this article][1] for a
# discussion of searching for multiline comments using regexps (including `*?`).
#
#   [1]: http://ostermiller.org/findcomment.html
_example_token_specs = [
    TokenSpec("COMMENT", r"\(\*(.|[\r\n])*?\*\)", re.MULTILINE),
    TokenSpec("COMMENT", r"\{(.|[\r\n])*?\}", re.MULTILINE),
    TokenSpec("COMMENT", r"//.*"),
    TokenSpec("NL", r"[\r\n]+"),
    TokenSpec("SPACE", r"[ \t\r\n]+"),
    TokenSpec("NAME", r"[A-Za-z_][A-Za-z_0-9]*"),
    TokenSpec("REAL", r"[0-9]+\.[0-9]*([Ee][+\-]?[0-9]+)*"),
    TokenSpec("INT", r"[0-9]+"),
    TokenSpec("INT", r"\$[0-9A-Fa-f]+"),
    TokenSpec("OP", r"(\.\.)|(<>)|(<=)|(>=)|(:=)|[;,=\(\):\[\]\.+\-<>\*/@\^]"),
    TokenSpec("STRING", r"'([^']|(''))*'"),
    TokenSpec("CHAR", r"#[0-9]+"),
    TokenSpec("CHAR", r"#\$[0-9A-Fa-f]+"),
]
# tokenize = make_tokenizer(_example_token_specs)