File: phystokens.py

package info (click to toggle)
chromium 120.0.6099.224-1~deb11u1
links: PTS, VCS
area: main
in suites: bullseye
size: 6,112,112 kB
sloc: cpp: 32,907,025; ansic: 8,148,123; javascript: 3,679,536; python: 2,031,248; asm: 959,718; java: 804,675; xml: 617,256; sh: 111,417; objc: 100,835; perl: 88,443; cs: 53,032; makefile: 29,579; fortran: 24,137; php: 21,162; tcl: 21,147; sql: 20,809; ruby: 17,735; pascal: 12,864; yacc: 8,045; lisp: 3,388; lex: 1,323; ada: 727; awk: 329; jsp: 267; csh: 117; exp: 43; sed: 37
file content (210 lines) | stat: -rw-r--r-- 7,401 bytes
parent folder | download | duplicates (22)
"""Better tokenizing for coverage.py."""

import codecs, keyword, re, sys, token, tokenize
from coverage.backward import set                       # pylint: disable=W0622
from coverage.parser import generate_tokens


def phys_tokens(toks):
    """Return all physical tokens, even line continuations.

    tokenize.generate_tokens() doesn't return a token for the backslash that
    continues lines.  This wrapper provides those tokens so that we can
    re-create a faithful representation of the original source.

    Returns the same values as generate_tokens()

    """
    last_line = None
    last_lineno = -1
    last_ttype = None
    for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks:
        if last_lineno != elineno:
            if last_line and last_line.endswith("\\\n"):
                # We are at the beginning of a new line, and the last line
                # ended with a backslash.  We probably have to inject a
                # backslash token into the stream. Unfortunately, there's more
                # to figure out.  This code::
                #
                #   usage = """\
                #   HEY THERE
                #   """
                #
                # triggers this condition, but the token text is::
                #
                #   '"""\\\nHEY THERE\n"""'
                #
                # so we need to figure out if the backslash is already in the
                # string token or not.
                inject_backslash = True
                if last_ttype == tokenize.COMMENT:
                    # Comments like this \
                    # should never result in a new token.
                    inject_backslash = False
                elif ttype == token.STRING:
                    if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':
                        # It's a multiline string and the first line ends with
                        # a backslash, so we don't need to inject another.
                        inject_backslash = False
                if inject_backslash:
                    # Figure out what column the backslash is in.
                    ccol = len(last_line.split("\n")[-2]) - 1
                    # Yield the token, with a fake token type.
                    yield (
                        99999, "\\\n",
                        (slineno, ccol), (slineno, ccol+2),
                        last_line
                        )
            last_line = ltext
            last_ttype = ttype
        yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext
        last_lineno = elineno


def source_token_lines(source):
    """Generate a series of lines, one for each line in `source`.

    Each line is a list of pairs, each pair is a token::

        [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ]

    Each pair has a token class, and the token text.

    If you concatenate all the token texts, and then join them with newlines,
    you should have your original `source` back, with two differences:
    trailing whitespace is not preserved, and a final line with no newline
    is indistinguishable from a final line with a newline.

    """
    ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL])
    line = []
    col = 0
    source = source.expandtabs(8).replace('\r\n', '\n')
    tokgen = generate_tokens(source)
    for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):
        mark_start = True
        for part in re.split('(\n)', ttext):
            if part == '\n':
                yield line
                line = []
                col = 0
                mark_end = False
            elif part == '':
                mark_end = False
            elif ttype in ws_tokens:
                mark_end = False
            else:
                if mark_start and scol > col:
                    line.append(("ws", " " * (scol - col)))
                    mark_start = False
                tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
                if ttype == token.NAME and keyword.iskeyword(ttext):
                    tok_class = "key"
                line.append((tok_class, part))
                mark_end = True
            scol = 0
        if mark_end:
            col = ecol

    if line:
        yield line

def source_encoding(source):
    """Determine the encoding for `source` (a string), according to PEP 263.

    Returns a string, the name of the encoding.

    """
    # Note: this function should never be called on Python 3, since py3 has
    # built-in tools to do this.
    assert sys.version_info < (3, 0)

    # This is mostly code adapted from Py3.2's tokenize module.

    cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)")

    # Do this so the detect_encode code we copied will work.
    readline = iter(source.splitlines(True)).next

    def _get_normal_name(orig_enc):
        """Imitates get_normal_name in tokenizer.c."""
        # Only care about the first 12 characters.
        enc = orig_enc[:12].lower().replace("_", "-")
        if re.match(r"^utf-8($|-)", enc):
            return "utf-8"
        if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc):
            return "iso-8859-1"
        return orig_enc

    # From detect_encode():
    # It detects the encoding from the presence of a utf-8 bom or an encoding
    # cookie as specified in pep-0263.  If both a bom and a cookie are present,
    # but disagree, a SyntaxError will be raised.  If the encoding cookie is an
    # invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
    # 'utf-8-sig' is returned.

    # If no encoding is specified, then the default will be returned.  The
    # default varied with version.

    if sys.version_info <= (2, 4):
        default = 'iso-8859-1'
    else:
        default = 'ascii'

    bom_found = False
    encoding = None

    def read_or_stop():
        """Get the next source line, or ''."""
        try:
            return readline()
        except StopIteration:
            return ''

    def find_cookie(line):
        """Find an encoding cookie in `line`."""
        try:
            line_string = line.decode('ascii')
        except UnicodeDecodeError:
            return None

        matches = cookie_re.findall(line_string)
        if not matches:
            return None
        encoding = _get_normal_name(matches[0])
        try:
            codec = codecs.lookup(encoding)
        except LookupError:
            # This behaviour mimics the Python interpreter
            raise SyntaxError("unknown encoding: " + encoding)

        if bom_found:
            # codecs in 2.3 were raw tuples of functions, assume the best.
            codec_name = getattr(codec, 'name', encoding)
            if codec_name != 'utf-8':
                # This behaviour mimics the Python interpreter
                raise SyntaxError('encoding problem: utf-8')
            encoding += '-sig'
        return encoding

    first = read_or_stop()
    if first.startswith(codecs.BOM_UTF8):
        bom_found = True
        first = first[3:]
        default = 'utf-8-sig'
    if not first:
        return default

    encoding = find_cookie(first)
    if encoding:
        return encoding

    second = read_or_stop()
    if not second:
        return default

    encoding = find_cookie(second)
    if encoding:
        return encoding

    return default