File: code_analyzer.py

package info (click to toggle)
python-docutils 0.22%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 11,448 kB
  • sloc: python: 53,302; lisp: 14,475; xml: 1,807; javascript: 1,032; makefile: 102; sh: 96
file content (140 lines) | stat: -rw-r--r-- 4,981 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# :Author: Georg Brandl; Lea Wiemann; Günter Milde
# :Date: $Date: 2025-05-20 17:48:27 +0200 (Di, 20. Mai 2025) $
# :Copyright: This module has been placed in the public domain.

"""Lexical analysis of formal languages (i.e. code) using Pygments."""

from __future__ import annotations

__docformat__ = 'reStructuredText'

try:
    import pygments
    from pygments.lexers import get_lexer_by_name
    from pygments.formatters.html import _get_ttype_class
    with_pygments = True
except ImportError:
    with_pygments = False

from docutils import ApplicationError

# Filter the following token types from the list of class arguments:
unstyled_tokens = ['token',  # Token (base token type)
                   'text',   # Token.Text
                   '']       # short name for Token and Text
# (Add, e.g., Token.Punctuation with ``unstyled_tokens += 'punctuation'``.)


class LexerError(ApplicationError):
    pass


class Lexer:
    """Parse `code` lines and yield "classified" tokens.

    Arguments

      code       -- string of source code to parse,
      language   -- formal language the code is written in,
      tokennames -- either 'long', 'short', or 'none' (see below).

    Merge subsequent tokens of the same token-type.

    Iterating over an instance yields the tokens as ``(tokentype, value)``
    tuples. The value of `tokennames` configures the naming of the tokentype:

      'long':  downcased full token type name,
      'short': short name defined by pygments.token.STANDARD_TYPES
               (= class argument used in pygments html output),
      'none':  skip lexical analysis.
    """

    def __init__(self, code, language, tokennames='short') -> None:
        """
        Set up a lexical analyzer for `code` in `language`.
        """
        self.code = code
        self.language = language
        self.tokennames = tokennames
        self.lexer = None
        # get lexical analyzer for `language`:
        if language in ('', 'text') or tokennames == 'none':
            return
        if not with_pygments:
            raise LexerError('Cannot analyze code. '
                             'Pygments package not found.')
        try:
            self.lexer = get_lexer_by_name(self.language)
        except pygments.util.ClassNotFound:
            raise LexerError('Cannot analyze code. '
                             'No Pygments lexer found for "%s".' % language)
        # self.lexer.add_filter('tokenmerge')
        # Since version 1.2. (released Jan 01, 2010) Pygments has a
        # TokenMergeFilter. # ``self.merge(tokens)`` in __iter__ could
        # be replaced by ``self.lexer.add_filter('tokenmerge')`` in __init__.
        # However, `merge` below also strips a final newline added by pygments.
        #
        # self.lexer.add_filter('tokenmerge')

    def merge(self, tokens):
        """Merge subsequent tokens of same token-type.

           Also strip the final newline (added by pygments).
        """
        tokens = iter(tokens)
        (lasttype, lastval) = next(tokens)
        for ttype, value in tokens:
            if ttype is lasttype:
                lastval += value
            else:
                yield lasttype, lastval
                (lasttype, lastval) = (ttype, value)
        lastval = lastval.removesuffix('\n')
        if lastval:
            yield lasttype, lastval

    def __iter__(self):
        """Parse self.code and yield "classified" tokens.
        """
        if self.lexer is None:
            yield [], self.code
            return
        tokens = pygments.lex(self.code, self.lexer)
        for tokentype, value in self.merge(tokens):
            if self.tokennames == 'long':  # long CSS class args
                classes = str(tokentype).lower().split('.')
            else:  # short CSS class args
                classes = [_get_ttype_class(tokentype)]
            classes = [cls for cls in classes if cls not in unstyled_tokens]
            yield classes, value


class NumberLines:
    """Insert linenumber-tokens at the start of every code line.

    Arguments

       tokens    -- iterable of ``(classes, value)`` tuples
       startline -- first line number
       endline   -- last line number

    Iterating over an instance yields the tokens with a
    ``(['ln'], '<the line number>')`` token added for every code line.
    Multi-line tokens are split."""

    def __init__(self, tokens, startline, endline) -> None:
        self.tokens = tokens
        self.startline = startline
        # pad linenumbers, e.g. endline == 100 -> fmt_str = '%3d '
        self.fmt_str = f'%{len(str(endline))}d '

    def __iter__(self):
        lineno = self.startline
        yield ['ln'], self.fmt_str % lineno
        for ttype, value in self.tokens:
            lines = value.split('\n')
            for line in lines[:-1]:
                yield ttype, line + '\n'
                lineno += 1
                yield ['ln'], self.fmt_str % lineno
            yield ttype, lines[-1]