File: speedups.pyx

package info (click to toggle)
python-tinycss 0.4-8
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 716 kB
  • sloc: python: 2,476; makefile: 7
file content (189 lines) | stat: -rw-r--r-- 7,120 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# coding: utf-8
"""
    tinycss.speedups
    ----------------

    Cython module for speeding up inner loops.

    Right now only :func:`tokenize_flat` has a second implementation.

    :copyright: (c) 2010 by Simon Sapin.
    :license: BSD, see LICENSE for more details.
"""

from __future__ import unicode_literals

from .token_data import (
    COMPILED_TOKEN_REGEXPS, UNICODE_UNESCAPE, NEWLINE_UNESCAPE,
    SIMPLE_UNESCAPE, FIND_NEWLINES, TOKEN_DISPATCH)


COMPILED_TOKEN_INDEXES = dict(
    (name, i) for i, (name, regexp) in enumerate(COMPILED_TOKEN_REGEXPS))


cdef class CToken:
    """A token built by the Cython speedups. Identical to
    :class:`~.token_data.Token`.

    """
    is_container = False

    cdef public object type, _as_css, value, unit
    cdef public Py_ssize_t line, column

    def __init__(self, type_, css_value, value, unit, line, column):
        self.type = type_
        self._as_css = css_value
        self.value = value
        self.unit = unit
        self.line = line
        self.column = column

    def as_css(self):
        """
        Return as an Unicode string the CSS representation of the token,
        as parsed in the source.
        """
        return self._as_css

    def __repr__(self):
        return ('<Token {0.type} at {0.line}:{0.column} {0.value!r}{1}>'
                .format(self, self.unit or ''))


def tokenize_flat(css_source, int ignore_comments=1):
    """
    :param css_source:
        CSS as an unicode string
    :param ignore_comments:
        if true (the default) comments will not be included in the
        return value
    :return:
        An iterator of :class:`Token`

    """
    # Make these local variable to avoid global lookups in the loop
    tokens_dispatch = TOKEN_DISPATCH
    compiled_token_indexes = COMPILED_TOKEN_INDEXES
    compiled_tokens = COMPILED_TOKEN_REGEXPS
    unicode_unescape = UNICODE_UNESCAPE
    newline_unescape = NEWLINE_UNESCAPE
    simple_unescape = SIMPLE_UNESCAPE
    find_newlines = FIND_NEWLINES

    # Use the integer indexes instead of string markers
    cdef Py_ssize_t BAD_COMMENT = compiled_token_indexes['BAD_COMMENT']
    cdef Py_ssize_t BAD_STRING = compiled_token_indexes['BAD_STRING']
    cdef Py_ssize_t PERCENTAGE = compiled_token_indexes['PERCENTAGE']
    cdef Py_ssize_t DIMENSION = compiled_token_indexes['DIMENSION']
    cdef Py_ssize_t ATKEYWORD = compiled_token_indexes['ATKEYWORD']
    cdef Py_ssize_t FUNCTION = compiled_token_indexes['FUNCTION']
    cdef Py_ssize_t COMMENT = compiled_token_indexes['COMMENT']
    cdef Py_ssize_t NUMBER = compiled_token_indexes['NUMBER']
    cdef Py_ssize_t STRING = compiled_token_indexes['STRING']
    cdef Py_ssize_t IDENT = compiled_token_indexes['IDENT']
    cdef Py_ssize_t HASH = compiled_token_indexes['HASH']
    cdef Py_ssize_t URI = compiled_token_indexes['URI']
    cdef Py_ssize_t DELIM = -1

    cdef Py_ssize_t pos = 0
    cdef Py_ssize_t line = 1
    cdef Py_ssize_t column = 1
    cdef Py_ssize_t source_len = len(css_source)
    cdef Py_ssize_t n_tokens = len(compiled_tokens)
    cdef Py_ssize_t length, next_pos, type_
    cdef CToken token

    tokens = []
    while pos < source_len:
        char = css_source[pos]
        if char in ':;{}()[]':
            type_ = -1  # not parsed further anyway
            type_name = char
            css_value = char
        else:
            codepoint = min(ord(char), 160)
            for type_, type_name, regexp in tokens_dispatch[codepoint]:
                match = regexp(css_source, pos)
                if match:
                    # First match is the longest. See comments on TOKENS above.
                    css_value = match.group()
                    break
            else:
                # No match.
                # "Any other character not matched by the above rules,
                #  and neither a single nor a double quote."
                # ... but quotes at the start of a token are always matched
                # by STRING or BAD_STRING. So DELIM is any single character.
                type_ = DELIM
                type_name = 'DELIM'
                css_value = char
        length = len(css_value)
        next_pos = pos + length

        # A BAD_COMMENT is a comment at EOF. Ignore it too.
        if not (ignore_comments and type_ in (COMMENT, BAD_COMMENT)):
            # Parse numbers, extract strings and URIs, unescape
            unit = None
            if type_ == DIMENSION:
                value = match.group(1)
                value = float(value) if '.' in value else int(value)
                unit = match.group(2)
                unit = simple_unescape(unit)
                unit = unicode_unescape(unit)
                unit = unit.lower()  # normalize
            elif type_ == PERCENTAGE:
                value = css_value[:-1]
                value = float(value) if '.' in value else int(value)
                unit = '%'
            elif type_ == NUMBER:
                value = css_value
                if '.' in value:
                    value = float(value)
                else:
                    value = int(value)
                    type_name = 'INTEGER'
            elif type_ in (IDENT, ATKEYWORD, HASH, FUNCTION):
                value = simple_unescape(css_value)
                value = unicode_unescape(value)
            elif type_ == URI:
                value = match.group(1)
                if value and value[0] in '"\'':
                    value = value[1:-1]  # Remove quotes
                    value = newline_unescape(value)
                value = simple_unescape(value)
                value = unicode_unescape(value)
            elif type_ == STRING:
                value = css_value[1:-1]  # Remove quotes
                value = newline_unescape(value)
                value = simple_unescape(value)
                value = unicode_unescape(value)
            # BAD_STRING can only be one of:
            # * Unclosed string at the end of the stylesheet:
            #   Close the string, but this is not an error.
            #   Make it a "good" STRING token.
            # * Unclosed string at the (unescaped) end of the line:
            #   Close the string, but this is an error.
            #   Leave it as a BAD_STRING, don’t bother parsing it.
            # See http://www.w3.org/TR/CSS21/syndata.html#parsing-errors
            elif type_ == BAD_STRING and next_pos == source_len:
                type_name = 'STRING'
                value = css_value[1:]  # Remove quote
                value = newline_unescape(value)
                value = simple_unescape(value)
                value = unicode_unescape(value)
            else:
                value = css_value
            token = CToken(type_name, css_value, value, unit, line, column)
            tokens.append(token)

        pos = next_pos
        newlines = list(find_newlines(css_value))
        if newlines:
            line += len(newlines)
            # Add 1 to have lines start at column 1, not 0
            column = length - newlines[-1].end() + 1
        else:
            column += length
    return tokens