File: test_tokenizer.py

package info (click to toggle)
python-tinycss 0.4-8
links: PTS, VCS
area: main
in suites: trixie
size: 716 kB
sloc: python: 2,476; makefile: 7
file content (302 lines) | stat: -rw-r--r-- 10,662 bytes
parent folder | download | duplicates (5)
# coding: utf-8
"""
    Tests for the tokenizer
    -----------------------

    :copyright: (c) 2012 by Simon Sapin.
    :license: BSD, see LICENSE for more details.
"""


from __future__ import unicode_literals

import os
import sys

import pytest
from tinycss.tokenizer import (
    cython_tokenize_flat, python_tokenize_flat, regroup)


def test_speedups():
    is_pypy = hasattr(sys, 'pypy_translation_info')
    env_skip_tests = os.environ.get('TINYCSS_SKIP_SPEEDUPS_TESTS')
    # pragma: no cover
    if is_pypy or env_skip_tests:
        return
    assert cython_tokenize_flat is not None, (
        'Cython speedups are not installed, related tests will '
        'be skipped. Set the TINYCSS_SKIP_SPEEDUPS_TESTS environment '
        'variable if this is expected.')


@pytest.mark.parametrize(('tokenize', 'css_source', 'expected_tokens'), [
    (tokenize,) + test_data
    for tokenize in (python_tokenize_flat, cython_tokenize_flat)
    for test_data in [
        ('', []),
        ('red -->', [('IDENT', 'red'), ('S', ' '), ('CDC', '-->')]),
        # Longest match rule: no CDC
        ('red-->', [('IDENT', 'red--'), ('DELIM', '>')]),
        (r'p[example="foo(int x) {    this.x = x;}"]', [
            ('IDENT', 'p'),
            ('[', '['),
            ('IDENT', 'example'),
            ('DELIM', '='),
            ('STRING', 'foo(int x) {    this.x = x;}'),
            (']', ']')]),

        # Numbers are parsed
        ('42 .5 -4pX 1.25em 30%', [
            ('INTEGER', 42), ('S', ' '),
            ('NUMBER', .5), ('S', ' '),
            # units are normalized to lower-case:
            ('DIMENSION', -4, 'px'), ('S', ' '),
            ('DIMENSION', 1.25, 'em'), ('S', ' '),
            ('PERCENTAGE', 30, '%')]),

        # URLs are extracted
        ('url(foo.png)', [('URI', 'foo.png')]),
        ('url("foo.png")', [('URI', 'foo.png')]),

        # Escaping

        (r'/* Comment with a \ backslash */', [
            ('COMMENT', '/* Comment with a \ backslash */')]),  # Unchanged

        # backslash followed by a newline in a string: ignored
        ('"Lorem\\\nIpsum"', [('STRING', 'LoremIpsum')]),

        # backslash followed by a newline outside a string: stands for itself
        ('Lorem\\\nIpsum', [
            ('IDENT', 'Lorem'), ('DELIM', '\\'),
            ('S', '\n'), ('IDENT', 'Ipsum')]),

        # Cancel the meaning of special characters
        (r'"Lore\m Ipsum"', [('STRING', 'Lorem Ipsum')]),  # or not specal
        (r'"Lorem \49psum"', [('STRING', 'Lorem Ipsum')]),
        (r'"Lorem \49 psum"', [('STRING', 'Lorem Ipsum')]),
        (r'"Lorem\"Ipsum"', [('STRING', 'Lorem"Ipsum')]),
        (r'"Lorem\\Ipsum"', [('STRING', r'Lorem\Ipsum')]),
        (r'"Lorem\5c Ipsum"', [('STRING', r'Lorem\Ipsum')]),
        (r'Lorem\+Ipsum', [('IDENT', 'Lorem+Ipsum')]),
        (r'Lorem+Ipsum', [
            ('IDENT', 'Lorem'), ('DELIM', '+'), ('IDENT', 'Ipsum')]),
        (r'url(foo\).png)', [('URI', 'foo).png')]),

        # Unicode and backslash escaping
        ('\\26 B', [('IDENT', '&B')]),
        ('\\&B', [('IDENT', '&B')]),
        ('@\\26\tB', [('ATKEYWORD', '@&B')]),
        ('@\\&B', [('ATKEYWORD', '@&B')]),
        ('#\\26\nB', [('HASH', '#&B')]),
        ('#\\&B', [('HASH', '#&B')]),
        ('\\26\r\nB(', [('FUNCTION', '&B(')]),
        ('\\&B(', [('FUNCTION', '&B(')]),
        (r'12.5\000026B', [('DIMENSION', 12.5, '&b')]),
        (r'12.5\0000263B', [('DIMENSION', 12.5, '&3b')]),  # max 6 digits
        (r'12.5\&B', [('DIMENSION', 12.5, '&b')]),
        (r'"\26 B"', [('STRING', '&B')]),
        (r"'\000026B'", [('STRING', '&B')]),
        (r'"\&B"', [('STRING', '&B')]),
        (r'url("\26 B")', [('URI', '&B')]),
        (r'url(\26 B)', [('URI', '&B')]),
        (r'url("\&B")', [('URI', '&B')]),
        (r'url(\&B)', [('URI', '&B')]),
        (r'Lorem\110000Ipsum', [('IDENT', 'Lorem\uFFFDIpsum')]),

        # Bad strings

        # String ends at EOF without closing: no error, parsed
        ('"Lorem\\26Ipsum', [('STRING', 'Lorem&Ipsum')]),
        # Unescaped newline: ends the string, error, unparsed
        ('"Lorem\\26Ipsum\n', [
            ('BAD_STRING', r'"Lorem\26Ipsum'), ('S', '\n')]),
        # Tokenization restarts after the newline, so the second " starts
        # a new string (which ends at EOF without errors, as above.)
        ('"Lorem\\26Ipsum\ndolor" sit', [
            ('BAD_STRING', r'"Lorem\26Ipsum'), ('S', '\n'),
            ('IDENT', 'dolor'), ('STRING', ' sit')]),

    ]])
def test_tokens(tokenize, css_source, expected_tokens):
    if tokenize is None:  # pragma: no cover
        pytest.skip('Speedups not available')
    sources = [css_source]
    if sys.version_info[0] < 3:
        # On Python 2.x, ASCII-only bytestrings can be used
        # where Unicode is expected.
        sources.append(css_source.encode('ascii'))
    for css_source in sources:
        tokens = tokenize(css_source, ignore_comments=False)
        result = [
            (token.type, token.value) + (
                () if token.unit is None else (token.unit,))
            for token in tokens
        ]
        assert result == expected_tokens


@pytest.mark.parametrize('tokenize', [
    python_tokenize_flat, cython_tokenize_flat])
def test_positions(tokenize):
    """Test the reported line/column position of each token."""
    if tokenize is None:  # pragma: no cover
        pytest.skip('Speedups not available')
    css = '/* Lorem\nipsum */\fa {\n    color: red;\tcontent: "dolor\\\fsit" }'
    tokens = tokenize(css, ignore_comments=False)
    result = [(token.type, token.line, token.column) for token in tokens]
    assert result == [
        ('COMMENT', 1, 1), ('S', 2, 9),
        ('IDENT', 3, 1), ('S', 3, 2), ('{', 3, 3),
        ('S', 3, 4), ('IDENT', 4, 5), (':', 4, 10),
        ('S', 4, 11), ('IDENT', 4, 12), (';', 4, 15), ('S', 4, 16),
        ('IDENT', 4, 17), (':', 4, 24), ('S', 4, 25), ('STRING', 4, 26),
        ('S', 5, 5), ('}', 5, 6)]


@pytest.mark.parametrize(('tokenize', 'css_source', 'expected_tokens'), [
    (tokenize,) + test_data
    for tokenize in (python_tokenize_flat, cython_tokenize_flat)
    for test_data in [
        ('', []),
        (r'Lorem\26 "i\psum"4px', [
            ('IDENT', 'Lorem&'), ('STRING', 'ipsum'), ('DIMENSION', 4)]),

        ('not([[lorem]]{ipsum (42)})', [
            ('FUNCTION', 'not', [
                ('[', [
                    ('[', [
                        ('IDENT', 'lorem'),
                    ]),
                ]),
                ('{', [
                    ('IDENT', 'ipsum'),
                    ('S', ' '),
                    ('(', [
                        ('INTEGER', 42),
                    ])
                ])
            ])]),

        # Close everything at EOF, no error
        ('a[b{"d', [
            ('IDENT', 'a'),
            ('[', [
                ('IDENT', 'b'),
                ('{', [
                    ('STRING', 'd'),
                ]),
            ]),
        ]),

        # Any remaining ), ] or } token is a nesting error
        ('a[b{d]e}', [
            ('IDENT', 'a'),
            ('[', [
                ('IDENT', 'b'),
                ('{', [
                    ('IDENT', 'd'),
                    (']', ']'),  # The error is visible here
                    ('IDENT', 'e'),
                ]),
            ]),
        ]),
        # ref:
        ('a[b{d}e]', [
            ('IDENT', 'a'),
            ('[', [
                ('IDENT', 'b'),
                ('{', [
                    ('IDENT', 'd'),
                ]),
                ('IDENT', 'e'),
            ]),
        ]),
    ]])
def test_token_grouping(tokenize, css_source, expected_tokens):
    if tokenize is None:  # pragma: no cover
        pytest.skip('Speedups not available')
    tokens = regroup(tokenize(css_source, ignore_comments=False))
    result = list(jsonify(tokens))
    assert result == expected_tokens


def jsonify(tokens):
    """Turn tokens into "JSON-compatible" data structures."""
    for token in tokens:
        if token.type == 'FUNCTION':
            yield (token.type, token.function_name,
                   list(jsonify(token.content)))
        elif token.is_container:
            yield token.type, list(jsonify(token.content))
        else:
            yield token.type, token.value


@pytest.mark.parametrize(('tokenize', 'ignore_comments', 'expected_tokens'), [
    (tokenize,) + test_data
    for tokenize in (python_tokenize_flat, cython_tokenize_flat)
    for test_data in [
        (False, [
            ('COMMENT', '/* lorem */'),
            ('S', ' '),
            ('IDENT', 'ipsum'),
            ('[', [
                ('IDENT', 'dolor'),
                ('COMMENT', '/* sit */'),
            ]),
            ('BAD_COMMENT', '/* amet')
        ]),
        (True, [
            ('S', ' '),
            ('IDENT', 'ipsum'),
            ('[', [
                ('IDENT', 'dolor'),
            ]),
        ]),
    ]])
def test_comments(tokenize, ignore_comments, expected_tokens):
    if tokenize is None:  # pragma: no cover
        pytest.skip('Speedups not available')
    css_source = '/* lorem */ ipsum[dolor/* sit */]/* amet'
    tokens = regroup(tokenize(css_source, ignore_comments))
    result = list(jsonify(tokens))
    assert result == expected_tokens


@pytest.mark.parametrize(('tokenize', 'css_source'), [
    (tokenize, test_data)
    for tokenize in (python_tokenize_flat, cython_tokenize_flat)
    for test_data in [
        r'p[example="foo(int x) {    this.x = x;}"]',
        '"Lorem\\26Ipsum\ndolor" sit',
        '/* Lorem\nipsum */\fa {\n    color: red;\tcontent: "dolor\\\fsit" }',
        'not([[lorem]]{ipsum (42)})',
        'a[b{d]e}',
        'a[b{"d',
    ]])
def test_token_serialize_css(tokenize, css_source):
    if tokenize is None:  # pragma: no cover
        pytest.skip('Speedups not available')
    for _regroup in [regroup, lambda x: x]:
        tokens = _regroup(tokenize(css_source, ignore_comments=False))
        result = ''.join(token.as_css() for token in tokens)
        assert result == css_source


@pytest.mark.parametrize(('tokenize', 'css_source'), [
    (tokenize, test_data)
    for tokenize in (python_tokenize_flat, cython_tokenize_flat)
    for test_data in [
        '(8, foo, [z])', '[8, foo, (z)]', '{8, foo, [z]}', 'func(8, foo, [z])'
    ]
])
def test_token_api(tokenize, css_source):
    if tokenize is None:  # pragma: no cover
        pytest.skip('Speedups not available')
    tokens = list(regroup(tokenize(css_source)))
    assert len(tokens) == 1
    token = tokens[0]
    expected_len = 7  # 2 spaces, 2 commas, 3 others.
    assert len(token.content) == expected_len