File: test_tokenizer2.py

package info (click to toggle)
python-html5rdf 1.2.1-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,448 kB
  • sloc: python: 12,794; makefile: 3
file content (63 lines) | stat: -rw-r--r-- 1,933 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63

import io

from html5rdf._tokenizer import HTMLTokenizer
from html5rdf.constants import tokenTypes


def ignore_parse_errors(toks):
    for tok in toks:
        if tok['type'] != tokenTypes['ParseError']:
            yield tok


def test_maintain_attribute_order():
    # generate loads to maximize the chance a hash-based mutation will occur
    attrs = [(chr(x), str(i)) for i, x in enumerate(range(ord('a'), ord('z')))]
    stream = io.StringIO("<span " + " ".join("%s='%s'" % (x, i) for x, i in attrs) + ">")

    toks = HTMLTokenizer(stream)
    out = list(ignore_parse_errors(toks))

    assert len(out) == 1
    assert out[0]['type'] == tokenTypes['StartTag']

    attrs_tok = out[0]['data']
    assert len(attrs_tok) == len(attrs)

    for (in_name, in_value), (out_name, out_value) in zip(attrs, attrs_tok.items()):
        assert in_name == out_name
        assert in_value == out_value


def test_duplicate_attribute():
    stream = io.StringIO("<span a=1 a=2 a=3>")

    toks = HTMLTokenizer(stream)
    out = list(ignore_parse_errors(toks))

    assert len(out) == 1
    assert out[0]['type'] == tokenTypes['StartTag']

    attrs_tok = out[0]['data']
    assert len(attrs_tok) == 1
    assert list(attrs_tok.items()) == [('a', '1')]


def test_maintain_duplicate_attribute_order():
    # generate loads to maximize the chance a hash-based mutation will occur
    attrs = [(chr(x), str(i)) for i, x in enumerate(range(ord('a'), ord('z')))]
    stream = io.StringIO("<span " + " ".join("%s='%s'" % (x, i) for x, i in attrs) + " a=100>")

    toks = HTMLTokenizer(stream)
    out = list(ignore_parse_errors(toks))

    assert len(out) == 1
    assert out[0]['type'] == tokenTypes['StartTag']

    attrs_tok = out[0]['data']
    assert len(attrs_tok) == len(attrs)

    for (in_name, in_value), (out_name, out_value) in zip(attrs, attrs_tok.items()):
        assert in_name == out_name
        assert in_value == out_value