File: test_dawg.py

package info (click to toggle)
pypy3 7.3.19%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 212,236 kB
  • sloc: python: 2,098,316; ansic: 540,565; sh: 21,462; asm: 14,419; cpp: 4,451; makefile: 4,209; objc: 761; xml: 530; exp: 499; javascript: 314; pascal: 244; lisp: 45; csh: 12; awk: 4
file content (149 lines) | stat: -rw-r--r-- 4,783 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import pytest
from hypothesis import given, strategies

from rpython.rlib.unicodedata.dawg import (Dawg, lookup, inverse_lookup,
        build_compression_dawg, _inverse_lookup,
        encode_varint_unsigned, decode_varint_unsigned,
        number_add_bits, number_split_bits)
from rpython.rlib.unicodedata.codegen import CodeWriter

def test_1():
    dawg = Dawg()
    dawg.insert("a", -4)
    dawg.insert("c", -2)
    dawg.insert("cat", -1)
    dawg.insert("catarr", 0)
    dawg.insert("catnip", 1)
    dawg.insert("zcatnip", 5)
    packed, data, inverse = dawg.finish()

    assert lookup(packed, data, "a") == -4
    assert lookup(packed, data, "c") == -2
    assert lookup(packed, data, "cat") == -1
    assert lookup(packed, data, "catarr") == 0
    assert lookup(packed, data, "catnip") == 1
    assert lookup(packed, data, "zcatnip") == 5
    assert inverse_lookup(packed, inverse, -4) == "a"
    assert inverse_lookup(packed, inverse, -2) == "c"
    assert inverse_lookup(packed, inverse, -1) == "cat"
    assert inverse_lookup(packed, inverse, 0) == "catarr"
    assert inverse_lookup(packed, inverse, 1) == "catnip"
    assert inverse_lookup(packed, inverse, 5) == "zcatnip"

def test_2():
    dawg = Dawg()
    dawg.insert("aaaaaa", -2)
    dawg.insert("baaaaa", -4)
    dawg.insert("bbbbbaaaaaaa", 0)
    dawg.insert("bbbbbbbbb", -1)
    packed, data, inverse = dawg.finish()

def test_bug_match_past_string_end():
    dawg = Dawg()
    dawg.insert("a", -2)
    dawg.insert("ba", 2)
    packed, data, inverse = dawg.finish()
    with pytest.raises(KeyError):
        lookup(packed, data, "b")

def test_bug_1():
    dawg = Dawg()
    dawg.insert("a", -2)
    dawg.insert("aa", 2)
    dawg.insert("b", 56)
    packed, data, inverse = dawg.finish()
    with pytest.raises(KeyError):
        lookup(packed, data, "ba")

def test_missing_key_inverse():
    dawg = Dawg()
    dawg.insert("aaaaaa", -2)
    dawg.insert("baaaaa", -4)
    dawg.insert("bbbbbaaaaaaa", 0)
    dawg.insert("bbbbbbbbb", -1)
    packed, data, inverse = dawg.finish()
    with pytest.raises(KeyError):
        _inverse_lookup(packed, 5)

def test_generate():
    import py
    tmpdir = py.test.ensuretemp(__name__)
    lines = lines = map(hex,map(hash, map(str, range(100))))
    # some extra handcrafted tests
    lines.extend([ 'AAA', 'AAAA', 'AAAB', 'AAB', 'AABB' ])
    out = tmpdir.join('dawg.py')
    print(out)
    o = out.open('w')
    d = dict(map(lambda (x,y):(y,x), enumerate(lines)))
    trie = build_compression_dawg(CodeWriter(o), d)
    o.close()
    print out.read()
    dmod = out.pyimport()
    for i, line in enumerate(lines):
        assert dmod.lookup_charcode(i) == line
        assert dmod.dawg_lookup(line) == i


@given(strategies.integers(min_value=0), strategies.binary())
def test_varint_hypothesis(i, prefix):
    b = []
    encode_varint_unsigned(i, b)
    b = b"".join(b)
    res, pos = decode_varint_unsigned(b)
    assert res == i
    assert pos == len(b)
    res, pos = decode_varint_unsigned(prefix + b, len(prefix))
    assert res == i
    assert pos == len(b) + len(prefix)

@given(strategies.integers())
def test_add_bits(i):
    for bit1, bit2 in ((0, 0), (0, 1), (1, 0), (1, 1)):
        assert number_split_bits(number_add_bits(i, bit1, bit2), 2) == (i, bit1, bit2)


START = ord('A')
STOP = ord('G')

@given(strategies.lists(strategies.text(strategies.characters(min_codepoint=START, max_codepoint=STOP), min_size=1), min_size=5), strategies.data())
def test_random_dawg(l, data):
    l = [s.encode('ascii') for s in l]
    print l

    d = {s: i for i, s in enumerate(l)}
    tmpdir = pytest.ensuretemp(__name__)
    out = tmpdir.join('%s.py' % hash(str(l)))
    o = out.open('w')
    print "&~" * 50
    print l
    trie = build_compression_dawg(CodeWriter(o), d)
    o.close()
    s = out.read()
    dmod = {}
    exec s in dmod
    dawg_lookup = dmod['dawg_lookup']
    lookup_charcode = dmod['lookup_charcode']
    def near_misses(s):
        for replacement_char in range(START, STOP):
            replacement_char = chr(replacement_char)
            yield s + replacement_char
            yield replacement_char + s
            for pos in range(len(s)):
                news = s[:pos] + replacement_char + s[pos + 1:]
                yield news
    for s, i in d.items():
        assert dawg_lookup(s) == d[s]
        assert lookup_charcode(i) == s

        # check some near misses
        for news in near_misses(s):
            if news in d:
                continue
            with pytest.raises(KeyError):
                dawg_lookup(news)
    valid_values = {i for s, i in d.items()}
    for i in range(-100, len(l) + 100):
        if i in valid_values:
            continue
        with pytest.raises(KeyError):
            lookup_charcode(i)