File: test_quality.py

package info (click to toggle)
python-whoosh 2.7.4%2Bgit6-g9134ad92-5
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 3,656 kB
  • sloc: python: 38,517; makefile: 118
file content (172 lines) | stat: -rw-r--r-- 5,527 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
from __future__ import with_statement
import random

from whoosh import fields, matching, scoring
from whoosh.compat import b, u, xrange
from whoosh.filedb.filestore import RamStorage
from whoosh.util.numeric import length_to_byte, byte_to_length


def _discreet(length):
    return byte_to_length(length_to_byte(length))


def test_max_field_length():
    st = RamStorage()
    schema = fields.Schema(t=fields.TEXT)
    ix = st.create_index(schema)
    for i in xrange(1, 200, 7):
        w = ix.writer()
        w.add_document(t=u(" ").join(["word"] * i))
        w.commit()

        with ix.reader() as r:
            assert r.max_field_length("t") == _discreet(i)


def test_minmax_field_length():
    st = RamStorage()
    schema = fields.Schema(t=fields.TEXT)
    ix = st.create_index(schema)
    least = 999999
    most = 0
    for _ in xrange(1, 200, 7):
        w = ix.writer()
        count = random.randint(1, 100)
        least = min(count, least)
        most = max(count, most)
        w.add_document(t=u(" ").join(["word"] * count))
        w.commit()

        with ix.reader() as r:
            assert r.min_field_length("t") == _discreet(least)
            assert r.max_field_length("t") == _discreet(most)


def test_term_stats():
    schema = fields.Schema(t=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(t=u("alfa bravo charlie delta echo"))
    w.add_document(t=u("bravo charlie delta echo foxtrot"))
    w.add_document(t=u("charlie delta echo foxtrot golf"))
    w.add_document(t=u("delta echo foxtrot"))
    w.add_document(t=u("echo foxtrot golf hotel india juliet"))
    w.add_document(t=u("foxtrot alfa alfa alfa"))
    w.commit()

    with ix.reader() as r:
        ti = r.term_info("t", u("alfa"))
        assert ti.weight() == 4.0
        assert ti.doc_frequency() == 2
        assert ti.min_length() == 4
        assert ti.max_length() == 5
        assert ti.max_weight() == 3.0

        assert r.term_info("t", u("echo")).min_length() == 3

        assert r.doc_field_length(3, "t") == 3
        assert r.min_field_length("t") == 3
        assert r.max_field_length("t") == 6

    w = ix.writer()
    w.add_document(t=u("alfa"))
    w.add_document(t=u("bravo charlie"))
    w.add_document(t=u("echo foxtrot tango bravo"))
    w.add_document(t=u("golf hotel"))
    w.add_document(t=u("india"))
    w.add_document(t=u("juliet alfa bravo charlie delta echo foxtrot"))
    w.commit(merge=False)

    with ix.reader() as r:
        ti = r.term_info("t", u("alfa"))
        assert ti.weight() == 6.0
        assert ti.doc_frequency() == 4
        assert ti.min_length() == 1
        assert ti.max_length() == 7
        assert ti.max_weight() == 3.0

        assert r.term_info("t", u("echo")).min_length() == 3

        assert r.min_field_length("t") == 1
        assert r.max_field_length("t") == 7


def test_min_max_id():
    schema = fields.Schema(id=fields.STORED, t=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(id=0, t=u("alfa bravo charlie"))
    w.add_document(id=1, t=u("bravo charlie delta"))
    w.add_document(id=2, t=u("charlie delta echo"))
    w.add_document(id=3, t=u("delta echo foxtrot"))
    w.add_document(id=4, t=u("echo foxtrot golf"))
    w.commit()

    with ix.reader() as r:
        ti = r.term_info("t", u("delta"))
        assert ti.min_id() == 1
        assert ti.max_id() == 3

        ti = r.term_info("t", u("alfa"))
        assert ti.min_id() == 0
        assert ti.max_id() == 0

        ti = r.term_info("t", u("foxtrot"))
        assert ti.min_id() == 3
        assert ti.max_id() == 4

    w = ix.writer()
    w.add_document(id=5, t=u("foxtrot golf hotel"))
    w.add_document(id=6, t=u("golf hotel alfa"))
    w.add_document(id=7, t=u("hotel alfa bravo"))
    w.add_document(id=8, t=u("alfa bravo charlie"))
    w.commit(merge=False)

    with ix.reader() as r:
        ti = r.term_info("t", u("delta"))
        assert ti.min_id() == 1
        assert ti.max_id() == 3

        ti = r.term_info("t", u("alfa"))
        assert ti.min_id() == 0
        assert ti.max_id() == 8

        ti = r.term_info("t", u("foxtrot"))
        assert ti.min_id() == 3
        assert ti.max_id() == 5


def test_replacements():
    sc = scoring.WeightScorer(0.25)
    a = matching.ListMatcher([1, 2, 3], [0.25, 0.25, 0.25], scorer=sc)
    b = matching.ListMatcher([1, 2, 3], [0.25, 0.25, 0.25], scorer=sc)
    um = matching.UnionMatcher(a, b)

    a2 = a.replace(0.5)
    assert a2.__class__ == matching.NullMatcherClass

    um2 = um.replace(0.5)
    assert um2.__class__ == matching.IntersectionMatcher
    um2 = um.replace(0.6)
    assert um2.__class__ == matching.NullMatcherClass

    wm = matching.WrappingMatcher(um, boost=2.0)
    wm = wm.replace(0.5)
    assert wm.__class__ == matching.WrappingMatcher
    assert wm.boost == 2.0
    assert wm.child.__class__ == matching.IntersectionMatcher

    ls1 = matching.ListMatcher([1, 2, 3], [0.1, 0.1, 0.1],
                               scorer=scoring.WeightScorer(0.1))
    ls2 = matching.ListMatcher([1, 2, 3], [0.2, 0.2, 0.2],
                               scorer=scoring.WeightScorer(0.2))
    ls3 = matching.ListMatcher([1, 2, 3], [0.3, 0.3, 0.3],
                               scorer=scoring.WeightScorer(0.3))
    mm = matching.MultiMatcher([ls1, ls2, ls3], [0, 4, 8])
    mm = mm.replace(0.25)
    assert mm.current == 2

    dm = matching.DisjunctionMaxMatcher(ls1, ls2)
    dm = dm.replace(0.15)
    assert dm is ls2