1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
|
from __future__ import with_statement
import random
from whoosh import fields, matching, scoring
from whoosh.compat import b, u, xrange
from whoosh.filedb.filestore import RamStorage
from whoosh.util.numeric import length_to_byte, byte_to_length
def _discreet(length):
return byte_to_length(length_to_byte(length))
def test_max_field_length():
st = RamStorage()
schema = fields.Schema(t=fields.TEXT)
ix = st.create_index(schema)
for i in xrange(1, 200, 7):
w = ix.writer()
w.add_document(t=u(" ").join(["word"] * i))
w.commit()
with ix.reader() as r:
assert r.max_field_length("t") == _discreet(i)
def test_minmax_field_length():
st = RamStorage()
schema = fields.Schema(t=fields.TEXT)
ix = st.create_index(schema)
least = 999999
most = 0
for _ in xrange(1, 200, 7):
w = ix.writer()
count = random.randint(1, 100)
least = min(count, least)
most = max(count, most)
w.add_document(t=u(" ").join(["word"] * count))
w.commit()
with ix.reader() as r:
assert r.min_field_length("t") == _discreet(least)
assert r.max_field_length("t") == _discreet(most)
def test_term_stats():
schema = fields.Schema(t=fields.TEXT)
ix = RamStorage().create_index(schema)
w = ix.writer()
w.add_document(t=u("alfa bravo charlie delta echo"))
w.add_document(t=u("bravo charlie delta echo foxtrot"))
w.add_document(t=u("charlie delta echo foxtrot golf"))
w.add_document(t=u("delta echo foxtrot"))
w.add_document(t=u("echo foxtrot golf hotel india juliet"))
w.add_document(t=u("foxtrot alfa alfa alfa"))
w.commit()
with ix.reader() as r:
ti = r.term_info("t", u("alfa"))
assert ti.weight() == 4.0
assert ti.doc_frequency() == 2
assert ti.min_length() == 4
assert ti.max_length() == 5
assert ti.max_weight() == 3.0
assert r.term_info("t", u("echo")).min_length() == 3
assert r.doc_field_length(3, "t") == 3
assert r.min_field_length("t") == 3
assert r.max_field_length("t") == 6
w = ix.writer()
w.add_document(t=u("alfa"))
w.add_document(t=u("bravo charlie"))
w.add_document(t=u("echo foxtrot tango bravo"))
w.add_document(t=u("golf hotel"))
w.add_document(t=u("india"))
w.add_document(t=u("juliet alfa bravo charlie delta echo foxtrot"))
w.commit(merge=False)
with ix.reader() as r:
ti = r.term_info("t", u("alfa"))
assert ti.weight() == 6.0
assert ti.doc_frequency() == 4
assert ti.min_length() == 1
assert ti.max_length() == 7
assert ti.max_weight() == 3.0
assert r.term_info("t", u("echo")).min_length() == 3
assert r.min_field_length("t") == 1
assert r.max_field_length("t") == 7
def test_min_max_id():
schema = fields.Schema(id=fields.STORED, t=fields.TEXT)
ix = RamStorage().create_index(schema)
w = ix.writer()
w.add_document(id=0, t=u("alfa bravo charlie"))
w.add_document(id=1, t=u("bravo charlie delta"))
w.add_document(id=2, t=u("charlie delta echo"))
w.add_document(id=3, t=u("delta echo foxtrot"))
w.add_document(id=4, t=u("echo foxtrot golf"))
w.commit()
with ix.reader() as r:
ti = r.term_info("t", u("delta"))
assert ti.min_id() == 1
assert ti.max_id() == 3
ti = r.term_info("t", u("alfa"))
assert ti.min_id() == 0
assert ti.max_id() == 0
ti = r.term_info("t", u("foxtrot"))
assert ti.min_id() == 3
assert ti.max_id() == 4
w = ix.writer()
w.add_document(id=5, t=u("foxtrot golf hotel"))
w.add_document(id=6, t=u("golf hotel alfa"))
w.add_document(id=7, t=u("hotel alfa bravo"))
w.add_document(id=8, t=u("alfa bravo charlie"))
w.commit(merge=False)
with ix.reader() as r:
ti = r.term_info("t", u("delta"))
assert ti.min_id() == 1
assert ti.max_id() == 3
ti = r.term_info("t", u("alfa"))
assert ti.min_id() == 0
assert ti.max_id() == 8
ti = r.term_info("t", u("foxtrot"))
assert ti.min_id() == 3
assert ti.max_id() == 5
def test_replacements():
sc = scoring.WeightScorer(0.25)
a = matching.ListMatcher([1, 2, 3], [0.25, 0.25, 0.25], scorer=sc)
b = matching.ListMatcher([1, 2, 3], [0.25, 0.25, 0.25], scorer=sc)
um = matching.UnionMatcher(a, b)
a2 = a.replace(0.5)
assert a2.__class__ == matching.NullMatcherClass
um2 = um.replace(0.5)
assert um2.__class__ == matching.IntersectionMatcher
um2 = um.replace(0.6)
assert um2.__class__ == matching.NullMatcherClass
wm = matching.WrappingMatcher(um, boost=2.0)
wm = wm.replace(0.5)
assert wm.__class__ == matching.WrappingMatcher
assert wm.boost == 2.0
assert wm.child.__class__ == matching.IntersectionMatcher
ls1 = matching.ListMatcher([1, 2, 3], [0.1, 0.1, 0.1],
scorer=scoring.WeightScorer(0.1))
ls2 = matching.ListMatcher([1, 2, 3], [0.2, 0.2, 0.2],
scorer=scoring.WeightScorer(0.2))
ls3 = matching.ListMatcher([1, 2, 3], [0.3, 0.3, 0.3],
scorer=scoring.WeightScorer(0.3))
mm = matching.MultiMatcher([ls1, ls2, ls3], [0, 4, 8])
mm = mm.replace(0.25)
assert mm.current == 2
dm = matching.DisjunctionMaxMatcher(ls1, ls2)
dm = dm.replace(0.15)
assert dm is ls2
|