File: lsh_test.go

package info (click to toggle)
golang-github-ekzhu-minhash-lsh 1.0-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 104 kB
  • sloc: makefile: 3
file content (95 lines) | stat: -rw-r--r-- 2,057 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
package minhashlsh

import (
	"math/rand"
	"testing"
)

func randomSignature(size int, seed int64) []uint64 {
	r := rand.New(rand.NewSource(seed))
	sig := make([]uint64, size)
	for i := range sig {
		sig[i] = uint64(r.Int63())
	}
	return sig
}

func Test_HashKeyFunc16(t *testing.T) {
	sig := randomSignature(2, 1)
	f := hashKeyFuncGen(2)
	hashKey := f(sig)
	if len(hashKey) != 2*2 {
		t.Fatal(len(hashKey))
	}
}

func Test_HashKeyFunc64(t *testing.T) {
	sig := randomSignature(2, 1)
	f := hashKeyFuncGen(8)
	hashKey := f(sig)
	if len(hashKey) != 8*2 {
		t.Fatal(len(hashKey))
	}
}

func Test_MinhashLSH(t *testing.T) {
	f := NewMinhashLSH16(256, 0.6)
	// sig1 is different from sig2 and sig3
	// sig2 and sig3 are identical
	sig1 := randomSignature(256, 1)
	sig2 := randomSignature(256, 2)
	sig3 := randomSignature(256, 2)

	f.Add("sig1", sig1)
	f.Add("sig2", sig2)
	f.Add("sig3", sig3)
	f.Index()
	// sig1 should be in its own bucket
	// sig2 and sig3 are in another bucket
	for i := range f.hashTables {
		if len(f.hashTables[i]) != 2 {
			t.Fatal(f.hashTables[i])
		}
	}

	found := 0
	for _, key := range f.Query(sig3) {
		if key.(string) == "sig3" || key.(string) == "sig2" {
			found++
		}
	}
	if found != 2 {
		t.Fatal("unable to retrieve inserted keys")
	}
}

func Test_MinhashLSH2(t *testing.T) {
	minhashLsh := NewMinhashLSH16(256, 0.5)
	seed := int64(1)
	numHash := 256
	mh := NewMinhash(seed, numHash)
	words := []string{"hello", "world", "minhash", "one", "two", "three", "four",
		"five", "six", "seven", "eight", "nine", "ten"}
	for _, word := range words {
		mh.Push([]byte(word))
	}
	sig1 := mh.Signature()
	minhashLsh.Add("s1", sig1)
	minhashLsh.Index()
	k, l := minhashLsh.Params()
	t.Logf("Minhash LSH params: k = %d, l = %d", k, l)

	mh = NewMinhash(seed, numHash)
	words = []string{"one", "two", "three", "four",
		"five", "six", "seven", "eight", "nine", "ten"}
	for _, word := range words {
		mh.Push([]byte(word))
	}
	sig2 := mh.Signature()

	results := minhashLsh.Query(sig2)
	t.Log(results)
	if len(results) < 1 {
		t.Fail()
	}
}