File: lsh.go

package info (click to toggle)
golang-github-ekzhu-minhash-lsh 1.0-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 104 kB
  • sloc: makefile: 3
file content (249 lines) | stat: -rw-r--r-- 6,806 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
package minhashlsh

import (
	"encoding/binary"
	"math"
	"sort"
	"sync"
)

const (
	integrationPrecision = 0.01
)

type hashKeyFunc func([]uint64) string

func hashKeyFuncGen(hashValueSize int) hashKeyFunc {
	return func(sig []uint64) string {
		s := make([]byte, hashValueSize*len(sig))
		buf := make([]byte, 8)
		for i, v := range sig {
			binary.LittleEndian.PutUint64(buf, v)
			copy(s[i*hashValueSize:(i+1)*hashValueSize], buf[:hashValueSize])
		}
		return string(s)
	}
}

// Compute the integral of function f, lower limit a, upper limit l, and
// precision defined as the quantize step
func integral(f func(float64) float64, a, b, precision float64) float64 {
	var area float64
	for x := a; x < b; x += precision {
		area += f(x+0.5*precision) * precision
	}
	return area
}

// Probability density function for false positive
func falsePositive(l, k int) func(float64) float64 {
	return func(j float64) float64 {
		return 1.0 - math.Pow(1.0-math.Pow(j, float64(k)), float64(l))
	}
}

// Probability density function for false negative
func falseNegative(l, k int) func(float64) float64 {
	return func(j float64) float64 {
		return 1.0 - (1.0 - math.Pow(1.0-math.Pow(j, float64(k)), float64(l)))
	}
}

// Compute the cummulative probability of false negative given threshold t
func probFalseNegative(l, k int, t, precision float64) float64 {
	return integral(falseNegative(l, k), t, 1.0, precision)
}

// Compute the cummulative probability of false positive given threshold t
func probFalsePositive(l, k int, t, precision float64) float64 {
	return integral(falsePositive(l, k), 0, t, precision)
}

// optimalKL returns the optimal K and L for Jaccard similarity search,
// and the false positive and negative probabilities.
// t is the Jaccard similarity threshold.
func optimalKL(numHash int, t float64) (optK, optL int, fp, fn float64) {
	minError := math.MaxFloat64
	for l := 1; l <= numHash; l++ {
		for k := 1; k <= numHash; k++ {
			if l*k > numHash {
				continue
			}
			currFp := probFalsePositive(l, k, t, integrationPrecision)
			currFn := probFalseNegative(l, k, t, integrationPrecision)
			currErr := currFn + currFp
			if minError > currErr {
				minError = currErr
				optK = k
				optL = l
				fp = currFp
				fn = currFn
			}
		}
	}
	return
}

// NewMinhashLSH is the default constructor uses 32 bit hash value
var NewMinhashLSH = NewMinhashLSH32

type keys []interface{}

// For initial bootstrapping
type initHashTable map[string]keys

type bucket struct {
	hashKey string
	keys    keys
}

type hashTable []bucket

func (h hashTable) Len() int           { return len(h) }
func (h hashTable) Swap(i, j int)      { h[i], h[j] = h[j], h[i] }
func (h hashTable) Less(i, j int) bool { return h[i].hashKey < h[j].hashKey }

// MinhashLSH represents a MinHash LSH implemented using LSH Forest
// (http://ilpubs.stanford.edu:8090/678/1/2005-14.pdf).
// It supports query-time setting of the MinHash LSH parameters
// L (number of bands) and
// K (number of hash functions per band).
type MinhashLSH struct {
	k              int
	l              int
	initHashTables []initHashTable
	hashTables     []hashTable
	hashKeyFunc    hashKeyFunc
	hashValueSize  int
}

func newMinhashLSH(threshold float64, numHash, hashValueSize int) *MinhashLSH {
	k, l, _, _ := optimalKL(numHash, threshold)
	hashTables := make([]hashTable, l)
	for i := range hashTables {
		hashTables[i] = make(hashTable, 0)
	}
	initHashTables := make([]initHashTable, l)
	for i := range initHashTables {
		initHashTables[i] = make(initHashTable)
	}
	return &MinhashLSH{
		k:              k,
		l:              l,
		hashValueSize:  hashValueSize,
		initHashTables: initHashTables,
		hashTables:     hashTables,
		hashKeyFunc:    hashKeyFuncGen(hashValueSize),
	}
}

// NewMinhashLSH64 uses 64-bit hash values.
func NewMinhashLSH64(numHash int, threshold float64) *MinhashLSH {
	return newMinhashLSH(threshold, numHash, 8)
}

// NewMinhashLSH32 uses 32-bit hash values.
// MinHash signatures with 64 bit hash values will have
// their hash values trimed.
func NewMinhashLSH32(numHash int, threshold float64) *MinhashLSH {
	return newMinhashLSH(threshold, numHash, 4)
}

// NewMinhashLSH16 uses 16-bit hash values.
// MinHash signatures with 64 or 32 bit hash values will have
// their hash values trimed.
func NewMinhashLSH16(numHash int, threshold float64) *MinhashLSH {
	return newMinhashLSH(threshold, numHash, 2)
}

// Params returns the LSH parameters k and l
func (f *MinhashLSH) Params() (k, l int) {
	return f.k, f.l
}

// Add a key with MinHash signature into the index.
// The key won't be searchable until Index() is called.
func (f *MinhashLSH) Add(key interface{}, sig []uint64) {
	// Generate hash keys
	Hs := make([]string, f.l)
	for i := 0; i < f.l; i++ {
		Hs[i] = f.hashKeyFunc(sig[i*f.k : (i+1)*f.k])
	}
	// Insert keys into the bootstrapping tables
	for i := range f.initHashTables {
		func(ht initHashTable, hk string, key interface{}) {
			if _, exist := ht[hk]; exist {
				ht[hk] = append(ht[hk], key)
			} else {
				ht[hk] = make(keys, 1)
				ht[hk][0] = key
			}
		}(f.initHashTables[i], Hs[i], key)
	}
}

// Index makes all the keys added searchable.
func (f *MinhashLSH) Index() {
	var wg sync.WaitGroup
	wg.Add(len(f.hashTables))
	for i := range f.hashTables {
		func(htPtr *hashTable, initHtPtr *initHashTable) {
			// Build sorted hash table using buckets from init hash tables
			initHt := *initHtPtr
			ht := *htPtr
			for hashKey := range initHt {
				ks, _ := initHt[hashKey]
				ht = append(ht, bucket{
					hashKey: hashKey,
					keys:    ks,
				})
			}
			sort.Sort(ht)
			*htPtr = ht
			// Reset the init hash tables
			*initHtPtr = make(initHashTable)
			wg.Done()
		}(&(f.hashTables[i]), &(f.initHashTables[i]))
	}
	wg.Wait()
}

// Query returns candidate keys given the query signature.
func (f *MinhashLSH) Query(sig []uint64) []interface{} {
	set := f.query(sig, f.k)
	results := make([]interface{}, 0, len(set))
	for key := range set {
		results = append(results, key)
	}
	return results
}

func (f *MinhashLSH) query(sig []uint64, minK int) map[interface{}]bool {
	results := make(map[interface{}]bool)
	for K := f.k; K >= minK; K-- {
		prefixSize := f.hashValueSize * K
		// Generate hash keys
		Hs := make([]string, f.l)
		for i := 0; i < f.l; i++ {
			Hs[i] = f.hashKeyFunc(sig[i*f.k : i*f.k+K])
		}
		// Query hash tables
		for i := 0; i < f.l; i++ {
			ht := f.hashTables[i]
			hk := Hs[i]
			k := sort.Search(len(ht), func(x int) bool {
				return ht[x].hashKey[:prefixSize] >= hk
			})
			if k < len(ht) && ht[k].hashKey[:prefixSize] == hk {
				for j := k; j < len(ht) && ht[j].hashKey[:prefixSize] == hk; j++ {
					for _, key := range ht[j].keys {
						if _, exist := results[key]; !exist {
							results[key] = true
						}
					}
				}
			}
		}
	}
	return results
}