File: minhash.go

package info (click to toggle)
golang-github-ekzhu-minhash-lsh 1.0-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 104 kB
  • sloc: makefile: 3
file content (68 lines) | stat: -rw-r--r-- 1,511 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
package minhashlsh

import (
	"encoding/binary"
	"hash/fnv"
	"math/rand"

	minwise "github.com/dgryski/go-minhash"
)

// The number of byte in a hash value for Minhash
const hashValueSize = 8

// Minhash represents a MinHash object
type Minhash struct {
	mw   *minwise.MinWise
	seed int64
}

// NewMinhash initialize a MinHash object with a seed and the number of
// hash functions.
func NewMinhash(seed int64, numHash int) *Minhash {
	r := rand.New(rand.NewSource(seed))
	b := binary.BigEndian
	b1 := make([]byte, hashValueSize)
	b2 := make([]byte, hashValueSize)
	b.PutUint64(b1, uint64(r.Int63()))
	b.PutUint64(b2, uint64(r.Int63()))
	fnv1 := fnv.New64a()
	fnv2 := fnv.New64a()
	h1 := func(b []byte) uint64 {
		fnv1.Reset()
		fnv1.Write(b1)
		fnv1.Write(b)
		return fnv1.Sum64()
	}
	h2 := func(b []byte) uint64 {
		fnv2.Reset()
		fnv2.Write(b2)
		fnv2.Write(b)
		return fnv2.Sum64()
	}
	return &Minhash{
		mw:   minwise.NewMinWise(h1, h2, numHash),
		seed: int64(seed),
	}
}

// Push a new value to the MinHash object.
// The value should be serialized to byte slice.
func (m *Minhash) Push(b []byte) {
	m.mw.Push(b)
}

// Signature exports the MinHash as a list of hash values.
func (m *Minhash) Signature() []uint64 {
	return m.mw.Signature()
}

// Merge combines the signature of the other Minhash
// with this one, making this one carry the signature of
// the union.
func (m *Minhash) Merge(o *Minhash) {
	if m.seed != o.seed {
		panic("Cannot merge Minhash with different seed")
	}
	m.mw.Merge(o.mw)
}