File: minhash_test.go

package info (click to toggle)
golang-github-ekzhu-minhash-lsh 1.0-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 104 kB
  • sloc: makefile: 3
file content (69 lines) | stat: -rw-r--r-- 1,461 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
package minhashlsh

import (
	"fmt"
	"math"
	"testing"
)

func TestMinhash(t *testing.T) {
	m := NewMinhash(1, 256)
	m.Push([]byte("Test some input"))
}

func data(size int) [][]byte {
	d := make([][]byte, size)
	for i := range d {
		d[i] = []byte(fmt.Sprintf("salt%d %d", i, size))
	}
	return d
}

func hashing(mh *Minhash, start, end int, data [][]byte) {
	for i := start; i < end; i++ {
		mh.Push(data[i])
	}
}

func benchmark(minhashSize, dataSize int, t *testing.B) {
	if dataSize < 10 {
		fmt.Printf("\n")
		return
	}
	// Data is a set of unique values
	d := data(dataSize)
	// a and b are two subsets of data with some overlaps
	a_start, a_end := 0, int(float64(dataSize)*0.65)
	b_start, b_end := int(float64(dataSize)*0.35), dataSize

	m1 := NewMinhash(1, minhashSize)
	m2 := NewMinhash(1, minhashSize)

	t.ResetTimer()
	hashing(m1, a_start, a_end, d)
	hashing(m2, b_start, b_end, d)

	est := m1.mw.Similarity(m2.mw)
	act := float64(a_end-b_start) / float64(b_end-a_start)
	err := math.Abs(act - est)
	fmt.Printf("Data size: %8d, ", dataSize)
	fmt.Printf("Real resemblance: %.8f, ", act)
	fmt.Printf("Estimated resemblance: %.8f, ", est)
	fmt.Printf("Absolute Error: %.8f\n", err)
}

func BenchmarkMinWise64(b *testing.B) {
	benchmark(64, b.N, b)
}

func BenchmarkMinWise128(b *testing.B) {
	benchmark(128, b.N, b)
}

func BenchmarkMinWise256(b *testing.B) {
	benchmark(256, b.N, b)
}

func BenchmarkMinWise512(b *testing.B) {
	benchmark(512, b.N, b)
}