File: bindings_test_labels.py

package info (click to toggle)
hnswlib 0.8.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 628 kB
  • sloc: cpp: 4,809; python: 1,113; makefile: 32; sh: 18
file content (135 lines) | stat: -rw-r--r-- 5,300 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os
import unittest

import numpy as np

import hnswlib


class RandomSelfTestCase(unittest.TestCase):
    def testRandomSelf(self):
        for idx in range(2):
            print("\n**** Index save-load test ****\n")

            np.random.seed(idx)
            dim = 16
            num_elements = 10000

            # Generating sample data
            data = np.float32(np.random.random((num_elements, dim)))

            # Declaring index
            p = hnswlib.Index(space='l2', dim=dim)  # possible options are l2, cosine or ip

            # Initiating index
            # max_elements - the maximum number of elements, should be known beforehand
            #     (probably will be made optional in the future)
            #
            # ef_construction - controls index search speed/build speed tradeoff
            # M - is tightly connected with internal dimensionality of the data
            #     strongly affects the memory consumption

            p.init_index(max_elements=num_elements, ef_construction=100, M=16)

            # Controlling the recall by setting ef:
            # higher ef leads to better accuracy, but slower search
            p.set_ef(100)

            p.set_num_threads(4)  # by default using all available cores

            # We split the data in two batches:
            data1 = data[:num_elements // 2]
            data2 = data[num_elements // 2:]

            print("Adding first batch of %d elements" % (len(data1)))
            p.add_items(data1)

            # Query the elements for themselves and measure recall:
            labels, distances = p.knn_query(data1, k=1)

            items = p.get_items(labels)

            # Check the recall:
            self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3)

            # Check that the returned element data is correct:
            diff_with_gt_labels=np.mean(np.abs(data1-items))
            self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4)

            # Serializing and deleting the index.
            # We need the part to check that serialization is working properly.

            index_path = 'first_half.bin'
            print("Saving index to '%s'" % index_path)
            p.save_index(index_path)
            print("Saved. Deleting...")
            del p
            print("Deleted")

            print("\n**** Mark delete test ****\n")
            # Re-initiating, loading the index
            print("Re-initiating")
            p = hnswlib.Index(space='l2', dim=dim)

            print("\nLoading index from '%s'\n" % index_path)
            p.load_index(index_path)
            p.set_ef(100)

            print("Adding the second batch of %d elements" % (len(data2)))
            p.add_items(data2)

            # Query the elements for themselves and measure recall:
            labels, distances = p.knn_query(data, k=1)
            items = p.get_items(labels)

            # Check the recall:
            self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3)

            # Check that the returned element data is correct:
            diff_with_gt_labels = np.mean(np.abs(data-items))
            self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) # deleting index.

            # Checking that all labels are returned correctly:
            sorted_labels = sorted(p.get_ids_list())
            self.assertEqual(np.sum(~np.asarray(sorted_labels) == np.asarray(range(num_elements))), 0)

            # Delete data1
            labels1_deleted, _ = p.knn_query(data1, k=1)
            # delete probable duplicates from nearest neighbors
            labels1_deleted_no_dup = set(labels1_deleted.flatten())
            for l in labels1_deleted_no_dup:
                p.mark_deleted(l)
            labels2, _ = p.knn_query(data2, k=1)
            items = p.get_items(labels2)
            diff_with_gt_labels = np.mean(np.abs(data2-items))
            self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-3)

            labels1_after, _ = p.knn_query(data1, k=1)
            for la in labels1_after:
                if la[0] in labels1_deleted_no_dup:
                    print(f"Found deleted label {la[0]} during knn search")
                    self.assertTrue(False)
            print("All the data in data1 are removed")

            # Checking saving/loading index with elements marked as deleted
            del_index_path = "with_deleted.bin"
            p.save_index(del_index_path)
            p = hnswlib.Index(space='l2', dim=dim)
            p.load_index(del_index_path)
            p.set_ef(100)

            labels1_after, _ = p.knn_query(data1, k=1)
            for la in labels1_after:
                if la[0] in labels1_deleted_no_dup:
                    print(f"Found deleted label {la[0]} during knn search after index loading")
                    self.assertTrue(False)

            # Unmark deleted data
            for l in labels1_deleted_no_dup:
                p.unmark_deleted(l)
            labels_restored, _ = p.knn_query(data1, k=1)
            self.assertAlmostEqual(np.mean(labels_restored.reshape(-1) == np.arange(len(data1))), 1.0, 3)
            print("All the data in data1 are restored")

        os.remove(index_path)
        os.remove(del_index_path)