File: deletion.py

package info (click to toggle)
redis 5%3A8.0.2-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 22,304 kB
  • sloc: ansic: 216,903; tcl: 51,562; sh: 4,625; perl: 4,214; cpp: 3,568; python: 2,954; makefile: 2,055; ruby: 639; javascript: 30; csh: 7
file content (173 lines) | stat: -rw-r--r-- 7,560 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
from test import TestCase, fill_redis_with_vectors, generate_random_vector
import random

"""
A note about this test:
It was experimentally tried to modify hnsw.c in order to
avoid calling hnsw_reconnect_nodes(). In this case, the test
fails very often with EF set to 250, while it hardly
fails at all with the same parameters if hnsw_reconnect_nodes()
is called.

Note that for the nature of the test (it is very strict) it can
still fail from time to time, without this signaling any
actual bug.
"""

class VREM(TestCase):
    def getname(self):
        return "Deletion and graph state after deletion"

    def estimated_runtime(self):
        return 2.0

    def format_neighbors_with_scores(self, links_result, old_links=None, items_to_remove=None):
        """Format neighbors with their similarity scores and status indicators"""
        if not links_result:
            return "No neighbors"

        output = []
        for level, neighbors in enumerate(links_result):
            level_num = len(links_result) - level - 1
            output.append(f"Level {level_num}:")

            # Get neighbors and scores
            neighbors_with_scores = []
            for i in range(0, len(neighbors), 2):
                neighbor = neighbors[i].decode() if isinstance(neighbors[i], bytes) else neighbors[i]
                score = float(neighbors[i+1]) if i+1 < len(neighbors) else None
                status = ""

                # For old links, mark deleted ones
                if items_to_remove and neighbor in items_to_remove:
                    status = " [lost]"
                # For new links, mark newly added ones
                elif old_links is not None:
                    # Check if this neighbor was in the old links at this level
                    was_present = False
                    if old_links and level < len(old_links):
                        old_neighbors = [n.decode() if isinstance(n, bytes) else n
                                      for n in old_links[level]]
                        was_present = neighbor in old_neighbors
                    if not was_present:
                        status = " [gained]"

                if score is not None:
                    neighbors_with_scores.append(f"{len(neighbors_with_scores)+1}. {neighbor} ({score:.6f}){status}")
                else:
                    neighbors_with_scores.append(f"{len(neighbors_with_scores)+1}. {neighbor}{status}")

            output.extend(["    " + n for n in neighbors_with_scores])
        return "\n".join(output)

    def test(self):
        # 1. Fill server with random elements
        dim = 128
        count = 5000
        data = fill_redis_with_vectors(self.redis, self.test_key, count, dim)

        # 2. Do VSIM to get 200 items
        query_vec = generate_random_vector(dim)
        results = self.redis.execute_command('VSIM', self.test_key, 'VALUES', dim,
                                    *[str(x) for x in query_vec],
                                    'COUNT', 200, 'WITHSCORES')

        # Convert results to list of (item, score) pairs, sorted by score
        items = []
        for i in range(0, len(results), 2):
            item = results[i].decode()
            score = float(results[i+1])
            items.append((item, score))
        items.sort(key=lambda x: x[1], reverse=True)  # Sort by similarity

        # Store the graph structure for all items before deletion
        neighbors_before = {}
        for item, _ in items:
            links = self.redis.execute_command('VLINKS', self.test_key, item, 'WITHSCORES')
            if links:  # Some items might not have links
                neighbors_before[item] = links

        # 3. Remove 100 random items
        items_to_remove = set(item for item, _ in random.sample(items, 100))
        # Keep track of top 10 non-removed items
        top_remaining = []
        for item, score in items:
            if item not in items_to_remove:
                top_remaining.append((item, score))
                if len(top_remaining) == 10:
                    break

        # Remove the items
        for item in items_to_remove:
            result = self.redis.execute_command('VREM', self.test_key, item)
            assert result == 1, f"VREM failed to remove {item}"

        # 4. Do VSIM again with same vector
        new_results = self.redis.execute_command('VSIM', self.test_key, 'VALUES', dim,
                                        *[str(x) for x in query_vec],
                                        'COUNT', 200, 'WITHSCORES',
                                        'EF', 500)

        # Convert new results to dict of item -> score
        new_scores = {}
        for i in range(0, len(new_results), 2):
            item = new_results[i].decode()
            score = float(new_results[i+1])
            new_scores[item] = score

        failure = False
        failed_item = None
        failed_reason = None
        # 5. Verify all top 10 non-removed items are still found with similar scores
        for item, old_score in top_remaining:
            if item not in new_scores:
                failure = True
                failed_item = item
                failed_reason = "missing"
                break
            new_score = new_scores[item]
            if abs(new_score - old_score) >= 0.01:
                failure = True
                failed_item = item
                failed_reason = f"score changed: {old_score:.6f} -> {new_score:.6f}"
                break

        if failure:
            print("\nTest failed!")
            print(f"Problem with item: {failed_item} ({failed_reason})")

            print("\nOriginal neighbors (with similarity scores):")
            if failed_item in neighbors_before:
                print(self.format_neighbors_with_scores(
                    neighbors_before[failed_item], 
                    items_to_remove=items_to_remove))
            else:
                print("No neighbors found in original graph")

            print("\nCurrent neighbors (with similarity scores):")
            current_links = self.redis.execute_command('VLINKS', self.test_key, 
                                                     failed_item, 'WITHSCORES')
            if current_links:
                print(self.format_neighbors_with_scores(
                    current_links,
                    old_links=neighbors_before.get(failed_item)))
            else:
                print("No neighbors in current graph")

            print("\nOriginal results (top 20):")
            for item, score in items[:20]:
                deleted = "[deleted]" if item in items_to_remove else ""
                print(f"{item}: {score:.6f} {deleted}")

            print("\nNew results after removal (top 20):")
            new_items = []
            for i in range(0, len(new_results), 2):
                item = new_results[i].decode()
                score = float(new_results[i+1])
                new_items.append((item, score))
            new_items.sort(key=lambda x: x[1], reverse=True)
            for item, score in new_items[:20]:
                print(f"{item}: {score:.6f}")

            raise AssertionError(f"Test failed: Problem with item {failed_item} ({failed_reason}). *** IMPORTANT *** This test may fail from time to time without indicating that there is a bug. However normally it should pass. The fact is that it's a quite extreme test where we destroy 50% of nodes of top results and still expect perfect recall, with vectors that are very hostile because of the distribution used.")