File: insert.py

package info (click to toggle)
redis 5%3A8.0.2-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 22,304 kB
  • sloc: ansic: 216,903; tcl: 51,562; sh: 4,625; perl: 4,214; cpp: 3,568; python: 2,954; makefile: 2,055; ruby: 639; javascript: 30; csh: 7
file content (56 lines) | stat: -rw-r--r-- 1,834 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#
# Copyright (c) 2009-Present, Redis Ltd.
# All rights reserved.
#
# Licensed under your choice of (a) the Redis Source Available License 2.0
# (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
# GNU Affero General Public License v3 (AGPLv3).
#

import h5py
import redis
from tqdm import tqdm

# Initialize Redis connection
redis_client = redis.Redis(host='localhost', port=6379, decode_responses=True, encoding='utf-8')

def add_to_redis(index, embedding):
    """Add embedding to Redis using VADD command"""
    args = ["VADD", "glove_embeddings", "VALUES", "100"]  # 100 is vector dimension
    args.extend(map(str, embedding))
    args.append(f"{index}")  # Using index as identifier since we don't have words
    args.append("EF")
    args.append("200")
    # args.append("NOQUANT")
    # args.append("BIN")
    redis_client.execute_command(*args)

def main():
    with h5py.File('glove-100-angular.hdf5', 'r') as f:
        # Get the train dataset
        train_vectors = f['train']
        total_vectors = train_vectors.shape[0]

        print(f"Starting to process {total_vectors} vectors...")

        # Process in batches to avoid memory issues
        batch_size = 1000

        for i in tqdm(range(0, total_vectors, batch_size)):
            batch_end = min(i + batch_size, total_vectors)
            batch = train_vectors[i:batch_end]

            for j, vector in enumerate(batch):
                try:
                    current_index = i + j
                    add_to_redis(current_index, vector)

                except Exception as e:
                    print(f"Error processing vector {current_index}: {str(e)}")
                    continue

            if (i + batch_size) % 10000 == 0:
                print(f"Processed {i + batch_size} vectors")

if __name__ == "__main__":
    main()