File: datasets.py

package info (click to toggle)
faiss 1.13.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 9,228 kB
  • sloc: cpp: 91,727; python: 31,865; sh: 874; ansic: 425; makefile: 41
file content (465 lines) | stat: -rw-r--r-- 16,414 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import os
import numpy as np
import faiss
import getpass


from .vecs_io import fvecs_read, ivecs_read, bvecs_mmap, fvecs_mmap, bvecs_iter, bvecs_iter_chunked
from .exhaustive_search import knn


class Dataset:
    """ Generic abstract class for a test dataset """

    def __init__(self):
        """ the constructor should set the following fields: """
        self.d = -1
        self.metric = 'L2'   # or IP
        self.nq = -1
        self.nb = -1
        self.nt = -1

    def get_queries(self):
        """ return the queries as a (nq, d) array """
        raise NotImplementedError()

    def get_train(self, maxtrain=None):
        """ return the queries as a (nt, d) array """
        raise NotImplementedError()

    def get_database(self):
        """ return the queries as a (nb, d) array """
        raise NotImplementedError()

    def database_iterator(self, bs=128, split=(1, 0)):
        """returns an iterator on database vectors.
        bs is the number of vectors per batch
        split = (nsplit, rank) means the dataset is split in nsplit
        shards and we want shard number rank
        The default implementation just iterates over the full matrix
        returned by get_dataset.
        """
        xb = self.get_database()
        nsplit, rank = split
        i0, i1 = self.nb * rank // nsplit, self.nb * (rank + 1) // nsplit
        for j0 in range(i0, i1, bs):
            yield xb[j0: min(j0 + bs, i1)]

    def get_groundtruth(self, k=None):
        """ return the ground truth for k-nearest neighbor search """
        raise NotImplementedError()

    def get_groundtruth_range(self, thresh=None):
        """ return the ground truth for range search """
        raise NotImplementedError()

    def __str__(self):
        return (f"dataset in dimension {self.d}, with metric {self.metric}, "
                f"size: Q {self.nq} B {self.nb} T {self.nt}")

    def check_sizes(self):
        """ runs the previous and checks the sizes of the matrices """
        assert self.get_queries().shape == (self.nq, self.d)
        if self.nt > 0:
            xt = self.get_train(maxtrain=123)
            assert xt.shape == (123, self.d), "shape=%s" % (xt.shape, )
        assert self.get_database().shape == (self.nb, self.d)
        assert self.get_groundtruth(k=13).shape == (self.nq, 13)


class SyntheticDataset(Dataset):
    """A dataset that is not completely random but still challenging to
    index
    """

    def __init__(self, d, nt, nb, nq, metric='L2', seed=1338):
        Dataset.__init__(self)
        self.d, self.nt, self.nb, self.nq = d, nt, nb, nq
        d1 = 10     # intrinsic dimension (more or less)
        n = nb + nt + nq
        rs = np.random.RandomState(seed)
        x = rs.normal(size=(n, d1))
        x = np.dot(x, rs.rand(d1, d))
        # now we have a d1-dim ellipsoid in d-dimensional space
        # higher factor (>4) -> higher frequency -> less linear
        x = x * (rs.rand(d) * 4 + 0.1)
        x = np.sin(x)
        x = x.astype('float32')
        self.metric = metric
        self.xt = x[:nt]
        self.xb = x[nt:nt + nb]
        self.xq = x[nt + nb:]

    def get_queries(self):
        return self.xq

    def get_train(self, maxtrain=None):
        maxtrain = maxtrain if maxtrain is not None else self.nt
        return self.xt[:maxtrain]

    def get_database(self):
        return self.xb

    def get_groundtruth(self, k=100):
        return knn(
            self.xq, self.xb, k,
            faiss.METRIC_L2 if self.metric == 'L2' else faiss.METRIC_INNER_PRODUCT
        )[1]


############################################################################
# The following datasets are a few standard open-source datasets
# they should be stored in a directory, and we start by guessing where
# that directory is
############################################################################

username = getpass.getuser()

for dataset_basedir in (
        '/datasets01/simsearch/041218/',
        '/mnt/vol/gfsai-flash3-east/ai-group/datasets/simsearch/',
        f'/home/{username}/simsearch/data/'):
    if os.path.exists(dataset_basedir):
        break
else:
    # users can link their data directory to `./data`
    dataset_basedir = 'data/'


def set_dataset_basedir(path):
    global dataset_basedir
    dataset_basedir = path


class DatasetSIFT1M(Dataset):
    """
    The original dataset is available at: http://corpus-texmex.irisa.fr/
    (ANN_SIFT1M)
    """

    def __init__(self):
        Dataset.__init__(self)
        self.d, self.nt, self.nb, self.nq = 128, 100000, 1000000, 10000
        self.basedir = dataset_basedir + 'sift1M/'

    def get_queries(self):
        return fvecs_read(self.basedir + "sift_query.fvecs")

    def get_train(self, maxtrain=None):
        maxtrain = maxtrain if maxtrain is not None else self.nt
        return fvecs_read(self.basedir + "sift_learn.fvecs")[:maxtrain]

    def get_database(self):
        return fvecs_read(self.basedir + "sift_base.fvecs")

    def get_groundtruth(self, k=None):
        gt = ivecs_read(self.basedir + "sift_groundtruth.ivecs")
        if k is not None:
            assert k <= 100
            gt = gt[:, :k]
        return gt


def sanitize(x):
    return np.ascontiguousarray(x, dtype='float32')


class DatasetBigANN(Dataset):
    """
    The original dataset is available at: http://corpus-texmex.irisa.fr/
    (ANN_SIFT1B)
    """

    def __init__(self, nb_M=1000):
        Dataset.__init__(self)
        assert nb_M in (1, 2, 5, 10, 20, 50, 100, 200, 500, 1000)
        self.nb_M = nb_M
        nb = nb_M * 10**6
        self.d, self.nt, self.nb, self.nq = 128, 10**8, nb, 10000
        self.basedir = dataset_basedir + 'bigann/'

    def get_queries(self):
        return sanitize(bvecs_mmap(self.basedir + 'bigann_query.bvecs')[:])

    def get_train(self, maxtrain=None):
        maxtrain = maxtrain if maxtrain is not None else self.nt
        return sanitize(bvecs_mmap(self.basedir + 'bigann_learn.bvecs')[:maxtrain])

    def get_groundtruth(self, k=None):
        gt = ivecs_read(self.basedir + 'gnd/idx_%dM.ivecs' % self.nb_M)
        if k is not None:
            assert k <= 100
            gt = gt[:, :k]
        return gt

    def get_database(self):
        assert self.nb_M < 100, "dataset too large, use iterator"
        return sanitize(bvecs_mmap(self.basedir + 'bigann_base.bvecs')[:self.nb])

    def database_iterator(self, bs=128, split=(1, 0)):
        xb = bvecs_mmap(self.basedir + 'bigann_base.bvecs')
        nsplit, rank = split
        i0, i1 = self.nb * rank // nsplit, self.nb * (rank + 1) // nsplit
        for j0 in range(i0, i1, bs):
            yield sanitize(xb[j0: min(j0 + bs, i1)])


class DatasetDeep1B(Dataset):
    """
    See
    https://github.com/facebookresearch/faiss/tree/main/benchs#getting-deep1b
    on how to get the data
    """

    def __init__(self, nb=10**9):
        Dataset.__init__(self)
        nb_to_name = {
            10**5: '100k',
            10**6: '1M',
            10**7: '10M',
            10**8: '100M',
            10**9: '1B'
        }
        assert nb in nb_to_name
        self.d, self.nt, self.nb, self.nq = 96, 358480000, nb, 10000
        self.basedir = dataset_basedir + 'deep1b/'
        self.gt_fname = "%sdeep%s_groundtruth.ivecs" % (
            self.basedir, nb_to_name[self.nb])

    def get_queries(self):
        return sanitize(fvecs_read(self.basedir + "deep1B_queries.fvecs"))

    def get_train(self, maxtrain=None):
        maxtrain = maxtrain if maxtrain is not None else self.nt
        return sanitize(fvecs_mmap(self.basedir + "learn.fvecs")[:maxtrain])

    def get_groundtruth(self, k=None):
        gt = ivecs_read(self.gt_fname)
        if k is not None:
            assert k <= 100
            gt = gt[:, :k]
        return gt

    def get_database(self):
        assert self.nb <= 10**8, "dataset too large, use iterator"
        return sanitize(fvecs_mmap(self.basedir + "base.fvecs")[:self.nb])

    def database_iterator(self, bs=128, split=(1, 0)):
        xb = fvecs_mmap(self.basedir + "base.fvecs")
        nsplit, rank = split
        i0, i1 = self.nb * rank // nsplit, self.nb * (rank + 1) // nsplit
        for j0 in range(i0, i1, bs):
            yield sanitize(xb[j0: min(j0 + bs, i1)])


class DatasetGlove(Dataset):
    """
    Data from http://ann-benchmarks.com/glove-100-angular.hdf5
    """

    def __init__(self, loc=None, download=False):
        import h5py
        assert not download, "not implemented"
        if not loc:
            loc = dataset_basedir + 'glove/glove-100-angular.hdf5'
        self.glove_h5py = h5py.File(loc, 'r')
        # IP and L2 are equivalent in this case, but it is traditionally seen as an IP dataset
        self.metric = 'IP'
        self.d, self.nt = 100, 0
        self.nb = self.glove_h5py['train'].shape[0]
        self.nq = self.glove_h5py['test'].shape[0]

    def get_queries(self):
        xq = np.array(self.glove_h5py['test'])
        faiss.normalize_L2(xq)
        return xq

    def get_database(self):
        xb = np.array(self.glove_h5py['train'])
        faiss.normalize_L2(xb)
        return xb

    def get_groundtruth(self, k=None):
        gt = self.glove_h5py['neighbors']
        if k is not None:
            assert k <= 100
            gt = gt[:, :k]
        return gt


class DatasetMusic100(Dataset):
    """
    get dataset from
    https://github.com/stanis-morozov/ip-nsw#dataset
    """

    def __init__(self):
        Dataset.__init__(self)
        self.d, self.nt, self.nb, self.nq = 100, 0, 10**6, 10000
        self.metric = 'IP'
        self.basedir = dataset_basedir + 'music-100/'

    def get_queries(self):
        xq = np.fromfile(self.basedir + 'query_music100.bin', dtype='float32')
        xq = xq.reshape(-1, 100)
        return xq

    def get_database(self):
        xb = np.fromfile(self.basedir + 'database_music100.bin', dtype='float32')
        xb = xb.reshape(-1, 100)
        return xb

    def get_groundtruth(self, k=None):
        gt = np.load(self.basedir + 'gt.npy')
        if k is not None:
            assert k <= 100
            gt = gt[:, :k]
        return gt


class DatasetGIST1M(Dataset):
    """
    The original dataset is available at: http://corpus-texmex.irisa.fr/
    (ANN_GIST1M)
    """

    def __init__(self):
        Dataset.__init__(self)
        self.d, self.nt, self.nb, self.nq = 960, 100000, 1000000, 10000
        self.basedir = dataset_basedir + 'gist1M/'

    def get_queries(self):
        return fvecs_read(self.basedir + "gist_query.fvecs")

    def get_train(self, maxtrain=None):
        maxtrain = maxtrain if maxtrain is not None else self.nt
        return fvecs_read(self.basedir + "gist_learn.fvecs")[:maxtrain]

    def get_database(self):
        return fvecs_read(self.basedir + "gist_base.fvecs")

    def get_groundtruth(self, k=None):
        gt = ivecs_read(self.basedir + "gist_groundtruth.ivecs")
        if k is not None:
            assert k <= 100
            gt = gt[:, :k]
        return gt

class DatasetDINO10B(Dataset):
    """
    Data from https://dl.fbaipublicfiles.com/large_objects/dino_vitl_10B/
    The dataset contains 10 billion 1024-d vectors extracted from image patches from the YFCC100M dataset, using a Dino-ViT-L 16 model (facebook/dinov3-vitl16-pretrain-lvd1689m).
    The dataset is sharded in multiple chunked .bvecs files. Downloading instructions can be obtained with "wget https://dl.fbaipublicfiles.com/large_objects/dino_vitl_10B/README.md".
    Supported sizes : 100k 200k 500k 1M ... 5B 10B listed in supported_nbs (see __init__).
    """
    def __init__(self, nb, ignore_supported = False):
        Dataset.__init__(self)
        supported_nbs = [100_000, 200_000, 500_000, 1_000_000, 2_000_000, 5_000_000, 10_000_000, 20_000_000, 50_000_000, 100_000_000, 200_000_000, 500_000_000, 1_000_000_000, 2_000_000_000, 5_000_000_000, 10_000_000_000]
        if nb not in supported_nbs and not ignore_supported:
            raise ValueError(f"Unsupported dataset size: {nb}, supported values are: {supported_nbs}")
        if not os.path.exists(dataset_basedir):
            raise ValueError(f"Provided dataset base directory does not exist: {dataset_basedir}")
        self.basedir = dataset_basedir + "dino_vitl_10B/"
        self.indexdir = self.basedir + "chunked_base_10B"
        assert os.path.exists(self.indexdir), f"Index path should exist, check your dataset path: {self.indexdir}"
        self.queriesdir = self.basedir + "queries_clean.bvecs"
        assert os.path.exists(self.queriesdir), f"Queries path should exist as dataset size {nb} is supported: {self.queriesdir}"
        self.gtsdir = self.basedir + "gts/" + "gts_dino_patch_" + str(nb) + "_" + "k10.npy"
        self.train_queriesdir = self.basedir + "train_queries_99M.bvecs"
        self.nb = nb
        self.d = 1024
        self.nq = 100_000
        self.nt = 99_000_000
        self.metric = "L2"


    def get_queries(self):
        """Get all vectors as a single array"""
        queries = bvecs_mmap(self.queriesdir)
        return sanitize(queries)

    def get_train(self, maxtrain = None):
        """Get training query vectors as a single array"""
        if maxtrain is None or maxtrain > 10_000_000:
            raise NotImplementedError("The training set is potentially too large to fit in RAM (400 GB of data). Please use train_iterator or use maxtrain parameter below 10_000_000 to get the first maxtrain training vectors.")
        return sanitize(bvecs_mmap(self.train_queriesdir)[:maxtrain])

    def get_database(self):
        """Get all database vectors as a single array"""
        if self.nb > 10_000_000:
            raise NotImplementedError("The dataset is potentially too large to fit in RAM. Please use database_iterator or use a dataset size equal to or below 10_000_000.")
        else:
            return sanitize(bvecs_iter_chunked(self.indexdir, batch_size=self.nb).__next__())

    def database_iterator(self, bs=10_000):
        """Iterator over the database of size nb, corresponding to the first nb vectors in the .bvecs file"""
        total_read = 0
        for batch in bvecs_iter_chunked(self.indexdir, batch_size=bs):
            if total_read + batch.shape[0] > self.nb:
                batch = batch[:self.nb - total_read]
            yield sanitize(batch)
            total_read += batch.shape[0]
            if total_read >= self.nb:
                break

    def train_iterator(self, bs=10_000):
        """Iterator over all training query vectors in the .bvecs file"""
        for batch in bvecs_iter(self.train_queriesdir, batch_size=bs):
            yield sanitize(batch)

    def get_groundtruth(self, k = 10):
        """Get ground truth from .npy file"""
        if k > 10:
            raise NotImplementedError("Ground truth files only available for k<=10")
        gts = np.load(self.gtsdir)
        gts = gts[:, :k]
        return gts

    def distance(self):
        return "euclidean"

def dataset_from_name(dataset='deep1M', download=False):
    """ converts a string describing a dataset to a Dataset object
    Supports sift1M, bigann1M..bigann1B, deep1M..deep1B, music-100 and glove
    """

    if dataset == 'sift1M':
        return DatasetSIFT1M()

    elif dataset == 'gist1M':
        return DatasetGIST1M()

    elif dataset.startswith('bigann'):
        dbsize = 1000 if dataset == "bigann1B" else int(dataset[6:-1])
        return DatasetBigANN(nb_M=dbsize)

    elif dataset.startswith("deep"):

        szsuf = dataset[4:]
        if szsuf[-1] == 'M':
            dbsize = 10 ** 6 * int(szsuf[:-1])
        elif szsuf == '1B':
            dbsize = 10 ** 9
        elif szsuf[-1] == 'k':
            dbsize = 1000 * int(szsuf[:-1])
        else:
            assert False, "did not recognize suffix " + szsuf
        return DatasetDeep1B(nb=dbsize)

    elif dataset == "music-100":
        return DatasetMusic100()

    elif dataset == "glove":
        return DatasetGlove(download=download)

    elif dataset.startswith("dino"):
        dbsize = 10_000_000_000 if dataset == "dino10B" else int(dataset[4:])
        return DatasetDINO10B(nb=dbsize)

    else:
        raise RuntimeError("unknown dataset " + dataset)