1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
|
from time import perf_counter as clock
import numpy as np
import tables as tb
N = 1000 * 1000
NCOLL = 200 # 200 collections maximum
# In order to have reproducible results
np.random.seed(19)
class Energies(tb.IsDescription):
collection = tb.UInt8Col()
energy = tb.Float64Col()
def fill_bucket(lbucket):
# c = np.random.normal(NCOLL/2, NCOLL/10, lbucket)
c = np.random.normal(NCOLL / 2, NCOLL / 100, lbucket)
e = np.arange(lbucket, dtype="f8")
return c, e
# Fill the table
t1 = clock()
f = tb.open_file("data.nobackup/collations.h5", "w")
table = f.create_table("/", "Energies", Energies, expectedrows=N)
# Fill the table with values
lbucket = 1000 # Fill in buckets of 1000 rows, for speed
for i in range(0, N, lbucket):
bucket = fill_bucket(lbucket)
table.append(bucket)
# Fill the remaining rows
bucket = fill_bucket(N % lbucket)
table.append(bucket)
f.close()
print(f"Time to create the table with {N} entries: {t1:.3f}")
# Now, read the table and group it by collection
f = tb.open_file("data.nobackup/collations.h5", "a")
table = f.root.Energies
#########################################################
# First solution: load the table completely in memory
#########################################################
t1 = clock()
t = table[:] # convert to structured array
coll1 = []
collections = np.unique(t["collection"])
for c in collections:
cond = t["collection"] == c
energy_this_collection = t["energy"][cond]
sener = energy_this_collection.sum()
coll1.append(sener)
print(c, " : ", sener)
del collections, energy_this_collection
print(f"Time for first solution: {clock() - t1:.3f}s")
#########################################################
# Second solution: load all the collections in memory
#########################################################
t1 = clock()
collections = {}
for row in table:
c = row["collection"]
e = row["energy"]
if c in collections:
collections[c].append(e)
else:
collections[c] = [e]
# Convert the lists in numpy arrays
coll2 = []
for c in sorted(collections):
energy_this_collection = np.array(collections[c])
sener = energy_this_collection.sum()
coll2.append(sener)
print(c, " : ", sener)
del collections, energy_this_collection
print(f"Time for second solution: {clock() - t1:.3f}s")
t1 = clock()
table.cols.collection.create_csindex()
# table.cols.collection.reindex()
print(f"Time for indexing: {clock() - t1:.3f}s")
#########################################################
# Third solution: load each collection separately
#########################################################
t1 = clock()
coll3 = []
for c in np.unique(table.col("collection")):
energy_this_collection = table.read_where(
"collection == c", field="energy"
)
sener = energy_this_collection.sum()
coll3.append(sener)
print(c, " : ", sener)
del energy_this_collection
print(f"Time for third solution: {clock() - t1:.3f}s")
t1 = clock()
table2 = table.copy(
"/",
"EnergySortedByCollation",
overwrite=True,
sortby="collection",
propindexes=True,
)
print(f"Time for sorting: {clock() - t1:.3f}s")
#####################################################################
# Fourth solution: load each collection separately. Sorted table.
#####################################################################
t1 = clock()
coll4 = []
for c in np.unique(table2.col("collection")):
energy_this_collection = table2.read_where(
"collection == c", field="energy"
)
sener = energy_this_collection.sum()
coll4.append(sener)
print(c, " : ", sener)
del energy_this_collection
print(f"Time for fourth solution: {clock() - t1:.3f}s")
# Finally, check that all solutions do match
assert coll1 == coll2 == coll3 == coll4
f.close()
|