File: collations.py

package info (click to toggle)
pytables 3.11.0-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 15,272 kB
  • sloc: ansic: 82,216; python: 65,569; cpp: 753; sh: 394; makefile: 106
file content (133 lines) | stat: -rw-r--r-- 3,832 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from time import perf_counter as clock

import numpy as np

import tables as tb

N = 1000 * 1000
NCOLL = 200  # 200 collections maximum

# In order to have reproducible results
np.random.seed(19)


class Energies(tb.IsDescription):
    collection = tb.UInt8Col()
    energy = tb.Float64Col()


def fill_bucket(lbucket):
    # c = np.random.normal(NCOLL/2, NCOLL/10, lbucket)
    c = np.random.normal(NCOLL / 2, NCOLL / 100, lbucket)
    e = np.arange(lbucket, dtype="f8")
    return c, e


# Fill the table
t1 = clock()
f = tb.open_file("data.nobackup/collations.h5", "w")
table = f.create_table("/", "Energies", Energies, expectedrows=N)
# Fill the table with values
lbucket = 1000  # Fill in buckets of 1000 rows, for speed
for i in range(0, N, lbucket):
    bucket = fill_bucket(lbucket)
    table.append(bucket)
# Fill the remaining rows
bucket = fill_bucket(N % lbucket)
table.append(bucket)
f.close()
print(f"Time to create the table with {N} entries: {t1:.3f}")

# Now, read the table and group it by collection
f = tb.open_file("data.nobackup/collations.h5", "a")
table = f.root.Energies

#########################################################
# First solution: load the table completely in memory
#########################################################
t1 = clock()
t = table[:]  # convert to structured array
coll1 = []
collections = np.unique(t["collection"])
for c in collections:
    cond = t["collection"] == c
    energy_this_collection = t["energy"][cond]
    sener = energy_this_collection.sum()
    coll1.append(sener)
    print(c, " : ", sener)
del collections, energy_this_collection
print(f"Time for first solution: {clock() - t1:.3f}s")

#########################################################
# Second solution: load all the collections in memory
#########################################################
t1 = clock()
collections = {}
for row in table:
    c = row["collection"]
    e = row["energy"]
    if c in collections:
        collections[c].append(e)
    else:
        collections[c] = [e]
# Convert the lists in numpy arrays
coll2 = []
for c in sorted(collections):
    energy_this_collection = np.array(collections[c])
    sener = energy_this_collection.sum()
    coll2.append(sener)
    print(c, " : ", sener)
del collections, energy_this_collection
print(f"Time for second solution: {clock() - t1:.3f}s")

t1 = clock()
table.cols.collection.create_csindex()
# table.cols.collection.reindex()
print(f"Time for indexing: {clock() - t1:.3f}s")

#########################################################
# Third solution: load each collection separately
#########################################################
t1 = clock()
coll3 = []
for c in np.unique(table.col("collection")):
    energy_this_collection = table.read_where(
        "collection == c", field="energy"
    )
    sener = energy_this_collection.sum()
    coll3.append(sener)
    print(c, " : ", sener)
del energy_this_collection
print(f"Time for third solution: {clock() - t1:.3f}s")


t1 = clock()
table2 = table.copy(
    "/",
    "EnergySortedByCollation",
    overwrite=True,
    sortby="collection",
    propindexes=True,
)
print(f"Time for sorting: {clock() - t1:.3f}s")

#####################################################################
# Fourth solution: load each collection separately.  Sorted table.
#####################################################################
t1 = clock()
coll4 = []
for c in np.unique(table2.col("collection")):
    energy_this_collection = table2.read_where(
        "collection == c", field="energy"
    )
    sener = energy_this_collection.sum()
    coll4.append(sener)
    print(c, " : ", sener)
    del energy_this_collection
print(f"Time for fourth solution: {clock() - t1:.3f}s")


# Finally, check that all solutions do match
assert coll1 == coll2 == coll3 == coll4

f.close()