File: query_meteo_data.py

package info (click to toggle)
pytables 3.11.0-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 15,272 kB
  • sloc: ansic: 82,216; python: 65,569; cpp: 753; sh: 394; makefile: 106
file content (88 lines) | stat: -rw-r--r-- 2,052 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python3
# Benchmark the times reading large datasets with Blosc and Blosc2 filters.

import sys
from time import time

import numpy as np
import pandas as pd

import tables as tb


def time_inkernel(table_blosc):
    t1 = time()
    res = [
        x["precip"]
        for x in table_blosc.where(
            "(lat > 50) & (20 <= lon) & (lon < 50) & (time < 10)"
        )
    ]
    # res = [x['precip'] for x in table_blosc.where("(time < 10)")]
    print(len(res))
    return time() - t1


def time_read(table):
    n_reads = 10_000
    t0 = time()
    idxs_to_read = np.random.randint(0, table.nrows, n_reads)
    # print(f"Time to create indexes: {time() - t0:.3f}s")

    print(f"Randomly reading {n_reads // 1_000} Krows...", end="")
    t0 = time()
    for i in idxs_to_read:
        _ = table[i]
    t = time() - t0
    print(f"\t{t:.3f}s ({t / n_reads * 1e6:.1f} us/read)")

    print(f"Querying {table.nrows // 1000_000_000} Grows...", end="")
    t0 = time()
    _ = [x["precip"] for x in table.where("(time < 10)")]
    t = time() - t0
    print(
        f"\t\t{t:.3f}s ({table.nrows * table.dtype.itemsize / t / 2**30:.1f} GB/s)"
    )


def time_pandas(df):
    t1 = time()
    res = df.query("(lat > 50) & (20 <= lon) & (lon < 50) & (time < 10)")[
        "precip"
    ]
    print(len(res))
    return time() - t1


def pandas_create_df():
    f = tb.open_file("wblosc_table.h5", "r")
    df = pd.DataFrame(f.root.table_blosc[:])
    f.close()
    return df


def inkernel_blosc2_blosclz(table):
    print(
        f"Time to read 6 inkernel queries with Blosc2 (blosclz): {time_inkernel(table):.3f} sec"
    )


def inkernel_blosc2_lz4(table):
    print(
        f"Time to read 6 inkernel queries with Blosc2 (lz4): {time_inkernel(table):.3f} sec"
    )


def pandas_query_numexpr(df):
    print(
        f"Time to perform 6 pandas+numexpr queries: {time_pandas(df):.3f} sec"
    )


f = tb.open_file(sys.argv[1])
table = f.root.table_blosc
time_read(table)
f.close()

# df = pandas_create_df()
# pandas_query_numexpr(df)