File: direct-chunk-shape.py

package info (click to toggle)
pytables 3.11.0-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 15,272 kB
  • sloc: ansic: 82,216; python: 65,569; cpp: 753; sh: 394; makefile: 106
file content (121 lines) | stat: -rw-r--r-- 3,895 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# This script uses h5py to write two versions of the following 6x6 array into
# two datasets with 4x4 chunks::
#
#                chunk boundary
#                |
#   [[ 0  1  2  3  0  1]
#    [ 4  5  6  7  4  5]
#    [ 8  9 10 11  8  9]
#    [12 13 14 15 12 13] __ chunk boundary
#    [ 0  1  2  3  0  1]
#    [ 4  5  6  7  4  5]]
#
# So its chunks contain, left-to-right, top-to-bottom, 4x4, 4x2, 2x4, and 2x2
# items worth of data.
#
# - ``/data_full`` has all its chunks with shape 4x4 in storage, even if the
#   actual data does not fill the whole chunk.
# - ``/data_part`` has chunks with the shape of actual data, as listed above.
#
# Direct chunking is used in h5py to control the exact storage format for each
# chunk, using a Blosc2 ND container.  Then PyTables is used to read both
# datasets and compare them against the original array, so as to test whether
# there are problems in loading partial chunks.
#
# Since PyTables has an optimized path for uncompressing Blosc2 ND data that
# does not use the HDF5 filter pipeline, you may force it to only use the
# latter by setting ``BLOSC2_FILTER=1`` in the environment.

import os
import tempfile

import h5py
import numpy
import blosc2
import hdf5plugin

# Import after h5py part is done so that it uses its own `hdf5-blosc2`.
import tables

# Both parameter sets below are equivalent.
fparams = hdf5plugin.Blosc2(
    cname="zstd", clevel=1, filters=hdf5plugin.Blosc2.SHUFFLE
)
cparams = {
    "codec": blosc2.Codec.ZSTD,
    "clevel": 1,
    "filters": [blosc2.Filter.SHUFFLE],
}


def np2b2(a):
    return blosc2.asarray(
        numpy.ascontiguousarray(a),
        chunks=a.shape,
        blocks=a.shape,
        cparams=cparams,
    )


# Assemble the array.
achunk = numpy.arange(4 * 4).reshape((4, 4))
adata = numpy.zeros((6, 6), dtype=achunk.dtype)
adata[0:4, 0:4] = achunk[:, :]
adata[0:4, 4:6] = achunk[:, 0:2]
adata[4:6, 0:4] = achunk[0:2, :]
adata[4:6, 4:6] = achunk[0:2, 0:2]

h5fdesc, h5fpath = tempfile.mkstemp()
os.close(h5fdesc)

h5f = h5py.File(h5fpath, "w")
print(f"Writing with full chunks to {h5fpath}:/data_full...")
dataset = h5f.create_dataset(
    "data_full", adata.shape, dtype=adata.dtype, chunks=achunk.shape, **fparams
)
b2chunk = np2b2(achunk)  # need to keep the ref or cframe becomes bogus
b2frame = b2chunk._schunk.to_cframe()
dataset.id.write_direct_chunk((0, 0), b2frame)
dataset.id.write_direct_chunk((0, 4), b2frame)
dataset.id.write_direct_chunk((4, 0), b2frame)
dataset.id.write_direct_chunk((4, 4), b2frame)
print(f"Writing with partial chunks to {h5fpath}:/data_part...")
dataset = h5f.create_dataset(
    "data_part", adata.shape, dtype=adata.dtype, chunks=achunk.shape, **fparams
)
b2chunk = np2b2(achunk[:, :])  # need to keep the ref or cframe becomes bogus
b2frame = b2chunk._schunk.to_cframe()
dataset.id.write_direct_chunk((0, 0), b2frame)
b2chunk = np2b2(achunk[:, 0:2])
# Uncomment to introduce a bogus partial chunk in the midle of data,
# too small even for a margin chunk.
# b2chunk = np2b2(achunk[0:2, 0:2])
b2frame = b2chunk._schunk.to_cframe()
dataset.id.write_direct_chunk((0, 4), b2frame)
b2chunk = np2b2(achunk[0:2, :])
b2frame = b2chunk._schunk.to_cframe()
dataset.id.write_direct_chunk((4, 0), b2frame)
b2chunk = np2b2(achunk[0:2, 0:2])
b2frame = b2chunk._schunk.to_cframe()
dataset.id.write_direct_chunk((4, 4), b2frame)
h5f.close()

h5f = tables.open_file(h5fpath, "r")
print(f"Reading with full chunks from {h5fpath}:/data_full...")
a_full = h5f.root.data_full[:]
print(f"Reading with partial chunks from {h5fpath}:/data_part...")
a_part = h5f.root.data_part[:]
h5f.close()

if not (a_full == adata).all():
    print("FULL CHUNKS FAILED (original, read):")
    print(adata)
    print(a_full)

if not (a_part == adata).all():
    print("PARTIAL CHUNKS FAILED (original, read):")
    print(adata)
    print(a_part)

print(f"Removing {h5fpath}...")
os.remove(h5fpath)