1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
|
"""
HDF5 support for Bitshuffle.
To read a dataset that uses the Bitshuffle filter using h5py, simply import
this module (unless you have installed the Bitshuffle dynamically loaded
filter, in which case importing this module is unnecessary).
To create a new dataset that includes the Bitshuffle filter, use one of the
convenience functions provided.
Constants
=========
H5FILTER : The Bitshuffle HDF5 filter integer identifier.
H5_COMPRESS_LZ4 : Filter option flag for LZ4 compression.
Functions
=========
create_dataset
create_bitshuffle_lzf_dataset
create_bitshuffle_compressed_dataset
Examples
========
>>> import numpy as np
>>> import h5py
>>> import bitshuffle.h5
>>> shape = (123, 456)
>>> chunks = (10, 456)
>>> dtype = np.float64
>>> f = h5py.File("tmp_test.h5")
>>> bitshuffle.h5.create_bitshuffle_compressed_dataset(
f, "some_data", shape, dtype, chunks)
>>> f["some_data"][:] = 42
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import numpy
import h5py
from h5py import h5d, h5s, h5t, h5p, filters
cimport cython
cdef extern from b"bshuf_h5filter.h":
int bshuf_register_h5filter()
int BSHUF_H5FILTER
int BSHUF_H5_COMPRESS_LZ4
cdef int LZF_FILTER = 32000
H5FILTER = BSHUF_H5FILTER
H5_COMPRESS_LZ4 = BSHUF_H5_COMPRESS_LZ4
def register_h5_filter():
ret = bshuf_register_h5filter()
if ret < 0:
raise RuntimeError("Failed to register bitshuffle HDF5 filter.", ret)
register_h5_filter()
def create_dataset(parent, name, shape, dtype, chunks=None, maxshape=None,
fillvalue=None, track_times=None,
filter_pipeline=(), filter_flags=None, filter_opts=None):
"""Create a dataset with an arbitrary filter pipeline.
Return a new low-level dataset identifier.
Much of this code is copied from h5py, but couldn't reuse much code due to
unstable API.
"""
if hasattr(filter_pipeline, "__getitem__"):
filter_pipeline = list(filter_pipeline)
else:
filter_pipeline = [filter_pipeline]
filter_flags = [filter_flags]
filter_opts = [filter_opts]
nfilters = len(filter_pipeline)
if filter_flags is None:
filter_flags = [None] * nfilters
if filter_opts is None:
filter_opts = [None] * nfilters
if not len(filter_flags) == nfilters or not len(filter_opts) == nfilters:
msg = "Supplied incompatible number of filters, flags, and options."
raise ValueError(msg)
shape = tuple(shape)
tmp_shape = maxshape if maxshape is not None else shape
# Validate chunk shape
chunks_larger = (numpy.array([ not i>=j
for i,j in zip(tmp_shape,chunks) if i is not None])).any()
if isinstance(chunks, tuple) and chunks_larger:
errmsg = ("Chunk shape must not be greater than data shape in any "
"dimension. {} is not compatible with {}".format(chunks, shape))
raise ValueError(errmsg)
if isinstance(dtype, h5py.Datatype):
# Named types are used as-is
tid = dtype.id
dtype = tid.dtype # Following code needs this
else:
# Validate dtype
dtype = numpy.dtype(dtype)
tid = h5t.py_create(dtype, logical=1)
if shape == ():
if any((chunks, filter_pipeline)):
raise TypeError("Scalar datasets don't support chunk/filter options")
if maxshape and maxshape != ():
raise TypeError("Scalar datasets cannot be extended")
return h5p.create(h5p.DATASET_CREATE)
def rq_tuple(tpl, name):
"""Check if chunks/maxshape match dataset rank"""
if tpl in (None, True):
return
try:
tpl = tuple(tpl)
except TypeError:
raise TypeError('"%s" argument must be None or a sequence object' % name)
if len(tpl) != len(shape):
raise ValueError('"%s" must have same rank as dataset shape' % name)
rq_tuple(chunks, 'chunks')
rq_tuple(maxshape, 'maxshape')
if (chunks is True) or (chunks is None and filter_pipeline):
chunks = filters.guess_chunk(shape, maxshape, dtype.itemsize)
if maxshape is True:
maxshape = (None,)*len(shape)
dcpl = h5p.create(h5p.DATASET_CREATE)
if chunks is not None:
dcpl.set_chunk(chunks)
dcpl.set_fill_time(h5d.FILL_TIME_ALLOC) # prevent resize glitch
if fillvalue is not None:
fillvalue = numpy.array(fillvalue)
dcpl.set_fill_value(fillvalue)
if track_times in (True, False):
dcpl.set_obj_track_times(track_times)
elif track_times is not None:
raise TypeError("track_times must be either True or False")
for ii in range(nfilters):
this_filter = filter_pipeline[ii]
this_flags = filter_flags[ii]
this_opts = filter_opts[ii]
if this_flags is None:
this_flags = 0
if this_opts is None:
this_opts = ()
dcpl.set_filter(this_filter, this_flags, this_opts)
if maxshape is not None:
maxshape = tuple(m if m is not None else h5s.UNLIMITED
for m in maxshape)
sid = h5s.create_simple(shape, maxshape)
dset_id = h5d.create(parent.id, name, tid, sid, dcpl=dcpl)
return dset_id
def create_bitshuffle_lzf_dataset(parent, name, shape, dtype, chunks=None,
maxshape=None, fillvalue=None,
track_times=None):
"""Create dataset with a filter pipeline including bitshuffle and LZF"""
filter_pipeline = [H5FILTER, LZF_FILTER]
dset_id = create_dataset(parent, name, shape, dtype, chunks=chunks,
filter_pipeline=filter_pipeline, maxshape=maxshape,
fillvalue=fillvalue, track_times=track_times)
return dset_id
def create_bitshuffle_compressed_dataset(parent, name, shape, dtype,
chunks=None, maxshape=None,
fillvalue=None, track_times=None):
"""Create dataset with bitshuffle+internal LZ4 compression."""
filter_pipeline = [H5FILTER,]
filter_opts = [(0, H5_COMPRESS_LZ4)]
dset_id = create_dataset(parent, name, shape, dtype, chunks=chunks,
filter_pipeline=filter_pipeline,
filter_opts=filter_opts, maxshape=maxshape,
fillvalue=fillvalue, track_times=track_times)
return dset_id
|