File: h5.pyx

package info (click to toggle)
bitshuffle 0.3.5-3.1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, sid
  • size: 612 kB
  • sloc: ansic: 3,477; python: 836; makefile: 15; sh: 2
file content (205 lines) | stat: -rw-r--r-- 6,574 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
"""
HDF5 support for Bitshuffle.

To read a dataset that uses the Bitshuffle filter using h5py, simply import
this module (unless you have installed the Bitshuffle dynamically loaded
filter, in which case importing this module is unnecessary).

To create a new dataset that includes the Bitshuffle filter, use one of the
convenience functions provided.


Constants
=========

    H5FILTER : The Bitshuffle HDF5 filter integer identifier.
    H5_COMPRESS_LZ4 : Filter option flag for LZ4 compression.

Functions
=========

    create_dataset
    create_bitshuffle_lzf_dataset
    create_bitshuffle_compressed_dataset

Examples
========

    >>> import numpy as np
    >>> import h5py
    >>> import bitshuffle.h5

    >>> shape = (123, 456)
    >>> chunks = (10, 456)
    >>> dtype = np.float64

    >>> f = h5py.File("tmp_test.h5")
    >>> bitshuffle.h5.create_bitshuffle_compressed_dataset(
            f, "some_data", shape, dtype, chunks)
    >>> f["some_data"][:] = 42

"""

from __future__ import absolute_import, division, print_function, unicode_literals

import numpy
import h5py
from h5py import h5d, h5s, h5t, h5p, filters

cimport cython


cdef extern from b"bshuf_h5filter.h":
    int bshuf_register_h5filter()
    int BSHUF_H5FILTER
    int BSHUF_H5_COMPRESS_LZ4

cdef int LZF_FILTER = 32000

H5FILTER = BSHUF_H5FILTER
H5_COMPRESS_LZ4 = BSHUF_H5_COMPRESS_LZ4


def register_h5_filter():
    ret = bshuf_register_h5filter()
    if ret < 0:
        raise RuntimeError("Failed to register bitshuffle HDF5 filter.", ret)


register_h5_filter()


def create_dataset(parent, name, shape, dtype, chunks=None, maxshape=None,
                   fillvalue=None, track_times=None,
                   filter_pipeline=(), filter_flags=None, filter_opts=None):
    """Create a dataset with an arbitrary filter pipeline.

    Return a new low-level dataset identifier.

    Much of this code is copied from h5py, but couldn't reuse much code due to
    unstable API.

    """

    if hasattr(filter_pipeline, "__getitem__"):
        filter_pipeline = list(filter_pipeline)
    else:
        filter_pipeline = [filter_pipeline]
        filter_flags = [filter_flags]
        filter_opts = [filter_opts]
    nfilters = len(filter_pipeline)
    if filter_flags is None:
        filter_flags = [None] * nfilters
    if filter_opts is None:
        filter_opts = [None] * nfilters
    if not len(filter_flags) == nfilters or not len(filter_opts) == nfilters:
        msg = "Supplied incompatible number of filters, flags, and options."
        raise ValueError(msg)

    shape = tuple(shape)

    tmp_shape = maxshape if maxshape is not None else shape
    # Validate chunk shape
    chunks_larger = (numpy.array([ not i>=j
                     for i,j in zip(tmp_shape,chunks) if i is not None])).any()
    if isinstance(chunks, tuple) and chunks_larger:
        errmsg = ("Chunk shape must not be greater than data shape in any "
                  "dimension. {} is not compatible with {}".format(chunks, shape))
        raise ValueError(errmsg)

    if isinstance(dtype, h5py.Datatype):
        # Named types are used as-is
        tid = dtype.id
        dtype = tid.dtype  # Following code needs this
    else:
        # Validate dtype
        dtype = numpy.dtype(dtype)
        tid = h5t.py_create(dtype, logical=1)

    if shape == ():
        if any((chunks, filter_pipeline)):
            raise TypeError("Scalar datasets don't support chunk/filter options")
        if maxshape and maxshape != ():
            raise TypeError("Scalar datasets cannot be extended")
        return h5p.create(h5p.DATASET_CREATE)

    def rq_tuple(tpl, name):
        """Check if chunks/maxshape match dataset rank"""
        if tpl in (None, True):
            return
        try:
            tpl = tuple(tpl)
        except TypeError:
            raise TypeError('"%s" argument must be None or a sequence object' % name)
        if len(tpl) != len(shape):
            raise ValueError('"%s" must have same rank as dataset shape' % name)

    rq_tuple(chunks, 'chunks')
    rq_tuple(maxshape, 'maxshape')

    if (chunks is True) or (chunks is None and filter_pipeline):
        chunks = filters.guess_chunk(shape, maxshape, dtype.itemsize)

    if maxshape is True:
        maxshape = (None,)*len(shape)

    dcpl = h5p.create(h5p.DATASET_CREATE)
    if chunks is not None:
        dcpl.set_chunk(chunks)
        dcpl.set_fill_time(h5d.FILL_TIME_ALLOC)  # prevent resize glitch

    if fillvalue is not None:
        fillvalue = numpy.array(fillvalue)
        dcpl.set_fill_value(fillvalue)

    if track_times in (True, False):
        dcpl.set_obj_track_times(track_times)
    elif track_times is not None:
        raise TypeError("track_times must be either True or False")

    for ii in range(nfilters):
        this_filter = filter_pipeline[ii]
        this_flags = filter_flags[ii]
        this_opts = filter_opts[ii]
        if this_flags is None:
            this_flags = 0
        if this_opts is None:
            this_opts = ()
        dcpl.set_filter(this_filter, this_flags, this_opts)

    if maxshape is not None:
        maxshape = tuple(m if m is not None else h5s.UNLIMITED
                         for m in maxshape)
    sid = h5s.create_simple(shape, maxshape)

    dset_id = h5d.create(parent.id, name, tid, sid, dcpl=dcpl)

    return dset_id


def create_bitshuffle_lzf_dataset(parent, name, shape, dtype, chunks=None,
                                  maxshape=None, fillvalue=None,
                                  track_times=None):
    """Create dataset with a filter pipeline including bitshuffle and LZF"""

    filter_pipeline = [H5FILTER, LZF_FILTER]
    dset_id = create_dataset(parent, name, shape, dtype, chunks=chunks,
                             filter_pipeline=filter_pipeline, maxshape=maxshape,
                             fillvalue=fillvalue, track_times=track_times)
    return dset_id


def create_bitshuffle_compressed_dataset(parent, name, shape, dtype,
                                        chunks=None, maxshape=None,
                                        fillvalue=None, track_times=None):
    """Create dataset with bitshuffle+internal LZ4 compression."""

    filter_pipeline = [H5FILTER,]
    filter_opts = [(0, H5_COMPRESS_LZ4)]
    dset_id = create_dataset(parent, name, shape, dtype, chunks=chunks,
                             filter_pipeline=filter_pipeline,
                             filter_opts=filter_opts, maxshape=maxshape,
                             fillvalue=fillvalue, track_times=track_times)
    return dset_id