1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
|
import h5py
import numpy as np
import pandas as pd
from pandas.api.types import is_categorical_dtype
def get(grp, lo=0, hi=None, fields=None, convert_enum=True, as_dict=False):
"""
Query a range of rows from a table as a dataframe.
A table is an HDF5 group containing equal-length 1D datasets serving as
columns.
Parameters
----------
grp : ``h5py.Group`` or any dict-like of array-likes
Handle to an HDF5 group containing only 1D datasets or any similar
collection of 1D datasets or arrays
lo, hi : int, optional
Range of rows to select from the table.
fields : str or sequence of str, optional
Column or list of columns to query. Defaults to all available columns.
A single string returns a Series instead of a DataFrame.
convert_enum : bool, optional
Whether to convert HDF5 enum datasets into ``pandas.Categorical``
columns instead of plain integer columns. Default is True.
kwargs : optional
Options to pass to ``pandas.DataFrame`` or ``pandas.Series``.
Returns
-------
DataFrame or Series
Notes
-----
HDF5 ASCII datasets are converted to Unicode.
"""
series = False
if fields is None:
fields = list(grp.keys())
elif isinstance(fields, str):
fields = [fields]
series = True
data = {}
for field in fields:
dset = grp[field]
if convert_enum:
dt = h5py.check_dtype(enum=dset.dtype)
else:
dt = None
if dt is not None:
data[field] = pd.Categorical.from_codes(
dset[lo:hi], sorted(dt, key=dt.__getitem__), ordered=True
)
elif dset.dtype.type == np.string_:
data[field] = dset[lo:hi].astype("U")
else:
data[field] = dset[lo:hi]
if as_dict:
return data
if data and lo is not None:
index = np.arange(lo, lo + len(next(iter(data.values()))))
else:
index = None
if series:
return pd.Series(data[fields[0]], index=index, name=field)
else:
return pd.DataFrame(data, columns=fields, index=index)
def put(grp, df, lo=0, store_categories=True, h5opts=None):
"""
Store a dataframe into a column-oriented table store.
A table is an HDF5 group containing equal-length 1D datasets serving as
columns.
Parameters
----------
h5 : ``h5py.Group``
Handle to an HDF5 group containing only 1D datasets or any similar
collection of 1D datasets or arrays
df : DataFrame or Series
Data columns to write to the HDF5 group
lo : int, optional
Row offset for data to be stored.
store_categories : bool, optional
Whether to convert ``pandas.Categorical`` columns into HDF5 enum
datasets instead of plain integer datasets. Default is True.
h5opts : dict, optional
HDF5 dataset filter options to use (compression, shuffling,
checksumming, etc.). Default is to use autochunking and GZIP
compression, level 6.
Notes
-----
Categorical data must be ASCII compatible.
"""
if h5opts is None:
h5opts = {"compression": "gzip", "compression_opts": 6}
if isinstance(df, pd.Series):
df = df.to_frame()
# fields = df.keys()
for field, data in df.items():
if np.isscalar(data):
data = np.array([data])
dtype = data.dtype
fillvalue = None
elif is_categorical_dtype(data):
if store_categories:
cats = data.cat.categories
enum = (data.cat.codes.dtype, dict(zip(cats, range(len(cats)))))
data = data.cat.codes
dtype = h5py.special_dtype(enum=enum)
fillvalue = -1
else:
data = data.cat.codes
dtype = data.dtype
fillvalue = -1
else:
data = np.asarray(data)
if data.dtype in (object, str, bytes):
dtype = np.dtype("S")
data = np.array(data, dtype=dtype)
fillvalue = None
else:
dtype = data.dtype
fillvalue = None
hi = lo + len(data)
try:
dset = grp[field]
except KeyError:
dset = grp.create_dataset(
field,
shape=(hi,),
dtype=dtype,
maxshape=(None,),
fillvalue=fillvalue,
**h5opts
)
if hi > len(dset):
dset.resize((hi,))
dset[lo:hi] = data
def delete(grp, fields=None):
"""
Delete columns from a table.
A table is an HDF5 group containing equal-length 1D datasets serving as
columns.
Parameters
----------
grp : ``h5py.Group``
Handle to an HDF5 group containing only 1D datasets or any similar
collection of 1D datasets or arrays
fields : str or sequence of str, optional
Column or list of columns to query. Defaults to all available columns.
A single string returns a Series instead of a DataFrame.
Notes
-----
Deleting objects leaves "holes" in HDF5 files and doesn't shrink the file.
You will need to repack or copy the file contents to reclaim space.
See the h5repack tool.
"""
if fields is None:
fields = list(grp.keys())
elif isinstance(fields, str):
fields = [fields]
for field in fields:
if field in grp.keys():
del grp[field]
|