1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618
|
from typing import List, Optional, Tuple
import h5py
import numpy as np
from .exceptions import TrainIDError, NoDataError
from .file_access import FileAccess
from .read_machinery import (
contiguous_regions, DataChunk, select_train_ids, split_trains, roi_shape,
trains_files_index,
)
class KeyData:
"""Data for one key in one source
Don't create this directly; get it from ``run[source, key]``.
"""
def __init__(
self, source, key, *, train_ids, files, section, dtype, eshape,
inc_suspect_trains=True,
):
self.source = source
self.key = key
self.train_ids = train_ids
self.files: List[FileAccess] = files
self.section = section
self.dtype = dtype
self.entry_shape = eshape
self.ndim = len(eshape) + 1
self.inc_suspect_trains = inc_suspect_trains
def _find_chunks(self):
"""Find contiguous chunks of data for this key, in any order."""
all_tids_arr = np.array(self.train_ids)
for file in self.files:
if len(file.train_ids) == 0:
continue
firsts, counts = file.get_index(self.source, self.index_group)
# Of trains in this file, which are in selection
include = np.isin(file.train_ids, all_tids_arr)
if not self.inc_suspect_trains:
include &= file.validity_flag
# Assemble contiguous chunks of data from this file
for _from, _to in contiguous_regions(include):
yield DataChunk(
file, self.hdf5_data_path,
first=firsts[_from],
train_ids=file.train_ids[_from:_to],
counts=counts[_from:_to],
)
_cached_chunks = None
@property
def _data_chunks(self) -> List[DataChunk]:
"""An ordered list of chunks containing data"""
if self._cached_chunks is None:
self._cached_chunks = sorted(
self._find_chunks(), key=lambda c: c.train_ids[0]
)
return self._cached_chunks
@property
def _data_chunks_nonempty(self) -> List[DataChunk]:
return [c for c in self._data_chunks if c.total_count]
def __repr__(self):
return f"<extra_data.KeyData source={self.source!r} key={self.key!r} " \
f"for {len(self.train_ids)} trains>"
@property
def is_control(self):
"""Whether this key belongs to a control source."""
return self.section == 'CONTROL'
@property
def is_instrument(self):
"""Whether this key belongs to an instrument source."""
return self.section == 'INSTRUMENT'
@property
def index_group(self):
"""The part of the key needed to look up index data"""
if self.section == 'INSTRUMENT':
return self.key.partition('.')[0]
else:
return ''
@property
def hdf5_data_path(self):
"""The path to the relevant dataset within each HDF5 file"""
return f"/{self.section}/{self.source}/{self.key.replace('.', '/')}"
@property
def shape(self):
"""The shape of this data as a tuple, like for a NumPy array.
Finding the shape may require getting index data from several files
"""
return (sum(c.total_count for c in self._data_chunks),) + self.entry_shape
@property
def nbytes(self):
"""The number of bytes this data would take up in memory."""
return self.dtype.itemsize * np.prod(self.shape)
@property
def size_mb(self):
"""The size of the data in memory in megabytes."""
return self.nbytes / 1e6
@property
def size_gb(self):
"""The size of the data in memory in gigabytes."""
return self.nbytes / 1e9
@property
def units(self):
"""The units symbol for this data, e.g. 'μJ', or None if not found"""
attrs = self.attributes()
base_unit = attrs.get('unitSymbol', None)
if base_unit is None:
return None
prefix = attrs.get('metricPrefixSymbol', '')
if prefix == 'u':
prefix = 'μ' # We are not afraid of unicode
return prefix + base_unit
@property
def units_name(self):
"""The units name for this data, e.g. 'microjoule', or None if not found"""
attrs = self.attributes()
base_unit = attrs.get('unitName', None)
if base_unit is None:
return None
prefix = attrs.get('metricPrefixName', '')
return prefix + base_unit
@property
def source_file_paths(self):
paths = []
for chunk in self._data_chunks:
if chunk.dataset.is_virtual:
mappings = chunk.dataset.virtual_sources()
for vspace, filename, _, _ in mappings:
if filename in paths:
continue # Already got it
# Does the mapping overlap with this chunk of selected data?
# We can assume that each mapping is a simple, contiguous
# block, and only selection on the first dimension matters.
starts, ends = vspace.get_select_bounds()
map_start, map_stop = starts[0], ends[0]
ck = chunk.slice
if (map_stop > ck.start) and (map_start < ck.stop):
paths.append(filename)
# Include 1 source file even if no trains are selected
if (not paths) and mappings:
paths.append(mappings[0].file_name)
else:
paths.append(chunk.file.filename)
# Fallback for virtual overview files where no data was recorded for
# this source, so there's no mapping to point to.
if not paths:
source_grp = self.files[0].file[f"{self.section}/{self.source}"]
if 'source_files' in source_grp.attrs:
paths.append(source_grp.attrs['source_files'][0])
from pathlib import Path
return [Path(p) for p in paths]
def _find_attributes(self, dset):
"""Find Karabo attributes belonging to a dataset."""
attrs = dict(dset.attrs)
if self.is_control and self.key.endswith('.value'):
# For CONTROL sources, most of the attributes are saved on
# the parent group rather than the .value dataset. In the
# case of duplicated keys, the parent value appears to be
# the correct one.
attrs.update(dict(dset.parent.attrs))
return attrs
def attributes(self):
"""Get a dict of all attributes stored with this data
This may be awkward to use. See .units and .units_name for more
convenient forms.
"""
dset = self.files[0].file[self.hdf5_data_path]
attrs = self._find_attributes(dset)
if (not attrs) and dset.is_virtual:
# Virtual datasets were initially created without these attributes.
# Find a source file. Not using source_file_paths as it can give [].
_, filename, _, _ = dset.virtual_sources()[0]
# Not using FileAccess: no need for train or source lists.
with h5py.File(filename, 'r') as f:
attrs = self._find_attributes(f[self.hdf5_data_path])
return attrs
def select_trains(self, trains):
"""Select a subset of trains in this data as a new :class:`KeyData` object.
Also available by slicing and indexing the KeyData object::
run[source, key][:10] # Select data for first 10 trains
"""
tids = select_train_ids(self.train_ids, trains)
return self._only_tids(tids)
def __getitem__(self, item):
return self.select_trains(item)
__iter__ = None # Disable iteration
def _only_tids(self, tids, files=None):
tids_arr = np.array(tids)
if files is None:
files = [
f for f in self.files
if f.has_train_ids(tids_arr, self.inc_suspect_trains)
]
if not files:
# Keep 1 file, even if 0 trains selected.
files = [self.files[0]]
return KeyData(
self.source,
self.key,
train_ids=tids,
files=files,
section=self.section,
dtype=self.dtype,
eshape=self.entry_shape,
inc_suspect_trains=self.inc_suspect_trains,
)
def drop_empty_trains(self):
"""Select only trains with data as a new :class:`KeyData` object."""
counts = self.data_counts(labelled=False)
tids = np.array(self.train_ids)[counts > 0]
return self._only_tids(list(tids))
def split_trains(self, parts=None, trains_per_part=None):
"""Split this data into chunks with a fraction of the trains each.
Either *parts* or *trains_per_part* must be specified.
This returns an iterator yielding new :class:`KeyData` objects.
The parts will have similar sizes, e.g. splitting 11 trains
with ``trains_per_part=8`` will produce 5 & 6 trains, not 8 & 3.
Selected trains count even if they are missing data, so different
keys from the same run can be split into matching chunks.
Parameters
----------
parts: int
How many parts to split the data into. If trains_per_part is also
specified, this is a minimum, and it may make more parts.
It may also make fewer if there are fewer trains in the data.
trains_per_part: int
A maximum number of trains in each part. Parts will often have
fewer trains than this.
"""
# tids_files points to the file for each train.
# This avoids checking all files for each chunk, which can be slow.
tids_files = trains_files_index(
self.train_ids, self.files, self.inc_suspect_trains
)
for sl in split_trains(len(self.train_ids), parts, trains_per_part):
tids = self.train_ids[sl]
files = set(tids_files[sl]) - {None}
files = sorted(files, key=lambda f: f.filename)
yield self._only_tids(tids, files=files)
def data_counts(self, labelled=True):
"""Get a count of data entries in each train.
If *labelled* is True, returns a pandas series with an index of train
IDs. Otherwise, returns a NumPy array of counts to match ``.train_ids``.
"""
if self._data_chunks:
train_ids = np.concatenate([c.train_ids for c in self._data_chunks])
counts = np.concatenate([c.counts for c in self._data_chunks])
else:
train_ids = counts = np.zeros(0, dtype=np.uint64)
if labelled:
import pandas as pd
return pd.Series(counts, index=train_ids)
else:
all_tids_arr = np.array(self.train_ids)
res = np.zeros(len(all_tids_arr), dtype=np.uint64)
tid_to_ix = np.intersect1d(all_tids_arr, train_ids, return_indices=True)[1]
# We may be missing some train IDs, if they're not in any file
# for this source, and they're sometimes out of order within chunks
# (they shouldn't be, but we try not to fail too badly if they are).
assert len(tid_to_ix) == len(train_ids)
res[tid_to_ix] = counts
return res
def as_single_value(self, rtol=1e-5, atol=0.0, reduce_by='median'):
"""Retrieve a single reduced value if within tolerances.
The relative and absolute tolerances *rtol* and *atol* work the
same way as in ``numpy.allclose``. The default relative tolerance
is 1e-5 with no absolute tolerance. The data for this key is compared
against a reduced value obtained by the method described in *reduce_by*.
This may be a callable taking the key data, the string value of a
global symbol in the numpy packge such as 'median' or 'first' to use
the first value encountered. By default, 'median' is used.
If within tolerances, the reduced value is returned.
"""
data = self.ndarray()
if len(data) == 0:
raise NoDataError(self.source, self.key)
if callable(reduce_by):
value = reduce_by(data)
elif isinstance(reduce_by, str) and hasattr(np, reduce_by):
value = getattr(np, reduce_by)(data, axis=0)
elif reduce_by == 'first':
value = data[0]
else:
raise ValueError('invalid reduction method (may be callable, '
'global numpy symbol or "first")')
if not np.allclose(data, value, rtol=rtol, atol=atol):
adev = np.max(np.abs(data - value))
rdev = np.max(np.abs(adev / value))
raise ValueError(f'data values are not within tolerance '
f'(absolute: {adev:.3g}, relative: {rdev:.3g})')
return value
# Getting data as different kinds of array: -------------------------------
def ndarray(self, roi=(), out=None):
"""Load this data as a numpy array
*roi* may be a ``numpy.s_[]`` expression to load e.g. only part of each
image from a camera. If *out* is not given, a suitable array will be
allocated.
"""
if not isinstance(roi, tuple):
roi = (roi,)
req_shape = self.shape[:1] + roi_shape(self.entry_shape, roi)
if out is None:
out = np.empty(req_shape, dtype=self.dtype)
elif out is not None and out.shape != req_shape:
raise ValueError(f'requires output array of shape {req_shape}')
# Read the data from each chunk into the result array
dest_cursor = 0
for chunk in self._data_chunks_nonempty:
dest_chunk_end = dest_cursor + chunk.total_count
slices = (chunk.slice,) + roi
chunk.dataset.read_direct(
out[dest_cursor:dest_chunk_end], source_sel=slices
)
dest_cursor = dest_chunk_end
return out
def train_id_coordinates(self):
"""Make an array of train IDs to use alongside data from ``.ndarray()``.
:attr:`train_ids` includes each selected train ID once, including trains
where data is missing. :meth:`train_id_coordinates` excludes missing
trains, and repeats train IDs if the source has multiple entries
per train. The result will be the same length as the first dimension
of an array from :meth:`ndarray`, and tells you which train each entry
belongs to.
.. seealso::
:meth:`xarray` returns a labelled array including these train IDs.
"""
if not self._data_chunks:
return np.zeros(0, dtype=np.uint64)
chunks_trainids = [
np.repeat(chunk.train_ids, chunk.counts.astype(np.intp))
for chunk in self._data_chunks
]
return np.concatenate(chunks_trainids)
def xarray(self, extra_dims=None, roi=(), name=None):
"""Load this data as a labelled xarray array or dataset.
The first dimension is labelled with train IDs. Other dimensions
may be named by passing a list of names to *extra_dims*.
For scalar datatypes, an xarray.DataArray is returned using either
the supplied *name* or the concatenated source and key name if omitted.
If the data is stored in a structured datatype, an xarray.Dataset
is returned with a variable for each field. The data of these
variables will be non-contiguous in memory, use
`Dataset.copy(deep=true)` to obtain a contiguous copy.
Parameters
----------
extra_dims: list of str
Name extra dimensions in the array. The first dimension is
automatically called 'train'. The default for extra dimensions
is dim_0, dim_1, ...
roi: numpy.s_[], slice, tuple of slices, or by_index
The region of interest. This expression selects data in all
dimensions apart from the first (trains) dimension. If the data
holds a 1D array for each entry, roi=np.s_[:8] would get the
first 8 values from every train. If the data is 2D or more at
each entry, selection looks like roi=np.s_[:8, 5:10] .
name: str
Name the array itself. The default is the source and key joined
by a dot. Ignored for structured data when a dataset is returned.
"""
import xarray
ndarr = self.ndarray(roi=roi)
# Dimension labels
if extra_dims is None:
extra_dims = ['dim_%d' % i for i in range(ndarr.ndim - 1)]
dims = ['trainId'] + extra_dims
# Train ID index
coords = {'trainId': self.train_id_coordinates()}
# xarray attributes
attrs = {}
if (units := self.units):
attrs['units'] = units
if ndarr.dtype.names is not None:
# Structured dtype.
return xarray.Dataset(
{field: (dims, ndarr[field]) for field in ndarr.dtype.names},
coords=coords, attrs=attrs)
else:
if name is None:
name = f'{self.source}.{self.key}'
if name.endswith('.value') and self.section == 'CONTROL':
name = name[:-6]
# Primitive dtype.
return xarray.DataArray(
ndarr, dims=dims, coords=coords, name=name, attrs=attrs)
def series(self):
"""Load this data as a pandas Series. Only for 1D data.
"""
import pandas as pd
if self.ndim > 1:
raise TypeError("pandas Series are only available for 1D data")
name = self.source + '/' + self.key
if name.endswith('.value') and self.section == 'CONTROL':
name = name[:-6]
index = pd.Index(self.train_id_coordinates(), name='trainId')
data = self.ndarray()
return pd.Series(data, name=name, index=index)
def dask_array(self, labelled=False):
"""Make a Dask array for this data.
Dask is a system for lazy parallel computation. This method doesn't
actually load the data, but gives you an array-like object which you
can operate on. Dask loads the data and calculates results when you ask
it to, e.g. by calling a ``.compute()`` method.
See the Dask documentation for more details.
If your computation depends on reading lots of data, consider creating
a dask.distributed.Client before calling this.
If you don't do this, Dask uses threads by default, which is not
efficient for reading HDF5 files.
Parameters
----------
labelled: bool
If True, label the train IDs for the data, returning an
xarray.DataArray object wrapping a Dask array.
"""
import dask.array as da
chunks_darrs = []
for chunk in self._data_chunks_nonempty:
chunk_dim0 = chunk.total_count
chunk_shape = (chunk_dim0,) + chunk.dataset.shape[1:]
itemsize = chunk.dataset.dtype.itemsize
# Find chunk size of maximum 2 GB. This is largely arbitrary:
# we want chunks small enough that each worker can have at least
# a couple in memory (Maxwell nodes have 256-768 GB in late 2019).
# But bigger chunks means less overhead.
# Empirically, making chunks 4 times bigger/smaller didn't seem to
# affect speed dramatically - but this could depend on many factors.
# TODO: optional user control of chunking
limit = 2 * 1024 ** 3
while np.prod(chunk_shape) * itemsize > limit and chunk_dim0 > 1:
chunk_dim0 //= 2
chunk_shape = (chunk_dim0,) + chunk.dataset.shape[1:]
chunks_darrs.append(
da.from_array(
chunk.file.dset_proxy(chunk.dataset_path), chunks=chunk_shape
)[chunk.slice]
)
if chunks_darrs:
dask_arr = da.concatenate(chunks_darrs, axis=0)
else:
shape = (0,) + self.entry_shape
dask_arr = da.zeros(shape=shape, dtype=self.dtype, chunks=shape)
if labelled:
# Dimension labels
dims = ['trainId'] + ['dim_%d' % i for i in range(dask_arr.ndim - 1)]
# Train ID index
coords = {'trainId': self.train_id_coordinates()}
import xarray
return xarray.DataArray(dask_arr, dims=dims, coords=coords)
else:
return dask_arr
# Getting data by train: --------------------------------------------------
def _find_tid(self, tid) -> Tuple[Optional[FileAccess], int]:
for fa in self.files:
matches = (fa.train_ids == tid).nonzero()[0]
if self.inc_suspect_trains and matches.size > 0:
return fa, matches[0]
for ix in matches:
if fa.validity_flag[ix]:
return fa, ix
return None, 0
def train_from_id(self, tid, keep_dims=False):
"""Get data for the given train ID as a numpy array.
Returns (train ID, array)
"""
if tid not in self.train_ids:
raise TrainIDError(tid)
fa, ix = self._find_tid(tid)
if fa is None:
return np.empty((0,) + self.entry_shape, dtype=self.dtype)
firsts, counts = fa.get_index(self.source, self.index_group)
first, count = firsts[ix], counts[ix]
if count == 1 and not keep_dims:
return tid, fa.file[self.hdf5_data_path][first]
else:
return tid, fa.file[self.hdf5_data_path][first: first+count]
def train_from_index(self, i, keep_dims=False):
"""Get data for a train by index (starting at 0) as a numpy array.
Returns (train ID, array)
"""
return self.train_from_id(self.train_ids[i], keep_dims=keep_dims)
def trains(self, keep_dims=False, include_empty=False):
"""Iterate through trains containing data for this key
Yields pairs of (train ID, array). Train axis is removed in case
of single elements unless *keep_dims* is set. Skips trains where
data is missing unless *include_empty* is set, returning None or
zero-length array with *keep_dims*.
"""
if keep_dims and include_empty:
empty_result = np.zeros(shape=(0,) + self.entry_shape,
dtype=self.dtype)
else:
empty_result = None
for chunk in self._data_chunks_nonempty:
start = chunk.first
ds = chunk.dataset
for tid, count in zip(chunk.train_ids, chunk.counts):
if count > 1 or keep_dims:
yield tid, ds[start: start+count]
elif count == 1:
yield tid, ds[start]
elif include_empty:
yield tid, empty_result
start += count
|