File: keydata.py

package info (click to toggle)
extra-data 1.20.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, trixie
  • size: 952 kB
  • sloc: python: 10,421; makefile: 4
file content (618 lines) | stat: -rw-r--r-- 23,339 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
from typing import List, Optional, Tuple

import h5py
import numpy as np

from .exceptions import TrainIDError, NoDataError
from .file_access import FileAccess
from .read_machinery import (
    contiguous_regions, DataChunk, select_train_ids, split_trains, roi_shape,
    trains_files_index,
)

class KeyData:
    """Data for one key in one source

    Don't create this directly; get it from ``run[source, key]``.
    """
    def __init__(
            self, source, key, *, train_ids, files, section, dtype, eshape,
            inc_suspect_trains=True,
    ):
        self.source = source
        self.key = key
        self.train_ids = train_ids
        self.files: List[FileAccess] = files
        self.section = section
        self.dtype = dtype
        self.entry_shape = eshape
        self.ndim = len(eshape) + 1
        self.inc_suspect_trains = inc_suspect_trains

    def _find_chunks(self):
        """Find contiguous chunks of data for this key, in any order."""
        all_tids_arr = np.array(self.train_ids)

        for file in self.files:
            if len(file.train_ids) == 0:
                continue

            firsts, counts = file.get_index(self.source, self.index_group)

            # Of trains in this file, which are in selection
            include = np.isin(file.train_ids, all_tids_arr)
            if not self.inc_suspect_trains:
                include &= file.validity_flag

            # Assemble contiguous chunks of data from this file
            for _from, _to in contiguous_regions(include):
                yield DataChunk(
                    file, self.hdf5_data_path,
                    first=firsts[_from],
                    train_ids=file.train_ids[_from:_to],
                    counts=counts[_from:_to],
                )

    _cached_chunks = None

    @property
    def _data_chunks(self) -> List[DataChunk]:
        """An ordered list of chunks containing data"""
        if self._cached_chunks is None:
            self._cached_chunks = sorted(
                self._find_chunks(), key=lambda c: c.train_ids[0]
            )
        return self._cached_chunks

    @property
    def _data_chunks_nonempty(self) -> List[DataChunk]:
        return [c for c in self._data_chunks if c.total_count]

    def __repr__(self):
        return f"<extra_data.KeyData source={self.source!r} key={self.key!r} " \
               f"for {len(self.train_ids)} trains>"

    @property
    def is_control(self):
        """Whether this key belongs to a control source."""
        return self.section == 'CONTROL'

    @property
    def is_instrument(self):
        """Whether this key belongs to an instrument source."""
        return self.section == 'INSTRUMENT'

    @property
    def index_group(self):
        """The part of the key needed to look up index data"""
        if self.section == 'INSTRUMENT':
            return self.key.partition('.')[0]
        else:
            return ''

    @property
    def hdf5_data_path(self):
        """The path to the relevant dataset within each HDF5 file"""
        return f"/{self.section}/{self.source}/{self.key.replace('.', '/')}"

    @property
    def shape(self):
        """The shape of this data as a tuple, like for a NumPy array.

        Finding the shape may require getting index data from several files
        """
        return (sum(c.total_count for c in self._data_chunks),) + self.entry_shape

    @property
    def nbytes(self):
        """The number of bytes this data would take up in memory."""
        return self.dtype.itemsize * np.prod(self.shape)

    @property
    def size_mb(self):
        """The size of the data in memory in megabytes."""
        return self.nbytes / 1e6

    @property
    def size_gb(self):
        """The size of the data in memory in gigabytes."""
        return self.nbytes / 1e9

    @property
    def units(self):
        """The units symbol for this data, e.g. 'μJ', or None if not found"""
        attrs = self.attributes()
        base_unit = attrs.get('unitSymbol', None)
        if base_unit is None:
            return None

        prefix = attrs.get('metricPrefixSymbol', '')
        if prefix == 'u':
            prefix = 'μ'  # We are not afraid of unicode
        return prefix + base_unit

    @property
    def units_name(self):
        """The units name for this data, e.g. 'microjoule', or None if not found"""
        attrs = self.attributes()
        base_unit = attrs.get('unitName', None)
        if base_unit is None:
            return None

        prefix = attrs.get('metricPrefixName', '')
        return prefix + base_unit

    @property
    def source_file_paths(self):
        paths = []
        for chunk in self._data_chunks:
            if chunk.dataset.is_virtual:
                mappings = chunk.dataset.virtual_sources()
                for vspace, filename, _, _ in mappings:
                    if filename in paths:
                        continue  # Already got it

                    # Does the mapping overlap with this chunk of selected data?
                    # We can assume that each mapping is a simple, contiguous
                    # block, and only selection on the first dimension matters.
                    starts, ends = vspace.get_select_bounds()
                    map_start, map_stop = starts[0], ends[0]
                    ck = chunk.slice
                    if (map_stop > ck.start) and (map_start < ck.stop):
                        paths.append(filename)

                # Include 1 source file even if no trains are selected
                if (not paths) and mappings:
                    paths.append(mappings[0].file_name)
            else:
                paths.append(chunk.file.filename)

        # Fallback for virtual overview files where no data was recorded for
        # this source, so there's no mapping to point to.
        if not paths:
            source_grp = self.files[0].file[f"{self.section}/{self.source}"]
            if 'source_files' in source_grp.attrs:
                paths.append(source_grp.attrs['source_files'][0])

        from pathlib import Path
        return [Path(p) for p in paths]

    def _find_attributes(self, dset):
        """Find Karabo attributes belonging to a dataset."""
        attrs = dict(dset.attrs)

        if self.is_control and self.key.endswith('.value'):
            # For CONTROL sources, most of the attributes are saved on
            # the parent group rather than the .value dataset. In the
            # case of duplicated keys, the parent value appears to be
            # the correct one.
            attrs.update(dict(dset.parent.attrs))

        return attrs

    def attributes(self):
        """Get a dict of all attributes stored with this data

        This may be awkward to use. See .units and .units_name for more
        convenient forms.
        """
        dset = self.files[0].file[self.hdf5_data_path]
        attrs = self._find_attributes(dset)
        if (not attrs) and dset.is_virtual:
            # Virtual datasets were initially created without these attributes.
            # Find a source file. Not using source_file_paths as it can give [].
            _, filename, _, _ = dset.virtual_sources()[0]
            # Not using FileAccess: no need for train or source lists.
            with h5py.File(filename, 'r') as f:
                attrs = self._find_attributes(f[self.hdf5_data_path])

        return attrs

    def select_trains(self, trains):
        """Select a subset of trains in this data as a new :class:`KeyData` object.

        Also available by slicing and indexing the KeyData object::

            run[source, key][:10]  # Select data for first 10 trains
        """
        tids = select_train_ids(self.train_ids, trains)
        return self._only_tids(tids)

    def __getitem__(self, item):
        return self.select_trains(item)

    __iter__ = None  # Disable iteration

    def _only_tids(self, tids, files=None):
        tids_arr = np.array(tids)
        if files is None:
            files = [
                f for f in self.files
                if f.has_train_ids(tids_arr, self.inc_suspect_trains)
            ]
        if not files:
            # Keep 1 file, even if 0 trains selected.
            files = [self.files[0]]

        return KeyData(
            self.source,
            self.key,
            train_ids=tids,
            files=files,
            section=self.section,
            dtype=self.dtype,
            eshape=self.entry_shape,
            inc_suspect_trains=self.inc_suspect_trains,
        )

    def drop_empty_trains(self):
        """Select only trains with data as a new :class:`KeyData` object."""
        counts = self.data_counts(labelled=False)
        tids = np.array(self.train_ids)[counts > 0]
        return self._only_tids(list(tids))

    def split_trains(self, parts=None, trains_per_part=None):
        """Split this data into chunks with a fraction of the trains each.

        Either *parts* or *trains_per_part* must be specified.

        This returns an iterator yielding new :class:`KeyData` objects.
        The parts will have similar sizes, e.g. splitting 11 trains
        with ``trains_per_part=8`` will produce 5 & 6 trains, not 8 & 3.
        Selected trains count even if they are missing data, so different
        keys from the same run can be split into matching chunks.

        Parameters
        ----------

        parts: int
            How many parts to split the data into. If trains_per_part is also
            specified, this is a minimum, and it may make more parts.
            It may also make fewer if there are fewer trains in the data.
        trains_per_part: int
            A maximum number of trains in each part. Parts will often have
            fewer trains than this.
        """
        # tids_files points to the file for each train.
        # This avoids checking all files for each chunk, which can be slow.
        tids_files = trains_files_index(
            self.train_ids, self.files, self.inc_suspect_trains
        )
        for sl in split_trains(len(self.train_ids), parts, trains_per_part):
            tids = self.train_ids[sl]
            files = set(tids_files[sl]) - {None}
            files = sorted(files, key=lambda f: f.filename)
            yield self._only_tids(tids, files=files)

    def data_counts(self, labelled=True):
        """Get a count of data entries in each train.

        If *labelled* is True, returns a pandas series with an index of train
        IDs. Otherwise, returns a NumPy array of counts to match ``.train_ids``.
        """
        if self._data_chunks:
            train_ids = np.concatenate([c.train_ids for c in self._data_chunks])
            counts = np.concatenate([c.counts for c in self._data_chunks])
        else:
            train_ids = counts = np.zeros(0, dtype=np.uint64)

        if labelled:
            import pandas as pd
            return pd.Series(counts, index=train_ids)
        else:
            all_tids_arr = np.array(self.train_ids)
            res = np.zeros(len(all_tids_arr), dtype=np.uint64)
            tid_to_ix = np.intersect1d(all_tids_arr, train_ids, return_indices=True)[1]

            # We may be missing some train IDs, if they're not in any file
            # for this source, and they're sometimes out of order within chunks
            # (they shouldn't be, but we try not to fail too badly if they are).
            assert len(tid_to_ix) == len(train_ids)
            res[tid_to_ix] = counts

            return res

    def as_single_value(self, rtol=1e-5, atol=0.0, reduce_by='median'):
        """Retrieve a single reduced value if within tolerances.

        The relative and absolute tolerances *rtol* and *atol* work the
        same way as in ``numpy.allclose``. The default relative tolerance
        is 1e-5 with no absolute tolerance. The data for this key is compared
        against a reduced value obtained by the method described in *reduce_by*.

        This may be a callable taking the key data, the string value of a
        global symbol in the numpy packge such as 'median' or 'first' to use
        the first value encountered. By default, 'median' is used.

        If within tolerances, the reduced value is returned.
        """

        data = self.ndarray()

        if len(data) == 0:
            raise NoDataError(self.source, self.key)

        if callable(reduce_by):
            value = reduce_by(data)
        elif isinstance(reduce_by, str) and hasattr(np, reduce_by):
            value = getattr(np, reduce_by)(data, axis=0)
        elif reduce_by == 'first':
            value = data[0]
        else:
            raise ValueError('invalid reduction method (may be callable, '
                             'global numpy symbol or "first")')

        if not np.allclose(data, value, rtol=rtol, atol=atol):
            adev = np.max(np.abs(data - value))
            rdev = np.max(np.abs(adev / value))

            raise ValueError(f'data values are not within tolerance '
                             f'(absolute: {adev:.3g}, relative: {rdev:.3g})')

        return value

    # Getting data as different kinds of array: -------------------------------

    def ndarray(self, roi=(), out=None):
        """Load this data as a numpy array

        *roi* may be a ``numpy.s_[]`` expression to load e.g. only part of each
        image from a camera. If *out* is not given, a suitable array will be
        allocated.
        """
        if not isinstance(roi, tuple):
            roi = (roi,)

        req_shape = self.shape[:1] + roi_shape(self.entry_shape, roi)

        if out is None:
            out = np.empty(req_shape, dtype=self.dtype)
        elif out is not None and out.shape != req_shape:
            raise ValueError(f'requires output array of shape {req_shape}')

        # Read the data from each chunk into the result array
        dest_cursor = 0
        for chunk in self._data_chunks_nonempty:
            dest_chunk_end = dest_cursor + chunk.total_count

            slices = (chunk.slice,) + roi
            chunk.dataset.read_direct(
                out[dest_cursor:dest_chunk_end], source_sel=slices
            )
            dest_cursor = dest_chunk_end

        return out

    def train_id_coordinates(self):
        """Make an array of train IDs to use alongside data from ``.ndarray()``.

        :attr:`train_ids` includes each selected train ID once, including trains
        where data is missing. :meth:`train_id_coordinates` excludes missing
        trains, and repeats train IDs if the source has multiple entries
        per train. The result will be the same length as the first dimension
        of an array from :meth:`ndarray`, and tells you which train each entry
        belongs to.

        .. seealso::

           :meth:`xarray` returns a labelled array including these train IDs.
        """
        if not self._data_chunks:
            return np.zeros(0, dtype=np.uint64)
        chunks_trainids = [
            np.repeat(chunk.train_ids, chunk.counts.astype(np.intp))
            for chunk in self._data_chunks
        ]
        return np.concatenate(chunks_trainids)

    def xarray(self, extra_dims=None, roi=(), name=None):
        """Load this data as a labelled xarray array or dataset.

        The first dimension is labelled with train IDs. Other dimensions
        may be named by passing a list of names to *extra_dims*.

        For scalar datatypes, an xarray.DataArray is returned using either
        the supplied *name* or the concatenated source and key name if omitted.

        If the data is stored in a structured datatype, an xarray.Dataset
        is returned with a variable for each field. The data of these
        variables will be non-contiguous in memory, use
        `Dataset.copy(deep=true)` to obtain a contiguous copy.

        Parameters
        ----------

        extra_dims: list of str
            Name extra dimensions in the array. The first dimension is
            automatically called 'train'. The default for extra dimensions
            is dim_0, dim_1, ...
        roi: numpy.s_[], slice, tuple of slices, or by_index
            The region of interest. This expression selects data in all
            dimensions apart from the first (trains) dimension. If the data
            holds a 1D array for each entry, roi=np.s_[:8] would get the
            first 8 values from every train. If the data is 2D or more at
            each entry, selection looks like roi=np.s_[:8, 5:10] .
        name: str
            Name the array itself. The default is the source and key joined
            by a dot. Ignored for structured data when a dataset is returned.
        """
        import xarray

        ndarr = self.ndarray(roi=roi)

        # Dimension labels
        if extra_dims is None:
            extra_dims = ['dim_%d' % i for i in range(ndarr.ndim - 1)]
        dims = ['trainId'] + extra_dims

        # Train ID index
        coords = {'trainId': self.train_id_coordinates()}
        # xarray attributes
        attrs = {}
        if (units := self.units):
            attrs['units'] = units

        if ndarr.dtype.names is not None:
            # Structured dtype.
            return xarray.Dataset(
                {field: (dims, ndarr[field]) for field in ndarr.dtype.names},
                coords=coords, attrs=attrs)
        else:
            if name is None:
                name = f'{self.source}.{self.key}'

                if name.endswith('.value') and self.section == 'CONTROL':
                    name = name[:-6]

            # Primitive dtype.
            return xarray.DataArray(
                ndarr, dims=dims, coords=coords, name=name, attrs=attrs)

    def series(self):
        """Load this data as a pandas Series. Only for 1D data.
        """
        import pandas as pd

        if self.ndim > 1:
            raise TypeError("pandas Series are only available for 1D data")

        name = self.source + '/' + self.key
        if name.endswith('.value') and self.section == 'CONTROL':
            name = name[:-6]

        index = pd.Index(self.train_id_coordinates(), name='trainId')
        data = self.ndarray()
        return pd.Series(data, name=name, index=index)

    def dask_array(self, labelled=False):
        """Make a Dask array for this data.

        Dask is a system for lazy parallel computation. This method doesn't
        actually load the data, but gives you an array-like object which you
        can operate on. Dask loads the data and calculates results when you ask
        it to, e.g. by calling a ``.compute()`` method.
        See the Dask documentation for more details.

        If your computation depends on reading lots of data, consider creating
        a dask.distributed.Client before calling this.
        If you don't do this, Dask uses threads by default, which is not
        efficient for reading HDF5 files.

        Parameters
        ----------

        labelled: bool
            If True, label the train IDs for the data, returning an
            xarray.DataArray object wrapping a Dask array.
        """
        import dask.array as da

        chunks_darrs = []

        for chunk in self._data_chunks_nonempty:
            chunk_dim0 = chunk.total_count
            chunk_shape = (chunk_dim0,) + chunk.dataset.shape[1:]
            itemsize = chunk.dataset.dtype.itemsize

            # Find chunk size of maximum 2 GB. This is largely arbitrary:
            # we want chunks small enough that each worker can have at least
            # a couple in memory (Maxwell nodes have 256-768 GB in late 2019).
            # But bigger chunks means less overhead.
            # Empirically, making chunks 4 times bigger/smaller didn't seem to
            # affect speed dramatically - but this could depend on many factors.
            # TODO: optional user control of chunking
            limit = 2 * 1024 ** 3
            while np.prod(chunk_shape) * itemsize > limit and chunk_dim0 > 1:
                chunk_dim0 //= 2
                chunk_shape = (chunk_dim0,) + chunk.dataset.shape[1:]

            chunks_darrs.append(
                da.from_array(
                    chunk.file.dset_proxy(chunk.dataset_path), chunks=chunk_shape
                )[chunk.slice]
            )

        if chunks_darrs:
            dask_arr = da.concatenate(chunks_darrs, axis=0)
        else:
            shape = (0,) + self.entry_shape
            dask_arr = da.zeros(shape=shape, dtype=self.dtype, chunks=shape)

        if labelled:
            # Dimension labels
            dims = ['trainId'] + ['dim_%d' % i for i in range(dask_arr.ndim - 1)]

            # Train ID index
            coords = {'trainId': self.train_id_coordinates()}

            import xarray
            return xarray.DataArray(dask_arr, dims=dims, coords=coords)
        else:
            return dask_arr

    # Getting data by train: --------------------------------------------------

    def _find_tid(self, tid) -> Tuple[Optional[FileAccess], int]:
        for fa in self.files:
            matches = (fa.train_ids == tid).nonzero()[0]
            if self.inc_suspect_trains and matches.size > 0:
                return fa, matches[0]

            for ix in matches:
                if fa.validity_flag[ix]:
                    return fa, ix

        return None, 0

    def train_from_id(self, tid, keep_dims=False):
        """Get data for the given train ID as a numpy array.

        Returns (train ID, array)
        """
        if tid not in self.train_ids:
            raise TrainIDError(tid)

        fa, ix = self._find_tid(tid)
        if fa is None:
            return np.empty((0,) + self.entry_shape, dtype=self.dtype)

        firsts, counts = fa.get_index(self.source, self.index_group)
        first, count = firsts[ix], counts[ix]
        if count == 1 and not keep_dims:
            return tid, fa.file[self.hdf5_data_path][first]
        else:
            return tid, fa.file[self.hdf5_data_path][first: first+count]

    def train_from_index(self, i, keep_dims=False):
        """Get data for a train by index (starting at 0) as a numpy array.

        Returns (train ID, array)
        """
        return self.train_from_id(self.train_ids[i], keep_dims=keep_dims)

    def trains(self, keep_dims=False, include_empty=False):
        """Iterate through trains containing data for this key

        Yields pairs of (train ID, array). Train axis is removed in case
        of single elements unless *keep_dims* is set. Skips trains where
        data is missing unless *include_empty* is set, returning None or
        zero-length array with *keep_dims*.
        """
        if keep_dims and include_empty:
            empty_result = np.zeros(shape=(0,) + self.entry_shape,
                                    dtype=self.dtype)
        else:
            empty_result = None

        for chunk in self._data_chunks_nonempty:
            start = chunk.first
            ds = chunk.dataset
            for tid, count in zip(chunk.train_ids, chunk.counts):
                if count > 1 or keep_dims:
                    yield tid, ds[start: start+count]
                elif count == 1:
                    yield tid, ds[start]
                elif include_empty:
                    yield tid, empty_result

                start += count