1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294
|
"""
================================================================================
Utilities for handling data types and transformations
================================================================================
**Suhas Somnath**
4/18/2018
"""
################################################################################
# Introduction
# -------------
# The general nature of the **Universal Spectroscopy and Imaging Data (USID)** model facilitates the representation of
# any kind of measurement data.
# This includes:
#
# #. Conventional data represented using floating point numbers such as ``1.2345``
# #. Integer data (with or without sign) such as ``137``
# #. Complex-valued data such as ``1.23 + 4.5i``
# #. Multi-valued or compound valued data cells such as (``'Frequency'``: ``301.2``, ``'Amplitude'``: ``1.553E-3``, ``'Phase'``: ``2.14``)
# where a single value or measurement is represented by multiple elements, each with their own names, and data types
#
# While HDF5 datasets are capable of storing all of these kinds of data, many conventional data analysis techniques
# such as decomposition, clustering, etc. are either unable to handle complicated data types such as complex-valued
# datasets and compound valued datasets, or the results from these techniques do not produce physically meaningful
# results. For example, most singular value decomposition algorithms are capable of processing complex-valued datasets.
# However, while the eigenvectors can have complex values, the resultant complex-valued abundance maps are meaningless.
# These algorithms would not even work if the original data was compound valued!
#
# To avoid such problems, we need functions that transform the data to and from the necessary type (integer, real-value
# etc.)
#
# The ``pyUSID.dtype_utils`` module facilitates comparisons, validations, and most importantly, transformations of one
# data-type to another. We will be going over the many useful functions in this module and explaining how, when and why
# one would use them.
#
# Recommended pre-requisite reading
# -----------------------------------
# * `Universal Spectroscopic and Imaging Data (USID) model </../../../USID/usid_model.html>`_
# * `Crash course on HDF5 and h5py <../beginner/plot_h5py.html>`_
#
# .. tip::
# You can download and run this document as a Jupyter notebook using the link at the bottom of this page.
#
# Import all necessary packages
# -------------------------------
# Before we begin demonstrating the numerous functions in ``pyUSID.dtype_utils``, we need to import the necessary
# packages. Here are a list of packages besides pyUSID that will be used in this example:
#
# * ``h5py`` - to manipulate HDF5 files
# * ``numpy`` - for numerical operations on arrays in memory
from __future__ import print_function, division, unicode_literals
import os
import subprocess
import sys
def install(package):
subprocess.call([sys.executable, "-m", "pip", "install", package])
import h5py
import numpy as np
# Finally import pyUSID.
try:
import pyUSID as usid
except ImportError:
# Warning package in case something goes wrong
from warnings import warn
warn('pyUSID not found. Will install with pip.')
import pip
install('pyUSID')
import pyUSID as usid
################################################################################
# Utilities for validating data types
# =====================================
# pyUSID.dtype_utils contains some handy functions that make it easy to write robust and safe code by simplifying
# common data type checking and validation.
#
# contains_integers()
# ---------------------
# The ``contains_integers()`` function checks to make sure that each item in a list is indeed an integer. Additionally, it
# can be configured to ensure that all the values are above a minimum value. This is particularly useful when building
# indices matrices based on the size of dimensions - specified as a list of integers for example.
item = [1, 2, -3, 4]
print('{} : contains integers? : {}'.format(item, usid.dtype_utils.contains_integers(item)))
item = [1, 4.5, 2.2, -1]
print('{} : contains integers? : {}'.format(item, usid.dtype_utils.contains_integers(item)))
item = [1, 5, 8, 3]
min_val = 2
print('{} : contains integers >= {} ? : {}'.format(item, min_val,
usid.dtype_utils.contains_integers(item, min_val=min_val)))
################################################################################
# validate_dtype()
# -----------------
# The ``validate_dtype()`` function ensure that a provided object is indeed a valid h5py or numpy data type. When writing
# a main dataset along with all ancillary datasets, pyUSID meticulously ensures that all inputs are valid before
# writing data to the file. This comes in very handy when we want to follow the 'measure twice, cut once' ethos.
for item in [np.float16, np.complex64, np.uint8, np.int16]:
print('Is {} a valid dtype? : {}'.format(item, usid.dtype_utils.validate_dtype(item)))
# This function is especially useful on compound or structured data types:
struct_dtype = np.dtype({'names': ['r', 'g', 'b'],
'formats': [np.float32, np.uint16, np.float64]})
print('Is {} a valid dtype? : {}'.format(struct_dtype, usid.dtype_utils.validate_dtype(struct_dtype)))
################################################################################
# get_compound_sub_dtypes()
# --------------------------
# One common hassle when dealing with compound / structured array dtypes is that it can be a little challenging to
# quickly get the individual datatypes of each field in such a data type. The ``get_compound_sub_dtypes()`` makes this a
# lot easier:
sub_dtypes = usid.dtype_utils.get_compound_sub_dtypes(struct_dtype)
for key, val in sub_dtypes.items():
print('{} : {}'.format(key, val))
################################################################################
# is_complex_dtype()
# -------------------
# Quite often, we need to treat complex datasets different from compound datasets which themselves need to be treated
# different from real valued datasets. ``is_complex_dtype()`` makes it easier to check if a numpy or HDF5 dataset has a
# complex data type:
for dtype in [np.float32, np.float16, np.uint8, np.int16, struct_dtype, bool]:
print('Is {} a complex dtype?: {}'.format(dtype, (usid.dtype_utils.is_complex_dtype(dtype))))
for dtype in [complex, np.complex64, np.complex128, np.complex256]:
print('Is {} a complex dtype?: {}'.format(dtype, (usid.dtype_utils.is_complex_dtype(dtype))))
################################################################################
# Data transformation
# ====================
# Perhaps the biggest benefit of ``dtype_utils`` is the ability to flatten complex, compound datasets to real-valued
# datasets and vice versa. As mentioned in the introduction, this is particularly important when attempting to use
# machine learning algorithms on complex or compound-valued datasets. In order to enable such pipelines, we need
# functions to transform:
#
# * complex / compound valued datasets to real-valued datasets
# * real-valued datasets back to complex / compound valued datasets
#
# flatten_complex_to_real()
# --------------------------
# As the name suggests, this function stacks the imaginary values of a N-dimensional numpy / HDF5 dataset below its
# real-values. Thus, applying this function to a complex valued dataset of size ``(a, b, c)`` would result in a
# real-valued dataset of shape ``(a, b, 2 * c)``:
length = 3
complex_array = np.random.randint(-5, high=5, size=length) + 1j * np.random.randint(-5, high=5, size=length)
stacked_real_array = usid.dtype_utils.flatten_complex_to_real(complex_array)
print('Complex value: {} has shape: {}'.format(complex_array, complex_array.shape))
print('Stacked real value: {} has shape: '
'{}'.format(stacked_real_array, stacked_real_array.shape))
################################################################################
# flatten_compound_to_real()
# ----------------------------
# This function flattens a compound-valued dataset of shape ``(a, b, c)`` into a real-valued dataset of shape
# ``(a, b, k * c)`` where ``k`` is the number of fields within the structured array / compound dtype. Here we will
# demonstrate this on a 1D array of 5 elements each containing 'r', 'g', 'b' fields:
num_elems = 5
structured_array = np.zeros(shape=num_elems, dtype=struct_dtype)
structured_array['r'] = np.random.random(size=num_elems) * 1024
structured_array['g'] = np.random.randint(0, high=1024, size=num_elems)
structured_array['b'] = np.random.random(size=num_elems) * 1024
real_array = usid.dtype_utils.flatten_compound_to_real(structured_array)
print('Structured array is of shape {} and have values:'.format(structured_array.shape))
print(structured_array)
print('\nThis array converted to regular scalar matrix has shape: {} and values:'.format(real_array.shape))
print(real_array)
################################################################################
# flatten_to_real()
# -----------------
# This function checks the data type of the provided dataset and then uses either of the above functions to
# (if necessary) flatten the dataset into a real-valued matrix. By checking the data type of the dataset, it obviates
# the need to explicitly call the aforementioned functions (that still do the work). Here is an example of the function
# being applied to the compound valued numpy array again:
real_array = usid.dtype_utils.flatten_to_real(structured_array)
print('Structured array is of shape {} and have values:'.format(structured_array.shape))
print(structured_array)
print('\nThis array converted to regular scalar matrix has shape: {} and values:'.format(real_array.shape))
print(real_array)
################################################################################
# The next three functions perform the inverse operation of taking real-valued matrices or datasets and converting them
# to complex or compound-valued datasets.
#
# stack_real_to_complex()
# ------------------------
# As the name suggests, this function collapses a N dimensional real-valued array of size ``(a, b, 2 * c)`` to a
# complex-valued array of shape ``(a, b, c)``. It assumes that the first c values in real-valued dataset are the real
# components and the following c values are the imaginary components of the complex value. This will become clearer
# with an example:
real_val = np.hstack([5 * np.random.rand(6),
7 * np.random.rand(6)])
print('Real valued dataset of shape {}:'.format(real_val.shape))
print(real_val)
comp_val = usid.dtype_utils.stack_real_to_complex(real_val)
print('\nComplex-valued array of shape: {}'.format(comp_val.shape))
print(comp_val)
################################################################################
# stack_real_to_compound()
# --------------------------
# Similar to the above function, this function shrinks the last axis of a real valued dataset to create the desired
# compound valued dataset. Here we will demonstrate it on the same 3-field ``(r,g,b)`` compound datatype:
num_elems = 5
real_val = np.concatenate((np.random.random(size=num_elems) * 1024,
np.random.randint(0, high=1024, size=num_elems),
np.random.random(size=num_elems) * 1024))
print('Real valued dataset of shape {}:'.format(real_val.shape))
print(real_val)
comp_val = usid.dtype_utils.stack_real_to_compound(real_val, struct_dtype)
print('\nStructured array of shape: {}'.format(comp_val.shape))
print(comp_val)
################################################################################
# stack_real_to_target_dtype()
# -----------------------------
# This function performs the inverse of ``flatten_to_real()`` - stacks the provided real-valued dataset into a complex or
# compound valued dataset using the two above functions. Note that unlike ``flatten_to_real()``, the target data type must
# be supplied to the function for this to work:
print('Real valued dataset of shape {}:'.format(real_val.shape))
print(real_val)
comp_val = usid.dtype_utils.stack_real_to_target_dtype(real_val, struct_dtype)
print('\nStructured array of shape: {}'.format(comp_val.shape))
print(comp_val)
################################################################################
# check_dtype()
# --------------
# ``check_dtype()`` is a master function that figures out the data type, necessary function to transform a HDF5 dataset to
# a real-valued array, expected data shape, etc. Before we demonstrate this function, we need to quickly create an
# example HDF5 dataset.
file_path = 'dtype_utils_example.h5'
if os.path.exists(file_path):
os.remove(file_path)
with h5py.File(file_path) as h5_f:
num_elems = (5, 7)
structured_array = np.zeros(shape=num_elems, dtype=struct_dtype)
structured_array['r'] = 450 * np.random.random(size=num_elems)
structured_array['g'] = np.random.randint(0, high=1024, size=num_elems)
structured_array['b'] = 3178 * np.random.random(size=num_elems)
_ = h5_f.create_dataset('compound', data=structured_array)
_ = h5_f.create_dataset('real', data=450 * np.random.random(size=num_elems), dtype=np.float16)
_ = h5_f.create_dataset('complex', data=np.random.random(size=num_elems) + 1j * np.random.random(size=num_elems),
dtype=np.complex64)
h5_f.flush()
################################################################################
# Now, lets test the the function on compound-, complex-, and real-valued HDF5 datasets:
def check_dataset(h5_dset):
print('\tDataset being tested: {}'.format(h5_dset))
func, is_complex, is_compound, n_features, type_mult = usid.dtype_utils.check_dtype(h5_dset)
print('\tFunction to transform to real: %s' % func)
print('\tis_complex? %s' % is_complex)
print('\tis_compound? %s' % is_compound)
print('\tShape of dataset in its current form: {}'.format(h5_dset.shape))
print('\tAfter flattening to real, shape is expected to be: ({}, {})'.format(h5_dset.shape[0], n_features))
print('\tByte-size of a single element in its current form: {}'.format(type_mult))
with h5py.File(file_path, mode='r') as h5_f:
print('Checking a compound-valued dataset:')
check_dataset(h5_f['compound'])
print('')
print('Checking a complex-valued dataset:')
check_dataset(h5_f['complex'])
print('')
print('Checking a real-valued dataset:')
check_dataset(h5_f['real'])
os.remove(file_path)
|