File: plot_dtype_utils.py

package info (click to toggle)
python-sidpy 0.12.3-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 21,988 kB
  • sloc: python: 11,456; makefile: 17
file content (294 lines) | stat: -rw-r--r-- 14,618 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
"""
================================================================================
Utilities for handling data types and transformations
================================================================================

**Suhas Somnath**

4/18/2018
"""
################################################################################
# Introduction
# -------------
# The general nature of the **Universal Spectroscopy and Imaging Data (USID)** model facilitates the representation of
# any kind of measurement data.
# This includes:
#
#  #. Conventional data represented using floating point numbers such as ``1.2345``
#  #. Integer data (with or without sign) such as ``137``
#  #. Complex-valued data such as ``1.23 + 4.5i``
#  #. Multi-valued or compound valued data cells such as (``'Frequency'``: ``301.2``, ``'Amplitude'``: ``1.553E-3``, ``'Phase'``: ``2.14``)
#     where a single value or measurement is represented by multiple elements, each with their own names, and data types
#
# While HDF5 datasets are capable of storing all of these kinds of data, many conventional data analysis techniques
# such as decomposition, clustering, etc. are either unable to handle complicated data types such as complex-valued
# datasets and compound valued datasets, or the results from these techniques do not produce physically meaningful
# results. For example, most singular value decomposition algorithms are capable of processing complex-valued datasets.
# However, while the eigenvectors can have complex values, the resultant complex-valued abundance maps are meaningless.
# These algorithms would not even work if the original data was compound valued!
#
# To avoid such problems, we need functions that transform the data to and from the necessary type (integer, real-value
# etc.)
#
# The ``pyUSID.dtype_utils`` module facilitates comparisons, validations, and most importantly, transformations of one
# data-type to another. We will be going over the many useful functions in this module and explaining how, when and why
# one would use them.
#
# Recommended pre-requisite reading
# -----------------------------------
# * `Universal Spectroscopic and Imaging Data (USID) model </../../../USID/usid_model.html>`_
# * `Crash course on HDF5 and h5py <../beginner/plot_h5py.html>`_
#
# .. tip::
#     You can download and run this document as a Jupyter notebook using the link at the bottom of this page.
#
# Import all necessary packages
# -------------------------------
# Before we begin demonstrating the numerous functions in ``pyUSID.dtype_utils``, we need to import the necessary
# packages. Here are a list of packages besides pyUSID that will be used in this example:
#
# * ``h5py`` - to manipulate HDF5 files
# * ``numpy`` - for numerical operations on arrays in memory

from __future__ import print_function, division, unicode_literals
import os
import subprocess
import sys
def install(package):
    subprocess.call([sys.executable, "-m", "pip", "install", package])

import h5py
import numpy as np
# Finally import pyUSID.
try:
    import pyUSID as usid
except ImportError:
    # Warning package in case something goes wrong
    from warnings import warn
    warn('pyUSID not found.  Will install with pip.')
    import pip
    install('pyUSID')
    import pyUSID as usid

################################################################################
# Utilities for validating data types
# =====================================
# pyUSID.dtype_utils contains some handy functions that make it easy to write robust and safe code by simplifying
# common data type checking and validation.
#
# contains_integers()
# ---------------------
# The ``contains_integers()`` function checks to make sure that each item in a list is indeed an integer. Additionally, it
# can be configured to ensure that all the values are above a minimum value. This is particularly useful when building
# indices matrices based on the size of dimensions - specified as a list of integers for example.

item = [1, 2, -3, 4]
print('{} : contains integers? : {}'.format(item, usid.dtype_utils.contains_integers(item)))
item = [1, 4.5, 2.2, -1]
print('{} : contains integers? : {}'.format(item, usid.dtype_utils.contains_integers(item)))

item = [1, 5, 8, 3]
min_val = 2
print('{} : contains integers >= {} ? : {}'.format(item, min_val,
                                                usid.dtype_utils.contains_integers(item, min_val=min_val)))

################################################################################
# validate_dtype()
# -----------------
# The ``validate_dtype()`` function ensure that a provided object is indeed a valid h5py or numpy data type. When writing
# a main dataset along with all ancillary datasets, pyUSID meticulously ensures that all inputs are valid before
# writing data to the file. This comes in very handy when we want to follow the 'measure twice, cut once' ethos.

for item in [np.float16, np.complex64, np.uint8, np.int16]:
    print('Is {} a valid dtype? : {}'.format(item, usid.dtype_utils.validate_dtype(item)))


# This function is especially useful on compound or structured data types:

struct_dtype = np.dtype({'names': ['r', 'g', 'b'],
                        'formats': [np.float32, np.uint16, np.float64]})
print('Is {} a valid dtype? : {}'.format(struct_dtype, usid.dtype_utils.validate_dtype(struct_dtype)))

################################################################################
# get_compound_sub_dtypes()
# --------------------------
# One common hassle when dealing with compound / structured array dtypes is that it can be a little challenging to
# quickly get the individual datatypes of each field in such a data type. The ``get_compound_sub_dtypes()`` makes this a
# lot easier:

sub_dtypes = usid.dtype_utils.get_compound_sub_dtypes(struct_dtype)
for key, val in sub_dtypes.items():
    print('{} : {}'.format(key, val))

################################################################################
# is_complex_dtype()
# -------------------
# Quite often, we need to treat complex datasets different from compound datasets which themselves need to be treated
# different from real valued datasets. ``is_complex_dtype()`` makes it easier to check if a numpy or HDF5 dataset has a
# complex data type:

for dtype in [np.float32, np.float16, np.uint8, np.int16, struct_dtype, bool]:
    print('Is {} a complex dtype?: {}'.format(dtype, (usid.dtype_utils.is_complex_dtype(dtype))))

for dtype in [complex, np.complex64, np.complex128, np.complex256]:
    print('Is {} a complex dtype?: {}'.format(dtype, (usid.dtype_utils.is_complex_dtype(dtype))))

################################################################################
# Data transformation
# ====================
# Perhaps the biggest benefit of ``dtype_utils`` is the ability to flatten complex, compound datasets to real-valued
# datasets and vice versa. As mentioned in the introduction, this is particularly important when attempting to use
# machine learning algorithms on complex or compound-valued datasets. In order to enable such pipelines, we need
# functions to transform:
#
# * complex / compound valued datasets to real-valued datasets
# * real-valued datasets back to complex / compound valued datasets
#
# flatten_complex_to_real()
# --------------------------
# As the name suggests, this function stacks the imaginary values of a N-dimensional numpy / HDF5 dataset below its
# real-values. Thus, applying this function to a complex valued dataset of size ``(a, b, c)`` would result in a
# real-valued dataset of shape ``(a, b, 2 * c)``:

length = 3
complex_array = np.random.randint(-5, high=5, size=length) + 1j * np.random.randint(-5, high=5, size=length)
stacked_real_array = usid.dtype_utils.flatten_complex_to_real(complex_array)
print('Complex value: {} has shape: {}'.format(complex_array, complex_array.shape))
print('Stacked real value: {} has shape: '
      '{}'.format(stacked_real_array, stacked_real_array.shape))

################################################################################
# flatten_compound_to_real()
# ----------------------------
# This function flattens a compound-valued dataset of shape ``(a, b, c)`` into a real-valued dataset of shape
# ``(a, b, k * c)`` where ``k`` is the number of fields within the structured array / compound dtype. Here we will
# demonstrate this on a 1D array of 5 elements each containing 'r', 'g', 'b' fields:

num_elems = 5
structured_array = np.zeros(shape=num_elems, dtype=struct_dtype)
structured_array['r'] = np.random.random(size=num_elems) * 1024
structured_array['g'] = np.random.randint(0, high=1024, size=num_elems)
structured_array['b'] = np.random.random(size=num_elems) * 1024
real_array = usid.dtype_utils.flatten_compound_to_real(structured_array)

print('Structured array is of shape {} and have values:'.format(structured_array.shape))
print(structured_array)
print('\nThis array converted to regular scalar matrix has shape: {} and values:'.format(real_array.shape))
print(real_array)

################################################################################
# flatten_to_real()
# -----------------
# This function checks the data type of the provided dataset and then uses either of the above functions to
# (if necessary) flatten the dataset into a real-valued matrix. By checking the data type of the dataset, it obviates
# the need to explicitly call the aforementioned functions (that still do the work). Here is an example of the function
# being applied to the compound valued numpy array again:

real_array = usid.dtype_utils.flatten_to_real(structured_array)
print('Structured array is of shape {} and have values:'.format(structured_array.shape))
print(structured_array)
print('\nThis array converted to regular scalar matrix has shape: {} and values:'.format(real_array.shape))
print(real_array)

################################################################################
# The next three functions perform the inverse operation of taking real-valued matrices or datasets and converting them
# to complex or compound-valued datasets.
#
# stack_real_to_complex()
# ------------------------
# As the name suggests, this function collapses a N dimensional real-valued array of size ``(a, b, 2 * c)`` to a
# complex-valued array of shape ``(a, b, c)``. It assumes that the first c values in real-valued dataset are the real
# components and the following c values are the imaginary components of the complex value. This will become clearer
# with an example:


real_val = np.hstack([5 * np.random.rand(6),
                      7 * np.random.rand(6)])
print('Real valued dataset of shape {}:'.format(real_val.shape))
print(real_val)

comp_val = usid.dtype_utils.stack_real_to_complex(real_val)

print('\nComplex-valued array of shape: {}'.format(comp_val.shape))
print(comp_val)

################################################################################
# stack_real_to_compound()
# --------------------------
# Similar to the above function, this function shrinks the last axis of a real valued dataset to create the desired
# compound valued dataset. Here we will demonstrate it on the same 3-field ``(r,g,b)`` compound datatype:

num_elems = 5
real_val = np.concatenate((np.random.random(size=num_elems) * 1024,
                           np.random.randint(0, high=1024, size=num_elems),
                           np.random.random(size=num_elems) * 1024))
print('Real valued dataset of shape {}:'.format(real_val.shape))
print(real_val)

comp_val = usid.dtype_utils.stack_real_to_compound(real_val, struct_dtype)

print('\nStructured array of shape: {}'.format(comp_val.shape))
print(comp_val)

################################################################################
# stack_real_to_target_dtype()
# -----------------------------
# This function performs the inverse of ``flatten_to_real()`` - stacks the provided real-valued dataset into a complex or
# compound valued dataset using the two above functions. Note that unlike ``flatten_to_real()``, the target data type must
# be supplied to the function for this to work:

print('Real valued dataset of shape {}:'.format(real_val.shape))
print(real_val)

comp_val = usid.dtype_utils.stack_real_to_target_dtype(real_val, struct_dtype)

print('\nStructured array of shape: {}'.format(comp_val.shape))
print(comp_val)

################################################################################
# check_dtype()
# --------------
# ``check_dtype()`` is a master function that figures out the data type, necessary function to transform a HDF5 dataset to
# a real-valued array, expected data shape, etc. Before we demonstrate this function, we need to quickly create an
# example HDF5 dataset.

file_path = 'dtype_utils_example.h5'
if os.path.exists(file_path):
    os.remove(file_path)
with h5py.File(file_path) as h5_f:
    num_elems = (5, 7)
    structured_array = np.zeros(shape=num_elems, dtype=struct_dtype)
    structured_array['r'] = 450 * np.random.random(size=num_elems)
    structured_array['g'] = np.random.randint(0, high=1024, size=num_elems)
    structured_array['b'] = 3178 * np.random.random(size=num_elems)
    _ = h5_f.create_dataset('compound', data=structured_array)
    _ = h5_f.create_dataset('real', data=450 * np.random.random(size=num_elems), dtype=np.float16)
    _ = h5_f.create_dataset('complex', data=np.random.random(size=num_elems) + 1j * np.random.random(size=num_elems),
                            dtype=np.complex64)
    h5_f.flush()

################################################################################
# Now, lets test the the function on compound-, complex-, and real-valued HDF5 datasets:


def check_dataset(h5_dset):
    print('\tDataset being tested: {}'.format(h5_dset))
    func, is_complex, is_compound, n_features, type_mult = usid.dtype_utils.check_dtype(h5_dset)
    print('\tFunction to transform to real: %s' % func)
    print('\tis_complex? %s' % is_complex)
    print('\tis_compound? %s' % is_compound)
    print('\tShape of dataset in its current form: {}'.format(h5_dset.shape))
    print('\tAfter flattening to real, shape is expected to be: ({}, {})'.format(h5_dset.shape[0], n_features))
    print('\tByte-size of a single element in its current form: {}'.format(type_mult))


with h5py.File(file_path, mode='r') as h5_f:
    print('Checking a compound-valued dataset:')
    check_dataset(h5_f['compound'])
    print('')
    print('Checking a complex-valued dataset:')
    check_dataset(h5_f['complex'])
    print('')
    print('Checking a real-valued dataset:')
    check_dataset(h5_f['real'])
os.remove(file_path)