1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
|
#!/usr/bin/python3
r'''A simple parser for vnlog data
Synopsis:
import vnlog
for d in vnlog.vnlog(f):
print(d['time'],d['height'])
Vnlog is simple, and you don't NEED a parser to read it, but this library makes
it a bit nicer.
This module provides three different ways to parse vnlog
1. slurp the whole thing into a numpy array: the slurp() function. Basic usage:
import vnlog
arr,list_keys,dict_key_index = \
vnlog.slurp(filename_or_fileobject)
This parses out the legend, and then calls numpy.loadtxt(). Null data values
('-') are not supported at this time. A structured dtype can be passed-in to
read non-numerical data. See the docstring for vnlog.slurp() for details
2. Iterate through the records: vnlog class, used as an iterator. Basic usage:
import vnlog
for d in vnlog.vnlog(filename_or_fileobject):
print(d['time'],d['height'])
Null data values are represented as None
3. Parse incoming lines individually: vnlog class, using the parse() method.
Basic usage:
import vnlog
parser = vnlog.vnlog()
for l in file:
parser.parse(l)
d = parser.values_dict()
if not d:
continue
print(d['time'],d['height'])
Most of the time you'd use options 1 or 2 above. Option 3 is the most general,
but also the most verbose
'''
from __future__ import print_function
import re
class vnlog:
r'''Class to facilitate vnlog parsing
This class provides two different ways to parse vnlog
1. Iterate through the records: vnlog class, used as an iterator. Basic
usage:
import vnlog
for d in vnlog.vnlog(filename_or_fileobject):
print(d['time'],d['height'])
Null data values are represented as None
2. Parse incoming lines individually: vnlog class, using the parse() method.
Basic usage:
import vnlog
parser = vnlog.vnlog()
for l in file:
parser.parse(l)
d = parser.values_dict()
if not d:
continue
print(d['time'],d['height'])
'''
def __init__(self, f = None):
r'''Initialize the vnlog parser
If using this class as an iterator, you MUST pass a filename or file
object into this constructor
'''
self._keys = None
self._values = None
self._values_dict = None
if f is None or type(f) is not str:
self.f = f
self.f_need_close = False
else:
self.f = open(f, 'r')
self.f_need_close = True
def __del__(self):
try:
if self.f_need_close:
self.f.close()
except:
pass
def parse(self, l):
r'''Parse a new line of data.
The user only needs to call this if they're not using this class as an
iterator. When this function returns, the keys(), values() and
values_dict() functions return the data from this line. Before the
legend was parsed, all would return None. After the legend was parsed,
keys() returns non-None. When a comment is encountered, values(),
values_dict() return None
'''
# I reset the data first
self._values = None
self._values_dict = None
if not hasattr(self, 're_hard_comment'):
self.re_hard_comment = re.compile(r'^\s*(?:#[#!]|#\s*$|$)')
self.re_soft_comment = re.compile(r'^\s*#\s*(.*?)\s*$')
if self.re_hard_comment.match(l):
# empty line or hard comment.
# no data, no error
return True
m = self.re_soft_comment.match(l)
if m:
if self._keys is not None:
# already have legend, so this is just a comment
# no data, no error
return True
# got legend.
# no data, no error
self._keys = m.group(1).split()
return True
if self._keys is None:
# Not comment, not empty line, but no legend yet. Barf
raise Exception("Got dataline before legend")
# string trailing comments
i = l.find('#')
if i >= 0:
l = l[:i]
# strip leading, trailing whitespace
l = l.strip()
if len(l) == 0:
return True
self._values = [ None if x == '-' else x for x in l.split()]
if len(self._values) != len(self._keys):
raise Exception('Legend line "{}" has {} elements, but data line "{}" has {} elements. Counts must match!'. \
format( "# " + ' '.join(self._keys),
len(self._keys),
l,
len(self._values)))
return True
def keys(self):
r'''Returns the keys of the so-far-parsed data
Returns None if we haven't seen the legend line yet'''
return self._keys
def values(self):
r'''Returns the values list of the last-parsed line
Returns None if the last line was a comment. Null fields ('-') values
are represented as None
'''
return self._values
def values_dict(self):
r'''Returns the values dict of the last-parsed line
This dict maps field names to values. Returns None if the last line was
a comment. Null fields ('-') values are represented as None.
'''
# internally:
# self._values_dict == None: not yet computed
# self._values_dict == {}: computed, but no-data
# returning: None if computed, but no-data
if self._values_dict is not None:
if len(self._values_dict) == 0:
return None
return self._values_dict
self._values_dict = {}
if self._keys and self._values:
for i in range(len(self._keys)):
self._values_dict[self._keys[i]] = self._values[i]
return self._values_dict
def __iter__(self):
if self.f is None:
raise Exception("Cannot iterate since this vnlog instance was not given a log to iterate on")
return self
def __next__(self):
for l in self.f:
self.parse(l)
if self._values is None:
continue
return self.values_dict()
raise StopIteration
# to support python2 and python3
next = __next__
def _slurp(f,
*,
dtype = None):
r'''Reads a whole vnlog into memory
This is an internal function. The argument is a file object, not a filename.
See the docs for slurp() for details
'''
import numpy as np
# Expands the fields in a dtype into a flat list of names. For vnlog
# purposes this doesn't support multiple levels of fields and it doesn't
# support unnamed fields. It DOES support (require!) compound elements with
# whitespace-separated field names, such as 'x y z' for a shape-(3,) field.
#
# This function is an analogue of field_type_grow_recursive() in
# https://github.com/numpy/numpy/blob/9815c16f449e12915ef35a8255329ba26dacd5c0/numpy/core/src/multiarray/textreading/field_types.c#L95
def field_names_in_dtype(dtype,
split_name = None,
name = None):
if dtype.subdtype is not None:
if split_name is None:
raise Exception("only structured dtypes with named fields are supported")
size = np.prod(dtype.shape)
if size != len(split_name):
raise Exception(f'Field "{name}" has {len(split_name)} elements, but the dtype has it associated with a field of shape {dtype.shape} with {size} elements. The sizes MUST match')
yield from split_name
return
if dtype.fields is not None:
if split_name is not None:
raise("structured dtype with nested fields unsupported")
for name1 in dtype.names:
tup = dtype.fields[name1]
field_descr = tup[0]
yield from field_names_in_dtype(field_descr,
name = name1,
split_name = name1.split(),)
return
if split_name is None:
raise Exception("structured dtype with unnamed fields unsupported")
if len(split_name) != 1:
raise Exception(f"Field '{name}' is a scalar so it may not contain whitespace in its name")
yield split_name[0]
parser = vnlog()
keys = None
for line in f:
parser.parse(line)
keys = parser.keys()
if keys is not None:
break
else:
raise Exception("vnlog parser did not find a legend line")
dict_key_index = {}
for i in range(len(keys)):
dict_key_index[keys[i]] = i
if dtype is None or \
not isinstance(dtype, np.dtype) or \
( dtype.fields is None and \
dtype.subdtype is None ):
return \
( np.loadtxt(f, ndmin=2, dtype=dtype),
keys,
dict_key_index )
# We have a dtype. We parse out the field names from it, map those to
# columns in the input (from the vnl legend that we just parsed), and
# load everything with np.loadtxt()
names_dtype = list(field_names_in_dtype(dtype))
# We have input fields in the vnl represented in:
# - keys
# - dict_key_index
#
# We have output fields represented in:
# - names_dtype
#
# Each element of 'names_dtype' corresponds to each output field, in
# order. 'names_dtype' are names of these input fields, which must match
# the input names given in 'keys'.
Ncols_output = len(names_dtype)
usecols = [None] * Ncols_output
for i_out in range(Ncols_output):
name_dtype = names_dtype[i_out]
try:
i_in = dict_key_index[name_dtype]
except:
raise Exception(f"The given dtype contains field {name_dtype=} but this doesn't appear in the vnlog columns {keys=}")
usecols[i_out] = i_in
return \
np.loadtxt(f,
ndmin = 1,
dtype = dtype,
usecols = usecols)
def slurp(f,
*,
dtype = None):
r'''Reads a whole vnlog into memory
SYNOPSIS
import vnlog
### Read numerical data into arr
arr,list_keys,dict_key_index = \
vnlog.slurp(filename_or_fileobject)
### Read disparate, partly-numerical data using a structured dtype
# Let's say "data.vnl" contains:
# image x y z temperature
image1.png 1 2 5 34
image2.png 3 4 1 35
dtype = np.dtype([ ('image', 'U16'),
('x y z', int, (3,)),
('temperature', float), ])
arr = vnlog.slurp("data.vnl", dtype=dtype)
print(arr['image'])
---> array(['image1.png', 'image2.png'], dtype='<U16')
print(arr['x y z'])
---> array([[1, 2, 5],
[3, 4, 1]])
print(arr['temperature'])
---> array([34., 35.])
This function is primarily a wrapper around numpy.loadtxt(), which does most of
the work. Null data values ('-') are not supported at this time.
A dtype can be given in a keyword argument. If this is a base type (something
like 'float' or 'np.int8'), the returned array will be composed entirely of
values of that type.
If this is a structured dtype (like the one in the SYNOPSIS above), a structured
array will be returned. Some notes about this behavior:
- The given structured dtype defines both how to organize the data, and which
data to extract. So it can be used to read in only a subset of the available
columns. In the sample above I could have omitted the 'temperature' column,
for instance
- Sub-arrays are allowed. In the example I could say either
dtype = np.dtype([ ('image', 'U16'),
('x y z', int, (3,)),
('temperature', float), ])
or
dtype = np.dtype([ ('image', 'U16'),
('x', int),
('y', int),
('z', int),
('temperature', float), ])
The latter would read x,y,z into separate, individual arrays. Sometime we want
this, sometimes not.
- Nested structured dtypes are not allowed. Fields inside other fields are not
supported, since it's not clear how to map that to a flat vnlog legend
- If a structured dtype is given we return the array only, since the field names
are already available in the dtype
ARGUMENTS
- f: a filename or a readable Python "file" object. We read this until the end
- dtype: an optional dtype for the ouput array. May be a structured dtype
RETURN VALUE
- If no dtype is given or a simple dtype is given:
Returns a tuple (arr, list_keys, dict_key_index)
- If a structured dtype is given:
Returns arr
'''
if type(f) is str:
with open(f, 'r') as fh:
return _slurp(fh, dtype=dtype)
else:
return _slurp(f, dtype=dtype)
# Basic usage. More examples in test_python_parser.py
if __name__ == '__main__':
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
f = StringIO('''#! zxcv
# time height
## qewr
1 2
3 4
# - 10
- 5
6 -
- -
7 8
''')
for d in vnlog(f):
print(d['time'],d['height'])
|