1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
|
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division
import itertools
from petl.compat import string_types, text_type
from petl.util.base import Table, iterpeek
from petl.io.numpy import construct_dtype
def frombcolz(source, expression=None, outcols=None, limit=None, skip=0):
"""Extract a table from a bcolz ctable, e.g.::
>>> import petl as etl
>>>
>>> def example_from_bcolz():
... import bcolz
... cols = [
... ['apples', 'oranges', 'pears'],
... [1, 3, 7],
... [2.5, 4.4, .1]
... ]
... names = ('foo', 'bar', 'baz')
... ctbl = bcolz.ctable(cols, names=names)
... return etl.frombcolz(ctbl)
>>>
>>> example_from_bcolz() # doctest: +SKIP
+-----------+-----+-----+
| foo | bar | baz |
+===========+=====+=====+
| 'apples' | 1 | 2.5 |
+-----------+-----+-----+
| 'oranges' | 3 | 4.4 |
+-----------+-----+-----+
| 'pears' | 7 | 0.1 |
+-----------+-----+-----+
If `expression` is provided it will be executed by bcolz and only
matching rows returned, e.g.::
>>> tbl2 = etl.frombcolz(ctbl, expression='bar > 1') # doctest: +SKIP
>>> tbl2 # doctest: +SKIP
+-----------+-----+-----+
| foo | bar | baz |
+===========+=====+=====+
| 'oranges' | 3 | 4.4 |
+-----------+-----+-----+
| 'pears' | 7 | 0.1 |
+-----------+-----+-----+
.. versionadded:: 1.1.0
"""
return BcolzView(source, expression=expression, outcols=outcols,
limit=limit, skip=skip)
class BcolzView(Table):
def __init__(self, source, expression=None, outcols=None, limit=None,
skip=0):
self.source = source
self.expression = expression
self.outcols = outcols
self.limit = limit
self.skip = skip
def __iter__(self):
# obtain ctable
if isinstance(self.source, string_types):
import bcolz
ctbl = bcolz.open(self.source, mode='r')
else:
# assume bcolz ctable
ctbl = self.source
# obtain header
if self.outcols is None:
header = tuple(ctbl.names)
else:
header = tuple(self.outcols)
assert all(h in ctbl.names for h in header), 'invalid outcols'
yield header
# obtain iterator
if self.expression is None:
it = ctbl.iter(outcols=self.outcols, skip=self.skip,
limit=self.limit)
else:
it = ctbl.where(self.expression, outcols=self.outcols, skip=self.skip,
limit=self.limit)
for row in it:
yield row
def tobcolz(table, dtype=None, sample=1000, **kwargs):
"""Load data into a bcolz ctable, e.g.::
>>> import petl as etl
>>>
>>> def example_to_bcolz():
... table = [('foo', 'bar', 'baz'),
... ('apples', 1, 2.5),
... ('oranges', 3, 4.4),
... ('pears', 7, .1)]
... return etl.tobcolz(table)
>>>
>>> ctbl = example_to_bcolz() # doctest: +SKIP
>>> ctbl # doctest: +SKIP
ctable((3,), [('foo', '<U7'), ('bar', '<i8'), ('baz', '<f8')])
nbytes: 132; cbytes: 1023.98 KB; ratio: 0.00
cparams := cparams(clevel=5, shuffle=1, cname='lz4', quantize=0)
[('apples', 1, 2.5) ('oranges', 3, 4.4) ('pears', 7, 0.1)]
>>> ctbl.names # doctest: +SKIP
['foo', 'bar', 'baz']
>>> ctbl['foo'] # doctest: +SKIP
carray((3,), <U7)
nbytes := 84; cbytes := 511.98 KB; ratio: 0.00
cparams := cparams(clevel=5, shuffle=1, cname='lz4', quantize=0)
chunklen := 18724; chunksize: 524272; blocksize: 0
['apples' 'oranges' 'pears']
Other keyword arguments are passed through to the ctable constructor.
.. versionadded:: 1.1.0
"""
import bcolz
import numpy as np
it = iter(table)
peek, it = iterpeek(it, sample)
hdr = next(it)
# numpy is fussy about having tuples, need to make sure
it = (tuple(row) for row in it)
flds = list(map(text_type, hdr))
dtype = construct_dtype(flds, peek, dtype)
# create ctable
kwargs.setdefault('expectedlen', 1000000)
kwargs.setdefault('mode', 'w')
ctbl = bcolz.ctable(np.array([], dtype=dtype), **kwargs)
# fill chunk-wise
chunklen = sum(ctbl.cols[name].chunklen
for name in ctbl.names) // len(ctbl.names)
while True:
data = list(itertools.islice(it, chunklen))
data = np.array(data, dtype=dtype)
ctbl.append(data)
if len(data) < chunklen:
break
ctbl.flush()
return ctbl
def appendbcolz(table, obj, check_names=True):
"""Append data into a bcolz ctable. The `obj` argument can be either an
existing ctable or the name of a directory were an on-disk ctable is
stored.
.. versionadded:: 1.1.0
"""
import bcolz
import numpy as np
if isinstance(obj, string_types):
ctbl = bcolz.open(obj, mode='a')
else:
assert hasattr(obj, 'append') and hasattr(obj, 'names'), \
'expected rootdir or ctable, found %r' % obj
ctbl = obj
# setup
dtype = ctbl.dtype
it = iter(table)
hdr = next(it)
flds = list(map(text_type, hdr))
# check names match
if check_names:
assert tuple(flds) == tuple(ctbl.names), 'column names do not match'
# fill chunk-wise
chunklen = sum(ctbl.cols[name].chunklen
for name in ctbl.names) // len(ctbl.names)
while True:
data = list(itertools.islice(it, chunklen))
data = np.array(data, dtype=dtype)
ctbl.append(data)
if len(data) < chunklen:
break
ctbl.flush()
return ctbl
|