File: bcolz.py

package info (click to toggle)
python-petl 1.7.17-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 2,224 kB
  • sloc: python: 22,617; makefile: 109; xml: 9
file content (202 lines) | stat: -rw-r--r-- 6,080 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division
import itertools


from petl.compat import string_types, text_type
from petl.util.base import Table, iterpeek
from petl.io.numpy import construct_dtype


def frombcolz(source, expression=None, outcols=None, limit=None, skip=0):
    """Extract a table from a bcolz ctable, e.g.::

        >>> import petl as etl
        >>>
        >>> def example_from_bcolz():
        ...     import bcolz
        ...     cols = [
        ...         ['apples', 'oranges', 'pears'],
        ...         [1, 3, 7],
        ...         [2.5, 4.4, .1]
        ...     ]
        ...     names = ('foo', 'bar', 'baz')
        ...     ctbl = bcolz.ctable(cols, names=names)
        ...     return etl.frombcolz(ctbl)
        >>>
        >>> example_from_bcolz() # doctest: +SKIP
        +-----------+-----+-----+
        | foo       | bar | baz |
        +===========+=====+=====+
        | 'apples'  |   1 | 2.5 |
        +-----------+-----+-----+
        | 'oranges' |   3 | 4.4 |
        +-----------+-----+-----+
        | 'pears'   |   7 | 0.1 |
        +-----------+-----+-----+

    If `expression` is provided it will be executed by bcolz and only
    matching rows returned, e.g.::

        >>> tbl2 = etl.frombcolz(ctbl, expression='bar > 1') # doctest: +SKIP
        >>> tbl2 # doctest: +SKIP
        +-----------+-----+-----+
        | foo       | bar | baz |
        +===========+=====+=====+
        | 'oranges' |   3 | 4.4 |
        +-----------+-----+-----+
        | 'pears'   |   7 | 0.1 |
        +-----------+-----+-----+

    .. versionadded:: 1.1.0

    """

    return BcolzView(source, expression=expression, outcols=outcols,
                     limit=limit, skip=skip)


class BcolzView(Table):

    def __init__(self, source, expression=None, outcols=None, limit=None,
                 skip=0):
        self.source = source
        self.expression = expression
        self.outcols = outcols
        self.limit = limit
        self.skip = skip

    def __iter__(self):

        # obtain ctable
        if isinstance(self.source, string_types):
            import bcolz
            ctbl = bcolz.open(self.source, mode='r')
        else:
            # assume bcolz ctable
            ctbl = self.source

        # obtain header
        if self.outcols is None:
            header = tuple(ctbl.names)
        else:
            header = tuple(self.outcols)
            assert all(h in ctbl.names for h in header), 'invalid outcols'
        yield header

        # obtain iterator
        if self.expression is None:
            it = ctbl.iter(outcols=self.outcols, skip=self.skip,
                           limit=self.limit)
        else:
            it = ctbl.where(self.expression, outcols=self.outcols, skip=self.skip,
                           limit=self.limit)

        for row in it:
            yield row


def tobcolz(table, dtype=None, sample=1000, **kwargs):
    """Load data into a bcolz ctable, e.g.::

        >>> import petl as etl
        >>>
        >>> def example_to_bcolz():
        ...     table = [('foo', 'bar', 'baz'),
        ...              ('apples', 1, 2.5),
        ...              ('oranges', 3, 4.4),
        ...              ('pears', 7, .1)]
        ...     return etl.tobcolz(table)
        >>> 
        >>> ctbl = example_to_bcolz() # doctest: +SKIP
        >>> ctbl # doctest: +SKIP
        ctable((3,), [('foo', '<U7'), ('bar', '<i8'), ('baz', '<f8')])
          nbytes: 132; cbytes: 1023.98 KB; ratio: 0.00
          cparams := cparams(clevel=5, shuffle=1, cname='lz4', quantize=0)
        [('apples', 1, 2.5) ('oranges', 3, 4.4) ('pears', 7, 0.1)]
        >>> ctbl.names # doctest: +SKIP
        ['foo', 'bar', 'baz']
        >>> ctbl['foo'] # doctest: +SKIP
        carray((3,), <U7)
          nbytes := 84; cbytes := 511.98 KB; ratio: 0.00
          cparams := cparams(clevel=5, shuffle=1, cname='lz4', quantize=0)
          chunklen := 18724; chunksize: 524272; blocksize: 0
        ['apples' 'oranges' 'pears']

    Other keyword arguments are passed through to the ctable constructor.

    .. versionadded:: 1.1.0

    """

    import bcolz
    import numpy as np

    it = iter(table)
    peek, it = iterpeek(it, sample)
    hdr = next(it)
    # numpy is fussy about having tuples, need to make sure
    it = (tuple(row) for row in it)
    flds = list(map(text_type, hdr))
    dtype = construct_dtype(flds, peek, dtype)

    # create ctable
    kwargs.setdefault('expectedlen', 1000000)
    kwargs.setdefault('mode', 'w')
    ctbl = bcolz.ctable(np.array([], dtype=dtype), **kwargs)

    # fill chunk-wise
    chunklen = sum(ctbl.cols[name].chunklen
                   for name in ctbl.names) // len(ctbl.names)
    while True:
        data = list(itertools.islice(it, chunklen))
        data = np.array(data, dtype=dtype)
        ctbl.append(data)
        if len(data) < chunklen:
            break

    ctbl.flush()
    return ctbl


def appendbcolz(table, obj, check_names=True):
    """Append data into a bcolz ctable. The `obj` argument can be either an
    existing ctable or the name of a directory were an on-disk ctable is
    stored.

    .. versionadded:: 1.1.0

    """

    import bcolz
    import numpy as np

    if isinstance(obj, string_types):
        ctbl = bcolz.open(obj, mode='a')
    else:
        assert hasattr(obj, 'append') and hasattr(obj, 'names'), \
            'expected rootdir or ctable, found %r' % obj
        ctbl = obj

    # setup
    dtype = ctbl.dtype
    it = iter(table)
    hdr = next(it)
    flds = list(map(text_type, hdr))

    # check names match
    if check_names:
        assert tuple(flds) == tuple(ctbl.names), 'column names do not match'

    # fill chunk-wise
    chunklen = sum(ctbl.cols[name].chunklen
                   for name in ctbl.names) // len(ctbl.names)
    while True:
        data = list(itertools.islice(it, chunklen))
        data = np.array(data, dtype=dtype)
        ctbl.append(data)
        if len(data) < chunklen:
            break

    ctbl.flush()
    return ctbl