File: ext.pyx

package info (click to toggle)
python-feather-format 0.3.1%2Bdfsg1-8
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 948 kB
  • sloc: cpp: 4,558; python: 493; makefile: 8
file content (286 lines) | stat: -rw-r--r-- 8,152 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
# Copyright 2016 Feather Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# distutils: language = c++
# cython: embedsignature = True

from libcpp.string cimport string
from libcpp cimport bool as c_bool

cimport cpython
from cython.operator cimport dereference as deref

from libfeather cimport *

import pandas as pd
from feather.compat import pdapi

from numpy cimport ndarray
cimport numpy as cnp
import numpy as np

from feather.compat import frombytes, tobytes, encode_file_path
import six

cnp.import_array()

class FeatherError(Exception):
    pass


cdef extern from "interop.h" namespace "feather::py":
    Status pandas_to_primitive(object ao, PrimitiveArray* out)
    Status pandas_masked_to_primitive(object ao, object mask,
                                      PrimitiveArray* out)
    object get_null_mask(const PrimitiveArray& arr)
    object raw_primitive_to_pandas(const PrimitiveArray& arr)
    object primitive_to_pandas(const PrimitiveArray& arr)
    void set_numpy_nan(object nan)


cdef check_status(const Status& status):
    if status.ok():
        return

    cdef string c_message = status.ToString()
    raise FeatherError(frombytes(c_message))

cdef update_mask_with_datatype_nulls(mask, values):
    datatype_nulls = pd.isnull(values)
    if datatype_nulls.any():
        if mask is None:
            return datatype_nulls
        else:
            return mask | datatype_nulls

set_numpy_nan(np.nan)

cdef class FeatherWriter:
    cdef:
        unique_ptr[TableWriter] writer
        int64_t num_rows

    def __cinit__(self, object name):
        cdef:
            string c_name = encode_file_path(name)

        check_status(TableWriter.OpenFile(c_name, &self.writer))
        self.num_rows = -1

    def close(self):
        if self.num_rows < 0:
            self.num_rows = 0
        self.writer.get().SetNumRows(self.num_rows)
        check_status(self.writer.get().Finalize())

    def write_array(self, object name, object col, object mask=None):
        if self.num_rows >= 0:
            if len(col) != self.num_rows:
                raise ValueError('prior column had a different number of rows')
        else:
            self.num_rows = len(col)

        if pdapi.is_categorical_dtype(col.dtype):
            self.write_category(name, col, mask)
        elif pdapi.is_datetime64_any_dtype(col.dtype):
            self.write_timestamp(name, col, mask)
        else:
            self.write_primitive(name, col, mask)

    cdef write_category(self, name, col, mask):
        cdef:
            string c_name = tobytes(name)
            PrimitiveArray values
            PrimitiveArray levels

        col_values = _unbox_series(col)
        mask_to_file = update_mask_with_datatype_nulls(mask, col_values)

        self.write_ndarray(col_values.codes, mask_to_file, &values)
        check_status(pandas_to_primitive(np.asarray(col_values.categories),
                                         &levels))
        check_status(self.writer.get().AppendCategory(c_name, values, levels,
                                                      col_values.ordered))

    cdef write_primitive(self, name, col, mask):
        cdef:
            string c_name = tobytes(name)
            PrimitiveArray values

        col_values = _unbox_series(col)
        self.write_ndarray(col_values, mask, &values)
        check_status(self.writer.get().AppendPlain(c_name, values))

    cdef write_timestamp(self, name, col, mask):
        cdef:
            string c_name = tobytes(name)
            PrimitiveArray values
            TimestampMetadata metadata

        col_values = _unbox_series(col)
        mask_to_file = update_mask_with_datatype_nulls(mask, col_values)

        self.write_ndarray(col_values.view('i8'), mask_to_file, &values)

        metadata.unit = TimeUnit_NANOSECOND

        tz = getattr(col.dtype, 'tz', None)
        if tz is None:
            metadata.timezone = b''
        else:
            metadata.timezone = tobytes(tz.zone)

        check_status(self.writer.get().AppendTimestamp(c_name, values,
                                                       metadata))

    cdef int write_ndarray(self, values, mask, PrimitiveArray* out) except -1:
        if mask is None:
            check_status(pandas_to_primitive(values, out))
        else:
            check_status(pandas_masked_to_primitive(values, mask, out))
        return 0


cdef _unbox_series(col):
    if isinstance(col, pd.Series):
        col_values = col.values
    else:
        col_values = col
    return col_values


cdef class Column:
    cdef:
        shared_ptr[CColumnMetadata] metadata
        CColumnMetadata* mp
        FeatherReader parent
        int column_index

    def __cinit__(self):
        self.mp = NULL

    cdef init(self, FeatherReader parent, int i):
        cdef TableReader* tbl = parent.reader.get()

        self.parent = parent
        self.column_index = i

        check_status(tbl.GetColumnMetadata(i, &self.metadata))
        self.mp = self.metadata.get()

    property name:

        def __get__(self):
            return frombytes(self.mp.name())

    property type:

        def __get__(self):
            return self.mp.type()

    property user_metadata:

        def __get__(self):
            return frombytes(self.mp.user_metadata())

    property null_count:
        def __get__(self):
            cdef:
                unique_ptr[CColumn] col
                CColumn* cp

            check_status(self.parent.reader.get()
                .GetColumn(self.column_index, &col))

            return col.get().values().null_count

    def read(self):
        cdef:
            unique_ptr[CColumn] col
            CColumn* cp

        check_status(self.parent.reader.get()
                     .GetColumn(self.column_index, &col))

        cp = col.get()

        if cp.type() == ColumnType_PRIMITIVE:
            values = primitive_to_pandas(cp.values())
        elif cp.type() == ColumnType_CATEGORY:
            values = category_to_pandas(cp)
        elif cp.type() == ColumnType_TIMESTAMP:
            values = timestamp_to_pandas(cp)
        else:
            raise NotImplementedError(cp.type())

        return values


cdef class FeatherReader:
    cdef:
        unique_ptr[TableReader] reader

    def __cinit__(self, object name):
        cdef:
            string c_name = encode_file_path(name)

        check_status(TableReader.OpenFile(c_name, &self.reader))

    property num_rows:

        def __get__(self):
            return self.reader.get().num_rows()

    property num_columns:

        def __get__(self):
            return self.reader.get().num_columns()

    def get_column(self, int i):
        if i < 0 or i >= self.num_columns:
            raise IndexError(i)

        cdef Column col = Column()
        col.init(self, i)

        return col


cdef category_to_pandas(CColumn* col):
    cdef CategoryColumn* cat = <CategoryColumn*>(col)

    values = raw_primitive_to_pandas(cat.values())
    mask = get_null_mask(cat.values())
    values[mask] = -1
    levels = primitive_to_pandas(cat.levels())

    return pd.Categorical(values, categories=levels,
                          fastpath=True)

cdef timestamp_to_pandas(CColumn* col):
    cdef TimestampColumn* ts = <TimestampColumn*>(col)

    values = raw_primitive_to_pandas(ts.values())
    mask = get_null_mask(ts.values())

    tz = frombytes(ts.timezone())
    if tz:
        values = (pd.DatetimeIndex(values).tz_localize('utc')
                  .tz_convert(tz))
        result = pd.Series(values)
    else:
        result = pd.Series(values, dtype='M8[ns]')
    result.iloc[mask] = pd.NaT

    return result