1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
|
from math import log
import numpy as np
import pandas as pd
from ..lowcardinalitycolumn import LowCardinalityColumn
from ...reader import read_binary_uint64
from ...writer import write_binary_int64
from .intcolumn import (
NumpyUInt8Column, NumpyUInt16Column, NumpyUInt32Column, NumpyUInt64Column
)
class NumpyLowCardinalityColumn(LowCardinalityColumn):
int_types = {
0: NumpyUInt8Column,
1: NumpyUInt16Column,
2: NumpyUInt32Column,
3: NumpyUInt64Column
}
def __init__(self, nested_column, **kwargs):
super(NumpyLowCardinalityColumn, self).__init__(nested_column,
**kwargs)
def _write_data(self, items, buf):
# Do not write anything for empty column.
# May happen while writing empty arrays.
if not len(items):
return
# Replace nans with defaults if not nullabe.
if isinstance(items, np.ndarray) and not self.nested_column.nullable:
nulls = pd.isnull(items)
items = np.where(nulls, self.nested_column.null_value, items)
c = pd.Categorical(items)
int_type = int(log(len(c.codes), 2) / 8)
int_column = self.int_types[int_type]()
serialization_type = self.serialization_type | int_type
index = c.categories
keys = c.codes
if self.nested_column.nullable:
# First element represents NULL if column is nullable.
index = index.insert(0, self.nested_column.null_value)
keys = keys + 1
# Prevent null map writing. Reset nested column nullable flag.
self.nested_column.nullable = False
write_binary_int64(serialization_type, buf)
write_binary_int64(len(index), buf)
self.nested_column.write_data(index.to_numpy(items.dtype), buf)
write_binary_int64(len(items), buf)
int_column.write_items(keys, buf)
def _read_data(self, n_items, buf, nulls_map=None):
if not n_items:
return tuple()
serialization_type = read_binary_uint64(buf)
# Lowest byte contains info about key type.
key_type = serialization_type & 0xf
keys_column = self.int_types[key_type]()
nullable = self.nested_column.nullable
# Prevent null map reading. Reset nested column nullable flag.
self.nested_column.nullable = False
index_size = read_binary_uint64(buf)
index = self.nested_column.read_data(index_size, buf)
read_binary_uint64(buf) # number of keys
keys = keys_column.read_data(n_items, buf)
if nullable:
# Shift all codes by one ("No value" code is -1 for pandas
# categorical) and drop corresponding first index
# this is analog of original operation:
# index = (None, ) + index[1:]
keys = np.array(keys, dtype='int64') # deal with possible overflow
keys = keys - 1
index = index[1:]
return pd.Categorical.from_codes(keys, index)
def create_numpy_low_cardinality_column(spec, column_by_spec_getter,
column_options):
inner = spec[15:-1]
nested = column_by_spec_getter(inner)
return NumpyLowCardinalityColumn(nested, **column_options)
|