1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
|
import warnings
from unittest import mock
import numpy as np
import pandas as pd
import pytest
from numpy import empty as np_empty
from pandas.testing import assert_frame_equal
from fastparquet.dataframe import empty
DatetimeTZDtype = pd.DatetimeTZDtype
def test_empty():
n = 100
df, views = empty('category', size=n, cols=['c'])
assert df.shape == (n, 1)
assert df.dtypes.tolist() == ['category']
assert views['c'].dtype == 'int16'
df, views = empty('category', size=n, cols=['c'], cats={'c': 2**20})
assert df.shape == (n, 1)
assert df.dtypes.tolist() == ['category']
assert views['c'].dtype == 'int32'
df, views = empty('category', size=n, cols=['c'],
cats={'c': ['one', 'two']})
views['c'][0] = 1
assert df.c[:2].tolist() == ['two', np.nan]
df, views = empty('i4,i8,f8,f8,O', size=n,
cols=['i4', 'i8', 'f8_1', 'f8_2', 'O'])
assert df.shape == (n, 5)
assert len(views) == 5
def test_no_cats():
df, views = empty('category', size=10, cols=['c'],
cats={'c': []})
assert (views["c"] == -1).all()
def test_empty_tz_utc():
with warnings.catch_warnings():
warnings.simplefilter("error")
empty([DatetimeTZDtype(unit="ns", tz="UTC")], 10, cols=['a'],
timezones={'a': 'UTC'})
# non regression test for https://github.com/dask/fastparquet/issues/532
def np_empty_mock(shape, dtype):
"""mock numpy empty to return an initialised array with all hours in 2020 if shape is 365 and dtype.kind is M.
The objective is to simulate a numpy.empty that returns an uninitialized array with random content that
can cause issues when tz_localize is applied with a timezone with DST"""
import numpy
dtype = numpy.dtype(dtype)
if shape == 8784 and dtype.kind == "M":
a = numpy.arange(start="2020-01-01", stop="2021-01-01", dtype="M8[h]").astype(dtype)
else:
a = np_empty(shape, dtype)
return a
@mock.patch("numpy.empty", np_empty_mock)
def test_empty_tz_nonutc():
df, views = empty(types=[DatetimeTZDtype(unit="ns", tz="CET")], size=8784, cols=['a'],
timezones={'a': 'CET', 'index': 'CET'}, index_types=["datetime64[ns]"], index_names=["index"])
assert df.index.tz.zone == "CET"
assert df.a.dtype.tz.zone == "CET"
# non-regression test for https://github.com/dask/fastparquet/issues/778
def test_empty_valid_timestamp():
df, views = empty(
"i4",
size=100,
cols=["a"],
index_types=["datetime64[ms]"],
index_names=["timestamp"],
)
assert isinstance(df.index, pd.DatetimeIndex)
def test_timestamps():
z = 'US/Eastern'
# single column
df, views = empty('M8', 100, cols=['t'])
assert df.t.dt.tz is None
views['t'].dtype.kind == "M"
df, views = empty('M8', 100, cols=['t'], timezones={'t': z})
assert df.t.dt.tz.zone == z
views['t'].dtype.kind == "M"
# one time column, one normal
df, views = empty('M8,i', 100, cols=['t', 'i'], timezones={'t': z})
assert df.t.dt.tz.zone == z
views['t'].dtype.kind == "M"
views['i'].dtype.kind == 'i'
# no effect of timezones= on non-time column
df, views = empty('M8,i', 100, cols=['t', 'i'], timezones={'t': z, 'i': z})
assert df.t.dt.tz.zone == z
assert df.i.dtype.kind == 'i'
views['t'].dtype.kind == "M"
views['i'].dtype.kind == 'i'
# multi-timezones
z2 = 'US/Central'
df, views = empty('M8,M8', 100, cols=['t1', 't2'], timezones={'t1': z,
't2': z})
assert df.t1.dt.tz.zone == z
assert df.t2.dt.tz.zone == z
df, views = empty('M8,M8', 100, cols=['t1', 't2'], timezones={'t1': z})
assert df.t1.dt.tz.zone == z
assert df.t2.dt.tz is None
df, views = empty('M8,M8', 100, cols=['t1', 't2'], timezones={'t1': z,
't2': 'UTC'})
assert df.t1.dt.tz.zone == z
assert str(df.t2.dt.tz) == 'UTC'
df, views = empty('M8,M8', 100, cols=['t1', 't2'], timezones={'t1': z,
't2': z2})
assert df.t1.dt.tz.zone == z
assert df.t2.dt.tz.zone == z2
def test_pandas_hive_serialization(tmpdir):
parquet_dir = tmpdir.join("test.par")
column = "data"
df = pd.DataFrame(
columns=[column], data=[("42",), ("",), ("0",), ("1",), ("0.0",)]
)
df.to_parquet(parquet_dir, file_scheme="hive", row_group_offsets=[0, 2, 4], engine='fastparquet')
df_ = pd.read_parquet(parquet_dir, engine='fastparquet')
assert_frame_equal(df, df_)
|