File: test_dataframe.py

package info (click to toggle)
python-fastparquet 2024.2.0-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 120,180 kB
  • sloc: python: 8,181; makefile: 187
file content (141 lines) | stat: -rw-r--r-- 4,701 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import warnings
from unittest import mock

import numpy as np
import pandas as pd
import pytest
from numpy import empty as np_empty
from pandas.testing import assert_frame_equal

from fastparquet.dataframe import empty

DatetimeTZDtype = pd.DatetimeTZDtype


def test_empty():
    n = 100
    df, views = empty('category', size=n, cols=['c'])
    assert df.shape == (n, 1)
    assert df.dtypes.tolist() == ['category']
    assert views['c'].dtype == 'int16'

    df, views = empty('category', size=n, cols=['c'], cats={'c': 2**20})
    assert df.shape == (n, 1)
    assert df.dtypes.tolist() == ['category']
    assert views['c'].dtype == 'int32'

    df, views = empty('category', size=n, cols=['c'],
                      cats={'c': ['one', 'two']})
    views['c'][0] = 1
    assert df.c[:2].tolist() == ['two', np.nan]

    df, views = empty('i4,i8,f8,f8,O', size=n,
                      cols=['i4', 'i8', 'f8_1', 'f8_2', 'O'])
    assert df.shape == (n, 5)
    assert len(views) == 5


def test_no_cats():
    df, views = empty('category', size=10, cols=['c'],
                      cats={'c': []})
    assert (views["c"] == -1).all()


def test_empty_tz_utc():
    with warnings.catch_warnings():
        warnings.simplefilter("error")
        empty([DatetimeTZDtype(unit="ns", tz="UTC")], 10, cols=['a'],
              timezones={'a': 'UTC'})


# non regression test for https://github.com/dask/fastparquet/issues/532
def np_empty_mock(shape, dtype):
    """mock numpy empty to return an initialised array with all hours in 2020 if shape is 365 and dtype.kind is M.
    The objective is to simulate a numpy.empty that returns an uninitialized array with random content that
    can cause issues when tz_localize is applied with a timezone with DST"""
    import numpy
    dtype = numpy.dtype(dtype)
    if shape == 8784 and dtype.kind == "M":
        a = numpy.arange(start="2020-01-01", stop="2021-01-01", dtype="M8[h]").astype(dtype)
    else:
        a = np_empty(shape, dtype)
    return a


@mock.patch("numpy.empty", np_empty_mock)
def test_empty_tz_nonutc():
    df, views = empty(types=[DatetimeTZDtype(unit="ns", tz="CET")], size=8784, cols=['a'],
                      timezones={'a': 'CET', 'index': 'CET'}, index_types=["datetime64[ns]"], index_names=["index"])
    assert df.index.tz.zone == "CET"
    assert df.a.dtype.tz.zone == "CET"


# non-regression test for https://github.com/dask/fastparquet/issues/778
def test_empty_valid_timestamp():
    df, views = empty(
        "i4",
        size=100,
        cols=["a"],
        index_types=["datetime64[ms]"],
        index_names=["timestamp"],
    )
    assert isinstance(df.index, pd.DatetimeIndex)


def test_timestamps():
    z = 'US/Eastern'

    # single column
    df, views = empty('M8', 100, cols=['t'])
    assert df.t.dt.tz is None
    views['t'].dtype.kind == "M"

    df, views = empty('M8', 100, cols=['t'], timezones={'t': z})
    assert df.t.dt.tz.zone == z
    views['t'].dtype.kind == "M"

    # one time column, one normal
    df, views = empty('M8,i', 100, cols=['t', 'i'], timezones={'t': z})
    assert df.t.dt.tz.zone == z
    views['t'].dtype.kind == "M"
    views['i'].dtype.kind == 'i'

    # no effect of timezones= on non-time column
    df, views = empty('M8,i', 100, cols=['t', 'i'], timezones={'t': z, 'i': z})
    assert df.t.dt.tz.zone == z
    assert df.i.dtype.kind == 'i'
    views['t'].dtype.kind == "M"
    views['i'].dtype.kind == 'i'

    # multi-timezones
    z2 = 'US/Central'
    df, views = empty('M8,M8', 100, cols=['t1', 't2'], timezones={'t1': z,
                                                                  't2': z})
    assert df.t1.dt.tz.zone == z
    assert df.t2.dt.tz.zone == z

    df, views = empty('M8,M8', 100, cols=['t1', 't2'], timezones={'t1': z})
    assert df.t1.dt.tz.zone == z
    assert df.t2.dt.tz is None

    df, views = empty('M8,M8', 100, cols=['t1', 't2'], timezones={'t1': z,
                                                                  't2': 'UTC'})
    assert df.t1.dt.tz.zone == z
    assert str(df.t2.dt.tz) == 'UTC'

    df, views = empty('M8,M8', 100, cols=['t1', 't2'], timezones={'t1': z,
                                                                  't2': z2})
    assert df.t1.dt.tz.zone == z
    assert df.t2.dt.tz.zone == z2


def test_pandas_hive_serialization(tmpdir):
    parquet_dir = tmpdir.join("test.par")
    column = "data"
    df = pd.DataFrame(
        columns=[column], data=[("42",), ("",), ("0",), ("1",), ("0.0",)]
    )
    df.to_parquet(parquet_dir, file_scheme="hive", row_group_offsets=[0, 2, 4], engine='fastparquet')

    df_ = pd.read_parquet(parquet_dir, engine='fastparquet')
    assert_frame_equal(df, df_)