File: test_helpers.py

package info (click to toggle)
azure-kusto-python 5.0.5-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 6,704 kB
  • sloc: python: 10,633; sh: 13; makefile: 3
file content (130 lines) | stat: -rw-r--r-- 5,717 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License
import datetime
import json
import os

import pytest

from azure.kusto.data._models import KustoResultTable
from azure.kusto.data.helpers import dataframe_from_result_table
from azure.kusto.data.response import KustoResponseDataSetV2
import pandas
import numpy


def test_dataframe_from_result_table():
    """Test conversion of KustoResultTable to pandas.DataFrame, including fixes for certain column types"""


with open(os.path.join(os.path.dirname(__file__), "input", "dataframe.json"), "r") as response_file:
    data = response_file.read()

response = KustoResponseDataSetV2(json.loads(data))
# Test when given both types of dictionary parameters that type conversion doesn't override column name conversion
test_dict_by_name = {
    "RecordName": lambda col, frame: frame[col].astype("str"),
    "RecordInt64": lambda col, frame: frame[col].astype("int64"),
    "MissingType": lambda col, frame: frame[col].astype("str"),
}
test_dict_by_type = {"int": lambda col, frame: frame[col].astype("int32")}
df = dataframe_from_result_table(response.primary_results[0], converters_by_type=test_dict_by_type, converters_by_column_name=test_dict_by_name)

if hasattr(pandas, "StringDType"):
    assert df["RecordName"].dtype == pandas.StringDtype()
    assert str(df.iloc[0].RecordName) == "now"
    assert df["RecordGUID"].dtype == pandas.StringDtype()
    assert str(df.iloc[0].RecordGUID) == "6f3c1072-2739-461c-8aa7-3cfc8ff528a8"
    assert df["RecordDynamic"].dtype == pandas.StringDtype()
    assert (
        str(df.iloc[0].RecordDynamic)
        == '{"Visualization":null,"Title":null,"XColumn":null,"Series":null,"YColumns":null,"XTitle":null,"YTitle":null,"XAxis":null,"YAxis":null,"Legend":null,"YSplit":null,"Accumulate":false,"IsQuerySorted":false,"Kind":null}'
    )
else:
    assert df.iloc[0].RecordName == "now"
    assert df.iloc[0].RecordGUID == "6f3c1072-2739-461c-8aa7-3cfc8ff528a8"

assert type(df.iloc[0].RecordTime) is pandas._libs.tslibs.timestamps.Timestamp

for k, v in {"year": 2021, "month": 12, "day": 22, "hour": 11, "minute": 43, "second": 00}.items():
    assert getattr(df.iloc[0].RecordTime, k) == v
assert type(df.iloc[0].RecordBool) is numpy.bool_
assert df.iloc[0].RecordBool == True
assert type(df.iloc[0].RecordInt) is numpy.int32
assert df.iloc[0].RecordInt == 5678
assert type(df.iloc[0].RecordInt64) is numpy.int64
assert df.iloc[0].RecordInt64 == 222
assert type(df.iloc[0].RecordLong) is numpy.int64
assert df.iloc[0].RecordLong == 92233720368
assert type(df.iloc[0].RecordReal) is numpy.float64
assert df.iloc[0].RecordReal == 3.14159
assert type(df.iloc[0].RecordDouble) is numpy.float64
assert df.iloc[0].RecordDouble == 7.89
assert type(df.iloc[0].RecordDecimal) is numpy.float64
assert df.iloc[0].RecordDecimal == 1.2

# Kusto datetime(0000-01-01T00:00:00Z), which Pandas can't represent.
assert df.iloc[1].RecordName == "earliest datetime"
assert type(df.iloc[1].RecordTime) is pandas._libs.tslibs.nattype.NaTType
assert pandas.isnull(df.iloc[1].RecordReal)

# Kusto datetime(9999-12-31T23:59:59Z), which Pandas can't represent.
assert df.iloc[2].RecordName == "latest datetime"
assert type(df.iloc[2].RecordTime) is pandas._libs.tslibs.nattype.NaTType
assert type(df.iloc[2].RecordReal) is numpy.float64
assert df.iloc[2].RecordReal == numpy.inf

# Pandas earliest datetime
assert df.iloc[3].RecordName == "earliest pandas datetime"
assert type(df.iloc[3].RecordTime) is pandas._libs.tslibs.timestamps.Timestamp
assert type(df.iloc[3].RecordReal) is numpy.float64
assert df.iloc[3].RecordReal == -numpy.inf

# Pandas latest datetime
assert df.iloc[4].RecordName == "latest pandas datetime"
assert type(df.iloc[4].RecordTime) is pandas._libs.tslibs.timestamps.Timestamp

# Kusto 600000000 ticks timedelta
assert df.iloc[5].RecordName == "timedelta ticks"
assert type(df.iloc[5].RecordTime) is pandas._libs.tslibs.timestamps.Timestamp
assert type(df.iloc[5].RecordOffset) is pandas._libs.tslibs.timestamps.Timedelta
assert df.iloc[5].RecordOffset == pandas.to_timedelta("00:01:00")

# Kusto timedelta(1.01:01:01.0) ==
assert df.iloc[6].RecordName == "timedelta string"
assert type(df.iloc[6].RecordTime) is pandas._libs.tslibs.timestamps.Timestamp
assert type(df.iloc[6].RecordOffset) is pandas._libs.tslibs.timestamps.Timedelta
assert df.iloc[6].RecordOffset == pandas.to_timedelta("1 days 01:01:01")

# Testing int to float conversion
test_int_to_float = {"int": "float64"}
ignore_missing_type = {
    "MissingType": lambda col, frame: frame[col].astype("str"),
}
df_int_to_float = dataframe_from_result_table(response.primary_results[0], converters_by_type=test_int_to_float, converters_by_column_name=ignore_missing_type)
assert type(df_int_to_float.iloc[0].RecordInt) is numpy.float64
assert df.iloc[0].RecordInt == 5678

# Testing missing type conversion
with pytest.raises(Exception):
    df_missing_type = dataframe_from_result_table(response.primary_results[0])


def test_pandas_mixed_date():
    df = dataframe_from_result_table(
        KustoResultTable(
            {
                "TableName": "Table_0",
                "Columns": [
                    {"ColumnName": "Date", "ColumnType": "datetime"},
                ],
                "Rows": [
                    ["2023-12-12T01:59:59.352Z"],
                    ["2023-12-12T01:54:44Z"],
                ],
            }
        )
    )

    assert df["Date"][0] == pandas.Timestamp(year=2023, month=12, day=12, hour=1, minute=59, second=59, microsecond=352000, tzinfo=datetime.timezone.utc)
    assert df["Date"][1] == pandas.Timestamp(year=2023, month=12, day=12, hour=1, minute=54, second=44, tzinfo=datetime.timezone.utc)