1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
|
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License
import datetime
import json
import os
import pytest
from azure.kusto.data._models import KustoResultTable
from azure.kusto.data.helpers import dataframe_from_result_table
from azure.kusto.data.response import KustoResponseDataSetV2
import pandas
import numpy
def test_dataframe_from_result_table():
"""Test conversion of KustoResultTable to pandas.DataFrame, including fixes for certain column types"""
with open(os.path.join(os.path.dirname(__file__), "input", "dataframe.json"), "r") as response_file:
data = response_file.read()
response = KustoResponseDataSetV2(json.loads(data))
# Test when given both types of dictionary parameters that type conversion doesn't override column name conversion
test_dict_by_name = {
"RecordName": lambda col, frame: frame[col].astype("str"),
"RecordInt64": lambda col, frame: frame[col].astype("int64"),
"MissingType": lambda col, frame: frame[col].astype("str"),
}
test_dict_by_type = {"int": lambda col, frame: frame[col].astype("int32")}
df = dataframe_from_result_table(response.primary_results[0], converters_by_type=test_dict_by_type, converters_by_column_name=test_dict_by_name)
if hasattr(pandas, "StringDType"):
assert df["RecordName"].dtype == pandas.StringDtype()
assert str(df.iloc[0].RecordName) == "now"
assert df["RecordGUID"].dtype == pandas.StringDtype()
assert str(df.iloc[0].RecordGUID) == "6f3c1072-2739-461c-8aa7-3cfc8ff528a8"
assert df["RecordDynamic"].dtype == pandas.StringDtype()
assert (
str(df.iloc[0].RecordDynamic)
== '{"Visualization":null,"Title":null,"XColumn":null,"Series":null,"YColumns":null,"XTitle":null,"YTitle":null,"XAxis":null,"YAxis":null,"Legend":null,"YSplit":null,"Accumulate":false,"IsQuerySorted":false,"Kind":null}'
)
else:
assert df.iloc[0].RecordName == "now"
assert df.iloc[0].RecordGUID == "6f3c1072-2739-461c-8aa7-3cfc8ff528a8"
assert type(df.iloc[0].RecordTime) is pandas._libs.tslibs.timestamps.Timestamp
for k, v in {"year": 2021, "month": 12, "day": 22, "hour": 11, "minute": 43, "second": 00}.items():
assert getattr(df.iloc[0].RecordTime, k) == v
assert type(df.iloc[0].RecordBool) is numpy.bool_
assert df.iloc[0].RecordBool == True
assert type(df.iloc[0].RecordInt) is numpy.int32
assert df.iloc[0].RecordInt == 5678
assert type(df.iloc[0].RecordInt64) is numpy.int64
assert df.iloc[0].RecordInt64 == 222
assert type(df.iloc[0].RecordLong) is numpy.int64
assert df.iloc[0].RecordLong == 92233720368
assert type(df.iloc[0].RecordReal) is numpy.float64
assert df.iloc[0].RecordReal == 3.14159
assert type(df.iloc[0].RecordDouble) is numpy.float64
assert df.iloc[0].RecordDouble == 7.89
assert type(df.iloc[0].RecordDecimal) is numpy.float64
assert df.iloc[0].RecordDecimal == 1.2
# Kusto datetime(0000-01-01T00:00:00Z), which Pandas can't represent.
assert df.iloc[1].RecordName == "earliest datetime"
assert type(df.iloc[1].RecordTime) is pandas._libs.tslibs.nattype.NaTType
assert pandas.isnull(df.iloc[1].RecordReal)
# Kusto datetime(9999-12-31T23:59:59Z), which Pandas can't represent.
assert df.iloc[2].RecordName == "latest datetime"
assert type(df.iloc[2].RecordTime) is pandas._libs.tslibs.nattype.NaTType
assert type(df.iloc[2].RecordReal) is numpy.float64
assert df.iloc[2].RecordReal == numpy.inf
# Pandas earliest datetime
assert df.iloc[3].RecordName == "earliest pandas datetime"
assert type(df.iloc[3].RecordTime) is pandas._libs.tslibs.timestamps.Timestamp
assert type(df.iloc[3].RecordReal) is numpy.float64
assert df.iloc[3].RecordReal == -numpy.inf
# Pandas latest datetime
assert df.iloc[4].RecordName == "latest pandas datetime"
assert type(df.iloc[4].RecordTime) is pandas._libs.tslibs.timestamps.Timestamp
# Kusto 600000000 ticks timedelta
assert df.iloc[5].RecordName == "timedelta ticks"
assert type(df.iloc[5].RecordTime) is pandas._libs.tslibs.timestamps.Timestamp
assert type(df.iloc[5].RecordOffset) is pandas._libs.tslibs.timestamps.Timedelta
assert df.iloc[5].RecordOffset == pandas.to_timedelta("00:01:00")
# Kusto timedelta(1.01:01:01.0) ==
assert df.iloc[6].RecordName == "timedelta string"
assert type(df.iloc[6].RecordTime) is pandas._libs.tslibs.timestamps.Timestamp
assert type(df.iloc[6].RecordOffset) is pandas._libs.tslibs.timestamps.Timedelta
assert df.iloc[6].RecordOffset == pandas.to_timedelta("1 days 01:01:01")
# Testing int to float conversion
test_int_to_float = {"int": "float64"}
ignore_missing_type = {
"MissingType": lambda col, frame: frame[col].astype("str"),
}
df_int_to_float = dataframe_from_result_table(response.primary_results[0], converters_by_type=test_int_to_float, converters_by_column_name=ignore_missing_type)
assert type(df_int_to_float.iloc[0].RecordInt) is numpy.float64
assert df.iloc[0].RecordInt == 5678
# Testing missing type conversion
with pytest.raises(Exception):
df_missing_type = dataframe_from_result_table(response.primary_results[0])
def test_pandas_mixed_date():
df = dataframe_from_result_table(
KustoResultTable(
{
"TableName": "Table_0",
"Columns": [
{"ColumnName": "Date", "ColumnType": "datetime"},
],
"Rows": [
["2023-12-12T01:59:59.352Z"],
["2023-12-12T01:54:44Z"],
],
}
)
)
assert df["Date"][0] == pandas.Timestamp(year=2023, month=12, day=12, hour=1, minute=59, second=59, microsecond=352000, tzinfo=datetime.timezone.utc)
assert df["Date"][1] == pandas.Timestamp(year=2023, month=12, day=12, hour=1, minute=54, second=44, tzinfo=datetime.timezone.utc)
|