File: test_0674_categorical_validation.py

package info (click to toggle)
python-awkward 2.6.5-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 23,088 kB
  • sloc: python: 148,689; cpp: 33,562; sh: 432; makefile: 21; javascript: 8
file content (65 lines) | stat: -rw-r--r-- 2,342 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE

from __future__ import annotations

import numpy as np
import pytest

import awkward as ak

pyarrow = pytest.importorskip("pyarrow")


def test_categorical_is_valid():
    # validate a categorical array by its content
    arr = ak.Array([2019, 2020, 2021, 2020, 2019])
    categorical = ak.str.to_categorical(arr)
    assert ak.operations.is_valid(categorical)


def test_optional_categorical_from_arrow():
    # construct categorical array from option-typed DictionaryArray
    indices = pyarrow.array([0, 1, 0, 1, 2, 0, 2])
    nan_indices = pyarrow.array([0, 1, 0, 1, 2, None, 0, 2])
    dictionary = pyarrow.array([2019, 2020, 2021])

    dict_array = pyarrow.DictionaryArray.from_arrays(indices, dictionary)
    categorical_array = ak.operations.from_arrow(dict_array)
    assert categorical_array.layout.parameter("__array__") == "categorical"

    option_dict_array = pyarrow.DictionaryArray.from_arrays(nan_indices, dictionary)
    option_categorical_array = ak.operations.from_arrow(option_dict_array)
    assert option_categorical_array.layout.parameter("__array__") == "categorical"


def test_categorical_from_arrow_ChunkedArray():
    indices = [0, 1, 0, 1, 2, 0, 2]
    indices_new_schema = [0, 1, 0, 1, 0]

    dictionary = pyarrow.array([2019, 2020, 2021])
    dictionary_new_schema = pyarrow.array([2019, 2020])

    dict_array = pyarrow.DictionaryArray.from_arrays(pyarrow.array(indices), dictionary)
    dict_array_new_schema = pyarrow.DictionaryArray.from_arrays(
        pyarrow.array(indices_new_schema), dictionary_new_schema
    )

    batch = pyarrow.RecordBatch.from_arrays([dict_array], ["year"])
    batch_new_schema = pyarrow.RecordBatch.from_arrays(
        [dict_array_new_schema], ["year"]
    )

    batches = [batch] * 3
    batches_mixed_schema = [batch, batch_new_schema]

    table = pyarrow.Table.from_batches(batches)
    table_mixed_schema = pyarrow.Table.from_batches(batches_mixed_schema)

    array = ak.operations.from_arrow(table)
    array_mixed_schema = ak.operations.from_arrow(table_mixed_schema)

    assert np.asarray(array.layout.contents[0].index).tolist() == indices * 3
    assert (
        np.asarray(array_mixed_schema.layout.contents[0].index).tolist()
        == indices + indices_new_schema
    )