File: test_pyodc_codc_interop.py

package info (click to toggle)
pyodc 1.6.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 700 kB
  • sloc: python: 2,369; ansic: 86; makefile: 32
file content (119 lines) | stat: -rw-r--r-- 5,380 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os
from tempfile import NamedTemporaryFile

import numpy as np
import numpy.testing
import pandas
import pandas as pd
import pytest
from conftest import odc_modules

from pyodc import Reader, codec
from pyodc.constants import INTERNAL_REAL_MISSING

# Each case is [data_column, expected_codec]
testcases = [
    # Anything constant that fits in less than 8 bytes goes into codec.Constant
    [[0, 0, 0, 0, 0, 0, 0], codec.Constant],
    [[73] * 7, codec.Constant],
    [[1.432] * 7, codec.Constant],
    # Like the above but with missing values
    [[1, 1, 1, None, 1, 1, 1], codec.ConstantOrMissing],
    [[0.1, 0.1, 0.1, None, 0.1, 0.1, 0.1], codec.RealConstantOrMissing],
    # Constant columns of strings of less than 8 bytes go into ConstantString
    [["abcd"] * 7, codec.ConstantString],
    # Constant columns of strings of more than 8 bytes must be handled differently
    [["abcdefghi"] * 7, codec.Int8String],
    # Columns of strings with less than 2^n unique values go into Int8String or Int16String
    [["aoeu", "aoeu", "aaaaaaaooooooo", "None", "boo", "squiggle", "a"], codec.Int8String],
    [["longconstant"] + [str(num) for num in range(256)], codec.Int16String],
    # Integers
    [[1, 2, 3, 4, 5, 6, 7], codec.Int8],
    [[1, None, 3, 4, 5, None, 7], codec.Int8Missing],
    [[-512, None, 3, 7623, -22000, None, 7], codec.Int16Missing],
    # Integers supplied as int32, int16 or int8 need to be internally cast to int64 if using the codc encoder
    [np.array([1, 2, 3, 4, 5, 6, 7], dtype=np.uint8), codec.Int8],
    [np.array([1, 2, 3, 4, 5, 6, 7], dtype=np.int8), codec.Int8],
    [np.array([1, 2, 3, 4, 5, 6, 7], dtype=np.uint16), codec.Int8],
    [np.array([1, 2, 3, 4, 5, 6, 7], dtype=np.int16), codec.Int8],
    [np.array([1, 2, 3, 4, 5, 6, 7], dtype=np.uint32), codec.Int8],
    [np.array([1, 2, 3, 4, 5, 6, 7], dtype=np.int32), codec.Int8],
    [np.array([1, 2, 3, 4, 5, 6, 7], dtype=np.int64), codec.Int8],
    # Integers encoded as floats
    [[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], codec.Int8],
    [[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, None], codec.Int8Missing],
    [[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, np.nan], codec.Int8Missing],
    [np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, np.nan]), codec.Int8Missing],
    [np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], dtype=np.float32), codec.Int8],
    [np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, np.nan], dtype=np.float32), codec.Int8Missing],
    # uint64 is not supported
    # [np.array([1, 2, 3, 4, 5, 6, 7, 2**64 - 1], dtype = np.uint64), codec.Int8],
    # Breaking the pattern, codec.Int32 accepts missing values.
    [[-1234567, 8765432, None, 22, 22222222, -81222323, None], codec.Int32],
    # Reals
    [[999.99, 888.88, 777.77, 666.66, 555.55, 444.44, 333.33], codec.LongReal],
    # ShortReal2 is the default codec for float32 which uses INTERNAL_REAL_MISSING[1] to represent missing data
    [np.array([999.99, 888.88, 777.77, 666.66, 555.55, 444.44, 333.33], dtype=np.float32), codec.ShortReal2],
    [
        np.array([INTERNAL_REAL_MISSING[0], 888.88, 777.77, 666.66, 555.55, 444.44, 333.33], dtype=np.float32),
        codec.ShortReal2,
    ],
    # When INTERNAL_REAL_MISSING[1] is present,
    # the codec switches to ShortReal which uses a different value to represent missing data
    [
        np.array([INTERNAL_REAL_MISSING[1], 888.88, 777.77, 666.66, 555.55, 444.44, 333.33], dtype=np.float32),
        codec.ShortReal,
    ],
]


def first_codec(file):
    return Reader(file).frames[0]._column_codecs[0]


@pytest.mark.parametrize("decoder", odc_modules)
@pytest.mark.parametrize("encoder", odc_modules)
@pytest.mark.parametrize("testcase", testcases)
def test_codec_choice(testcase, encoder, decoder):
    "Check that codc and pyodc choose the same codec for all the test data"
    testdata, expected_codec = testcase
    df = pd.DataFrame(dict(column=testdata))

    with NamedTemporaryFile() as fencode:
        encoder.encode_odb(df, fencode.name)
        round_tripped_data = decoder.read_odb(fencode.name, single=True)
        chosen_codec = type(first_codec(fencode.name))

    assert chosen_codec == expected_codec, (
        f"{encoder.__name__} chose codec '{chosen_codec.__name__}'"
        f"but we expected '{expected_codec.__name__}' for {testdata!r}"
    )

    # Check the data round tripped
    numpy.testing.assert_array_equal(df.column.values, round_tripped_data.column.values)


@pytest.mark.parametrize("encoder", odc_modules)
@pytest.mark.parametrize("decoder", odc_modules)
def test_codec_choice_long_string(encoder, decoder):
    """
    Check that codc and pyodc choose the same codec for long constant strings
    in the presence of the ODC_ENABLE_WRITING_LONG_STRING_CODEC environment variable.

    """
    testdata, expected_codec = [["abcdefghi"] * 7, codec.LongConstantString]
    df = pd.DataFrame(dict(column=testdata))

    os.environ["ODC_ENABLE_WRITING_LONG_STRING_CODEC"] = "true"

    with NamedTemporaryFile() as fencode:
        encoder.encode_odb(df, fencode.name)
        round_tripped_data = decoder.read_odb(fencode.name, single=True)
        chosen_codec = first_codec(fencode.name)

    del os.environ["ODC_ENABLE_WRITING_LONG_STRING_CODEC"]

    assert type(chosen_codec) is expected_codec

    # Check the data round tripped
    numpy.testing.assert_array_equal(df.column.values, round_tripped_data.column.values)