File: test_3697_parquet_string_int32_offsets.py

package info (click to toggle)
python-awkward 2.9.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 25,524 kB
  • sloc: python: 187,940; cpp: 33,928; sh: 432; makefile: 21; javascript: 8
file content (131 lines) | stat: -rw-r--r-- 4,969 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE

from __future__ import annotations

import numpy as np
import pytest

import awkward as ak

pyarrow = pytest.importorskip("pyarrow")
pyarrow_parquet = pytest.importorskip("pyarrow.parquet")


def test_parquet_string_roundtrip_with_string_to32_default(tmp_path):
    """Test that strings can be round-tripped through parquet with default string_to32=True.

    This test ensures that when string_to32=True (the default for parquet), strings are
    correctly serialized to Arrow's 32-bit string type and can be converted back to NumPy
    arrays without kernel lookup errors.

    Regression test for bug where int32 offsets from pyarrow.string() caused KeyError
    when converting to NumPy because UTF8 kernels only had int64 specializations.
    """
    # Create array with strings
    a = ak.Array([{"foo": "bar"}, {"foo": "baz"}])

    # Convert to numpy before serialization (should work with int64 offsets)
    b = np.asarray(a["foo"])
    assert b.dtype.kind == "U"
    assert list(b) == ["bar", "baz"]

    # Serialize to parquet with default string_to32=True
    ak.to_parquet(a, tmp_path / "test_string_to32.parquet")

    # Deserialize from parquet (creates int32 offsets for strings)
    x = ak.from_parquet(tmp_path / "test_string_to32.parquet")

    # This should work now with int32 offset kernel specializations
    y = np.asarray(x["foo"])
    assert y.dtype.kind == "U"
    assert list(y) == ["bar", "baz"]


def test_parquet_string_roundtrip_explicit_string_to32_true(tmp_path):
    """Test parquet roundtrip with explicitly setting string_to32=True."""
    a = ak.Array([{"name": "Alice"}, {"name": "Bob"}, {"name": "Charlie"}])

    # Explicitly set string_to32=True
    ak.to_parquet(a, tmp_path / "test_explicit_string_to32.parquet", string_to32=True)
    x = ak.from_parquet(tmp_path / "test_explicit_string_to32.parquet")

    # Convert to numpy (should use int32 offset kernels)
    result = np.asarray(x["name"])
    assert result.dtype.kind == "U"
    assert list(result) == ["Alice", "Bob", "Charlie"]


def test_parquet_string_roundtrip_string_to32_false(tmp_path):
    """Test parquet roundtrip with string_to32=False."""
    a = ak.Array([{"text": "hello"}, {"text": "world"}])

    # Set string_to32=False to use int64 offsets
    ak.to_parquet(a, tmp_path / "test_string_to32_false.parquet", string_to32=False)
    x = ak.from_parquet(tmp_path / "test_string_to32_false.parquet")

    # Convert to numpy (may use int64 offset kernels)
    result = np.asarray(x["text"])
    assert result.dtype.kind == "U"
    assert list(result) == ["hello", "world"]


def test_parquet_bytestring_roundtrip_with_bytestring_to32(tmp_path):
    """Test that bytestrings work correctly with bytestring_to32=True."""
    # Create array with bytestrings
    a = ak.Array([{"data": b"foo"}, {"data": b"bar"}])

    # Serialize with bytestring_to32=True (default)
    ak.to_parquet(a, tmp_path / "test_bytestring_to32.parquet", bytestring_to32=True)
    x = ak.from_parquet(tmp_path / "test_bytestring_to32.parquet")

    # Convert to numpy
    result = np.asarray(x["data"])
    assert result.dtype.kind == "S"
    assert list(result) == [b"foo", b"bar"]


def test_direct_string_array_with_int32_offsets(tmp_path):
    """Test direct conversion of string arrays with int32 offsets to NumPy."""
    # Create a string array
    strings = ak.Array(["hello", "world", "test"])

    # Serialize and deserialize through parquet to get int32 offsets
    ak.to_parquet(strings, tmp_path / "test_direct_strings.parquet", string_to32=True)
    x = ak.from_parquet(tmp_path / "test_direct_strings.parquet")

    # Check that offsets are int32
    layout = x.layout
    assert layout.offsets.dtype in (
        np.dtype("int32"),
        np.dtype("uint32"),
        np.dtype("int64"),
    )

    # Convert to numpy
    result = np.asarray(x)
    assert result.dtype.kind == "U"
    assert list(result) == ["hello", "world", "test"]


def test_empty_strings_with_int32_offsets(tmp_path):
    """Test that empty strings work correctly with int32 offsets."""
    a = ak.Array([{"msg": ""}, {"msg": "x"}, {"msg": ""}])

    ak.to_parquet(a, tmp_path / "test_empty_strings.parquet", string_to32=True)
    x = ak.from_parquet(tmp_path / "test_empty_strings.parquet")

    result = np.asarray(x["msg"])
    assert result.dtype.kind == "U"
    assert list(result) == ["", "x", ""]


def test_unicode_strings_with_int32_offsets(tmp_path):
    """Test that unicode strings work correctly with int32 offsets."""
    a = ak.Array([{"text": "Hello ๐Ÿ‘‹"}, {"text": "World ๐ŸŒ"}, {"text": "Test ๐Ÿงช"}])

    ak.to_parquet(a, tmp_path / "test_unicode_strings.parquet", string_to32=True)
    x = ak.from_parquet(tmp_path / "test_unicode_strings.parquet")

    result = np.asarray(x["text"])
    assert result.dtype.kind == "U"
    assert list(result) == ["Hello ๐Ÿ‘‹", "World ๐ŸŒ", "Test ๐Ÿงช"]