1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
|
import pandas as pd
import pytest
fastparquet = pytest.importorskip("fastparquet")
import fsspec
import ujson
from kerchunk.df import refs_to_dataframe
@pytest.mark.parametrize("url", [True, False])
def test_1(m, url):
refs = {
"a/0": ["memory://url1.file"],
"a/1": ["memory://url1.file", 10, 100],
"a/2": ["memory://url2.file"],
"a/3": ["memory://url3.file"],
"a/4": ["memory://url4.file"],
"a/5": ["memory://url5.file"],
"a/6": b"data",
"a/.zarray": b"""{"shape": [7], "chunks":[1], "filters": [], "compressor": null}""",
".zgroup": b'{"zarr_format": 2}',
}
u = "memory://myrefs.json"
if url:
with fsspec.open(u, "wt") as f:
ujson.dump(
{
k: (v.decode() if isinstance(v, bytes) else v)
for k, v in refs.items()
},
f,
)
refs = u
refs_to_dataframe(refs, "memory://outdir", record_size=4)
with fsspec.open("memory:///outdir/.zmetadata") as f:
meta = ujson.load(f)
assert list(meta) == ["metadata", "record_size"]
assert meta["record_size"] == 4
df0 = pd.read_parquet("memory:///outdir/a/refs.0.parq")
# no raw
assert df0.to_dict() == {
"offset": {0: 0, 1: 10, 2: 0, 3: 0},
"path": {
0: "memory://url1.file",
1: "memory://url1.file",
2: "memory://url2.file",
3: "memory://url3.file",
},
"size": {0: 0, 1: 100, 2: 0, 3: 0},
"raw": {0: None, 1: None, 2: None, 3: None},
}
# with raw column
df1 = pd.read_parquet("memory:///outdir/a/refs.1.parq")
assert df1.to_dict() == {
"offset": {0: 0, 1: 0, 2: 0, 3: 0},
"path": {0: "memory://url4.file", 1: "memory://url5.file", 2: None, 3: None},
"raw": {0: None, 1: None, 2: b"data", 3: None},
"size": {0: 0, 1: 0, 2: 0, 3: 0},
}
|