File: make_jagged1_parquet.py

package info (click to toggle)
python-awkward 2.6.5-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 23,088 kB
  • sloc: python: 148,689; cpp: 33,562; sh: 432; makefile: 21; javascript: 8
file content (78 lines) | stat: -rw-r--r-- 2,481 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import numpy as np
import awkward as ak

content = np.memmap("/home/jpivarski/storage/data/chep-2021-jagged-jagged-jagged/sample-content.float32", np.float32)
offsets1 = np.memmap("/home/jpivarski/storage/data/chep-2021-jagged-jagged-jagged/sample-offsets1.int64", np.int64)

# jagged1

events_per_basket = 1342176

partitions = []
start1 = stop1 = 0
while start1 < len(offsets1) - 1:
    stop1 = int(min(stop1 + events_per_basket, len(offsets1) - 1))

    o1 = offsets1[start1 : stop1 + 1]
    c = content[o1[0] : o1[-1]]

    o1 = (o1 - o1[0]).astype(np.int32)

    partitions.append(ak.Array(
        ak.layout.ListOffsetArray32(
            ak.layout.Index32(o1),
            ak.layout.NumpyArray(c)
        ), check_valid=True
    ))

    start1 = stop1

for level in [None]:  # [9, 1]:
    print("level", level)
    ak.to_parquet(
        ak.partitioned(partitions),
        "/home/jpivarski/storage/data/chep-2021-jagged-jagged-jagged/lzfour" + str(level) + "-jagged1.parquet",
        list_to32=True,
        compression="LZ4",
        compression_level=level,
        use_dictionary=False,
        write_statistics=False,
        data_page_size=100*1024**2,
    )
    print("level", level, "split")
    ak.to_parquet(
        ak.partitioned(partitions),
        "/home/jpivarski/storage/data/chep-2021-jagged-jagged-jagged/lzfour" + str(level) + "-split-jagged1.parquet",
        list_to32=True,
        compression="LZ4",
        compression_level=level,
        use_dictionary=False,
        write_statistics=False,
        data_page_size=100*1024**2,
        use_byte_stream_split=True,
    )

# for level in [0]:
#     print("level", level)
#     ak.to_parquet(
#         ak.partitioned(partitions),
#         "/home/jpivarski/storage/data/chep-2021-jagged-jagged-jagged/zlib" + str(level) + "-jagged1.parquet",
#         list_to32=True,
#         compression="NONE",
#         compression_level=None,
#         use_dictionary=False,
#         write_statistics=False,
#         data_page_size=100*1024**2,
#     )
#     print("level", level, "split")
#     ak.to_parquet(
#         ak.partitioned(partitions),
#         "/home/jpivarski/storage/data/chep-2021-jagged-jagged-jagged/zlib" + str(level) + "-split-jagged1.parquet",
#         list_to32=True,
#         compression="NONE",
#         compression_level=None,
#         use_dictionary=False,
#         write_statistics=False,
#         data_page_size=100*1024**2,
#         use_byte_stream_split=True,
#     )