File: forth-read-jagged0-avro.py

package info (click to toggle)
python-awkward 2.6.5-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 23,088 kB
  • sloc: python: 148,689; cpp: 33,562; sh: 432; makefile: 21; javascript: 8
file content (69 lines) | stat: -rw-r--r-- 1,447 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import zlib
import struct
import time

import numpy as np
import awkward as ak
import awkward.forth

data = np.memmap("/home/jpivarski/storage/data/chep-2021-jagged-jagged-jagged/zlib0-jagged0.avro", np.uint8)

def decompress(data, pos):
    decompressor = zlib.decompressobj(-zlib.MAX_WBITS)
    uncompressed = decompressor.decompress(data[pos : pos + num_bytes])
    uncompressed += decompressor.flush()
    return uncompressed

def decode_varint(pos, data):
    shift = 0
    result = 0
    while True:
        i = data[pos]
        pos += 1
        result |= (i & 0x7f) << shift
        shift += 7
        if not (i & 0x80):
            break
    return pos, result

def decode_zigzag(n):
    return (n >> 1) ^ (-(n & 1))

jagged0 = awkward.forth.ForthMachine32("""
input stream
output content float32

stream #f-> content
""")

begintime = time.time()

pos = 0
while data[pos] != 0:
    pos += 1
pos += 1

while pos + 16 < len(data):
    pos += 16

    pos, varint = decode_varint(pos, data)
    num_items = decode_zigzag(varint)

    pos, varint = decode_varint(pos, data)
    num_bytes = decode_zigzag(varint)

    uncompressed = decompress(data, pos)

    pos += num_bytes

    jagged0.begin({"stream": uncompressed})
    jagged0.stack_push(num_items)
    jagged0.resume()

    array = ak.Array(
        jagged0.output_NumpyArray("content")
    )

endtime = time.time()

print("AwkwardForth zlib0-jagged0", endtime - begintime, "seconds")