1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
|
#!/usr/bin/env python
import collections
import pathlib
import sys
import time
import zlib
REPEAT = 10
WB, ML = 12, 5 # defaults used as a reference
def benchmark(data):
size = collections.defaultdict(dict)
duration = collections.defaultdict(dict)
for wbits in range(9, 16):
for memLevel in range(1, 10):
encoder = zlib.compressobj(wbits=-wbits, memLevel=memLevel)
encoded = []
print(f"Compressing {REPEAT} times with {wbits=} and {memLevel=}")
t0 = time.perf_counter()
for _ in range(REPEAT):
for item in data:
# Taken from PerMessageDeflate.encode
item = encoder.compress(item) + encoder.flush(zlib.Z_SYNC_FLUSH)
if item.endswith(b"\x00\x00\xff\xff"):
item = item[:-4]
encoded.append(item)
t1 = time.perf_counter()
size[wbits][memLevel] = sum(len(item) for item in encoded) / REPEAT
duration[wbits][memLevel] = (t1 - t0) / REPEAT
raw_size = sum(len(item) for item in data)
print("=" * 79)
print("Compression ratio")
print("=" * 79)
print("\t".join(["wb \\ ml"] + [str(memLevel) for memLevel in range(1, 10)]))
for wbits in range(9, 16):
print(
"\t".join(
[str(wbits)]
+ [
f"{100 * (1 - size[wbits][memLevel] / raw_size):.1f}%"
for memLevel in range(1, 10)
]
)
)
print("=" * 79)
print()
print("=" * 79)
print("CPU time")
print("=" * 79)
print("\t".join(["wb \\ ml"] + [str(memLevel) for memLevel in range(1, 10)]))
for wbits in range(9, 16):
print(
"\t".join(
[str(wbits)]
+ [
f"{1000 * duration[wbits][memLevel]:.1f}ms"
for memLevel in range(1, 10)
]
)
)
print("=" * 79)
print()
print("=" * 79)
print(f"Size vs. {WB} \\ {ML}")
print("=" * 79)
print("\t".join(["wb \\ ml"] + [str(memLevel) for memLevel in range(1, 10)]))
for wbits in range(9, 16):
print(
"\t".join(
[str(wbits)]
+ [
f"{100 * (size[wbits][memLevel] / size[WB][ML] - 1):.1f}%"
for memLevel in range(1, 10)
]
)
)
print("=" * 79)
print()
print("=" * 79)
print(f"Time vs. {WB} \\ {ML}")
print("=" * 79)
print("\t".join(["wb \\ ml"] + [str(memLevel) for memLevel in range(1, 10)]))
for wbits in range(9, 16):
print(
"\t".join(
[str(wbits)]
+ [
f"{100 * (duration[wbits][memLevel] / duration[WB][ML] - 1):.1f}%"
for memLevel in range(1, 10)
]
)
)
print("=" * 79)
print()
def main(corpus):
data = [file.read_bytes() for file in corpus.iterdir()]
benchmark(data)
if __name__ == "__main__":
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} [directory]")
sys.exit(2)
main(pathlib.Path(sys.argv[1]))
|