1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
|
import subprocess
import sys
import tempfile
import requests
TEMPLATE = """
import resource
import time
with open({path!r}, "rb") as f:
data = f.read()
initial_rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
{setup}
start = time.perf_counter()
for _ in range(5):
decode(data)
stop = time.perf_counter()
max_rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
# mem_mib = (max_rss * 1024 - len(data)) / (1024 * 1024)
mem_mib = (max_rss - initial_rss) / 1024
time_ms = ((stop - start) / 5) * 1000
print([mem_mib, time_ms])
"""
JSON = """
import json
decode = json.loads
"""
UJSON = """
import ujson
decode = ujson.loads
"""
ORJSON = """
import orjson
decode = orjson.loads
"""
RAPIDJSON = """
import rapidjson
decode = rapidjson.loads
"""
SIMDJSON = """
import simdjson
decode = simdjson.loads
"""
MSGSPEC = """
import msgspec
decode = msgspec.json.decode
"""
MSGSPEC_STRUCTS = """
import msgspec
from typing import Union
class Package(msgspec.Struct, gc=False):
build: str
build_number: int
depends: tuple[str, ...]
md5: str
name: str
sha256: str
subdir: str
version: str
license: str = ""
noarch: Union[str, bool, None] = None
size: int = 0
timestamp: int = 0
class RepoData(msgspec.Struct, gc=False):
repodata_version: int
info: dict
packages: dict[str, Package]
removed: tuple[str, ...]
decode = msgspec.json.Decoder(RepoData).decode
"""
def main():
import argparse
parser = argparse.ArgumentParser(
description="Benchmark decoding a large JSON message using various JSON libraries"
)
parser.add_argument(
"--versions",
action="store_true",
help="Output library version info, and exit immediately",
)
args = parser.parse_args()
benchmarks = [
("json", None, JSON),
("ujson", "ujson", UJSON),
("orjson", "orjson", ORJSON),
("rapidjson", "python-rapidjson", RAPIDJSON),
("simdjson", "pysimdjson", SIMDJSON),
("msgspec", "msgspec", MSGSPEC),
("msgspec structs", None, MSGSPEC_STRUCTS),
]
if args.versions:
import importlib.metadata
for _, lib, _ in benchmarks:
if lib is not None:
version = importlib.metadata.version(lib)
print(f"- {lib}: {version}")
sys.exit(0)
with tempfile.NamedTemporaryFile() as f:
# Download the repodata.json
resp = requests.get(
"https://conda.anaconda.org/conda-forge/noarch/repodata.json"
)
resp.raise_for_status()
f.write(resp.content)
# Run the benchmark for each library
results = {}
import ast
for lib, _, setup in benchmarks:
script = TEMPLATE.format(path=f.name, setup=setup)
# We execute each script in a subprocess to isolate their memory usage
output = subprocess.check_output([sys.executable, "-c", script])
results[lib] = ast.literal_eval(output.decode())
# Compose the results table
best_mem, best_time = results["msgspec structs"]
columns = (
"",
"memory (MiB)",
"vs.",
"time (ms)",
"vs.",
)
rows = [
(
f"**{lib}**",
f"{mem:.1f}",
f"{mem / best_mem:.1f}x",
f"{time:.1f}",
f"{time / best_time:.1f}x",
)
for lib, (mem, time) in results.items()
]
rows.sort(key=lambda x: float(x[1]))
widths = tuple(
max(max(map(len, x)), len(c)) for x, c in zip(zip(*rows), columns)
)
row_template = ("|" + (" %%-%ds |" * len(columns))) % widths
header = row_template % tuple(columns)
bar_underline = "+%s+" % "+".join("=" * (w + 2) for w in widths)
bar = "+%s+" % "+".join("-" * (w + 2) for w in widths)
parts = [bar, header, bar_underline]
for r in rows:
parts.append(row_template % r)
parts.append(bar)
print("\n".join(parts))
if __name__ == "__main__":
main()
|