1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
|
#!/usr/bin/env python
import getpass
import json
import pathlib
import subprocess
import sys
import time
def github_commits():
OAUTH_TOKEN = getpass.getpass("OAuth Token? ")
COMMIT_API = (
f'curl -H "Authorization: token {OAUTH_TOKEN}" '
f"https://api.github.com/repos/python-websockets/websockets/git/commits/:sha"
)
commits = []
head = subprocess.check_output(
"git rev-parse origin/main",
shell=True,
text=True,
).strip()
todo = [head]
seen = set()
while todo:
sha = todo.pop(0)
commit = subprocess.check_output(COMMIT_API.replace(":sha", sha), shell=True)
commits.append(commit)
seen.add(sha)
for parent in json.loads(commit)["parents"]:
sha = parent["sha"]
if sha not in seen and sha not in todo:
todo.append(sha)
time.sleep(1) # rate throttling
return commits
def main(corpus):
data = github_commits()
for num, content in enumerate(reversed(data)):
(corpus / f"{num:04d}.json").write_bytes(content)
if __name__ == "__main__":
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} <directory>")
sys.exit(2)
main(pathlib.Path(sys.argv[1]))
|