File: corpus.py

package info (click to toggle)
python-websockets 15.0.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,948 kB
  • sloc: python: 25,105; javascript: 350; ansic: 148; makefile: 43
file content (52 lines) | stat: -rw-r--r-- 1,261 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/env python

import getpass
import json
import pathlib
import subprocess
import sys
import time


def github_commits():
    OAUTH_TOKEN = getpass.getpass("OAuth Token? ")
    COMMIT_API = (
        f'curl -H "Authorization: token {OAUTH_TOKEN}" '
        f"https://api.github.com/repos/python-websockets/websockets/git/commits/:sha"
    )

    commits = []

    head = subprocess.check_output(
        "git rev-parse origin/main",
        shell=True,
        text=True,
    ).strip()
    todo = [head]
    seen = set()

    while todo:
        sha = todo.pop(0)
        commit = subprocess.check_output(COMMIT_API.replace(":sha", sha), shell=True)
        commits.append(commit)
        seen.add(sha)
        for parent in json.loads(commit)["parents"]:
            sha = parent["sha"]
            if sha not in seen and sha not in todo:
                todo.append(sha)
        time.sleep(1)  # rate throttling

    return commits


def main(corpus):
    data = github_commits()
    for num, content in enumerate(reversed(data)):
        (corpus / f"{num:04d}.json").write_bytes(content)


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print(f"Usage: {sys.argv[0]} <directory>")
        sys.exit(2)
    main(pathlib.Path(sys.argv[1]))