1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
|
"""
Dump the GitHub issues of the current project to a file (.json.gz).
Usage: python3 Tools/dump_github_issues.py
"""
import configparser
import gzip
import json
import os.path
from datetime import datetime
from urllib.request import urlopen
GIT_CONFIG_FILE = ".git/config"
class RateLimitReached(Exception):
pass
def gen_urls(repo):
i = 0
while True:
yield f"https://api.github.com/repos/{repo}/issues?state=all&per_page=100&page={i}"
i += 1
def read_rate_limit():
with urlopen("https://api.github.com/rate_limit") as p:
return json.load(p)
def parse_rate_limit(limits):
limits = limits['resources']['core']
return limits['limit'], limits['remaining'], datetime.fromtimestamp(limits['reset'])
def load_url(url):
with urlopen(url) as p:
data = json.load(p)
if isinstance(data, dict) and 'rate limit' in data.get('message', ''):
raise RateLimitReached()
assert isinstance(data, list), type(data)
return data or None # None indicates empty last page
def join_list_data(lists):
result = []
for data in lists:
if not data:
break
result.extend(data)
return result
def output_filename(repo):
timestamp = datetime.now()
return f"github_issues_{repo.replace('/', '_')}_{timestamp.strftime('%Y%m%d_%H%M%S')}.json.gz"
def write_gzjson(file_name, data, indent=2):
with gzip.open(file_name, "wt", encoding='utf-8') as gz:
json.dump(data, gz, indent=indent)
def find_origin_url(git_config=GIT_CONFIG_FILE):
assert os.path.exists(git_config)
parser = configparser.ConfigParser()
parser.read(git_config)
return parser.get('remote "origin"', 'url')
def parse_repo_name(git_url):
if git_url.endswith('.git'):
git_url = git_url[:-4]
return '/'.join(git_url.split('/')[-2:])
def dump_issues(repo):
"""Main entry point."""
print(f"Reading issues from repo '{repo}'")
urls = gen_urls(repo)
try:
paged_data = map(load_url, urls)
issues = join_list_data(paged_data)
except RateLimitReached:
limit, remaining, reset_time = parse_rate_limit(read_rate_limit())
print(f"FAILURE: Rate limits ({limit}) reached, remaining: {remaining}, reset at {reset_time}")
return
filename = output_filename(repo)
print(f"Writing {len(issues)} to {filename}")
write_gzjson(filename, issues)
### TESTS
def test_join_list_data():
assert join_list_data([]) == []
assert join_list_data([[1,2]]) == [1,2]
assert join_list_data([[1,2], [3]]) == [1,2,3]
assert join_list_data([[0], [1,2], [3]]) == [0,1,2,3]
assert join_list_data([[0], [1,2], [[[]],[]]]) == [0,1,2,[[]],[]]
def test_output_filename():
filename = output_filename("re/po")
import re
assert re.match(r"github_issues_re_po_[0-9]{8}_[0-9]{6}\.json", filename)
def test_find_origin_url():
assert find_origin_url()
def test_parse_repo_name():
assert parse_repo_name("https://github.com/cython/cython") == "cython/cython"
assert parse_repo_name("git+ssh://git@github.com/cython/cython.git") == "cython/cython"
assert parse_repo_name("git+ssh://git@github.com/fork/cython.git") == "fork/cython"
def test_write_gzjson():
import tempfile
with tempfile.NamedTemporaryFile() as tmp:
write_gzjson(tmp.name, [{}])
# test JSON format
with gzip.open(tmp.name) as f:
assert json.load(f) == [{}]
# test indentation
with gzip.open(tmp.name) as f:
assert f.read() == b'[\n {}\n]'
### MAIN
if __name__ == '__main__':
repo_name = parse_repo_name(find_origin_url())
dump_issues(repo_name)
|