1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
|
#!/usr/bin/env python3
import matplotlib.pyplot as plt
import xdg
import re
import pycurl
import certifi
import os
import sys
from bs4 import BeautifulSoup
from io import BytesIO
# where we get the bench list from
# sadly message edits don't appear in the public archive so we have to
# download the gitlab raw logs too
# (maybe we should be getting the messages from the zulip API instead??)
archive = 'https://coq.gitlab.io/zulip-archive/stream/240008-GitHub-notifications/topic/Bench.20Notifications.html'
print ('Getting bench list.')
buffer = BytesIO()
c = pycurl.Curl()
c.setopt(c.URL, archive)
c.setopt(c.WRITEFUNCTION, buffer.write)
c.setopt(c.CAINFO, certifi.where())
c.perform()
c.close()
str = buffer.getvalue().decode('utf8')
print ('Parsing bench list.')
# parse HTML and remove tags
# NB since coq was made default language on zulip the tables are full of <span> tags
# so we really don't want to keep them
str = BeautifulSoup(str, "html.parser").text
# NB: the line with the date starts with a space
# (in the original there is the avatar broken image)
datere = re.compile(' Bench bot \((.*)\):')
jobre = re.compile('Bench at https://gitlab.com/coq/coq/-/jobs/(.*)$')
benches=[]
curdate=None
for line in iter(str.splitlines()):
match = re.match(datere, line)
if match:
# Next post
curdate = match[1]
else:
match = re.match(jobre, line)
if match:
job = match[1]
benches += [(curdate, match[1])]
cachedir = xdg.xdg_cache_home() / "coq-bench-plotter"
os.makedirs(cachedir, exist_ok=True)
## download job logs
print ('Downloading job logs.')
m = pycurl.CurlMulti()
files=[]
for _, job in benches:
fname = cachedir / (job + '.log')
if not(os.path.exists(fname)):
f = open(fname, 'xb') # x: will error if already exists
files += [f]
c = pycurl.Curl()
c.setopt(c.URL, 'https://gitlab.com/coq/coq/-/jobs/' + job + '/raw')
c.setopt(c.WRITEFUNCTION, f.write)
c.setopt(c.CAINFO, certifi.where())
c.setopt(c.FOLLOWLOCATION, True)
m.add_handle(c)
num_handles = len(files)
while num_handles:
print (f'Downloading {num_handles} log files.')
ret = m.select(5.0)
if ret == -1:
continue
while 1:
ret, num_handles = m.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
for f in files:
f.flush()
f.close()
## parse job logs
print ('Parsing job logs.')
# captures package name and OLD time
# the numerals are to avoid matching the table header
packre = re.compile('│[ ]+([^ ]+) │[ ]+[^ ]+[ ]+([0-9][^ ]*) ')
parsed=[]
for date, job in benches:
cur={}
fname = cachedir / (job + '.log')
with open(fname) as f:
for l in f:
match = re.match(packre, l)
if match:
# the table appears multiple times in the log so this may be overriding the value
# that is OK (it's not worth bothering to find the last table)
cur[match[1]] = match[2]
if cur:
parsed += [(date, job, cur)]
def filter_to(packname):
dates=[]
hotbenches=[]
for date, _, packs in parsed:
if packname in packs:
dates += [date]
hotbenches += [float(packs[packname])]
return dates, hotbenches
def plot(packname):
dates, hotbenches = filter_to(packname)
plt.plot(hotbenches)
plt.ylabel(packname)
plt.show()
def list_packs():
known={}
for _, _, packs in parsed:
for pack in packs.keys():
if pack in known.keys():
known[pack] += 1
else:
known[pack] = 1
return known
plot(sys.argv[1])
# dates is only useful if you want to zoom, there are too many to read otherwise
# plt.plot(dates, hotbenches)
# plt.show()
|