File: retrieve_prs_data.py

package info (click to toggle)
pytorch-vision 0.21.0-3
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 20,228 kB
sloc: python: 65,904; cpp: 11,406; ansic: 2,459; java: 550; sh: 265; xml: 79; objc: 56; makefile: 33
file content (212 lines) | stat: -rw-r--r-- 5,997 bytes
parent folder | download | duplicates (2)
import json
import locale
import os
import re
import subprocess
from collections import namedtuple
from os.path import expanduser

import requests


Features = namedtuple(
    "Features",
    [
        "title",
        "body",
        "pr_number",
        "files_changed",
        "labels",
    ],
)


def dict_to_features(dct):
    return Features(
        title=dct["title"],
        body=dct["body"],
        pr_number=dct["pr_number"],
        files_changed=dct["files_changed"],
        labels=dct["labels"],
    )


def features_to_dict(features):
    return dict(features._asdict())


def run(command):
    """Returns (return-code, stdout, stderr)"""
    p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
    output, err = p.communicate()
    rc = p.returncode
    enc = locale.getpreferredencoding()
    output = output.decode(enc)
    err = err.decode(enc)
    return rc, output.strip(), err.strip()


def commit_body(commit_hash):
    cmd = f"git log -n 1 --pretty=format:%b {commit_hash}"
    ret, out, err = run(cmd)
    return out if ret == 0 else None


def commit_title(commit_hash):
    cmd = f"git log -n 1 --pretty=format:%s {commit_hash}"
    ret, out, err = run(cmd)
    return out if ret == 0 else None


def commit_files_changed(commit_hash):
    cmd = f"git diff-tree --no-commit-id --name-only -r {commit_hash}"
    ret, out, err = run(cmd)
    return out.split("\n") if ret == 0 else None


def parse_pr_number(body, commit_hash, title):
    regex = r"(#[0-9]+)"
    matches = re.findall(regex, title)
    if len(matches) == 0:
        if "revert" not in title.lower() and "updating submodules" not in title.lower():
            print(f"[{commit_hash}: {title}] Could not parse PR number, ignoring PR")
        return None
    if len(matches) > 1:
        print(f"[{commit_hash}: {title}] Got two PR numbers, using the last one")
        return matches[-1][1:]
    return matches[0][1:]


def get_ghstack_token():
    pattern = "github_oauth = (.*)"
    with open(expanduser("~/.ghstackrc"), "r+") as f:
        config = f.read()
    matches = re.findall(pattern, config)
    if len(matches) == 0:
        raise RuntimeError("Can't find a github oauth token")
    return matches[0]


token = get_ghstack_token()
headers = {"Authorization": f"token {token}"}


def run_query(query):
    request = requests.post("https://api.github.com/graphql", json={"query": query}, headers=headers)
    if request.status_code == 200:
        return request.json()
    else:
        raise Exception(f"Query failed to run by returning code of {request.status_code}. {query}")


def gh_labels(pr_number):
    query = f"""
    {{
      repository(owner: "pytorch", name: "vision") {{
        pullRequest(number: {pr_number}) {{
          labels(first: 10) {{
            edges {{
              node {{
                name
              }}
            }}
          }}
        }}
      }}
    }}
    """
    query = run_query(query)
    edges = query["data"]["repository"]["pullRequest"]["labels"]["edges"]
    return [edge["node"]["name"] for edge in edges]


def get_features(commit_hash, return_dict=False):
    title, body, files_changed = (
        commit_title(commit_hash),
        commit_body(commit_hash),
        commit_files_changed(commit_hash),
    )
    pr_number = parse_pr_number(body, commit_hash, title)
    labels = []
    if pr_number is not None:
        labels = gh_labels(pr_number)
    result = Features(title, body, pr_number, files_changed, labels)
    if return_dict:
        return features_to_dict(result)
    return result


class CommitDataCache:
    def __init__(self, path="results/data.json"):
        self.path = path
        self.data = {}
        if os.path.exists(path):
            self.data = self.read_from_disk()

    def get(self, commit):
        if commit not in self.data.keys():
            # Fetch and cache the data
            self.data[commit] = get_features(commit)
            self.write_to_disk()
        return self.data[commit]

    def read_from_disk(self):
        with open(self.path) as f:
            data = json.load(f)
            data = {commit: dict_to_features(dct) for commit, dct in data.items()}
        return data

    def write_to_disk(self):
        data = {commit: features._asdict() for commit, features in self.data.items()}
        with open(self.path, "w") as f:
            json.dump(data, f)


def get_commits_between(base_version, new_version):
    cmd = f"git merge-base {base_version} {new_version}"
    rc, merge_base, _ = run(cmd)
    assert rc == 0

    # Returns a list of something like
    # b33e38ec47 Allow a higher-precision step type for Vec256::arange (#34555)
    cmd = f"git log --reverse --oneline {merge_base}..{new_version}"
    rc, commits, _ = run(cmd)
    assert rc == 0

    log_lines = commits.split("\n")
    hashes, titles = zip(*[log_line.split(" ", 1) for log_line in log_lines])
    return hashes, titles


def convert_to_dataframes(feature_list):
    import pandas as pd

    df = pd.DataFrame.from_records(feature_list, columns=Features._fields)
    return df


def main(base_version, new_version):
    hashes, titles = get_commits_between(base_version, new_version)

    cdc = CommitDataCache("data.json")
    for idx, commit in enumerate(hashes):
        if idx % 10 == 0:
            print(f"{idx} / {len(hashes)}")
        cdc.get(commit)

    return cdc


if __name__ == "__main__":
    # d = get_features('2ab93592529243862ce8ad5b6acf2628ef8d0dc8')
    # print(d)
    # hashes, titles = get_commits_between("tags/v0.9.0", "fc852f3b39fe25dd8bf1dedee8f19ea04aa84c15")

    # Usage: change the tags below accordingly to the current release, then save the json with
    # cdc.write_to_disk().
    # Then you can use classify_prs.py (as a notebook)
    # to open the json and generate the release notes semi-automatically.
    cdc = main("tags/v0.9.0", "fc852f3b39fe25dd8bf1dedee8f19ea04aa84c15")
    from IPython import embed

    embed()