File: classify_prs.py

package info (click to toggle)
pytorch-vision 0.14.1-2
links: PTS, VCS
area: main
in suites: bookworm
size: 15,188 kB
sloc: python: 49,008; cpp: 10,019; sh: 610; java: 550; xml: 79; objc: 56; makefile: 32
file content (138 lines) | stat: -rw-r--r-- 3,465 bytes
# In[1]:

# imports and set configuration
import pandas as pd
from retrieve_prs_data import run

exclude_prototype = True
data_filename = "10.0_to_11.0-rc2.json"
previous_release = "v10.0"
current_release = "v11.0-rc2"

# In[2]:


df = pd.read_json(data_filename).T
df.tail()


# In[3]:


all_labels = {lbl for labels in df["labels"] for lbl in labels}
all_labels


# In[4]:


# Add one column per label
for label in all_labels:
    df[label] = df["labels"].apply(lambda labels_list: label in labels_list)
df.head()


# In[5]:


# Add a clean "module" column. It contains tuples since PRs can have more than one module.
# Maybe we should include "topics" in that column as well?

all_modules = {  # mapping: full name -> clean name
    label: "".join(label.split(" ")[1:]) for label in all_labels if label.startswith("module")
}

# We use an ugly loop, but whatever ¯\_(ツ)_/¯
df["module"] = [[] for _ in range(len(df))]
for i, row in df.iterrows():
    for full_name, clean_name in all_modules.items():
        if full_name in row["labels"]:
            row["module"].append(clean_name)
df["module"] = df.module.apply(tuple)
df.head()


# In[6]:


mod_df = df.set_index("module").sort_index()
mod_df.tail()


# In[7]:


# All improvement PRs
mod_df[mod_df["enhancement"]].head()


# In[8]:


# improvement f module
# note: don't filter module name on the index as the index contain tuples with non-exclusive values
# Use the boolean column instead
mod_df[mod_df["enhancement"] & mod_df["module: transforms"]]


# In[9]:


def format_prs(mod_df):
    out = []
    for idx, row in mod_df.iterrows():
        if exclude_prototype and row["prototype"]:
            continue
        modules = idx
        # Put "documentation" and "tests" first for sorting to be dece
        for last_module in ("documentation", "tests"):
            if last_module in modules:
                modules = [m for m in modules if m != last_module] + [last_module]

        module = f"[{', '.join(modules)}]"
        module = module.replace("referencescripts", "reference scripts")
        module = module.replace("code", "reference scripts")
        out.append(f"{module} {row['title']}")

    return "\n".join(out)


# In[10]:


included_prs = pd.DataFrame()

# If labels are accurate, this shouhld generate most of the release notes already
# We keep track of the included PRs to figure out which ones are missing
for section_title, module_idx in (
    ("Backward-incompatible changes", "bc-breaking"),
    ("Deprecations", "deprecation"),
    ("New Features", "new feature"),
    ("Improvements", "enhancement"),
    ("Bug Fixes", "bug"),
    ("Code Quality", "code quality"),
):
    print(f"## {section_title}")
    print()
    tmp_df = mod_df[mod_df[module_idx]]
    included_prs = pd.concat([included_prs, tmp_df])
    print(format_prs(tmp_df))
    print()


# In[11]:


# Missing PRs are these ones... classify them manually
missing_prs = pd.concat([mod_df, included_prs]).drop_duplicates(subset="pr_number", keep=False)
print(format_prs(missing_prs))

# In[12]:

# Generate list of contributors
print()
print("## Contributors")

command_to_run = f"{{ git shortlog -s {previous_release}..{current_release} | cut -f2- & git log -s {previous_release}..{current_release} | grep Co-authored | cut -f2- -d: | cut -f1 -d\\< | sed 's/^ *//;s/ *$//' ; }} | sort --ignore-case | uniq | tr '\\n' ';' | sed 's/;/, /g;s/, $//' | fold -s"
rc, output, err = run(command_to_run)
print(output)