1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
|
import argparse
import os
import textwrap
from pathlib import Path
import common
# Imports for working with classi
from classifier import (
CategoryConfig,
CommitClassifier,
CommitClassifierInputs,
get_author_map,
get_file_map,
XLMR_BASE,
)
from commitlist import CommitList
from common import get_commit_data_cache, topics
import torch
class Categorizer:
def __init__(self, path, category="Uncategorized", use_classifier: bool = False):
self.cache = get_commit_data_cache()
self.commits = CommitList.from_existing(path)
if use_classifier:
print("Using a classifier to aid with categorization.")
device = "cuda" if torch.cuda.is_available() else "cpu"
classifier_config = CategoryConfig(common.categories)
author_map = get_author_map(
Path("results/classifier"), regen_data=False, assert_stored=True
)
file_map = get_file_map(
Path("results/classifier"), regen_data=False, assert_stored=True
)
self.classifier = CommitClassifier(
XLMR_BASE, author_map, file_map, classifier_config
).to(device)
self.classifier.load_state_dict(
torch.load(Path("results/classifier/commit_classifier.pt"))
)
self.classifier.eval()
else:
self.classifier = None
# Special categories: 'Uncategorized'
# All other categories must be real
self.category = category
def categorize(self):
commits = self.commits.filter(category=self.category)
total_commits = len(self.commits.commits)
already_done = total_commits - len(commits)
i = 0
while i < len(commits):
cur_commit = commits[i]
next_commit = commits[i + 1] if i + 1 < len(commits) else None
jump_to = self.handle_commit(
cur_commit, already_done + i + 1, total_commits, commits
)
# Increment counter
if jump_to is not None:
i = jump_to
elif next_commit is None:
i = len(commits)
else:
i = commits.index(next_commit)
def features(self, commit):
return self.cache.get(commit.commit_hash)
def potential_reverts_of(self, commit, commits):
submodule_update_str = [
"Update TensorPipe submodule",
"Updating submodules",
"Automated submodule update",
]
if any(a in commit.title for a in submodule_update_str):
return []
features = self.features(commit)
if "Reverted" in features.labels:
reasons = {"GithubBot": "Reverted"}
else:
reasons = {}
index = commits.index(commit)
# -8 to remove the (#35011)
cleaned_title = commit.title[:-10]
# NB: the index + 2 is sketch
reasons.update(
{
(index + 2 + delta): cand
for delta, cand in enumerate(commits[index + 1 :])
if cleaned_title in cand.title
and commit.commit_hash != cand.commit_hash
}
)
return reasons
def handle_commit(self, commit, i, total, commits):
potential_reverts = self.potential_reverts_of(commit, commits)
if potential_reverts:
potential_reverts = f"!!!POTENTIAL REVERTS!!!: {potential_reverts}"
else:
potential_reverts = ""
features = self.features(commit)
if self.classifier is not None:
# Some commits don't have authors:
author = features.author if features.author else "Unknown"
files = " ".join(features.files_changed)
classifier_input = CommitClassifierInputs(
title=[features.title], files=[files], author=[author]
)
classifier_category = self.classifier.get_most_likely_category_name(
classifier_input
)[0]
else:
classifier_category = commit.category
breaking_alarm = ""
if "module: bc-breaking" in features.labels:
breaking_alarm += "\n!!!!!! BC BREAKING !!!!!!"
if "module: deprecation" in features.labels:
breaking_alarm += "\n!!!!!! DEPRECATION !!!!!!"
os.system("clear")
view = textwrap.dedent(
f"""\
[{i}/{total}]
================================================================================
{features.title}
{potential_reverts} {breaking_alarm}
{features.body}
Files changed: {features.files_changed}
Labels: {features.labels}
Current category: {commit.category}
Select from: {', '.join(common.categories)}
"""
)
print(view)
cat_choice = None
while cat_choice is None:
print("Enter category: ")
value = input(f"{classifier_category} ").strip()
if len(value) == 0:
# The user just pressed enter and likes the default value
cat_choice = classifier_category
continue
choices = [cat for cat in common.categories if cat.startswith(value)]
if len(choices) != 1:
print(f"Possible matches: {choices}, try again")
continue
cat_choice = choices[0]
print(f"\nSelected: {cat_choice}")
print(f"\nCurrent topic: {commit.topic}")
print(f"""Select from: {', '.join(topics)}""")
topic_choice = None
while topic_choice is None:
value = input("topic> ").strip()
if len(value) == 0:
topic_choice = commit.topic
continue
choices = [cat for cat in topics if cat.startswith(value)]
if len(choices) != 1:
print(f"Possible matches: {choices}, try again")
continue
topic_choice = choices[0]
print(f"\nSelected: {topic_choice}")
self.update_commit(commit, cat_choice, topic_choice)
return None
def update_commit(self, commit, category, topic):
assert category in common.categories
assert topic in topics
commit.category = category
commit.topic = topic
self.commits.write_result()
def main():
parser = argparse.ArgumentParser(description="Tool to help categorize commits")
parser.add_argument(
"--category",
type=str,
default="Uncategorized",
help='Which category to filter by. "Uncategorized", None, or a category name',
)
parser.add_argument(
"--file",
help="The location of the commits CSV",
default="results/commitlist.csv",
)
parser.add_argument(
"--use_classifier",
action="store_true",
help="Whether or not to use a classifier to aid in categorization.",
)
args = parser.parse_args()
categorizer = Categorizer(args.file, args.category, args.use_classifier)
categorizer.categorize()
if __name__ == "__main__":
main()
|