File: get_git_authors.py

package info (click to toggle)
paperwork 2.2.5-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 166,660 kB
  • sloc: python: 44,775; makefile: 992; sh: 625; xml: 135
file content (220 lines) | stat: -rwxr-xr-x 6,215 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#!/usr/bin/env python3

import collections
import fnmatch
import json
import os
import os.path
import re
import subprocess
import sys


CATEGORIES = [
    # order of evaluation matters
    ("doc", "Documentation"),
    ("openpaperwork-core", "OpenPaperwork Core"),
    ("openpaperwork-gtk", "OpenPaperwork GTK"),
    ("paperwork-backend", "Paperwork Backend"),
    ("paperwork-gtk", "GTK Frontend"),
    ("paperwork-shell", "CLI Frontend"),
    ("flatpak", "Flatpak Integration"),
    (None, "Others"),  # default
]


REPLACEMENT_RULES = {
    # Because I'm a dimw** who doesn't always configure his Git correctly.
    "jflesch": "Jerome Flesch",

    # Those are translations commits from Weblate. Weblate credits are
    # downloaded from Weblate manually.
    "Weblate Admin": None,

    # Shouldn't happen
    "Not Committed Yet": None,
}


EXTRA_IGNORES = [
    "sub",
    ".git",
    "AUTHORS*",
]


REGEX_EMAIL_AUTHOR = re.compile(
    r"[^(]*\((.+)\s\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} .*"
)


def split_path(path):
    path = os.path.normpath(path)
    return path.split(os.sep)


class IgnoreList(object):
    def __init__(self, ignore_list):
        self.ignore_list = EXTRA_IGNORES + ignore_list

    @staticmethod
    def find_gitignore():
        current_dir = os.path.abspath(__file__)
        while not os.path.exists(os.path.join(current_dir, ".gitignore")):
            current_dir = os.path.dirname(current_dir)
            if current_dir == "/":
                raise Exception(".gitignore not found")
        return os.path.join(current_dir, ".gitignore")

    @staticmethod
    def load():
        gitignore_path = IgnoreList.find_gitignore()
        sys.stderr.write("Loading {} ... ".format(gitignore_path))
        sys.stderr.flush()
        with open(gitignore_path, 'r') as fd:
            ignore_list = fd.readlines()
        ignore_list = [line.strip() for line in ignore_list]
        ignore_list = [line for line in ignore_list if line != ""]
        ignore_list = [line.replace("/", "") for line in ignore_list]
        sys.stderr.write("{} ignores loaded\n".format(len(ignore_list)))
        return IgnoreList(ignore_list)

    def match(self, file_path):
        for pattern in self.ignore_list:
            if fnmatch.fnmatch(file_path, pattern):
                return True

        file_path = split_path(file_path)
        for pattern in self.ignore_list:
            for file_path_component in file_path:
                if fnmatch.fnmatch(file_path_component, pattern):
                    return True
        return False


def walk(directory, ignore_list):
    for (dirpath, dirnames, file_names) in os.walk(directory):
        for file_name in file_names:
            file_path = os.path.join(dirpath, file_name)
            if ignore_list.match(file_path):
                continue
            yield file_path


def get_category_name(file_path):
    file_path = split_path(file_path)
    for (category_pattern, category_name) in CATEGORIES:
        if category_pattern is None:
            return category_name
        for component in file_path:
            if component == category_pattern:
                return category_name
    assert False


def count_lines(line_counts, file_path):
    output = subprocess.run(
        ['git', 'blame', file_path],
        capture_output=True
    )
    if output.returncode != os.EX_OK:
        sys.stderr.write(
            "WARNING: git blame {} failed ! (returncode={})\n".format(
                file_path, output.returncode
            )
        )
        return

    try:
        stdout = output.stdout.decode("utf-8")
    except UnicodeDecodeError:
        sys.stderr.write(
            "WARNING: Unicode on {}. Assuming it's a binary file\n".format(
                file_path
            )
        )
        return

    stdout = [line.strip() for line in stdout.split("\n")]

    for line in stdout:
        if line == "":
            continue

        author = REGEX_EMAIL_AUTHOR.match(line)
        if author is None:
            sys.stderr.write(
                "WARNING: Failed to find author email in the following line:\n"
            )
            sys.stderr.write(line + "\n")
            continue
        author = author[1].strip()
        author = REPLACEMENT_RULES.get(author, author)

        if author is None:
            # replacement rules told us to ignore this one
            continue

        line_counts[author] += 1


def dump_json(line_counts):
    # We want to merge this JSON with the one from Weblate later. So we
    # imitate the weird JSON output of Weblate here.

    out = []
    for (category_name, authors) in line_counts.items():
        category = {category_name: []}
        out.append(category)

        category = category[category_name]
        for (author, line_count) in authors.items():
            category.append([
                '',  # we don't care about emails
                author,
                line_count,
            ])
        # sort authors by line count
        category.sort(key=lambda x: x[2], reverse=True)

    # Sort categories alphabetically
    out.sort(key=lambda x: next(iter(x)).lower())

    print(json.dumps(
        out,
        indent=4,
        separators=(',', ': '),
        sort_keys=True
    ))


def main():
    if len(sys.argv) < 2 or sys.argv[1] == '-h' or sys.argv[1] == '--help':
        sys.stderr.write("Syntax:\n")
        sys.stderr.write("  {} <directory to examine>\n".format(sys.argv[0]))
        return

    ignore_list = IgnoreList.load()
    line_counts = collections.defaultdict(
        lambda: collections.defaultdict(lambda: 0)
    )

    for file_path in walk(sys.argv[1], ignore_list):
        category = get_category_name(file_path)
        sys.stderr.write("Examining {} (category={}) ...\n".format(
            file_path, category
        ))
        counts = line_counts[category]
        count_lines(counts, file_path)

    for (category, authors) in line_counts.items():
        sys.stderr.write("  - {}\n".format(category))
        for (k, v) in authors.items():
            sys.stderr.write("{}: {}\n".format(k, v))
        sys.stderr.write("\n")

    dump_json(line_counts)


if __name__ == "__main__":
    main()