File: compare_license.py

package info (click to toggle)
xournalpp 1.3.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 30,036 kB
  • sloc: cpp: 64,137; xml: 939; sh: 752; ansic: 362; python: 338; php: 74; makefile: 15
file content (224 lines) | stat: -rw-r--r-- 8,988 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
"""Script for copyright/license reports

Dependencies:
 - python3
 - ripgrep

Assumptions:
 - copyright.txt file is in CWD
 - copyright.txt is in https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ format
 - all relevant files are within CWD and not ignored by git (ripgrep default behaviour)

For xournalpp execute using:

    python3 scripts/compare_license.py


Workflow:

 1. Run the script
 2. In case script exits with status 1 adapt copyright.txt or this script (see comments I, II, III in below code)
 3. Rerun script now it should exit with status 0

Note: This script cannot automatically detect whether you added a file that should be licensed differently
but does not indicate this in any way. Please refer to comment II in the code below and add it to the whitelist.
"""

from typing import Set
import re
import os
import subprocess

def get_files_from_copyright_format(file: str) -> Set[str]:
    """Get all Files listed in a copyright file

    Args:
     - file: file formatted according to https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
    """
    with open('copyright.txt', 'r') as f:
        lines = f.readlines()

    files = set()
    for l in lines:
        if re.match("^Files: |^ {7}[a-zA-Z0-9/_\-.*]* *$", l):
            files.add(l[7:].strip())
    return files

def get_all_files():
    stdout = os.popen('rg --files').readlines()
    files = [f.strip() for f in stdout]
    return set(files)

def get_files_containing_copyright_or_license():
    """Find all files containing either
     - copyright
     - license
    (case insensitive)
    Exluding .po files as they create only false positives
    """
    stdout = os.popen('rg -i -e "copyright" -e "license" -l | rg -v "\.po"').readlines()
    files = [f.strip() for f in stdout]
    lc_files = set(files)

    stdout = os.popen('rg --files-without-match "@author Xournal\+\+"').readlines()
    files = [f.strip() for f in stdout]
    xpp_files = set(files)
    return (lc_files & xpp_files)

def get_changed_files_since(git_hash:str):
    stdout = os.popen(f'git diff {git_hash} HEAD --name-only').readlines()
    files = [f.strip() for f in stdout]
    return set(files)

def get_source_files_missing_license_of_header(scanned_files:Set[str], all_files:Set[str]) -> Set[str]:
    """Return all `.cpp` files which do not have a license but their corrsponding `.h` file has.

    Args:
        scanned_files (Set[str]): Files which have a license header
        all_files (Set[str]): all Files in the project (used for existence check)
    """
    scanned_header_files = set()
    scanned_source_files = set()
    for f in scanned_files:
        if f.endswith(".h"):
            scanned_header_files.add(f.strip('.h'))
        elif f.endswith('.cpp'):
            scanned_source_files.add(f.strip('.cpp'))

    missing_source_files = scanned_header_files - scanned_source_files
    source_file_exists = lambda x: (x+'.cpp') in all_files
    return set(filter(source_file_exists, missing_source_files))

# I: Add an entry if a file is detected automatically as a file with special
# license/copyright, but which is actually licensed/copyrighted under the same
# license/copyright as xournalpp.
# Please add a short comment explaining why it's whitelisted
def get_whitelist_not_listed():
    """Whitelist for files containing the searched for substrings but
    are not necessary for the copyright.txt"""
    white_list = set()
    white_list.add("ABOUT-NLS") # false positive
    white_list.add("copyright.txt") # copyright/license summary file
    white_list.add("scripts/compare_license.py") # this very script
    white_list.add("CMakeLists.txt") # false positive
    white_list.add("LICENSE") # main license file
    white_list.add("rpm/fedora/xournalpp.spec") # false positive
    white_list.add("windows-setup/xournalpp.nsi") # false positive
    white_list.add("ui/about.glade") # false positive
    white_list.add("src/exe/win32/xpp.rc.in") # false positive
    white_list.add("mac-setup/Info.plist") # false positive
    white_list.add("src/core/gui/dialog/AboutDialog.cpp") # false positive
    return white_list

# II: Add an entry to the whitelist if you added a file which has special
# licensing/copyright but does not contain any of the substrings used to
# automatically identify such files
# The rational should be explained in the copyright.txt file itself.
# Do not use comments in this file to explain the rational.
def get_whitelist_not_found():
    """Whitelist for files listed in copyright.txt but do not include
    the searched for substrings"""
    white_list = set()
    white_list.add("*")
    white_list.add("debian/changelog")
    white_list.add("debian/compat")
    white_list.add("debian/control")
    white_list.add("debian/docs")
    white_list.add("debian/package_description")
    white_list.add("debian/rules")
    white_list.add("debian/source/format")
    white_list.add("ui/pixmaps/application-x-xojpp.svg")
    white_list.add("ui/pixmaps/application-x-xopp.svg")
    white_list.add("ui/pixmaps/application-x-xopt.svg")
    white_list.add("ui/pixmaps/com.github.xournalpp.xournalpp.png")
    white_list.add("ui/pixmaps/com.github.xournalpp.xournalpp.svg")
    white_list.add("ui/pixmaps/gnome-mime-application-x-xopp.svg")
    white_list.add("ui/pixmaps/gnome-mime-application-x-xopt.svg")
    white_list.add("ui/pixmaps/xopt.svg")
    white_list.add("ui/iconsColor-dark/*")
    white_list.add("ui/iconsColor-light/*")
    white_list.add("ui/iconsLucide-dark/*")
    white_list.add("ui/iconsLucide-light/*")
    white_list.add("ui/iconsColor-dark/hicolor/scalable/actions/xopp-compass.svg")
    white_list.add("ui/iconsColor-dark/hicolor/scalable/actions/xopp-setsquare.svg")
    white_list.add("ui/iconsColor-light/hicolor/scalable/actions/xopp-Tselect-pdf-text-area.svg")
    white_list.add("ui/iconsColor-light/hicolor/scalable/actions/xopp-Tselect-pdf-text-hd.svg")
    white_list.add("ui/iconsLucide-dark/hicolor/scalable/actions/xopp-compass.svg")
    white_list.add("ui/iconsLucide-dark/hicolor/scalable/actions/xopp-draw-spline.svg")
    white_list.add("ui/iconsLucide-dark/hicolor/scalable/actions/xopp-floating-toolbox.svg")
    white_list.add("ui/iconsLucide-dark/hicolor/scalable/actions/xopp-setsquare.svg")
    white_list.add("ui/iconsLucide-light/hicolor/scalable/actions/xopp-compass.svg")
    white_list.add("ui/iconsLucide-light/hicolor/scalable/actions/xopp-draw-spline.svg")
    white_list.add("ui/iconsLucide-light/hicolor/scalable/actions/xopp-floating-toolbox.svg")
    white_list.add("ui/iconsLucide-light/hicolor/scalable/actions/xopp-setsquare.svg")

    return white_list

# III: Update git commit hash to current commit once you checked
# that the changes do not affect the licensing information in copyright.txt
last_checked_git_commit_hash = "c00f7b74009716c488bd666fa8ba7587ea0fed2f"

changed_files = get_changed_files_since(last_checked_git_commit_hash)

summary_files = get_files_from_copyright_format("copyright.txt")
scanned_files = get_files_containing_copyright_or_license()

found = summary_files & scanned_files
not_found = summary_files - scanned_files - get_whitelist_not_found()
not_listed = scanned_files - summary_files - get_whitelist_not_listed()

# Copyright could change with the same commit. Hence, it needs to be exluded.
all_whitelisted = (get_whitelist_not_found() | get_whitelist_not_listed()) - set(["copyright.txt"])
# Files inside copyright.txt or mentioned in whitelist should be checked for
# diffs affecting the license/copyright
out_of_date = (all_whitelisted | summary_files) & changed_files

missing_source_license = get_source_files_missing_license_of_header(scanned_files, get_all_files())

print("Found License/Copyright both in copyright.txt and repo: ",len(found))
if not_listed:
    print()
    print("No License/Copyright listed in copyright.txt (but found in repo):")
    for f in sorted(not_listed):
        print(" ", f)
else:
    print("- All automatically detected files listed or whitelisted")


if not_found:
    print()
    print("No License/Copyright found in repo (but listed in copyright.txt):")
    for f in sorted(not_found):
        print(" ", f)
else:
    print("- All listed files automatically detected or whitelisted")


if out_of_date:
    print()
    print("Following items are whitelisted or listed in copyright.txt but changed since last check:")
    for f in sorted(out_of_date):
        print(" ", f)
else:
    print("- No listed file got changed since the last check.")

if missing_source_license:
    print()
    print("Following `.cpp` files do NOT contain a license even though their accompanying `.h` file does.")
    for f in sorted(missing_source_license):
        print(" ", f)

if not_found or not_listed:
    print("⚠️ Update required")
    exit(1)

if out_of_date:
    "⚠️ Recheck required"
    exit(1)

if missing_source_license:
    "⚠️ Adding license header required"
    exit(1)

print("🎉 Success")
exit(0)