1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400
|
#!/usr/bin/python3
# A trivial script for updating copyright headers within a project from its git
# commit history
#
# Copyright (c) 2020-2021 Dave Jones <dave@waveform.org.uk>
#
# SPDX-License-Identifier: BSD-3-Clause
"""
A script for updating the copyright headers on all files project-wide. It
derives the authorship and copyright years information from the git history
of the project; hence, this script must be run within a git repository.
"""
from __future__ import annotations
import os
import sys
assert sys.version_info >= (3, 6), 'Script requires Python 3.6+'
import tempfile
import typing as t
from argparse import ArgumentParser, Namespace
from configparser import ConfigParser
from operator import attrgetter
from itertools import groupby
from datetime import datetime
from subprocess import Popen, PIPE, DEVNULL
from pathlib import Path
from fnmatch import fnmatch
SPDX_PREFIX = 'SPDX-License-Identifier:'
COPYRIGHT_PREFIX = 'Copyright (c)'
def main(args: t.List[str] = None):
if args is None:
args = sys.argv[1:]
config = get_config(args)
writer = CopyWriter.from_config(config)
for path, copyrights in get_copyrights(config.include, config.exclude):
print(f'Re-writing {path}...')
copyrights = sorted(
copyrights, reverse=True, key=lambda c: (max(c.years), c.author))
with AtomicReplaceFile(path, encoding='utf-8') as target:
with path.open('r') as source:
for chunk in writer.transform(source, copyrights):
target.write(chunk)
def get_config(args: t.List[str]) -> Namespace:
config = ConfigParser(
defaults={
'include': '**/*',
'exclude': '',
'license': 'LICENSE.txt',
'preamble': '',
'strip_preamble': 'false',
'spdx_prefix': SPDX_PREFIX,
'copy_prefix': COPYRIGHT_PREFIX,
},
delimiters=('=',), default_section='copyrights:settings',
empty_lines_in_values=False, interpolation=None,
converters={'list': lambda s: s.strip().splitlines() })
config.read('setup.cfg')
sect = config[config.default_section]
parser = ArgumentParser(description=__doc__)
parser.add_argument(
'-i', '--include', action='append', metavar='GLOB',
default=sect.getlist('include'),
help="The set of patterns that a file must match to be included in "
"the set of files to re-write. Can be specified multiple times to "
"add several patterns. Default: %(default)r")
parser.add_argument(
'-e', '--exclude', action='append', metavar='GLOB',
default=sect.getlist('exclude'),
help="The set of patterns that a file must *not* match to be included "
"in the set of files to re-write. Can be specified multiple times to "
"add several patterns. Default: %(default)r")
parser.add_argument(
'-l', '--license', action='store', type=Path, metavar='PATH',
default=sect['license'],
help="The file containing the project's license text. If this file "
"contains a SPDX-License-Identifier line (in addition to the license "
"text itself), then matching license text found in source files will "
"be replaced by the SPDX-License-Identifier line (appropriately "
"commented). Default: %(default)s")
parser.add_argument(
'-p', '--preamble', action='append', metavar='STR',
default=sect.getlist('preamble'),
help="The line(s) of text to insert before the copyright attributions "
"in source files. This is typically a brief description of the "
"project. Can be specified multiple times to add several lines. "
"Default: %(default)r")
parser.add_argument(
'-S', '--spdx-prefix', action='store', metavar='STR',
default=sect['spdx_prefix'],
help="The prefix on the line in the license file, and within comments "
"of source files that identifies the appropriate license from the "
"SPDX list. Default: %(default)r")
parser.add_argument(
'-C', '--copy-prefix', action='store', metavar='STR',
default=sect['copy_prefix'],
help="The prefix before copyright attributions in source files. "
"Default: %(default)r")
parser.add_argument(
'--no-strip-preamble', action='store_false', dest='strip_preamble')
parser.add_argument(
'--strip-preamble', action='store_true',
default=sect.getboolean('strip-preamble'),
help="If enabled, any existing preamble matching that specified "
"by --preamble will be removed. This can be used to change the "
"preamble text in files by first specifying the old preamble with "
"this option, then running a second time with the new preamble")
ns = parser.parse_args(args)
ns.include = set(ns.include)
ns.exclude = set(ns.exclude)
return ns
class Copyright(t.NamedTuple):
author: str
email: str
years: t.Set[int]
def __str__(self):
if len(self.years) > 1:
years = f'{min(self.years)}-{max(self.years)}'
else:
years = f'{min(self.years)}'
return f'{years} {self.author} <{self.email}>'
def get_copyrights(include: t.Set[str], exclude: t.Set[str])\
-> t.Iterator[t.Tuple[Path, t.Container[Copyright]]]:
sorted_blame = sorted(
get_contributions(include, exclude),
key=lambda c: (c.path, c.author, c.email)
)
blame_by_file = {
path: list(file_contributions)
for path, file_contributions in groupby(
sorted_blame, key=attrgetter('path')
)
}
for path, file_contributors in blame_by_file.items():
it = groupby(file_contributors, key=lambda c: (c.author, c.email))
copyrights = [
Copyright(author, email, {y.year for y in years})
for (author, email), years in it
]
yield path, copyrights
class Contribution(t.NamedTuple):
author: str
email: str
year: int
path: Path
def get_contributions(include: t.Set[str], exclude: t.Set[str])\
-> t.Iterator[Contribution]:
for path in get_source_paths(include, exclude):
blame = Popen(
['git', 'blame', '--line-porcelain', 'HEAD', '--', str(path)],
stdout=PIPE,
stderr=PIPE,
universal_newlines=True
)
author = email = year = None
if blame.stdout is not None:
for line in blame.stdout:
if line.startswith('author '):
author = line.split(' ', 1)[1].rstrip()
elif line.startswith('author-mail '):
email = line.split(' ', 1)[1].rstrip()
email = email.lstrip('<').rstrip('>')
elif line.startswith('author-time '):
# Forget the timezone; we only want the year anyway
timestamp = int(line.split(' ', 1)[1].strip())
year = datetime.fromtimestamp(timestamp).year
elif line.startswith('filename '):
assert author is not None
assert email is not None
assert year is not None
yield Contribution(
author=author, email=email, year=year, path=path)
author = email = year = None
blame.wait()
assert blame.returncode == 0
def get_source_paths(include: t.Set[str], exclude: t.Set[str])\
-> t.Iterator[Path]:
ls_tree = Popen(
['git', 'ls-tree', '-r', '--name-only', 'HEAD'],
stdout=PIPE, stderr=DEVNULL, universal_newlines=True)
if not include:
include = {'*'}
if ls_tree.stdout is not None:
for filename in ls_tree.stdout:
filename = filename.strip()
if any(fnmatch(filename, pattern) for pattern in exclude):
continue
if any(fnmatch(filename, pattern) for pattern in include):
yield Path(filename)
ls_tree.wait()
assert ls_tree.returncode == 0
class License(t.NamedTuple):
ident: t.Optional[str]
text: t.List[str]
def get_license(path: Path, *, spdx_prefix: str = SPDX_PREFIX) -> License:
with open(path, 'r') as f:
lines = f.read().splitlines()
idents = [
line.rstrip() for line in lines
if line.startswith(spdx_prefix)
]
ident = None
if len(idents) > 1:
raise RuntimeError(f'More than one {spdx_prefix} line in {path}!')
elif len(idents) == 1:
ident = idents[0]
body = [
line.rstrip() for line in lines
if not line.startswith(spdx_prefix)
]
while not body[0]:
del body[0]
while not body[-1]:
del body[-1]
return License(ident, body)
class CopyWriter:
"""
Transformer for the copyright header in source files. The :meth:`transform`
method can be called with a file-like object as the *source* and will
yield chunks of replacement data to be written to the replacement.
"""
# The script's kinda dumb at this point - only handles straight-forward
# line-based comments, not multi-line delimited styles like /*..*/
COMMENTS = {
'': '#',
'.c': '//',
'.cpp': '//',
'.js': '//',
'.py': '#',
'.rst': '..',
'.sh': '#',
'.sql': '--',
}
def __init__(self, license: Path=Path('LICENSE.txt'),
preamble: t.List[str]=None,
spdx_prefix: str=SPDX_PREFIX,
copy_prefix: str=COPYRIGHT_PREFIX):
if preamble is None:
preamble = []
self.license = get_license(license, spdx_prefix=spdx_prefix)
self.preamble = preamble
self.spdx_prefix = spdx_prefix
self.copy_prefix = copy_prefix
@classmethod
def from_config(cls, config: Namespace) -> CopyWriter:
return cls(
config.license, config.preamble,
config.spdx_prefix, config.copy_prefix)
def transform(self, source: t.TextIO,
copyrights: t.List[Copyright], *,
comment_prefix: str=None) -> t.Iterator[str]:
if comment_prefix is None:
comment_prefix = self.COMMENTS[Path(source.name).suffix]
license_start = self.license.text[0]
license_end = self.license.text[-1]
state = 'header'
empty = True
for linenum, line in enumerate(source, start=1):
if state == 'header':
if linenum == 1 and line.startswith('#!'):
yield line
empty = False
elif linenum < 3 and (
'fileencoding=' in line or '-*- coding:' in line):
yield line
empty = False
elif line.rstrip() == comment_prefix:
pass # skip blank comment lines
elif line.startswith(f'{comment_prefix} {self.spdx_prefix}'):
pass # skip existing SPDX ident
elif line.startswith(f'{comment_prefix} {self.copy_prefix}'):
pass # skip existing copyright lines
elif any(line.startswith(f'{comment_prefix} {pre_line}')
for pre_line in self.preamble):
pass # skip existing preamble
elif line.startswith(f'{comment_prefix} {license_start}'):
state = 'license' # skip existing license lines
else:
yield from self._generate_header(
copyrights, comment_prefix, empty)
state = 'blank'
elif state == 'license':
if line.startswith(f'{comment_prefix} {license_end}'):
yield from self._generate_header(
copyrights, comment_prefix, empty)
state = 'blank'
continue
if state == 'blank':
# Ensure there's a blank line between license and start of the
# source body
if line.strip():
yield '\n'
yield line
state = 'body'
elif state == 'body':
yield line
def _generate_header(self, copyrights: t.Iterable[Copyright],
comment_prefix: str, empty: bool) -> t.Iterator[str]:
if not empty:
yield comment_prefix + '\n'
for line in self.preamble:
yield f'{comment_prefix} {line}\n'
if self.preamble:
yield comment_prefix + '\n'
for copyright in copyrights:
yield f'{comment_prefix} {self.copy_prefix} {copyright!s}\n'
yield comment_prefix + '\n'
if self.license.ident:
yield f'{comment_prefix} {self.license.ident}\n'
else:
for line in self.license.text:
if line:
yield f'{comment_prefix} {line}\n'
else:
yield comment_prefix + '\n'
class AtomicReplaceFile:
"""
A context manager for atomically replacing a target file.
Uses :class:`tempfile.NamedTemporaryFile` to construct a temporary file in
the same directory as the target file. The associated file-like object is
returned as the context manager's variable; you should write the content
you wish to this object.
When the context manager exits, if no exception has occurred, the temporary
file will be renamed over the target file atomically (after copying
permissions from the target file). If an exception occurs during the
context manager's block, the temporary file will be deleted leaving the
original target file unaffected and the exception will be re-raised.
:param pathlib.Path path:
The full path and filename of the target file. This is expected to be
an absolute path.
:param str encoding:
If ``None`` (the default), the temporary file will be opened in binary
mode. Otherwise, this specifies the encoding to use with text mode.
"""
def __init__(self, path: t.Union[str, Path], encoding: str = None):
if isinstance(path, str):
path = Path(path)
self._path = path
self._tempfile = tempfile.NamedTemporaryFile(
mode='wb' if encoding is None else 'w',
dir=str(self._path.parent), encoding=encoding, delete=False)
self._withfile = None
def __enter__(self):
self._withfile = self._tempfile.__enter__()
return self._withfile
def __exit__(self, exc_type, exc_value, exc_tb):
os.fchmod(self._withfile.file.fileno(), self._path.stat().st_mode)
result = self._tempfile.__exit__(exc_type, exc_value, exc_tb)
if exc_type is None:
os.rename(self._withfile.name, str(self._path))
else:
os.unlink(self._withfile.name)
return result
if __name__ == '__main__':
main()
|