File: Copyright.pm

package info (click to toggle)
libstring-copyright-perl 0.003014-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 284 kB
  • sloc: perl: 177; makefile: 2; sh: 1
file content (368 lines) | stat: -rw-r--r-- 10,673 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
use 5.008001;
use strict;
use warnings;
use utf8;
use re (qw/eval/);

my $CAN_RE2;

BEGIN {
	eval { require re::engine::RE2 };
	$CAN_RE2 = $@ ? '' : 1;
}

package String::Copyright;

=encoding UTF-8

=head1 NAME

String::Copyright - Representation of text-based copyright statements

=head1 VERSION

Version 0.003014

=cut

our $VERSION = '0.003014';

# Dependencies
use parent 'Exporter::Tiny';
use Carp ();
use Set::IntSpan;

our @EXPORT = qw/copyright/;

use constant {
	PLAINTEXT => 0,
	BLOCKS    => 1,
	FORMAT    => 2,
};

use overload (
	q{""}    => '_compose',
	fallback => 1,
);

=head1 SYNOPSIS

    use String::Copyright;

    my $copyright = copyright(<<'END');
    copr. © 1999,2000 Foo Barbaz <fb@acme.corp> and Acme Corp.
    Copyright (c) 2001,2004 Foo (work address) <foo@zorg.corp>
    Copyright 2003, Foo B. and friends
    © 2000, 2002 Foo Barbaz <foo@bar.baz>
    END

    print $copyright;

    # Copyright 1999-2000 Foo Barbaz <fb@acme.com> and Acme Corp.
    # Copyright 2000, 2002 Foo Barbaz and Acme Corp.
    # Copyright 2001, 2004 Foo (work address) <foo@zorg.org>
    # Copyright 2003 Foo B. and friends

=head1 DESCRIPTION

L<String::Copyright> identifies copyright statements in a string
and serializes them in a normalized format.

=head1 OPTIONS

Options can be set as an argument to the 'use' statement.

=head2 threshold, threshold_before, threshold_after

    use String::Copyright { threshold_after => 5 };

Stop parsing after this many lines without copyright information,
before or after having found any copyright information at all.
C<threshold> sets both C<threshold_before> and C<threshold_after>.

By default unset: All lines are parsed.

=head2 format( \&sub )

    use String::Copyright { format => \&GNU_style } };

    sub GNU_style {
        my ( $years, $owners ) = @_;

        return 'Copyright (C) ' . join '  ', $years || '', $owners || '';
    }

=head1 FUNCTIONS

Exports one function: C<copyright>.
This module uses L<Exporter::Tiny> to export functions,
which allows for flexible import options;
see the L<Exporter::Tiny> documentation for details.

=cut

# OR'ed strings have regular variable name and are already grouped
# AND'ed strings have name ending in underscore: must be grouped if repeated
my $blank           = '[ \t]';
my $blank_or_break_ = "$blank*\\n?$blank*";
my $dash            = '[-˗‐‑‒–—―⁃−﹣-]';
my $colons_         = "$blank?:{1,2}";
my $strictlabel     = 'SPDX-FileCopyrightText:';
my $label           = '(?i:copyright(?:-holders?)?\b|copr\.)';
my $sign            = '[©⒞Ⓒⓒ🄒🄫🅒]';
my $nroff_sign_     = '\\\\[(]co';
my $pseudo_sign_    = '[({][Cc][})]';
my $vague_sign_     = '-[Cc]-';
my $broken_sign_    = "\\?$blank*";

# high-bit © noise, caused by misparsing UTF-8 as latin1
# except \xAE (latin1 ©), \xAE (MacRoman ©), \xE2 (latin1 © lowercased after misparse)
my $nonsign_ = '[\x80-\xAB\xAD-\xC1\xC3-\xE1\xE3-\xFF]\xA9';
my $nonidentifier_
	= "(?:no |_|$dash)copyright|copyright-[^h]|(?:Digital Millennium|U.S.|US|United States) Copyright Act|\\b(?:for|we) copyright\\b";

# this should cause *no* false positives, and stop-chars therefore
# exclude e.g. email address building blocks; tested against the code
# corpus at https://codesearch.debian.net/ (tricky: its RE2 engine lacks
# support for negative groups) using searches like these:
# (?i)copyright (?:(?:claim|holder|info|information|notice|owner|ownership|statement|string)s?|in|is|to)@\w
# (?i)copyright (?:(?:claim|holder|info|information|notice|owner|ownership|statement|string)s?|in|is|to)@\b[-_@]
# (?im)copyright (?:(?:claim|holder|info|information|notice|owner|ownership|statement|string)s?|in|is|to)[^ $]
my $identifier_action
	= '(?i:apply|applied|applies|assigned|generated|transfer|transferred)';
my $identifier_thing_
	= '(?i:block|claim|date|disclaimer|holder|info|information|interest|law|license|notice|owner|ownership|permission|sign|statement|string|symbol|tag|text)s?';
my $identifier_misc
	= "(?i:and|are|at|eq|for|if|in|is|of|on|or|,${blank}patent|this|to|the (?:library|software),|treaty)";
my $identifier_chatter
	= "(?:$identifier_action|$identifier_thing_|$identifier_misc)";
my $the_notname
	= '(?i:concrete|fault|first|immediately|least|min\/max|one|outer|previous|ratio|sum|user)';
my $the_sentence_
	= "(?:\\w+$blank+){1,10}(?i:are|can(?:not)?|in|is|must|was)";
my $pseudosign_chatter_
	= "(?:(?:the$blank+(?:$the_notname|$the_sentence_)|all begin|there|you must)\\b|,? \\(?\\w\\))";
my $chatter
	= "(?im:$nonsign_|$nonidentifier_|copyright$blank_or_break_$identifier_chatter(?:\\z|@\\W|[^a-zA-Z0-9@_-])|$blank*$pseudo_sign_(?:$blank_or_break_)+$pseudosign_chatter_)";
my $nonyears_ = '\W?(?i:year|19[xy]{2}|[xy]{4})\W?';

my $year_       = '\b[0-9]{4}\b';
my $comma_spacy = "(?:$blank*,$blank_or_break_|$blank_or_break_,?$blank*)";
my $dash_spacy_ = "$blank*$dash(?:$blank_or_break_)*";

my $colon_or_dash = "(?:$colons_$blank_or_break_|$blank?$dash\{1,2}$blank)";
my $delimiter     = "(?:$colon_or_dash|$comma_spacy)";

my $vague_year_ = "(?:$dash$blank?)?[0-9]{1,5}";
my $owner_intro_
	= "(?:$colon_or_dash|$pseudo_sign_$blank?|\\bby$blank_or_break_)";
my $owner_prefix  = '[(*<@\[{]';
my $owner_initial = '[^\s!"#$%&\'()*+,./:;<=>?@[\\\\\]^_`{|}~-]';

my $signs
	= "(?m:$strictlabel$blank*|(?:$label|$sign|$nroff_sign_|(?:^|$blank)$pseudo_sign_)(?:$colon_or_dash?$blank*(?:$label|$sign|$pseudo_sign_))*)";
my $yearspan_ = "$year_(?:$dash_spacy_$year_)?";
my $years_    = "$yearspan_(?:$comma_spacy$yearspan_)*";
my $owners_
	= "(?:$vague_year_|$owner_prefix*$owner_initial\\S*)(?:$blank*\\S+)*";

# compile regexps in isolation to limit use of RE2 engine
my ($dash_spacy_re, $owner_intro_A_re, $boilerplate_X_re,
	$signs_and_more_re
);
{
	BEGIN { re::engine::RE2->import( -strict => 1 ) if ($CAN_RE2) }
	$dash_spacy_re    = qr/$dash_spacy_/;
	$owner_intro_A_re = qr/^$owner_intro_/;
	$boilerplate_X_re
		= qr/(?i)${comma_spacy}All$blank+Rights$blank+Reserved[.!]?.*/;
	$signs_and_more_re
		= qr/$chatter|$signs(?:$blank$vague_sign_)?$delimiter(?:$broken_sign_)?(?:$nonyears_|((?:$years_$delimiter)?(?:(?:$owner_intro_)?$owners_)?))|\n/;
}

sub _generate_copyright
{
	my ( $class, $name, $args, $globals ) = @_;

	return sub {
		my $copyright = shift;

		Carp::croak("String::Copyright strings require defined parts")
			unless 1 + @_ == grep {defined} $copyright, @_;

	   # String::Copyright objects are effectively immutable and can be reused
		if ( !@_ && ref($copyright) eq __PACKAGE__ ) {
			return $copyright;
		}

		# stringify objects
		$copyright = "$copyright";

		# TODO: also parse @_ - but each separately!
		my @block;
		my $skipped = 0;
		while ( $copyright =~ /$signs_and_more_re/g ) {
			my $owners = $1;
			if ( $globals->{threshold_before} || $globals->{threshold} ) {
				last
					if (
						!@block
					and !length $owners
					and ++$skipped >= (
						$globals->{threshold_before} || $globals->{threshold}
					)
					);
			}
			if ( $globals->{threshold_after} || $globals->{threshold} ) {

				# "after" detects end of _current_ line so is skewed by one
				last
					if (
						@block
					and !length $owners
					and ++$skipped >= 1 + (
						$globals->{threshold_after} || $globals->{threshold}
					)
					);
			}
			next if ( !length $owners );
			$skipped = 0;

			my $years;
			my @span = $owners =~ /\G($yearspan_)(?:$comma_spacy|\Z)/gm;
			if (@span) {
				$owners = $';

				# deduplicate
				my @ranges;
				for (@span) {
					my ( $y1, $y2 ) = split /$dash_spacy_re/;
					if ( !$y2 ) {
						push @ranges, $y1;
					}
					elsif ( $y1 > $y2 ) {
						push @ranges, [ $y2, $y1 ];
					}
					else {
						push @ranges, [ $y1, $y2 ];
					}
				}

				# normalize
				$years = join ', ',
					map { $_->[0] == $_->[1] ? $_->[0] : "$_->[0]-$_->[1]" }
					Set::IntSpan->new( \@ranges )->spans;
			}
			if ($owners) {
				$owners =~ s/$owner_intro_A_re//;
				$owners =~ s/\s{2,}/ /g;
				$owners =~ s/$owner_intro_A_re//;
				$owners =~ s/$boilerplate_X_re//g;
			}

# split owner into owner_id and owner

			push @block, [ $years || undef, $owners || undef ];
		}

# TODO: save $skipped_lines to indicate how dirty parsing was

		my $ext_format = $globals->{format};
		my $format
			= $globals->{format}
			? sub { $ext_format->( $_->[0], $_->[1] ) }
			: sub { join ' ', '©', $_->[0] || (), $_->[1] || () };

		bless [ $copyright, \@block, $format ], __PACKAGE__;
	}
}

sub new
{
	my ( $self, @data ) = @_;
	Carp::croak("String::Copyright require defined, positive-length parts")
		unless 1 + @_ == grep { defined && length } @data;

	# String::Copyright objects are simply stripped of their string part
	if ( !@_ && ref($self) eq __PACKAGE__ ) {
		return bless [ undef, $data[1] ], __PACKAGE__;
	}

	# FIXME: properly validate data
	Carp::croak("String::Copyright blocks must be an array of strings")
		unless @_ == grep { ref eq 'ARRAY' } @data;

	bless [ undef, \@data ], __PACKAGE__;
}

sub _compose
{
	my $format = $_[0]->[FORMAT];
	join "\n", map {&$format} @{ $_[0]->[BLOCKS] };
}

sub is_normalized { !defined $_[0]->[PLAINTEXT] }

=head1 SEE ALSO

=over 4

=item *

L<Encode>

=item *

L<Exporter::Tiny>

=back

=head1 BUGS/CAVEATS/etc

L<String::Copyright> operates on strings, not bytes.
Data encoded as UTF-8, Latin1 or other formats
need to be decoded to strings before use.

Only ASCII characters and B<©> (copyright sign) are directly processed.

If copyright sign is not detected
or accents or multi-byte characters display wrong,
then most likely the data was not decoded into a string.

If ranges or lists of years are not tidied,
then maybe it contained non-ASCII whitespace or digits.

=head1 AUTHOR

Jonas Smedegaard C<< <dr@jones.dk> >>

=head1 COPYRIGHT AND LICENSE

This program is based on the script "licensecheck" from the KDE SDK,
originally introduced by Stefan Westerfeld C<< <stefan@space.twc.de> >>.

  Copyright © 2007, 2008 Adam D. Barratt

  Copyright © 2005-2012, 2016, 2018, 2020-2021 Jonas Smedegaard

  Copyright © 2018, 2020-2021 Purism SPC

This program is free software:
you can redistribute it and/or modify it
under the terms of the GNU Affero General Public License
as published by the Free Software Foundation,
either version 3, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY;
without even the implied warranty
of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU Affero General Public License for more details.

You should have received a copy
of the GNU Affero General Public License along with this program.
If not, see <https://www.gnu.org/licenses/>.

=cut

1;