File: normalize.go

package info (click to toggle)
golang-github-go-enry-go-license-detector 4.3.0%2Bgit20221007.a3a1cc6-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 13,068 kB
  • sloc: makefile: 25
file content (195 lines) | stat: -rw-r--r-- 6,575 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
package normalize

import (
	"bytes"
	"regexp"
	"strings"
	"unicode"

	"golang.org/x/text/runes"
	"golang.org/x/text/transform"
	"golang.org/x/text/unicode/norm"
)

var (
	lineEndingsRe = regexp.MustCompile(`\r\n?`)
	// 3.1.1 All whitespace should be treated as a single blank space.
	whitespaceRe         = regexp.MustCompile(`[ \t\f\r              ​]+`)
	trailingWhitespaceRe = regexp.MustCompile(`(?m)[ \t\f\r              ​]$`)
	licenseHeaderRe      = regexp.MustCompile(`(licen[cs]e)\.?\n\n`)
	leadingWhitespaceRe  = regexp.MustCompile(`(?m)^(( \n?)|\n)`)
	// 5.1.2 Hyphens, Dashes  Any hyphen, dash, en dash, em dash, or other variation should be
	// considered equivalent.
	punctuationRe = regexp.MustCompile(`[-‒–—―⁓⸺⸻~˗‐‑⁃⁻₋−∼⎯⏤─➖𐆑֊﹘﹣-]+`)
	// 5.1.3 Quotes  Any variation of quotations (single, double, curly, etc.) should be considered
	// equivalent.
	quotesRe = regexp.MustCompile(`["'“”‘’„‚«»‹›❛❜❝❞\x60]+`)
	// 7.1.1 Where a line starts with a bullet, number, letter, or some form of a list item
	// (determined where list item is followed by a space, then the text of the sentence), ignore
	// the list item for matching purposes.
	bulletRe = regexp.MustCompile(`(?m)^(([-*✱﹡•●⚫⏺🞄∙⋅])|([(\[{]?\d+[.)\]}] ?)|([(\[{]?[a-z][.)\]}] ?)|([(\[{]?i+[.)\]} ] ?))`)
	// 8.1.1 The words in the following columns are considered equivalent and interchangeable.
	wordReplacer = strings.NewReplacer(
		"acknowledgment", "acknowledgement",
		"analogue", "analog",
		"analyse", "analyze",
		"artefact", "artifact",
		"authorisation", "authorization",
		"authorised", "authorized",
		"calibre", "caliber",
		"cancelled", "canceled",
		"capitalisations", "capitalizations",
		"catalogue", "catalog",
		"categorise", "categorize",
		"centre", "center",
		"emphasised", "emphasized",
		"favour", "favor",
		"favourite", "favorite",
		"fulfil", "fulfill",
		"fulfilment", "fulfillment",
		"initialise", "initialize",
		"judgment", "judgement",
		"labelling", "labeling",
		"labour", "labor",
		"licence", "license",
		"maximise", "maximize",
		"modelled", "modeled",
		"modelling", "modeling",
		"offence", "offense",
		"optimise", "optimize",
		"organisation", "organization",
		"organise", "organize",
		"practise", "practice",
		"programme", "program",
		"realise", "realize",
		"recognise", "recognize",
		"signalling", "signaling",
		"sub-license", "sublicense",
		"sub license", "sub-license",
		"utilisation", "utilization",
		"whilst", "while",
		"wilful", "wilfull",
		"non-commercial", "noncommercial",
		"per cent", "percent",
		"copyright owner", "copyright",
	)

	// 9.1.1 "©", "(c)", or "Copyright" should be considered equivalent and interchangeable.
	copyrightRe = regexp.MustCompile(`copyright|\(c\)`)
	trademarkRe = regexp.MustCompile(`trademark(s?)|\(tm\)`)

	// extra cleanup
	brokenLinkRe    = regexp.MustCompile(`http s ://`)
	urlCleanupRe    = regexp.MustCompile(`[<(](http(s?)://[^\s]+)[)>]`)
	copyrightLineRe = regexp.MustCompile(`(?m)^((©.*)|(all rights reserved(\.)?)|(li[cs]en[cs]e))\n`)
	nonAlphaNumRe   = regexp.MustCompile(`[^- \na-z0-9]`)

	// used in Split()
	splitRe = regexp.MustCompile(`\n\s*[^a-zA-Z0-9_,()]{3,}\s*\n`)
)

// Strictness represents the aggressiveness of the performed normalization. The bigger the number,
// the more aggressive. See `Enforced`, `Moderate` and `Relaxed`.
type Strictness int

const (
	// Enforced is the strictest mode - only the official SPDX guidelines are applied.
	Enforced Strictness = 0
	// Moderate is equivalent to Enforced with some additional normalization: dots are removed, copyright lines too.
	Moderate Strictness = 1
	// Relaxed is the most powerful normalization, Moderate + Unicode normalization and all non-alphanumeric chars removed.
	Relaxed Strictness = 2
)

// LicenseText makes a license text ready for analysis.
// It follows SPDX guidelines at
// https://spdx.org/spdx-license-list/matching-guidelines
func LicenseText(text string, strictness Strictness) string {
	// Line endings
	text = lineEndingsRe.ReplaceAllString(text, "\n")

	// 4. Capitalization
	text = strings.ToLower(text)

	// 3. Whitespace
	text = whitespaceRe.ReplaceAllString(text, " ")
	text = trailingWhitespaceRe.ReplaceAllString(text, "")
	text = licenseHeaderRe.ReplaceAllString(text, "$1\nthisislikelyalicenseheaderplaceholder\n")
	text = leadingWhitespaceRe.ReplaceAllString(text, "")

	// 5. Punctuation
	text = punctuationRe.ReplaceAllString(text, "-")
	text = quotesRe.ReplaceAllString(text, "\"")

	// 7. Bullets and Numbering
	text = bulletRe.ReplaceAllString(text, "")

	// 8. Varietal Word Spelling
	text = wordReplacer.Replace(text)

	// 9. Copyright Symbol
	text = copyrightRe.ReplaceAllString(text, "©")
	text = trademarkRe.ReplaceAllString(text, "™")

	// fix broken URLs in SPDX source texts
	text = brokenLinkRe.ReplaceAllString(text, "https://")

	// fix URLs in <> - erase the decoration
	text = urlCleanupRe.ReplaceAllString(text, "$1")

	// collapse several non-alphanumeric characters
	{
		buffer := &bytes.Buffer{}
		back := '\x00'
		for _, char := range text {
			if !unicode.IsLetter(char) && !unicode.IsDigit(char) && back == char {
				continue
			}
			back = char
			buffer.WriteRune(char)
		}
		text = buffer.String()
	}

	if strictness > Enforced {
		// there are common mismatches because of trailing dots
		text = strings.Replace(text, ".", "", -1)
		// usually copyright lines are custom and occur multiple times
		text = copyrightLineRe.ReplaceAllString(text, "")
	}

	if strictness > Moderate {
		return Relax(text)
	}

	text = leadingWhitespaceRe.ReplaceAllString(text, "")
	text = strings.Replace(text, "thisislikelyalicenseheaderplaceholder", "", -1)

	return text
}

// Relax applies very aggressive normalization rules to text.
func Relax(text string) string {
	buffer := &bytes.Buffer{}
	writer := transform.NewWriter(
		buffer, transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC))
	_, _ = writer.Write([]byte(text))
	_ = writer.Close()
	text = buffer.String()
	text = nonAlphaNumRe.ReplaceAllString(text, "")
	text = leadingWhitespaceRe.ReplaceAllString(text, "")
	text = strings.Replace(text, "  ", " ", -1)
	return text
}

// Split applies heuristics to split the text into several parts
func Split(text string) []string {
	result := []string{text}

	// Always add the full text
	splitted := splitRe.Split(text, -1)
	if len(splitted) > 1 {
		result = append(result, splitted...)
	}
	return result
}