1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
|
package normalize
import (
"bytes"
"regexp"
"strings"
"unicode"
"golang.org/x/text/runes"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
)
var (
lineEndingsRe = regexp.MustCompile(`\r\n?`)
// 3.1.1 All whitespace should be treated as a single blank space.
whitespaceRe = regexp.MustCompile(`[ \t\f\r ]+`)
trailingWhitespaceRe = regexp.MustCompile(`(?m)[ \t\f\r ]$`)
licenseHeaderRe = regexp.MustCompile(`(licen[cs]e)\.?\n\n`)
leadingWhitespaceRe = regexp.MustCompile(`(?m)^(( \n?)|\n)`)
// 5.1.2 Hyphens, Dashes Any hyphen, dash, en dash, em dash, or other variation should be
// considered equivalent.
punctuationRe = regexp.MustCompile(`[-‒–—―⁓⸺⸻~˗‐‑⁃⁻₋−∼⎯⏤─➖𐆑֊﹘﹣-]+`)
// 5.1.3 Quotes Any variation of quotations (single, double, curly, etc.) should be considered
// equivalent.
quotesRe = regexp.MustCompile(`["'“”‘’„‚«»‹›❛❜❝❞\x60]+`)
// 7.1.1 Where a line starts with a bullet, number, letter, or some form of a list item
// (determined where list item is followed by a space, then the text of the sentence), ignore
// the list item for matching purposes.
bulletRe = regexp.MustCompile(`(?m)^(([-*✱﹡•●⚫⏺🞄∙⋅])|([(\[{]?\d+[.)\]}] ?)|([(\[{]?[a-z][.)\]}] ?)|([(\[{]?i+[.)\]} ] ?))`)
// 8.1.1 The words in the following columns are considered equivalent and interchangeable.
wordReplacer = strings.NewReplacer(
"acknowledgment", "acknowledgement",
"analogue", "analog",
"analyse", "analyze",
"artefact", "artifact",
"authorisation", "authorization",
"authorised", "authorized",
"calibre", "caliber",
"cancelled", "canceled",
"capitalisations", "capitalizations",
"catalogue", "catalog",
"categorise", "categorize",
"centre", "center",
"emphasised", "emphasized",
"favour", "favor",
"favourite", "favorite",
"fulfil", "fulfill",
"fulfilment", "fulfillment",
"initialise", "initialize",
"judgment", "judgement",
"labelling", "labeling",
"labour", "labor",
"licence", "license",
"maximise", "maximize",
"modelled", "modeled",
"modelling", "modeling",
"offence", "offense",
"optimise", "optimize",
"organisation", "organization",
"organise", "organize",
"practise", "practice",
"programme", "program",
"realise", "realize",
"recognise", "recognize",
"signalling", "signaling",
"sub-license", "sublicense",
"sub license", "sub-license",
"utilisation", "utilization",
"whilst", "while",
"wilful", "wilfull",
"non-commercial", "noncommercial",
"per cent", "percent",
"copyright owner", "copyright",
)
// 9.1.1 "©", "(c)", or "Copyright" should be considered equivalent and interchangeable.
copyrightRe = regexp.MustCompile(`copyright|\(c\)`)
trademarkRe = regexp.MustCompile(`trademark(s?)|\(tm\)`)
// extra cleanup
brokenLinkRe = regexp.MustCompile(`http s ://`)
urlCleanupRe = regexp.MustCompile(`[<(](http(s?)://[^\s]+)[)>]`)
copyrightLineRe = regexp.MustCompile(`(?m)^((©.*)|(all rights reserved(\.)?)|(li[cs]en[cs]e))\n`)
nonAlphaNumRe = regexp.MustCompile(`[^- \na-z0-9]`)
// used in Split()
splitRe = regexp.MustCompile(`\n\s*[^a-zA-Z0-9_,()]{3,}\s*\n`)
)
// Strictness represents the aggressiveness of the performed normalization. The bigger the number,
// the more aggressive. See `Enforced`, `Moderate` and `Relaxed`.
type Strictness int
const (
// Enforced is the strictest mode - only the official SPDX guidelines are applied.
Enforced Strictness = 0
// Moderate is equivalent to Enforced with some additional normalization: dots are removed, copyright lines too.
Moderate Strictness = 1
// Relaxed is the most powerful normalization, Moderate + Unicode normalization and all non-alphanumeric chars removed.
Relaxed Strictness = 2
)
// LicenseText makes a license text ready for analysis.
// It follows SPDX guidelines at
// https://spdx.org/spdx-license-list/matching-guidelines
func LicenseText(text string, strictness Strictness) string {
// Line endings
text = lineEndingsRe.ReplaceAllString(text, "\n")
// 4. Capitalization
text = strings.ToLower(text)
// 3. Whitespace
text = whitespaceRe.ReplaceAllString(text, " ")
text = trailingWhitespaceRe.ReplaceAllString(text, "")
text = licenseHeaderRe.ReplaceAllString(text, "$1\nthisislikelyalicenseheaderplaceholder\n")
text = leadingWhitespaceRe.ReplaceAllString(text, "")
// 5. Punctuation
text = punctuationRe.ReplaceAllString(text, "-")
text = quotesRe.ReplaceAllString(text, "\"")
// 7. Bullets and Numbering
text = bulletRe.ReplaceAllString(text, "")
// 8. Varietal Word Spelling
text = wordReplacer.Replace(text)
// 9. Copyright Symbol
text = copyrightRe.ReplaceAllString(text, "©")
text = trademarkRe.ReplaceAllString(text, "™")
// fix broken URLs in SPDX source texts
text = brokenLinkRe.ReplaceAllString(text, "https://")
// fix URLs in <> - erase the decoration
text = urlCleanupRe.ReplaceAllString(text, "$1")
// collapse several non-alphanumeric characters
{
buffer := &bytes.Buffer{}
back := '\x00'
for _, char := range text {
if !unicode.IsLetter(char) && !unicode.IsDigit(char) && back == char {
continue
}
back = char
buffer.WriteRune(char)
}
text = buffer.String()
}
if strictness > Enforced {
// there are common mismatches because of trailing dots
text = strings.Replace(text, ".", "", -1)
// usually copyright lines are custom and occur multiple times
text = copyrightLineRe.ReplaceAllString(text, "")
}
if strictness > Moderate {
return Relax(text)
}
text = leadingWhitespaceRe.ReplaceAllString(text, "")
text = strings.Replace(text, "thisislikelyalicenseheaderplaceholder", "", -1)
return text
}
// Relax applies very aggressive normalization rules to text.
func Relax(text string) string {
buffer := &bytes.Buffer{}
writer := transform.NewWriter(
buffer, transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC))
_, _ = writer.Write([]byte(text))
_ = writer.Close()
text = buffer.String()
text = nonAlphaNumRe.ReplaceAllString(text, "")
text = leadingWhitespaceRe.ReplaceAllString(text, "")
text = strings.Replace(text, " ", " ", -1)
return text
}
// Split applies heuristics to split the text into several parts
func Split(text string) []string {
result := []string{text}
// Always add the full text
splitted := splitRe.Split(text, -1)
if len(splitted) > 1 {
result = append(result, splitted...)
}
return result
}
|