1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254
|
/*
Package fuzzy provides fuzzy string matching optimized
for filenames and code symbols in the style of Sublime Text,
VSCode, IntelliJ IDEA et al.
*/
package fuzzy
import (
"sort"
"unicode"
"unicode/utf8"
)
// Match represents a matched string.
type Match struct {
// The matched string.
Str string
// The index of the matched string in the supplied slice.
Index int
// The indexes of matched characters. Useful for highlighting matches.
MatchedIndexes []int
// Score used to rank matches
Score int
}
const (
firstCharMatchBonus = 10
matchFollowingSeparatorBonus = 20
camelCaseMatchBonus = 20
adjacentMatchBonus = 5
unmatchedLeadingCharPenalty = -5
maxUnmatchedLeadingCharPenalty = -15
)
var separators = []rune("/-_ .\\")
// Matches is a slice of Match structs
type Matches []Match
func (a Matches) Len() int { return len(a) }
func (a Matches) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a Matches) Less(i, j int) bool { return a[i].Score >= a[j].Score }
// Source represents an abstract source of a list of strings. Source must be iterable type such as a slice.
// The source will be iterated over till Len() with String(i) being called for each element where i is the
// index of the element. You can find a working example in the README.
type Source interface {
// The string to be matched at position i.
String(i int) string
// The length of the source. Typically is the length of the slice of things that you want to match.
Len() int
}
type stringSource []string
func (ss stringSource) String(i int) string {
return ss[i]
}
func (ss stringSource) Len() int { return len(ss) }
/*
Find looks up pattern in data and returns matches
in descending order of match quality. Match quality
is determined by a set of bonus and penalty rules.
The following types of matches apply a bonus:
* The first character in the pattern matches the first character in the match string.
* The matched character is camel cased.
* The matched character follows a separator such as an underscore character.
* The matched character is adjacent to a previous match.
Penalties are applied for every character in the search string that wasn't matched and all leading
characters upto the first match.
Results are sorted by best match.
*/
func Find(pattern string, data []string) Matches {
return FindFrom(pattern, stringSource(data))
}
/*
FindNoSort is an alternative Find implementation that does not sort
the results in the end.
*/
func FindNoSort(pattern string, data []string) Matches {
return FindFromNoSort(pattern, stringSource(data))
}
/*
FindFrom is an alternative implementation of Find using a Source
instead of a list of strings.
*/
func FindFrom(pattern string, data Source) Matches {
matches := FindFromNoSort(pattern, data)
sort.Stable(matches)
return matches
}
/*
FindFromNoSort is an alternative FindFrom implementation that does
not sort results in the end.
*/
func FindFromNoSort(pattern string, data Source) Matches {
if len(pattern) == 0 {
return nil
}
runes := []rune(pattern)
var matches Matches
var matchedIndexes []int
for i := 0; i < data.Len(); i++ {
var match Match
match.Str = data.String(i)
match.Index = i
if matchedIndexes != nil {
match.MatchedIndexes = matchedIndexes
} else {
match.MatchedIndexes = make([]int, 0, len(runes))
}
var score int
patternIndex := 0
bestScore := -1
matchedIndex := -1
currAdjacentMatchBonus := 0
var last rune
var lastIndex int
nextc, nextSize := utf8.DecodeRuneInString(data.String(i))
var candidate rune
var candidateSize int
for j := 0; j < len(data.String(i)); j += candidateSize {
candidate, candidateSize = nextc, nextSize
if equalFold(candidate, runes[patternIndex]) {
score = 0
if j == 0 {
score += firstCharMatchBonus
}
if unicode.IsLower(last) && unicode.IsUpper(candidate) {
score += camelCaseMatchBonus
}
if j != 0 && isSeparator(last) {
score += matchFollowingSeparatorBonus
}
if len(match.MatchedIndexes) > 0 {
lastMatch := match.MatchedIndexes[len(match.MatchedIndexes)-1]
bonus := adjacentCharBonus(lastIndex, lastMatch, currAdjacentMatchBonus)
score += bonus
// adjacent matches are incremental and keep increasing based on previous adjacent matches
// thus we need to maintain the current match bonus
currAdjacentMatchBonus += bonus
}
if score > bestScore {
bestScore = score
matchedIndex = j
}
}
var nextp rune
if patternIndex < len(runes)-1 {
nextp = runes[patternIndex+1]
}
if j+candidateSize < len(data.String(i)) {
if data.String(i)[j+candidateSize] < utf8.RuneSelf { // Fast path for ASCII
nextc, nextSize = rune(data.String(i)[j+candidateSize]), 1
} else {
nextc, nextSize = utf8.DecodeRuneInString(data.String(i)[j+candidateSize:])
}
} else {
nextc, nextSize = 0, 0
}
// We apply the best score when we have the next match coming up or when the search string has ended.
// Tracking when the next match is coming up allows us to exhaustively find the best match and not necessarily
// the first match.
// For example given the pattern "tk" and search string "The Black Knight", exhaustively matching allows us
// to match the second k thus giving this string a higher score.
if equalFold(nextp, nextc) || nextc == 0 {
if matchedIndex > -1 {
if len(match.MatchedIndexes) == 0 {
penalty := matchedIndex * unmatchedLeadingCharPenalty
bestScore += max(penalty, maxUnmatchedLeadingCharPenalty)
}
match.Score += bestScore
match.MatchedIndexes = append(match.MatchedIndexes, matchedIndex)
score = 0
bestScore = -1
patternIndex++
}
}
lastIndex = j
last = candidate
}
// apply penalty for each unmatched character
penalty := len(match.MatchedIndexes) - len(data.String(i))
match.Score += penalty
if len(match.MatchedIndexes) == len(runes) {
matches = append(matches, match)
matchedIndexes = nil
} else {
matchedIndexes = match.MatchedIndexes[:0] // Recycle match index slice
}
}
return matches
}
// Taken from strings.EqualFold
func equalFold(tr, sr rune) bool {
if tr == sr {
return true
}
if tr < sr {
tr, sr = sr, tr
}
// Fast check for ASCII.
if tr < utf8.RuneSelf {
// ASCII, and sr is upper case. tr must be lower case.
if 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' {
return true
}
return false
}
// General case. SimpleFold(x) returns the next equivalent rune > x
// or wraps around to smaller values.
r := unicode.SimpleFold(sr)
for r != sr && r < tr {
r = unicode.SimpleFold(r)
}
return r == tr
}
func adjacentCharBonus(i int, lastMatch int, currentBonus int) int {
if lastMatch == i {
return currentBonus*2 + adjacentMatchBonus
}
return 0
}
func isSeparator(s rune) bool {
for _, sep := range separators {
if s == sep {
return true
}
}
return false
}
func max(x int, y int) int {
if x > y {
return x
}
return y
}
|