1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
|
// Fuzzy searching allows for flexibly matching a string with partial input,
// useful for filtering data very quickly based on lightweight user input.
package fuzzy
import (
"bytes"
"unicode"
"unicode/utf8"
"golang.org/x/text/runes"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
)
func noopTransformer() transform.Transformer {
return transform.Nop
}
func foldTransformer() transform.Transformer {
return unicodeFoldTransformer{}
}
func normalizeTransformer() transform.Transformer {
return transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
}
func normalizedFoldTransformer() transform.Transformer {
return transform.Chain(normalizeTransformer(), foldTransformer())
}
// Match returns true if source matches target using a fuzzy-searching
// algorithm. Note that it doesn't implement Levenshtein distance (see
// RankMatch instead), but rather a simplified version where there's no
// approximation. The method will return true only if each character in the
// source can be found in the target and occurs after the preceding matches.
func Match(source, target string) bool {
return match(source, target, noopTransformer())
}
// MatchFold is a case-insensitive version of Match.
func MatchFold(source, target string) bool {
return match(source, target, foldTransformer())
}
// MatchNormalized is a unicode-normalized version of Match.
func MatchNormalized(source, target string) bool {
return match(source, target, normalizeTransformer())
}
// MatchNormalizedFold is a unicode-normalized and case-insensitive version of Match.
func MatchNormalizedFold(source, target string) bool {
return match(source, target, normalizedFoldTransformer())
}
func match(source, target string, transformer transform.Transformer) bool {
source = stringTransform(source, transformer)
target = stringTransform(target, transformer)
lenDiff := len(target) - len(source)
if lenDiff < 0 {
return false
}
if lenDiff == 0 && source == target {
return true
}
Outer:
for _, r1 := range source {
for i, r2 := range target {
if r1 == r2 {
target = target[i+utf8.RuneLen(r2):]
continue Outer
}
}
return false
}
return true
}
// Find will return a list of strings in targets that fuzzy matches source.
func Find(source string, targets []string) []string {
return find(source, targets, noopTransformer())
}
// FindFold is a case-insensitive version of Find.
func FindFold(source string, targets []string) []string {
return find(source, targets, foldTransformer())
}
// FindNormalized is a unicode-normalized version of Find.
func FindNormalized(source string, targets []string) []string {
return find(source, targets, normalizeTransformer())
}
// FindNormalizedFold is a unicode-normalized and case-insensitive version of Find.
func FindNormalizedFold(source string, targets []string) []string {
return find(source, targets, normalizedFoldTransformer())
}
func find(source string, targets []string, transformer transform.Transformer) []string {
var matches []string
for _, target := range targets {
if match(source, target, transformer) {
matches = append(matches, target)
}
}
return matches
}
// RankMatch is similar to Match except it will measure the Levenshtein
// distance between the source and the target and return its result. If there
// was no match, it will return -1.
// Given the requirements of match, RankMatch only needs to perform a subset of
// the Levenshtein calculation, only deletions need be considered, required
// additions and substitutions would fail the match test.
func RankMatch(source, target string) int {
return rank(source, target, noopTransformer())
}
// RankMatchFold is a case-insensitive version of RankMatch.
func RankMatchFold(source, target string) int {
return rank(source, target, foldTransformer())
}
// RankMatchNormalized is a unicode-normalized version of RankMatch.
func RankMatchNormalized(source, target string) int {
return rank(source, target, normalizeTransformer())
}
// RankMatchNormalizedFold is a unicode-normalized and case-insensitive version of RankMatch.
func RankMatchNormalizedFold(source, target string) int {
return rank(source, target, normalizedFoldTransformer())
}
func rank(source, target string, transformer transform.Transformer) int {
lenDiff := len(target) - len(source)
if lenDiff < 0 {
return -1
}
source = stringTransform(source, transformer)
target = stringTransform(target, transformer)
if lenDiff == 0 && source == target {
return 0
}
runeDiff := 0
Outer:
for _, r1 := range source {
for i, r2 := range target {
if r1 == r2 {
target = target[i+utf8.RuneLen(r2):]
continue Outer
} else {
runeDiff++
}
}
return -1
}
// Count up remaining char
runeDiff += utf8.RuneCountInString(target)
return runeDiff
}
// RankFind is similar to Find, except it will also rank all matches using
// Levenshtein distance.
func RankFind(source string, targets []string) Ranks {
return rankFind(source, targets, noopTransformer())
}
// RankFindFold is a case-insensitive version of RankFind.
func RankFindFold(source string, targets []string) Ranks {
return rankFind(source, targets, foldTransformer())
}
// RankFindNormalized is a unicode-normalized version of RankFind.
func RankFindNormalized(source string, targets []string) Ranks {
return rankFind(source, targets, normalizeTransformer())
}
// RankFindNormalizedFold is a unicode-normalized and case-insensitive version of RankFind.
func RankFindNormalizedFold(source string, targets []string) Ranks {
return rankFind(source, targets, normalizedFoldTransformer())
}
func rankFind(source string, targets []string, transformer transform.Transformer) Ranks {
var r Ranks
for index, target := range targets {
if match(source, target, transformer) {
distance := LevenshteinDistance(source, target)
r = append(r, Rank{source, target, distance, index})
}
}
return r
}
type Rank struct {
// Source is used as the source for matching.
Source string
// Target is the word matched against.
Target string
// Distance is the Levenshtein distance between Source and Target.
Distance int
// Location of Target in original list
OriginalIndex int
}
type Ranks []Rank
func (r Ranks) Len() int {
return len(r)
}
func (r Ranks) Swap(i, j int) {
r[i], r[j] = r[j], r[i]
}
func (r Ranks) Less(i, j int) bool {
return r[i].Distance < r[j].Distance
}
func stringTransform(s string, t transform.Transformer) (transformed string) {
var err error
transformed, _, err = transform.String(t, s)
if err != nil {
transformed = s
}
return
}
type unicodeFoldTransformer struct{}
func (unicodeFoldTransformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
runes := bytes.Runes(src)
var lowerRunes []rune
for _, r := range runes {
lowerRunes = append(lowerRunes, unicode.ToLower(r))
}
srcBytes := []byte(string(lowerRunes))
n := copy(dst, srcBytes)
if n < len(srcBytes) {
err = transform.ErrShortDst
}
return n, n, err
}
func (unicodeFoldTransformer) Reset() {}
|