1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309
|
// Copyright 2021 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package fuzzy
import (
"bytes"
"fmt"
"log"
"unicode"
)
// SymbolMatcher implements a fuzzy matching algorithm optimized for Go symbols
// of the form:
//
// example.com/path/to/package.object.field
//
// Knowing that we are matching symbols like this allows us to make the
// following optimizations:
// - We can incorporate right-to-left relevance directly into the score
// calculation.
// - We can match from right to left, discarding leading bytes if the input is
// too long.
// - We just take the right-most match without losing too much precision. This
// allows us to use an O(n) algorithm.
// - We can operate directly on chunked strings; in many cases we will
// be storing the package path and/or package name separately from the
// symbol or identifiers, so doing this avoids allocating strings.
// - We can return the index of the right-most match, allowing us to trim
// irrelevant qualification.
type SymbolMatcher struct {
// Using buffers of length 256 is both a reasonable size for most qualified
// symbols, and makes it easy to avoid bounds checks by using uint8 indexes.
pattern [256]rune
patternLen uint8
inputBuffer [256]rune // avoid allocating when considering chunks
roles [256]uint32 // which roles does a rune play (word start, etc.)
segments [256]uint8 // how many segments from the right is each rune
}
// Rune roles.
const (
segmentStart uint32 = 1 << iota // input rune starts a segment (i.e. follows '/' or '.')
wordStart // input rune starts a word, per camel-case naming rules
separator // input rune is a separator ('/' or '.')
upper // input rune is an upper case letter
)
// NewSymbolMatcher creates a SymbolMatcher that may be used to match the given
// search pattern.
//
// Currently this matcher only accepts case-insensitive fuzzy patterns.
//
// An empty pattern matches no input.
func NewSymbolMatcher(pattern string) *SymbolMatcher {
m := &SymbolMatcher{}
for _, p := range pattern {
m.pattern[m.patternLen] = unicode.ToLower(p)
m.patternLen++
if m.patternLen == 255 || int(m.patternLen) == len(pattern) {
// break at 255 so that we can represent patternLen with a uint8.
break
}
}
return m
}
// Match searches for the right-most match of the search pattern within the
// symbol represented by concatenating the given chunks.
//
// If a match is found, the first result holds the absolute byte offset within
// all chunks for the start of the symbol. In other words, the index of the
// match within strings.Join(chunks, "").
//
// The second return value will be the score of the match, which is always
// between 0 and 1, inclusive. A score of 0 indicates no match.
//
// If no match is found, Match returns (-1, 0).
func (m *SymbolMatcher) Match(chunks []string) (int, float64) {
// Explicit behavior for an empty pattern.
//
// As a minor optimization, this also avoids nilness checks later on, since
// the compiler can prove that m != nil.
if m.patternLen == 0 {
return -1, 0
}
// Matching implements a heavily optimized linear scoring algorithm on the
// input. This is not guaranteed to produce the highest score, but works well
// enough, particularly due to the right-to-left significance of qualified
// symbols.
//
// Matching proceeds in three passes through the input:
// - The first pass populates the input buffer and collects rune roles.
// - The second pass proceeds right-to-left to find the right-most match.
// - The third pass proceeds left-to-right from the start of the right-most
// match, to find the most *compact* match, and computes the score of this
// match.
//
// See below for more details of each pass, as well as the scoring algorithm.
// First pass: populate the input buffer out of the provided chunks
// (lower-casing in the process), and collect rune roles.
//
// We could also check for a forward match here, but since we'd have to write
// the entire input anyway this has negligible impact on performance.
var (
inputLen = uint8(0)
modifiers = wordStart | segmentStart
)
input:
for _, chunk := range chunks {
for _, r := range chunk {
if r == '.' || r == '/' {
modifiers |= separator
}
// optimization: avoid calls to unicode.ToLower, which can't be inlined.
l := r
if r <= unicode.MaxASCII {
if 'A' <= r && r <= 'Z' {
l = r + 'a' - 'A'
}
} else {
l = unicode.ToLower(r)
}
if l != r {
modifiers |= upper
// If the current rune is capitalized *and the preceding rune was not*,
// mark this as a word start. This avoids spuriously high ranking of
// non-camelcase naming schemas, such as the
// yaml_PARSE_FLOW_SEQUENCE_ENTRY_MAPPING_END_STATE example of
// golang/go#60201.
if inputLen == 0 || m.roles[inputLen-1]&upper == 0 {
modifiers |= wordStart
}
}
m.inputBuffer[inputLen] = l
m.roles[inputLen] = modifiers
inputLen++
if m.roles[inputLen-1]&separator != 0 {
modifiers = wordStart | segmentStart
} else {
modifiers = 0
}
// TODO: we should prefer the right-most input if it overflows, rather
// than the left-most as we're doing here.
if inputLen == 255 {
break input
}
}
}
// Second pass: find the right-most match, and count segments from the
// right.
var (
pi = uint8(m.patternLen - 1) // pattern index
p = m.pattern[pi] // pattern rune
start = -1 // start offset of match
rseg = uint8(0) // effective "depth" from the right of the current rune in consideration
)
const maxSeg = 3 // maximum number of segments from the right to count, for scoring purposes.
for ii := inputLen - 1; ; ii-- {
r := m.inputBuffer[ii]
if rseg < maxSeg && m.roles[ii]&separator != 0 {
rseg++
}
m.segments[ii] = rseg
if p == r {
if pi == 0 {
// TODO(rfindley): BUG: the docstring for Match says that it returns an
// absolute byte offset, but clearly it is returning a rune offset here.
start = int(ii)
break
}
pi--
p = m.pattern[pi]
}
// Don't check ii >= 0 in the loop condition: ii is a uint8.
if ii == 0 {
break
}
}
if start < 0 {
// no match: skip scoring
return -1, 0
}
// Third pass: find the shortest match and compute the score.
// Score is the average score for each rune.
//
// A rune score is the multiple of:
// 1. The base score, which is 1.0 if the rune starts a segment, 0.9 if the
// rune starts a mid-segment word, else 0.6.
//
// Runes preceded by a matching rune are treated the same as the start
// of a mid-segment word (with a 0.9 score), so that sequential or exact
// matches are preferred. We call this a sequential bonus.
//
// For the final rune match, this sequential bonus is reduced to 0.8 if
// the next rune in the input is a mid-segment word, or 0.7 if the next
// rune in the input is not a word or segment start. This ensures that
// we favor whole-word or whole-segment matches over prefix matches.
//
// 2. 1.0 if the rune is part of the last segment, otherwise
// 1.0-0.1*<segments from the right>, with a max segment count of 3.
// Notably 1.0-0.1*3 = 0.7 > 0.6, so that foo/_/_/_/_ (a match very
// early in a qualified symbol name) still scores higher than _f_o_o_ (a
// completely split match).
//
// This is a naive algorithm, but it is fast. There's lots of prior art here
// that could be leveraged. For example, we could explicitly consider
// rune distance, and exact matches of words or segments.
//
// Also note that this might not actually find the highest scoring match, as
// doing so could require a non-linear algorithm, depending on how the score
// is calculated.
// debugging support
const debug = false // enable to log debugging information
var (
runeScores []float64
runeIdxs []int
)
pi = 0
p = m.pattern[pi]
const (
segStartScore = 1.0 // base score of runes starting a segment
wordScore = 0.9 // base score of runes starting or continuing a word
noStreak = 0.6
perSegment = 0.1 // we count at most 3 segments above
)
totScore := 0.0
lastMatch := uint8(255)
for ii := uint8(start); ii < inputLen; ii++ {
r := m.inputBuffer[ii]
if r == p {
pi++
finalRune := pi >= m.patternLen
p = m.pattern[pi]
baseScore := noStreak
// Calculate the sequence bonus based on preceding matches.
//
// We do this first as it is overridden by role scoring below.
if lastMatch == ii-1 {
baseScore = wordScore
// Reduce the sequence bonus for the final rune of the pattern based on
// whether it borders a new segment or word.
if finalRune {
switch {
case ii == inputLen-1 || m.roles[ii+1]&separator != 0:
// Full segment: no reduction
case m.roles[ii+1]&wordStart != 0:
baseScore = wordScore - 0.1
default:
baseScore = wordScore - 0.2
}
}
}
lastMatch = ii
// Calculate the rune's role score. If the rune starts a segment or word,
// this overrides the sequence score, as the rune starts a new sequence.
switch {
case m.roles[ii]&segmentStart != 0:
baseScore = segStartScore
case m.roles[ii]&wordStart != 0:
baseScore = wordScore
}
// Apply the segment-depth penalty (segments from the right).
runeScore := baseScore * (1.0 - float64(m.segments[ii])*perSegment)
if debug {
runeScores = append(runeScores, runeScore)
runeIdxs = append(runeIdxs, int(ii))
}
totScore += runeScore
if finalRune {
break
}
}
}
if debug {
// Format rune roles and scores in line:
// fo[o:.52].[b:1]a[r:.6]
var summary bytes.Buffer
last := 0
for i, idx := range runeIdxs {
summary.WriteString(string(m.inputBuffer[last:idx])) // encode runes
fmt.Fprintf(&summary, "[%s:%.2g]", string(m.inputBuffer[idx]), runeScores[i])
last = idx + 1
}
summary.WriteString(string(m.inputBuffer[last:inputLen])) // encode runes
log.Println(summary.String())
}
return start, totScore / float64(m.patternLen)
}
|