File: symbol.go

package info (click to toggle)
golang-github-cue-lang-cue 0.14.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 19,644 kB
  • sloc: makefile: 20; sh: 15
file content (309 lines) | stat: -rw-r--r-- 10,341 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
// Copyright 2021 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package fuzzy

import (
	"bytes"
	"fmt"
	"log"
	"unicode"
)

// SymbolMatcher implements a fuzzy matching algorithm optimized for Go symbols
// of the form:
//
//	example.com/path/to/package.object.field
//
// Knowing that we are matching symbols like this allows us to make the
// following optimizations:
//   - We can incorporate right-to-left relevance directly into the score
//     calculation.
//   - We can match from right to left, discarding leading bytes if the input is
//     too long.
//   - We just take the right-most match without losing too much precision. This
//     allows us to use an O(n) algorithm.
//   - We can operate directly on chunked strings; in many cases we will
//     be storing the package path and/or package name separately from the
//     symbol or identifiers, so doing this avoids allocating strings.
//   - We can return the index of the right-most match, allowing us to trim
//     irrelevant qualification.
type SymbolMatcher struct {
	// Using buffers of length 256 is both a reasonable size for most qualified
	// symbols, and makes it easy to avoid bounds checks by using uint8 indexes.
	pattern     [256]rune
	patternLen  uint8
	inputBuffer [256]rune   // avoid allocating when considering chunks
	roles       [256]uint32 // which roles does a rune play (word start, etc.)
	segments    [256]uint8  // how many segments from the right is each rune
}

// Rune roles.
const (
	segmentStart uint32 = 1 << iota // input rune starts a segment (i.e. follows '/' or '.')
	wordStart                       // input rune starts a word, per camel-case naming rules
	separator                       // input rune is a separator ('/' or '.')
	upper                           // input rune is an upper case letter
)

// NewSymbolMatcher creates a SymbolMatcher that may be used to match the given
// search pattern.
//
// Currently this matcher only accepts case-insensitive fuzzy patterns.
//
// An empty pattern matches no input.
func NewSymbolMatcher(pattern string) *SymbolMatcher {
	m := &SymbolMatcher{}
	for _, p := range pattern {
		m.pattern[m.patternLen] = unicode.ToLower(p)
		m.patternLen++
		if m.patternLen == 255 || int(m.patternLen) == len(pattern) {
			// break at 255 so that we can represent patternLen with a uint8.
			break
		}
	}
	return m
}

// Match searches for the right-most match of the search pattern within the
// symbol represented by concatenating the given chunks.
//
// If a match is found, the first result holds the absolute byte offset within
// all chunks for the start of the symbol. In other words, the index of the
// match within strings.Join(chunks, "").
//
// The second return value will be the score of the match, which is always
// between 0 and 1, inclusive. A score of 0 indicates no match.
//
// If no match is found, Match returns (-1, 0).
func (m *SymbolMatcher) Match(chunks []string) (int, float64) {
	// Explicit behavior for an empty pattern.
	//
	// As a minor optimization, this also avoids nilness checks later on, since
	// the compiler can prove that m != nil.
	if m.patternLen == 0 {
		return -1, 0
	}

	// Matching implements a heavily optimized linear scoring algorithm on the
	// input. This is not guaranteed to produce the highest score, but works well
	// enough, particularly due to the right-to-left significance of qualified
	// symbols.
	//
	// Matching proceeds in three passes through the input:
	//  - The first pass populates the input buffer and collects rune roles.
	//  - The second pass proceeds right-to-left to find the right-most match.
	//  - The third pass proceeds left-to-right from the start of the right-most
	//    match, to find the most *compact* match, and computes the score of this
	//    match.
	//
	// See below for more details of each pass, as well as the scoring algorithm.

	// First pass: populate the input buffer out of the provided chunks
	// (lower-casing in the process), and collect rune roles.
	//
	// We could also check for a forward match here, but since we'd have to write
	// the entire input anyway this has negligible impact on performance.
	var (
		inputLen  = uint8(0)
		modifiers = wordStart | segmentStart
	)

input:
	for _, chunk := range chunks {
		for _, r := range chunk {
			if r == '.' || r == '/' {
				modifiers |= separator
			}
			// optimization: avoid calls to unicode.ToLower, which can't be inlined.
			l := r
			if r <= unicode.MaxASCII {
				if 'A' <= r && r <= 'Z' {
					l = r + 'a' - 'A'
				}
			} else {
				l = unicode.ToLower(r)
			}
			if l != r {
				modifiers |= upper

				// If the current rune is capitalized *and the preceding rune was not*,
				// mark this as a word start. This avoids spuriously high ranking of
				// non-camelcase naming schemas, such as the
				// yaml_PARSE_FLOW_SEQUENCE_ENTRY_MAPPING_END_STATE example of
				// golang/go#60201.
				if inputLen == 0 || m.roles[inputLen-1]&upper == 0 {
					modifiers |= wordStart
				}
			}
			m.inputBuffer[inputLen] = l
			m.roles[inputLen] = modifiers
			inputLen++
			if m.roles[inputLen-1]&separator != 0 {
				modifiers = wordStart | segmentStart
			} else {
				modifiers = 0
			}
			// TODO: we should prefer the right-most input if it overflows, rather
			//       than the left-most as we're doing here.
			if inputLen == 255 {
				break input
			}
		}
	}

	// Second pass: find the right-most match, and count segments from the
	// right.
	var (
		pi    = uint8(m.patternLen - 1) // pattern index
		p     = m.pattern[pi]           // pattern rune
		start = -1                      // start offset of match
		rseg  = uint8(0)                // effective "depth" from the right of the current rune in consideration
	)
	const maxSeg = 3 // maximum number of segments from the right to count, for scoring purposes.

	for ii := inputLen - 1; ; ii-- {
		r := m.inputBuffer[ii]
		if rseg < maxSeg && m.roles[ii]&separator != 0 {
			rseg++
		}
		m.segments[ii] = rseg
		if p == r {
			if pi == 0 {
				// TODO(rfindley): BUG: the docstring for Match says that it returns an
				// absolute byte offset, but clearly it is returning a rune offset here.
				start = int(ii)
				break
			}
			pi--
			p = m.pattern[pi]
		}
		// Don't check ii >= 0 in the loop condition: ii is a uint8.
		if ii == 0 {
			break
		}
	}

	if start < 0 {
		// no match: skip scoring
		return -1, 0
	}

	// Third pass: find the shortest match and compute the score.

	// Score is the average score for each rune.
	//
	// A rune score is the multiple of:
	//   1. The base score, which is 1.0 if the rune starts a segment, 0.9 if the
	//      rune starts a mid-segment word, else 0.6.
	//
	//      Runes preceded by a matching rune are treated the same as the start
	//      of a mid-segment word (with a 0.9 score), so that sequential or exact
	//      matches are preferred. We call this a sequential bonus.
	//
	//      For the final rune match, this sequential bonus is reduced to 0.8 if
	//      the next rune in the input is a mid-segment word, or 0.7 if the next
	//      rune in the input is not a word or segment start. This ensures that
	//      we favor whole-word or whole-segment matches over prefix matches.
	//
	//   2. 1.0 if the rune is part of the last segment, otherwise
	//      1.0-0.1*<segments from the right>, with a max segment count of 3.
	//      Notably 1.0-0.1*3 = 0.7 > 0.6, so that foo/_/_/_/_ (a match very
	//      early in a qualified symbol name) still scores higher than _f_o_o_ (a
	//      completely split match).
	//
	// This is a naive algorithm, but it is fast. There's lots of prior art here
	// that could be leveraged. For example, we could explicitly consider
	// rune distance, and exact matches of words or segments.
	//
	// Also note that this might not actually find the highest scoring match, as
	// doing so could require a non-linear algorithm, depending on how the score
	// is calculated.

	// debugging support
	const debug = false // enable to log debugging information
	var (
		runeScores []float64
		runeIdxs   []int
	)

	pi = 0
	p = m.pattern[pi]

	const (
		segStartScore = 1.0 // base score of runes starting a segment
		wordScore     = 0.9 // base score of runes starting or continuing a word
		noStreak      = 0.6
		perSegment    = 0.1 // we count at most 3 segments above
	)

	totScore := 0.0
	lastMatch := uint8(255)
	for ii := uint8(start); ii < inputLen; ii++ {
		r := m.inputBuffer[ii]
		if r == p {
			pi++
			finalRune := pi >= m.patternLen
			p = m.pattern[pi]

			baseScore := noStreak

			// Calculate the sequence bonus based on preceding matches.
			//
			// We do this first as it is overridden by role scoring below.
			if lastMatch == ii-1 {
				baseScore = wordScore
				// Reduce the sequence bonus for the final rune of the pattern based on
				// whether it borders a new segment or word.
				if finalRune {
					switch {
					case ii == inputLen-1 || m.roles[ii+1]&separator != 0:
						// Full segment: no reduction
					case m.roles[ii+1]&wordStart != 0:
						baseScore = wordScore - 0.1
					default:
						baseScore = wordScore - 0.2
					}
				}
			}
			lastMatch = ii

			// Calculate the rune's role score. If the rune starts a segment or word,
			// this overrides the sequence score, as the rune starts a new sequence.
			switch {
			case m.roles[ii]&segmentStart != 0:
				baseScore = segStartScore
			case m.roles[ii]&wordStart != 0:
				baseScore = wordScore
			}

			// Apply the segment-depth penalty (segments from the right).
			runeScore := baseScore * (1.0 - float64(m.segments[ii])*perSegment)
			if debug {
				runeScores = append(runeScores, runeScore)
				runeIdxs = append(runeIdxs, int(ii))
			}
			totScore += runeScore
			if finalRune {
				break
			}
		}
	}

	if debug {
		// Format rune roles and scores in line:
		// fo[o:.52].[b:1]a[r:.6]
		var summary bytes.Buffer
		last := 0
		for i, idx := range runeIdxs {
			summary.WriteString(string(m.inputBuffer[last:idx])) // encode runes
			fmt.Fprintf(&summary, "[%s:%.2g]", string(m.inputBuffer[idx]), runeScores[i])
			last = idx + 1
		}
		summary.WriteString(string(m.inputBuffer[last:inputLen])) // encode runes
		log.Println(summary.String())
	}

	return start, totScore / float64(m.patternLen)
}