File: levenshtein.go

package info (click to toggle)
golang-github-texttheater-golang-levenshtein 1.0.1-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 84 kB
  • sloc: makefile: 2
file content (262 lines) | stat: -rw-r--r-- 8,170 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
// This package implements the Levenshtein algorithm for computing the
// similarity between two strings. The central function is MatrixForStrings,
// which computes the Levenshtein matrix. The functions DistanceForMatrix,
// EditScriptForMatrix and RatioForMatrix read various interesting properties
// off the matrix. The package also provides the convenience functions
// DistanceForStrings, EditScriptForStrings and RatioForStrings for going
// directly from two strings to the property of interest.
package levenshtein

import (
	"fmt"
	"io"
	"os"
)

type EditOperation int

const (
	Ins = iota
	Del
	Sub
	Match
)

type EditScript []EditOperation

type MatchFunction func(rune, rune) bool

// IdenticalRunes is the default MatchFunction: it checks whether two runes are
// identical.
func IdenticalRunes(a rune, b rune) bool {
	return a == b
}

type Options struct {
	InsCost int
	DelCost int
	SubCost int
	Matches MatchFunction
}

// DefaultOptions is the default options without substitution: insertion cost
// is 1, deletion cost is 1, substitution cost is 2 (meaning insert and delete
// will be used instead), and two runes match iff they are identical.
var DefaultOptions Options = Options{
	InsCost: 1,
	DelCost: 1,
	SubCost: 2,
	Matches: IdenticalRunes,
}

// DefaultOptionsWithSub is the default options with substitution: insertion
// cost is 1, deletion cost is 1, substitution cost is 1, and two runes match
// iff they are identical.
var DefaultOptionsWithSub Options = Options{
	InsCost: 1,
	DelCost: 1,
	SubCost: 1,
	Matches: IdenticalRunes,
}

func (operation EditOperation) String() string {
	if operation == Match {
		return "match"
	} else if operation == Ins {
		return "ins"
	} else if operation == Sub {
		return "sub"
	}
	return "del"
}

// DistanceForStrings returns the edit distance between source and target.
//
// It has a runtime proportional to len(source) * len(target) and memory use
// proportional to len(target).
func DistanceForStrings(source []rune, target []rune, op Options) int {
	// Note: This algorithm is a specialization of MatrixForStrings.
	// MatrixForStrings returns the full edit matrix. However, we only need a
	// single value (see DistanceForMatrix) and the main loop of the algorithm
	// only uses the current and previous row. As such we create a 2D matrix,
	// but with height 2 (enough to store current and previous row).
	height := len(source) + 1
	width := len(target) + 1
	matrix := make([][]int, 2)

	// Initialize trivial distances (from/to empty string). That is, fill
	// the left column and the top row with row/column indices multiplied
	// by deletion/insertion cost.
	for i := 0; i < 2; i++ {
		matrix[i] = make([]int, width)
		matrix[i][0] = i * op.DelCost
	}
	for j := 1; j < width; j++ {
		matrix[0][j] = j * op.InsCost
	}

	// Fill in the remaining cells: for each prefix pair, choose the
	// (edit history, operation) pair with the lowest cost.
	for i := 1; i < height; i++ {
		cur := matrix[i%2]
		prev := matrix[(i-1)%2]
		cur[0] = i * op.DelCost
		for j := 1; j < width; j++ {
			delCost := prev[j] + op.DelCost
			matchSubCost := prev[j-1]
			if !op.Matches(source[i-1], target[j-1]) {
				matchSubCost += op.SubCost
			}
			insCost := cur[j-1] + op.InsCost
			cur[j] = min(delCost, min(matchSubCost, insCost))
		}
	}
	return matrix[(height-1)%2][width-1]
}

// DistanceForMatrix reads the edit distance off the given Levenshtein matrix.
func DistanceForMatrix(matrix [][]int) int {
	return matrix[len(matrix)-1][len(matrix[0])-1]
}

// RatioForStrings returns the Levenshtein ratio for the given strings. The
// ratio is computed as follows:
//
//     (sourceLength + targetLength - distance) / (sourceLength + targetLength)
func RatioForStrings(source []rune, target []rune, op Options) float64 {
	matrix := MatrixForStrings(source, target, op)
	return RatioForMatrix(matrix)
}

// RatioForMatrix returns the Levenshtein ratio for the given matrix. The ratio
// is computed as follows:
//
//     (sourceLength + targetLength - distance) / (sourceLength + targetLength)
func RatioForMatrix(matrix [][]int) float64 {
	sourcelength := len(matrix) - 1
	targetlength := len(matrix[0]) - 1
	sum := sourcelength + targetlength

	if sum == 0 {
		return 0
	}

	dist := DistanceForMatrix(matrix)
	return float64(sum-dist) / float64(sum)
}

// MatrixForStrings generates a 2-D array representing the dynamic programming
// table used by the Levenshtein algorithm, as described e.g. here:
// http://www.let.rug.nl/kleiweg/lev/
// The reason for putting the creation of the table into a separate function is
// that it cannot only be used for reading of the edit distance between two
// strings, but also e.g. to backtrace an edit script that provides an
// alignment between the characters of both strings.
func MatrixForStrings(source []rune, target []rune, op Options) [][]int {
	// Make a 2-D matrix. Rows correspond to prefixes of source, columns to
	// prefixes of target. Cells will contain edit distances.
	// Cf. http://www.let.rug.nl/~kleiweg/lev/levenshtein.html
	height := len(source) + 1
	width := len(target) + 1
	matrix := make([][]int, height)

	// Initialize trivial distances (from/to empty string). That is, fill
	// the left column and the top row with row/column indices multiplied
	// by deletion/insertion cost.
	for i := 0; i < height; i++ {
		matrix[i] = make([]int, width)
		matrix[i][0] = i * op.DelCost
	}
	for j := 1; j < width; j++ {
		matrix[0][j] = j * op.InsCost
	}

	// Fill in the remaining cells: for each prefix pair, choose the
	// (edit history, operation) pair with the lowest cost.
	for i := 1; i < height; i++ {
		for j := 1; j < width; j++ {
			delCost := matrix[i-1][j] + op.DelCost
			matchSubCost := matrix[i-1][j-1]
			if !op.Matches(source[i-1], target[j-1]) {
				matchSubCost += op.SubCost
			}
			insCost := matrix[i][j-1] + op.InsCost
			matrix[i][j] = min(delCost, min(matchSubCost,
				insCost))
		}
	}
	//LogMatrix(source, target, matrix)
	return matrix
}

// EditScriptForStrings returns an optimal edit script to turn source into
// target.
func EditScriptForStrings(source []rune, target []rune, op Options) EditScript {
	return backtrace(len(source), len(target),
		MatrixForStrings(source, target, op), op)
}

// EditScriptForMatrix returns an optimal edit script based on the given
// Levenshtein matrix.
func EditScriptForMatrix(matrix [][]int, op Options) EditScript {
	return backtrace(len(matrix)-1, len(matrix[0])-1, matrix, op)
}

// WriteMatrix writes a visual representation of the given matrix for the given
// strings to the given writer.
func WriteMatrix(source []rune, target []rune, matrix [][]int, writer io.Writer) {
	fmt.Fprintf(writer, "    ")
	for _, targetRune := range target {
		fmt.Fprintf(writer, "  %c", targetRune)
	}
	fmt.Fprintf(writer, "\n")
	fmt.Fprintf(writer, "  %2d", matrix[0][0])
	for j, _ := range target {
		fmt.Fprintf(writer, " %2d", matrix[0][j+1])
	}
	fmt.Fprintf(writer, "\n")
	for i, sourceRune := range source {
		fmt.Fprintf(writer, "%c %2d", sourceRune, matrix[i+1][0])
		for j, _ := range target {
			fmt.Fprintf(writer, " %2d", matrix[i+1][j+1])
		}
		fmt.Fprintf(writer, "\n")
	}
}

// LogMatrix writes a visual representation of the given matrix for the given
// strings to os.Stderr. This function is deprecated, use
// WriteMatrix(source, target, matrix, os.Stderr) instead.
func LogMatrix(source []rune, target []rune, matrix [][]int) {
	WriteMatrix(source, target, matrix, os.Stderr)
}

func backtrace(i int, j int, matrix [][]int, op Options) EditScript {
	if i > 0 && matrix[i-1][j]+op.DelCost == matrix[i][j] {
		return append(backtrace(i-1, j, matrix, op), Del)
	}
	if j > 0 && matrix[i][j-1]+op.InsCost == matrix[i][j] {
		return append(backtrace(i, j-1, matrix, op), Ins)
	}
	if i > 0 && j > 0 && matrix[i-1][j-1]+op.SubCost == matrix[i][j] {
		return append(backtrace(i-1, j-1, matrix, op), Sub)
	}
	if i > 0 && j > 0 && matrix[i-1][j-1] == matrix[i][j] {
		return append(backtrace(i-1, j-1, matrix, op), Match)
	}
	return []EditOperation{}
}

func min(a int, b int) int {
	if b < a {
		return b
	}
	return a
}

func max(a int, b int) int {
	if b > a {
		return b
	}
	return a
}