1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
|
# encoding: UTF-8
require "levenshtein/version"
module Levenshtein
# Returns the Levenshtein distance as a number between 0.0 and
# 1.0. It's basically the Levenshtein distance divided by the
# size of the longest sequence.
def self.normalized_distance(a1, a2, threshold=nil, options={})
size = [a1.size, a2.size].max
if a1.size == 0 and a2.size == 0
0.0
elsif a1.size == 0
a2.size.to_f/size
elsif a2.size == 0
a1.size.to_f/size
else
if threshold
if d = self.distance(a1, a2, (threshold*size).to_i+1)
d.to_f/size
else
nil
end
else
self.distance(a1, a2).to_f/size
end
end
end
# Returns the Levenshtein distance between two sequences.
#
# The two sequences can be two strings, two arrays, or two other
# objects responding to :each. All sequences are by generic
# (fast) C code.
#
# All objects in the sequences should respond to :hash and :eql?.
def self.distance(a1, a2, threshold=nil, options={})
a1, a2 = a1.scan(/./), a2.scan(/./) if String === a1 and String === a2
a1, a2 = Util.pool(a1, a2)
# Handle some basic circumstances.
return 0 if a1 == a2
return a2.size if a1.empty?
return a1.size if a2.empty?
if threshold
return nil if (a1.size-a2.size) >= threshold
return nil if (a2.size-a1.size) >= threshold
return nil if (a1-a2).size >= threshold
return nil if (a2-a1).size >= threshold
end
# Remove the common prefix and the common postfix.
l1 = a1.size
l2 = a2.size
offset = 0
no_more_optimizations = true
while offset < l1 and offset < l2 and a1[offset].equal?(a2[offset])
offset += 1
no_more_optimizations = false
end
while offset < l1 and offset < l2 and a1[l1-1].equal?(a2[l2-1])
l1 -= 1
l2 -= 1
no_more_optimizations = false
end
if no_more_optimizations
distance_fast_or_slow(a1, a2, threshold, options)
else
l1 -= offset
l2 -= offset
a1 = a1[offset, l1]
a2 = a2[offset, l2]
distance(a1, a2, threshold, options)
end
end
def self.distance_fast_or_slow(a1, a2, threshold, options) # :nodoc:
if respond_to?(:distance_fast) and options[:force_slow]
distance_fast(a1, a2, threshold) # Implemented in C.
else
distance_slow(a1, a2, threshold) # Implemented in Ruby.
end
end
def self.distance_slow(a1, a2, threshold) # :nodoc:
crow = (0..a1.size).to_a
1.upto(a2.size) do |y|
prow = crow
crow = [y]
1.upto(a1.size) do |x|
crow[x] = [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[x-1].equal?(a2[y-1]) ? 0 : 1)].min
end
# Stop analysing this sequence as soon as the best possible
# result for this sequence is bigger than the best result so far.
# (The minimum value in the next row will be equal to or greater
# than the minimum value in this row.)
return nil if threshold and crow.min >= threshold
end
crow[-1]
end
module Util # :nodoc:
def self.pool(*args)
# So we can compare pointers instead of objects (equal?() instead of ==()).
pool = {}
args.collect do |arg|
a = []
arg.each do |o|
a << pool[o] ||= o
end
a
end
end
end
end
begin
require "levenshtein/levenshtein_fast" # Compiled by RubyGems.
rescue LoadError
begin
require "levenshtein_fast" # Compiled by the build script.
rescue LoadError
$stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein. Using the much slower Ruby version instead."
end
end
|