File: levenshtein.rb

package info (click to toggle)
ruby-levenshtein 0.2.2-3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 104 kB
  • sloc: ruby: 231; ansic: 71; makefile: 2
file content (148 lines) | stat: -rwxr-xr-x 3,678 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# encoding: UTF-8

require "levenshtein/version"

module Levenshtein
  # Returns the Levenshtein distance as a number between 0.0 and
  # 1.0. It's basically the Levenshtein distance divided by the
  # size of the longest sequence.

  def self.normalized_distance(a1, a2, threshold=nil, options={})
    size	= [a1.size, a2.size].max

    if a1.size == 0 and a2.size == 0
      0.0
    elsif a1.size == 0
      a2.size.to_f/size
    elsif a2.size == 0
      a1.size.to_f/size
    else
      if threshold
        if d = self.distance(a1, a2, (threshold*size).to_i+1)
          d.to_f/size
        else
          nil
        end
      else
        self.distance(a1, a2).to_f/size
      end
    end
  end

  # Returns the Levenshtein distance between two sequences.
  #
  # The two sequences can be two strings, two arrays, or two other
  # objects responding to :each. All sequences are by generic
  # (fast) C code.
  #
  # All objects in the sequences should respond to :hash and :eql?.

  def self.distance(a1, a2, threshold=nil, options={})
    a1, a2	= a1.scan(/./), a2.scan(/./)	if String === a1 and String === a2
    a1, a2	= Util.pool(a1, a2)

    # Handle some basic circumstances.

    return 0		if a1 == a2
    return a2.size	if a1.empty?
    return a1.size	if a2.empty?

    if threshold
      return nil	if (a1.size-a2.size) >= threshold
      return nil	if (a2.size-a1.size) >= threshold
      return nil	if (a1-a2).size >= threshold
      return nil	if (a2-a1).size >= threshold
    end

    # Remove the common prefix and the common postfix.

    l1	= a1.size
    l2	= a2.size

    offset			= 0
    no_more_optimizations	= true
 
    while offset < l1 and offset < l2 and a1[offset].equal?(a2[offset])
      offset += 1

      no_more_optimizations	= false
    end
 
    while offset < l1 and offset < l2 and a1[l1-1].equal?(a2[l2-1])
      l1 -= 1
      l2 -= 1

      no_more_optimizations	= false
    end
 
    if no_more_optimizations
      distance_fast_or_slow(a1, a2, threshold, options)
    else
      l1 -= offset
      l2 -= offset

      a1	= a1[offset, l1]
      a2	= a2[offset, l2]

      distance(a1, a2, threshold, options)
    end
  end

  def self.distance_fast_or_slow(a1, a2, threshold, options)	# :nodoc:
    if respond_to?(:distance_fast) and options[:force_slow]
      distance_fast(a1, a2, threshold)	# Implemented in C.
    else
      distance_slow(a1, a2, threshold)	# Implemented in Ruby.
    end
  end

  def self.distance_slow(a1, a2, threshold)	# :nodoc:
    crow	= (0..a1.size).to_a

    1.upto(a2.size) do |y|
      prow	= crow
      crow	= [y]

      1.upto(a1.size) do |x|
        crow[x]	= [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[x-1].equal?(a2[y-1]) ? 0 : 1)].min
      end

      # Stop analysing this sequence as soon as the best possible
      # result for this sequence is bigger than the best result so far.
      # (The minimum value in the next row will be equal to or greater
      # than the minimum value in this row.)

      return nil	if threshold and crow.min >= threshold
    end

    crow[-1]
  end

  module Util	# :nodoc:
    def self.pool(*args)
      # So we can compare pointers instead of objects (equal?() instead of ==()).

      pool	= {}

      args.collect do |arg|
        a	= []

        arg.each do |o|
          a << pool[o] ||= o
        end

        a
      end
    end
  end
end

begin
  require "levenshtein/levenshtein_fast"	# Compiled by RubyGems.
rescue LoadError
  begin
    require "levenshtein_fast"			# Compiled by the build script.
  rescue LoadError
    $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein. Using the much slower Ruby version instead."
  end
end