File: levenshtein.rb

package info (click to toggle)
ruby-levenshtein 0.2.2-3
links: PTS, VCS
area: main
in suites: bookworm, forky, sid, trixie
size: 104 kB
sloc: ruby: 231; ansic: 71; makefile: 2
file content (148 lines) | stat: -rwxr-xr-x 3,678 bytes
parent folder | download | duplicates (2)
# encoding: UTF-8

require "levenshtein/version"

module Levenshtein
  # Returns the Levenshtein distance as a number between 0.0 and
  # 1.0. It's basically the Levenshtein distance divided by the
  # size of the longest sequence.

  def self.normalized_distance(a1, a2, threshold=nil, options={})
    size	= [a1.size, a2.size].max

    if a1.size == 0 and a2.size == 0
      0.0
    elsif a1.size == 0
      a2.size.to_f/size
    elsif a2.size == 0
      a1.size.to_f/size
    else
      if threshold
        if d = self.distance(a1, a2, (threshold*size).to_i+1)
          d.to_f/size
        else
          nil
        end
      else
        self.distance(a1, a2).to_f/size
      end
    end
  end

  # Returns the Levenshtein distance between two sequences.
  #
  # The two sequences can be two strings, two arrays, or two other
  # objects responding to :each. All sequences are by generic
  # (fast) C code.
  #
  # All objects in the sequences should respond to :hash and :eql?.

  def self.distance(a1, a2, threshold=nil, options={})
    a1, a2	= a1.scan(/./), a2.scan(/./)	if String === a1 and String === a2
    a1, a2	= Util.pool(a1, a2)

    # Handle some basic circumstances.

    return 0		if a1 == a2
    return a2.size	if a1.empty?
    return a1.size	if a2.empty?

    if threshold
      return nil	if (a1.size-a2.size) >= threshold
      return nil	if (a2.size-a1.size) >= threshold
      return nil	if (a1-a2).size >= threshold
      return nil	if (a2-a1).size >= threshold
    end

    # Remove the common prefix and the common postfix.

    l1	= a1.size
    l2	= a2.size

    offset			= 0
    no_more_optimizations	= true
 
    while offset < l1 and offset < l2 and a1[offset].equal?(a2[offset])
      offset += 1

      no_more_optimizations	= false
    end
 
    while offset < l1 and offset < l2 and a1[l1-1].equal?(a2[l2-1])
      l1 -= 1
      l2 -= 1

      no_more_optimizations	= false
    end
 
    if no_more_optimizations
      distance_fast_or_slow(a1, a2, threshold, options)
    else
      l1 -= offset
      l2 -= offset

      a1	= a1[offset, l1]
      a2	= a2[offset, l2]

      distance(a1, a2, threshold, options)
    end
  end

  def self.distance_fast_or_slow(a1, a2, threshold, options)	# :nodoc:
    if respond_to?(:distance_fast) and options[:force_slow]
      distance_fast(a1, a2, threshold)	# Implemented in C.
    else
      distance_slow(a1, a2, threshold)	# Implemented in Ruby.
    end
  end

  def self.distance_slow(a1, a2, threshold)	# :nodoc:
    crow	= (0..a1.size).to_a

    1.upto(a2.size) do |y|
      prow	= crow
      crow	= [y]

      1.upto(a1.size) do |x|
        crow[x]	= [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[x-1].equal?(a2[y-1]) ? 0 : 1)].min
      end

      # Stop analysing this sequence as soon as the best possible
      # result for this sequence is bigger than the best result so far.
      # (The minimum value in the next row will be equal to or greater
      # than the minimum value in this row.)

      return nil	if threshold and crow.min >= threshold
    end

    crow[-1]
  end

  module Util	# :nodoc:
    def self.pool(*args)
      # So we can compare pointers instead of objects (equal?() instead of ==()).

      pool	= {}

      args.collect do |arg|
        a	= []

        arg.each do |o|
          a << pool[o] ||= o
        end

        a
      end
    end
  end
end

begin
  require "levenshtein/levenshtein_fast"	# Compiled by RubyGems.
rescue LoadError
  begin
    require "levenshtein_fast"			# Compiled by the build script.
  rescue LoadError
    $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein. Using the much slower Ruby version instead."
  end
end