File: lsi.rb

package info (click to toggle)
ruby-classifier 1.3.4-4
links: PTS, VCS
area: main
in suites: bookworm, bullseye, forky, sid, trixie
size: 224 kB
sloc: ruby: 739; makefile: 5
file content (318 lines) | stat: -rw-r--r-- 12,387 bytes
parent folder | download | duplicates (3)
# Author::    David Fayram  (mailto:dfayram@lensmen.net)
# Copyright:: Copyright (c) 2005 David Fayram II
# License::   LGPL

begin
   raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
   
   require 'gsl' # requires http://rb-gsl.rubyforge.org/
   require 'classifier/extensions/vector_serialize'
   $GSL = true
   
rescue LoadError
	warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
	require 'classifier/extensions/vector'	
end
   
require 'classifier/lsi/word_list'
require 'classifier/lsi/content_node'
require 'classifier/lsi/summary'

module Classifier
  
  # This class implements a Latent Semantic Indexer, which can search, classify and cluster
  # data based on underlying semantic relations. For more information on the algorithms used,
  # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
  class LSI
    
    attr_reader :word_list
    attr_accessor :auto_rebuild
    
    # Create a fresh index.
    # If you want to call #build_index manually, use
    #      Classifier::LSI.new :auto_rebuild => false
    #
    def initialize(options = {})
      @auto_rebuild = true unless options[:auto_rebuild] == false
      @word_list, @items = WordList.new, {}
      @version, @built_at_version = 0, -1
    end
    
    # Returns true if the index needs to be rebuilt.  The index needs
    # to be built after all informaton is added, but before you start
    # using it for search, classification and cluster detection.
    def needs_rebuild?
      (@items.keys.size > 1) && (@version != @built_at_version)
    end
    
    # Adds an item to the index. item is assumed to be a string, but 
    # any item may be indexed so long as it responds to #to_s or if
    # you provide an optional block explaining how the indexer can 
    # fetch fresh string data. This optional block is passed the item,
    # so the item may only be a reference to a URL or file name.
    # 
    # For example:
    #   lsi = Classifier::LSI.new
    #   lsi.add_item "This is just plain text"
    #   lsi.add_item "/home/me/filename.txt" { |x| File.read x }
    #   ar = ActiveRecordObject.find( :all )
    #   lsi.add_item ar, *ar.categories { |x| ar.content }
    #
    def add_item( item, *categories, &block )
      clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
      @items[item] = ContentNode.new(clean_word_hash, *categories)
      @version += 1
      build_index if @auto_rebuild
    end

    # A less flexible shorthand for add_item that assumes 
    # you are passing in a string with no categorries. item
    # will be duck typed via to_s . 
    #
    def <<( item )
      add_item item
    end
    
    # Returns the categories for a given indexed items. You are free to add and remove
    # items from this as you see fit. It does not invalide an index to change its categories.
    def categories_for(item)
      return [] unless @items[item]
      return @items[item].categories
    end

    # Removes an item from the database, if it is indexed. 
    #
    def remove_item( item )
      if @items.keys.contain? item
        @items.remove item
        @version += 1
      end
    end
    
    # Returns an array of items that are indexed. 
    def items
      @items.keys
    end
    
    # Returns the categories for a given indexed items. You are free to add and remove
    # items from this as you see fit. It does not invalide an index to change its categories.
    def categories_for(item)
      return [] unless @items[item]
      return @items[item].categories
    end

    # This function rebuilds the index if needs_rebuild? returns true.
    # For very large document spaces, this indexing operation may take some
    # time to complete, so it may be wise to place the operation in another 
    # thread. 
    #
    # As a rule, indexing will be fairly swift on modern machines until
    # you have well over 500 documents indexed, or have an incredibly diverse 
    # vocabulary for your documents. 
    #
    # The optional parameter "cutoff" is a tuning parameter. When the index is
    # built, a certain number of s-values are discarded from the system. The 
    # cutoff parameter tells the indexer how many of these values to keep.
    # A value of 1 for cutoff means that no semantic analysis will take place,
    # turning the LSI class into a simple vector search engine.
    def build_index( cutoff=0.75 )
      return unless needs_rebuild?
      make_word_list
      
      doc_list = @items.values
      tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
      
      if $GSL
         tdm = GSL::Matrix.alloc(*tda).trans
         ntdm = build_reduced_matrix(tdm, cutoff)

         ntdm.size[1].times do |col| 
           vec = GSL::Vector.alloc( ntdm.column(col) ).row
           doc_list[col].lsi_vector = vec
           doc_list[col].lsi_norm = vec.normalize
         end
      else
         tdm = Matrix.rows(tda).trans
         ntdm = build_reduced_matrix(tdm, cutoff)
      
         ntdm.row_size.times do |col|
           doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
           doc_list[col].lsi_norm = ntdm.column(col).normalize  if doc_list[col]
         end
      end
   
      @built_at_version = @version
    end
   
    # This method returns max_chunks entries, ordered by their average semantic rating.
    # Essentially, the average distance of each entry from all other entries is calculated,
    # the highest are returned.
    #
    # This can be used to build a summary service, or to provide more information about
    # your dataset's general content. For example, if you were to use categorize on the
    # results of this data, you could gather information on what your dataset is generally 
    # about.
    def highest_relative_content( max_chunks=10 )
       return [] if needs_rebuild?
       
       avg_density = Hash.new
       @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
       
       avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
    end

    # This function is the primitive that find_related and classify 
    # build upon. It returns an array of 2-element arrays. The first element
    # of this array is a document, and the second is its "score", defining
    # how "close" it is to other indexed items.
    # 
    # These values are somewhat arbitrary, having to do with the vector space
    # created by your content, so the magnitude is interpretable but not always
    # meaningful between indexes. 
    #
    # The parameter doc is the content to compare. If that content is not
    # indexed, you can pass an optional block to define how to create the 
    # text data. See add_item for examples of how this works. 
    def proximity_array_for_content( doc, &block )
      return [] if needs_rebuild?
      
      content_node = node_for_content( doc, &block )
      result = 
        @items.keys.collect do |item|
          if $GSL
             val = content_node.search_vector * @items[item].search_vector.col
          else
             val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
          end
          [item, val]
        end
      result.sort_by { |x| x[1] }.reverse
    end 
    
    # Similar to proximity_array_for_content, this function takes similar
    # arguments and returns a similar array. However, it uses the normalized
    # calculated vectors instead of their full versions. This is useful when 
    # you're trying to perform operations on content that is much smaller than
    # the text you're working with. search uses this primitive.
    def proximity_norms_for_content( doc, &block )
      return [] if needs_rebuild?
  
      content_node = node_for_content( doc, &block )
      result = 
        @items.keys.collect do |item|
          if $GSL
            val = content_node.search_norm * @items[item].search_norm.col
          else
            val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
          end
          [item, val]
        end
      result.sort_by { |x| x[1] }.reverse
    end 
    
    # This function allows for text-based search of your index. Unlike other functions
    # like find_related and classify, search only takes short strings. It will also ignore
    # factors like repeated words. It is best for short, google-like search terms. 
    # A search will first priortize lexical relationships, then semantic ones. 
    #
    # While this may seem backwards compared to the other functions that LSI supports,
    # it is actually the same algorithm, just applied on a smaller document.
    def search( string, max_nearest=3 )
      return [] if needs_rebuild?
      carry = proximity_norms_for_content( string )
      result = carry.collect { |x| x[0] }
      return result[0..max_nearest-1]
    end
    
    # This function takes content and finds other documents
    # that are semantically "close", returning an array of documents sorted
    # from most to least relavant.
    # max_nearest specifies the number of documents to return. A value of 
    # 0 means that it returns all the indexed documents, sorted by relavence. 
    #
    # This is particularly useful for identifing clusters in your document space. 
    # For example you may want to identify several "What's Related" items for weblog
    # articles, or find paragraphs that relate to each other in an essay.
    def find_related( doc, max_nearest=3, &block )
      carry = 
        proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
      result = carry.collect { |x| x[0] }
      return result[0..max_nearest-1]
    end
      
    # This function uses a voting system to categorize documents, based on 
    # the categories of other documents. It uses the same logic as the 
    # find_related function to find related documents, then returns the
    # most obvious category from this list. 
    #
    # cutoff signifies the number of documents to consider when clasifying 
    # text. A cutoff of 1 means that every document in the index votes on 
    # what category the document is in. This may not always make sense.
    #
    def classify( doc, cutoff=0.30, &block )
      icutoff = (@items.size * cutoff).round
      carry = proximity_array_for_content( doc, &block )
      carry = carry[0..icutoff-1]
      votes = {}
      carry.each do |pair|
        categories = @items[pair[0]].categories
        categories.each do |category| 
          votes[category] ||= 0.0
          votes[category] += pair[1] 
        end
      end
      
      ranking = votes.keys.sort_by { |x| votes[x] }
      return ranking[-1]
    end
    
    # Prototype, only works on indexed documents.
    # I have no clue if this is going to work, but in theory
    # it's supposed to.
    def highest_ranked_stems( doc, count=3 )
      raise "Requested stem ranking on non-indexed content!" unless @items[doc]
      arr = node_for_content(doc).lsi_vector.to_a
      top_n = arr.sort.reverse[0..count-1]
      return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
    end

    private
    def build_reduced_matrix( matrix, cutoff=0.75 )
      # TODO: Check that M>=N on these dimensions! Transpose helps assure this
      u, v, s = matrix.SV_decomp

      # TODO: Better than 75% term, please. :\
      s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
      s.size.times do |ord|
        s[ord] = 0.0 if s[ord] < s_cutoff
      end
      # Reconstruct the term document matrix, only with reduced rank
      u * ($GSL ? GSL::Matrix : ::Matrix).diag( s ) * v.trans
    end
    
    def node_for_content(item, &block)    
      if @items[item]
        return @items[item]
      else
        clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash

        cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data

        unless needs_rebuild?
          cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
        end
      end
      
      return cn
    end
    
    def make_word_list
      @word_list = WordList.new
      @items.each_value do |node|
        node.word_hash.each_key { |key| @word_list.add_word key }
      end
    end

  end
end