File: content_node.rb

package info (click to toggle)
ruby-classifier-reborn 2.2.0-3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 1,424 kB
  • sloc: ruby: 2,021; makefile: 7
file content (100 lines) | stat: -rw-r--r-- 2,876 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# Author::    David Fayram  (mailto:dfayram@lensmen.net)
# Copyright:: Copyright (c) 2005 David Fayram II
# License::   LGPL

module ClassifierReborn
  # This is an internal data structure class for the LSI node. Save for
  # raw_vector_with, it should be fairly straightforward to understand.
  # You should never have to use it directly.
  class ContentNode
    attr_accessor :raw_vector, :raw_norm,
                  :lsi_vector, :lsi_norm,
                  :categories

    attr_reader :word_hash
    # If text_proc is not specified, the source will be duck-typed
    # via source.to_s
    def initialize(word_hash, *categories)
      @categories = categories || []
      @word_hash = word_hash
      @lsi_norm, @lsi_vector = nil
    end

    # Use this to fetch the appropriate search vector.
    def search_vector
      @lsi_vector || @raw_vector
    end

    # Method to access the transposed search vector
    def transposed_search_vector
      search_vector.col
    end

    # Use this to fetch the appropriate search vector in normalized form.
    def search_norm
      @lsi_norm || @raw_norm
    end

    # Creates the raw vector out of word_hash using word_list as the
    # key for mapping the vector space.
    def raw_vector_with(word_list)
      if $GSL
        vec = GSL::Vector.alloc(word_list.size)
      else
        vec = Array.new(word_list.size, 0)
      end

      @word_hash.each_key do |word|
        vec[word_list[word]] = @word_hash[word] if word_list[word]
      end

      # Perform the scaling transform and force floating point arithmetic
      if $GSL
        sum = 0.0
        vec.each { |v| sum += v }
        total_words = sum
      else
        total_words = vec.reduce(0, :+).to_f
      end

      total_unique_words = 0

      if $GSL
        vec.each { |word| total_unique_words += 1 if word != 0.0 }
      else
        total_unique_words = vec.count { |word| word != 0 }
      end

      # Perform first-order association transform if this vector has more
      # then one word in it.
      if total_words > 1.0 && total_unique_words > 1
        weighted_total = 0.0
        # Cache calculations, this takes too long on large indexes
        cached_calcs = Hash.new do |hash, term|
          hash[term] = ((term / total_words) * Math.log(term / total_words))
        end

        vec.each do |term|
          weighted_total += cached_calcs[term] if term > 0.0
        end

        # Cache calculations, this takes too long on large indexes
        cached_calcs = Hash.new do |hash, val|
          hash[val] = Math.log(val + 1) / -weighted_total
        end

        vec.collect! do |val|
          cached_calcs[val]
        end
      end

      if $GSL
        @raw_norm   = vec.normalize
        @raw_vector = vec
      else
        @raw_norm   = Vector[*vec].normalize
        @raw_vector = Vector[*vec]
      end
    end
  end
end