File: summarizer.rb

package info (click to toggle)
ruby-classifier-reborn 2.2.0-3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 1,424 kB
  • sloc: ruby: 2,021; makefile: 7
file content (33 lines) | stat: -rw-r--r-- 1,039 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
# Copyright:: Copyright (c) 2005 Lucas Carlson
# License::   LGPL

module ClassifierReborn
  module Summarizer
    module_function

    def summary(str, count = 10, separator = ' [...] ')
      perform_lsi split_sentences(str), count, separator
    end

    def paragraph_summary(str, count = 1, separator = ' [...] ')
      perform_lsi split_paragraphs(str), count, separator
    end

    def split_sentences(str)
      str.split(/(\.|\!|\?)/) # TODO: make this less primitive
    end

    def split_paragraphs(str)
      str.split(/(\n\n|\r\r|\r\n\r\n)/) # TODO: make this less primitive
    end

    def perform_lsi(chunks, count, separator)
      lsi = ClassifierReborn::LSI.new auto_rebuild: false
      chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
      lsi.build_index
      summaries = lsi.highest_relative_content count
      summaries.reject { |chunk| !summaries.include? chunk }.map(&:strip).join(separator)
    end
  end
end