1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
|
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
# Copyright:: Copyright (c) 2005 Lucas Carlson
# License:: LGPL
require "set"
# These are extensions to the String class to provide convenience
# methods for the Classifier package.
class String
# Removes common punctuation symbols, returning a new string.
# E.g.,
# "Hello (greeting's), with {braces} < >...?".without_punctuation
# => "Hello greetings with braces "
def without_punctuation
tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
end
# Return a Hash of strings => ints. Each word in the string is stemmed,
# interned, and indexes to its frequency in the document.
def word_hash
word_hash = clean_word_hash()
symbol_hash = word_hash_for_symbols(gsub(/[\w]/," ").split)
return word_hash.merge(symbol_hash)
end
# Return a word hash without extra punctuation or short symbols, just stemmed words
def clean_word_hash
word_hash_for_words gsub(/[^\w\s]/,"").split
end
private
def word_hash_for_words(words)
d = Hash.new(0)
words.each do |word|
word.downcase!
if ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
d[word.stem.intern] += 1
end
end
return d
end
def word_hash_for_symbols(words)
d = Hash.new(0)
words.each do |word|
d[word.intern] += 1
end
return d
end
CORPUS_SKIP_WORDS = Set.new([
"a",
"again",
"all",
"along",
"are",
"also",
"an",
"and",
"as",
"at",
"but",
"by",
"came",
"can",
"cant",
"couldnt",
"did",
"didn",
"didnt",
"do",
"doesnt",
"dont",
"ever",
"first",
"from",
"have",
"her",
"here",
"him",
"how",
"i",
"if",
"in",
"into",
"is",
"isnt",
"it",
"itll",
"just",
"last",
"least",
"like",
"most",
"my",
"new",
"no",
"not",
"now",
"of",
"on",
"or",
"should",
"sinc",
"so",
"some",
"th",
"than",
"this",
"that",
"the",
"their",
"then",
"those",
"to",
"told",
"too",
"true",
"try",
"until",
"url",
"us",
"were",
"when",
"whether",
"while",
"with",
"within",
"yes",
"you",
"youll",
])
end
|