1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
|
import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
import time
from pattern.vector import Document, Model, KNN
from pattern.db import Datasheet
# Long documents contain lots of words.
# Models with lots of long documents can become slow,
# because calculating cosine similarity then takes a long time.
# Latent Semantic Analysis (LSA) is a statistical machine learning method,
# based on a matrix calculation called "singular value decomposition" (SVD).
# It discovers semantically related words across documents.
# It groups related words into "concepts" .
# It then creates a concept vector for each document.
# This reduces the amount of data to work with (for example when clustering),
# and filters out noise, so that semantically related words come out stronger.
# We'll use the Pang & Lee corpus of movie reviews, included in the testing suite.
# Take 250 positive reviews and 250 negative reviews:
data = os.path.join(os.path.dirname(__file__), "..","..","test", "corpora", "polarity-en-pang&lee1.csv")
data = Datasheet.load(data)
data = data[:250] + data[-250:]
# Build a model of movie reviews.
# Each document consists of the top 40 words in the movie review.
documents = []
for score, review in data:
document = Document(review, stopwords=False, top=40, type=int(score) > 0)
documents.append(document)
m = Model(documents)
print "number of documents:", len(m)
print "number of features:", len(m.vector)
print "number of features (average):", sum(len(d.features) for d in m.documents) / float(len(m))
print
# 6,337 different features may be too slow for some algorithms (e.g., hierarchical clustering).
# We'll reduce the document vectors to 10 concepts.
# Let's test how our model performs as a classifier.
# A document can have a label (or type, or class).
# For example, in the movie reviews corpus,
# there are positive reviews (score > 0) and negative reviews (score < 0).
# A classifier uses a model as "training" data
# to predict the label (type/class) of unlabeled documents.
# In this case, it can predict whether a new movie review is positive or negative.
# The details are not that important right now, just observe the accuracy.
# Naturally, we want accuracy to stay the same after LSA reduction,
# and hopefully decrease the time needed to run.
t = time.time()
print "accuracy:", KNN.test(m, folds=10)[-1]
print "time:", time.time() - t
print
# Reduce the documents to vectors of 10 concepts (= 1/4 of 40 features).
print "LSA reduction..."
print
m.reduce(10)
t = time.time()
print "accuracy:", KNN.test(m, folds=10)[-1]
print "time:", time.time() - t
print
# Accuracy is about the same, but the performance is better: 2x-3x faster,
# because each document is now a "10-word summary" of the original review.
# Let's take a closer look at the concepts.
# The concept vector for the first document:
print m.lsa.vectors[m[0].id]
print
# It is a dictionary of concept id's (instead of features).
# This is is not very helpful.
# But we can look up the features "bundled" in each concept:
print len(m.lsa.concepts[0])
# That's a lot of words.
# In fact, all features in the model have a score for one of the ten concepts.
# To make it clearer, let's generate 100 concepts (i.e., semantic categories),
# and then examine the features with the highest score for a concept:
m.lsa = None
m.reduce(100)
for feature, weight in m.lsa.concepts[15].items(): # concept id=2
if abs(weight) > 0.1:
print feature
# Concept 2 = "truman", "ventura", "ace", "carrey", ... Obviously about Jim Carrey movies.
# Concept 15 = "sixth", "sense", "child", "dead", "willis" ...
# Not all concepts are equally easy to interpret,
# but the technique can be useful to discover synonym sets.
|