1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
|
"""
A simple example demonstrating Multinomial HMM
----------------------------------------------
The Multinomial HMM is a generalization of the Categorical HMM, with some key
differences:
- a Categorical__ (or generalized Bernoulli/multinoulli) distribution models an
outcome of a die with `n_features` possible values, i.e. it is a
generalization of the Bernoulli distribution where there are ``n_features``
categories instead of the binary success/failure outcome; a Categorical HMM
has the emission probabilities for each component parametrized by Categorical
distributions.
- a Multinomial__ distribution models the outcome of ``n_trials`` independent
rolls of die, each with ``n_features`` possible values; i.e.
- when ``n_trials = 1`` and ``n_features = 2``, it is a Bernoulli
distribution,
- when ``n_trials > 1`` and ``n_features = 2``, it is a Binomial
distribution,
- when ``n_trials = 1`` and ``n_features > 2``, it is a Categorical
distribution.
The emission probabilities for each component of a Multinomial HMM are
parameterized by Multinomial distributions.
__ https://en.wikipedia.org/wiki/Categorical_distribution
__ https://en.wikipedia.org/wiki/Multinomial_distribution
"""
import numpy as np
from hmmlearn import hmm
# For this example, we will model the stages of a conversation,
# where each sentence is "generated" with an underlying topic, "cat" or "dog"
states = ["cat", "dog"]
id2topic = dict(zip(range(len(states)), states))
# we are more likely to talk about cats first
start_probs = np.array([0.6, 0.4])
# For each topic, the probability of saying certain words can be modeled by
# a distribution over vocabulary associated with the categories
vocabulary = ["tail", "fetch", "mouse", "food"]
# if the topic is "cat", we are more likely to talk about "mouse"
# if the topic is "dog", we are more likely to talk about "fetch"
emission_probs = np.array([[0.25, 0.1, 0.4, 0.25],
[0.2, 0.5, 0.1, 0.2]])
# Also assume it's more likely to stay in a state than transition to the other
trans_mat = np.array([[0.8, 0.2], [0.2, 0.8]])
# Pretend that every sentence we speak only has a total of 5 words,
# i.e. we independently utter a word from the vocabulary 5 times per sentence
# we observe the following bag of words (BoW) for 8 sentences:
observations = [["tail", "mouse", "mouse", "food", "mouse"],
["food", "mouse", "mouse", "food", "mouse"],
["tail", "mouse", "mouse", "tail", "mouse"],
["food", "mouse", "food", "food", "tail"],
["tail", "fetch", "mouse", "food", "tail"],
["tail", "fetch", "fetch", "food", "fetch"],
["fetch", "fetch", "fetch", "food", "tail"],
["food", "mouse", "food", "food", "tail"],
["tail", "mouse", "mouse", "tail", "mouse"],
["fetch", "fetch", "fetch", "fetch", "fetch"]]
# Convert "sentences" to numbers:
vocab2id = dict(zip(vocabulary, range(len(vocabulary))))
def sentence2counts(sentence):
ans = []
for word, idx in vocab2id.items():
count = sentence.count(word)
ans.append(count)
return ans
X = []
for sentence in observations:
row = sentence2counts(sentence)
X.append(row)
data = np.array(X, dtype=int)
# pretend this is repeated, so we have more data to learn from:
lengths = [len(X)]*5
sequences = np.tile(data, (5,1))
# Set up model:
model = hmm.MultinomialHMM(n_components=len(states),
n_trials=len(observations[0]),
n_iter=50,
init_params='')
model.n_features = len(vocabulary)
model.startprob_ = start_probs
model.transmat_ = trans_mat
model.emissionprob_ = emission_probs
model.fit(sequences, lengths)
logprob, received = model.decode(sequences)
print("Topics discussed:")
print([id2topic[x] for x in received])
print("Learned emission probs:")
print(model.emissionprob_)
print("Learned transition matrix:")
print(model.transmat_)
# Try to reset and refit:
new_model = hmm.MultinomialHMM(n_components=len(states),
n_trials=len(observations[0]),
n_iter=50, init_params='ste')
new_model.fit(sequences, lengths)
logprob, received = new_model.decode(sequences)
print("\nNew Model")
print("Topics discussed:")
print([id2topic[x] for x in received])
print("Learned emission probs:")
print(new_model.emissionprob_)
print("Learned transition matrix:")
print(new_model.transmat_)
|