1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
|
################################################################################
#
# This file is part of the General Hidden Markov Model Library,
# GHMM version __VERSION__, see http://ghmm.org
#
# file: mixture.py
# authors: Alexander Schliep
#
# Copyright (C) 1998-2004 Alexander Schliep
# Copyright (C) 1998-2001 ZAIK/ZPR, Universitaet zu Koeln
# Copyright (C) 2002-2004 Max-Planck-Institut fuer Molekulare Genetik,
# Berlin
#
# Contact: schliep@ghmm.org
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Library General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Library General Public License for more details.
#
# You should have received a copy of the GNU Library General Public
# License along with this library; if not, write to the Free
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
#
# This file is version $Revision: 2258 $
# from $Date: 2009-04-22 04:18:46 -0400 (Wed, 22 Apr 2009) $
# last change by $Author: grunau $.
#
################################################################################
#!/usr/bin/python32.3
from ghmm import *
import numarray
import math
import getopt, sys, string
def Entropy(prob_dist):
""" Returns Entropy for the discrete prob. distribution
Entropy is according to natural logarithm (default)
or log with the specified base
"""
result = 0.0
for i in range(len(prob_dist)):
# we have to deal with zero probs
p = prob_dist[i]
if p > 0.0:
result -= p * math.log(p)
# p == 0.0 Note: lim_(x->0) x log(x) = 0
return result
def sumlogs(a):
""" Given a numarray.array a of log p_i, return log(sum p_i)
XXX should be coded in C, check whether part of Numarray
"""
m = max(a)
result = 0.0
for x in a:
if x >= m:
result += 1.0
else:
x = x - m
if x < -1.0e-16:
result += math.exp(x) # XXX use approximation
result = math.log(result)
result += m
return result
def estimate_mixture(models, seqs, max_iter, eps, alpha=None):
""" Given a Python-list of models and a SequenceSet seqs
perform an nested EM to estimate maximum-likelihood
parameters for the models and the mixture coefficients.
The iteration stops after max_iter steps or if the
improvement in log-likelihood is less than eps.
alpha is a numarray of dimension len(models) containing
the mixture coefficients. If alpha is not given, uniform
values will be chosen.
Result: The models are changed in place. Return value
is (l, alpha, P) where l is the final log likelihood of
seqs under the mixture, alpha is a numarray of
dimension len(models) containing the mixture coefficients
and P is a (|sequences| x |models|)-matrix containing
P[model j| sequence i]
"""
done = 0
iter = 1
last_mixture_likelihood = -99999999.99
# The (nr of seqs x nr of models)-matrix holding the likelihoods
l = numarray.zeros((len(seqs), len(models)), numarray.Float)
if alpha == None: # Uniform alpha
logalpha = numarray.ones(len(models), numarray.Float) * \
math.log(1.0/len(models))
else:
logalpha = numarray.log(alpha)
print(logalpha, numarray.exp(logalpha))
log_nrseqs = math.log(len(seqs))
while 1:
# Score all sequences with all models
for i, m in enumerate(models):
loglikelihood = m.loglikelihoods(seqs)
# numarray slices: l[:,i] is the i-th column of l
l[:,i] = numarray.array(loglikelihood)
#print l
for i in range(len(seqs)):
l[i] += logalpha # l[i] = ( log( a_k * P[seq i| model k]) )
#print l
mixture_likelihood = numarray.sum(numarray.sum(l))
print("# iter %s joint likelihood = %f" % (iter, mixture_likelihood))
improvement = mixture_likelihood - last_mixture_likelihood
if iter > max_iter or improvement < eps:
break
# Compute P[model j| seq i]
for i in range(len(seqs)):
seq_logprob = sumlogs(l[i]) # \sum_{k} a_k P[seq i| model k]
l[i] -= seq_logprob # l[i] = ( log P[model j | seq i] )
#print l
l_exp = numarray.exp(l) # XXX Use approx with table lookup
#print "exp(l)", l_exp
#print numarray.sum(numarray.transpose(l_exp)) # Print row sums
# Compute priors alpha
for i in range(len(models)):
logalpha[i] = sumlogs(l[:,i]) - log_nrseqs
#print "logalpha", logalpha, numarray.exp(logalpha)
for j, m in enumerate(models):
# Set the sequence weight for sequence i under model m to P[m| i]
for i in range(len(seqs)):
seqs.setWeight(i,l_exp[i,j])
m.baumWelch(seqs, 10, 0.0001)
iter += 1
last_mixture_likelihood = mixture_likelihood
return (mixture_likelihood, numarray.exp(logalpha), l_exp)
def decode_mixture(P, entropy_cutoff):
""" Given P, a (|sequences| x |models|)-matrix where P_{ij} =
P[model j| sequence i] return a list of size (|sequences|)
which contains j, the model which maximizes P[model j| sequence i]
for a sequence, if the entropy of the discrete distribution
{ P[ . | sequence i] } is less than the entropy_cutoff and None else.
"""
nr_seqs = numarray.shape(P)[0]
result = [None] * nr_seqs
for i in range(nr_seqs):
e = Entropy(P[i])
#print e, P[i]
if e < entropy_cutoff:
result[i] = int(numarray.argmax(P[i]))
return result
usage_info = """
mixture.py [options] hmms.smo seqs.sqd [hmms-reestimated.smo]
Estimate a mixture of hmms from a file hmms.smo containing continous
emission HMMs and a set of sequences of reals given in the file
seqs.sqd.
Running:
-m iter Maximal number of iterations (default is 100)
-e eps If the improvement in likelihood is below eps, the training
is terminated (default is 0.001)
Post-analyis (the options are mutually exclusive):
-p Output the matrix p_{ij} = P[model j| sequence i] to the console
-c Cluster sequences. Assign each sequence i to the model maximizing
P[model j| sequence i]. Outputs seq_id\tcluster_nr to the console
-d ent Decode mixture. If the entropy of { P[model j| sequence i] } is
less than 'ent', sequence i is assigned to the model maximizing
P[model j| sequence i]. Outputs seq_id\tcluster_nr to the console,
cluster_nr is None if no assignment was possible
Example:
mixture.py -m 10 -e 0.1 -d 0.15 test2.smo test100.sqd reestimated.smo
"""
def usage():
print(usage_info)
if __name__ == '__main__':
# Default values
max_iter = 100
eps = 0.001
output = None
try:
opts, args = getopt.getopt(sys.argv[1:], "m:e:pcd:", [])
except getopt.GetoptError:
usage()
sys.exit(2)
for o, a in opts:
if o in ['-m']:
max_iter = int(a)
if o in ['-e']:
eps = float(a)
if o in ['-p']:
output = 'p_matrix'
if o in ['-c']:
output = 'cluster'
if o in ['-d']:
output = 'decode'
entropy_cutoff = float(a)
if len(args) != 3:
usage()
sys.exit(2)
hmmsFileName = args[0]
seqsFileName = args[1]
outFileName = args[2]
models = HMMOpen.all(hmmsFileName)
print("# Read %d models from '%s'" % (len(models), hmmsFileName))
seqs = SequenceSet(Float(), seqsFileName)
print("# Read %d sequences from '%s'" % (len(seqs), seqsFileName))
(ml, alpha, P) = estimate_mixture(models, seqs, max_iter, eps)
if output != None:
if output == 'p_matrix':
for i in range(len(seqs)):
print(string.join(["%1.3f" % x for x in P[i]], '\t'))
else:
if output == 'cluster':
assignment = decode_mixture(P, len(models)) # max ent: log(len(models))
else:
assignment = decode_mixture(P, entropy_cutoff)
for i, c in enumerate(assignment):
print("%s\t%s" % (str(i), str(c)))
|