File: gauss_mix.py

package info (click to toggle)
python-scipy 0.5.2-0.1
  • links: PTS
  • area: main
  • in suites: etch, etch-m68k
  • size: 33,888 kB
  • ctags: 44,231
  • sloc: ansic: 156,256; cpp: 90,347; python: 89,604; fortran: 73,083; sh: 1,318; objc: 424; makefile: 342
file content (465 lines) | stat: -rw-r--r-- 16,562 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
# /usr/bin/python
# Last Change: Thu Nov 16 08:00 PM 2006 J

# Module to implement GaussianMixture class.

import numpy as N
from numpy.random import randn, rand
import numpy.linalg as lin
import densities
from misc import _MAX_DBL_DEV

# Right now, two main usages of a Gaussian Model are possible
#   - init a Gaussian Model with meta-parameters, and trains it
#   - set-up a Gaussian Model to sample it, draw ellipsoides 
#   of confidences. In this case, we would like to init it with
#   known values of parameters. This can be done with the class method 
#   fromval

# TODO:
#   - change bounds methods of GM class instanciations so that it cannot 
#   be used as long as w, mu and va are not set
#   - We have to use scipy now for chisquare pdf, so there may be other
#   methods to be used, ie for implementing random index.
#   - there is no check on internal state of the GM, that is does w, mu and va values
#   make sense (eg singular values)
#   - plot1d is still very rhough. There should be a sensible way to 
#   modify the result plot (maybe returns a dic with global pdf, component pdf and
#   fill matplotlib handles). Should be coherent with plot
class GmParamError:
    """Exception raised for errors in gmm params

    Attributes:
        expression -- input expression in which the error occurred
        message -- explanation of the error
    """
    def __init__(self, message):
        self.message    = message
    
    def __str__(self):
        return self.message

class GM:
    """Gaussian Mixture class. This is a simple container class
    to hold Gaussian Mixture parameters (weights, mean, etc...).
    It can also draw itself (confidence ellipses) and samples itself.
    """

    # I am not sure it is useful to have a spherical mode...
    _cov_mod    = ['diag', 'full']

    #===============================
    # Methods to construct a mixture
    #===============================
    def __init__(self, d, k, mode = 'diag'):
        """Init a Gaussian Mixture of k components, each component being a 
        d multi-variate Gaussian, with covariance matrix of style mode.
        
        If you want to build a Gaussian Mixture with knowns weights, means
        and variances, you can use GM.fromvalues method directly"""
        if mode not in self._cov_mod:
            raise GmParamError("mode %s not recognized" + str(mode))

        self.d      = d
        self.k      = k
        self.mode   = mode

        # Init to 0 all parameters, with the right dimensions.
        # Not sure this is useful in python from an efficiency POV ?
        self.w   = N.zeros(k)
        self.mu  = N.zeros((k, d))
        if mode == 'diag':
            self.va  = N.zeros((k, d))
        elif mode == 'full':
            self.va  = N.zeros((k * d, d))

        self.is_valid   = False

    def set_param(self, weights, mu, sigma):
        """Set parameters of the model. Args should
        be conformant with metparameters d and k given during
        initialisation"""
        k, d, mode  = check_gmm_param(weights, mu, sigma)
        if not k == self.k:
            raise GmParamError("Number of given components is %d, expected %d" 
                    % (k, self.k))
        if not d == self.d:
            raise GmParamError("Dimension of the given model is %d, expected %d" 
                    % (d, self.d))
        if not mode == self.mode and not d == 1:
            raise GmParamError("Given covariance mode is %s, expected %s"
                    % (mode, self.mode))
        self.w  = weights
        self.mu = mu
        self.va = sigma

        self.is_valid   = True

    @classmethod
    def fromvalues(cls, weights, mu, sigma):
        """This class method can be used to create a GM model
        directly from its parameters weights, mean and variance
        
        w, mu, va   = GM.gen_param(d, k)
        gm  = GM(d, k)
        gm.set_param(w, mu, va)

        and
        
        w, mu, va   = GM.gen_param(d, k)
        gm  = GM.fromvalue(w, mu, va)

        Are equivalent """
        k, d, mode  = check_gmm_param(weights, mu, sigma)
        res = cls(d, k, mode)
        res.set_param(weights, mu, sigma)
        return res
        
    #=====================================================
    # Fundamental facilities (sampling, confidence, etc..)
    #=====================================================
    def sample(self, nframes):
        """ Sample nframes frames from the model """
        if not self.is_valid:
            raise GmParamError("""Parameters of the model has not been 
                set yet, please set them using self.set_param()""")

        # State index (ie hidden var)
        S   = gen_rand_index(self.w, nframes)
        # standard gaussian
        X   = randn(nframes, self.d)        

        if self.mode == 'diag':
            X   = self.mu[S, :]  + X * N.sqrt(self.va[S,:])
        elif self.mode == 'full':
            # Faster:
            cho = N.zeros((self.k, self.va.shape[1], self.va.shape[1]))
            for i in range(self.k):
                # Using cholesky looks more stable than sqrtm; sqrtm is not
                # available in numpy anyway, only in scipy...
                cho[i]  = lin.cholesky(self.va[i*self.d:i*self.d+self.d,:])

            for s in range(self.k):
                tmpind      = N.where(S == s)[0]
                X[tmpind]   = N.dot(X[tmpind], cho[s].transpose()) + self.mu[s]
        else:
            raise GmParamError('cov matrix mode not recognized, this is a bug !')

        return X

    def conf_ellipses(self, *args, **kargs):
        """Returns a list of confidence ellipsoids describing the Gmm
        defined by mu and va. Check densities.gauss_ell for details

        Returns:
            -Xe:    a list of x coordinates for the ellipses (Xe[i] is
            the array containing x coordinates of the ith Gaussian)
            -Ye:    a list of y coordinates for the ellipses

        Example:
            Suppose we have w, mu and va as parameters for a mixture, then:
            
            gm      = GM(d, k)
            gm.set_param(w, mu, va)
            X       = gm.sample(1000)
            Xe, Ye  = gm.conf_ellipsoids()
            pylab.plot(X[:,0], X[:, 1], '.')
            for k in len(w):
                pylab.plot(Xe[k], Ye[k], 'r')
                
            Will plot samples X draw from the mixture model, and
            plot the ellipses of equi-probability from the mean with
            fixed level of confidence 0.39.  """
        if not self.is_valid:
            raise GmParamError("""Parameters of the model has not been 
                set yet, please set them using self.set_param()""")

        Xe  = []
        Ye  = []   
        if self.mode == 'diag':
            for i in range(self.k):
                xe, ye  = densities.gauss_ell(self.mu[i,:], self.va[i,:], 
                        *args, **kargs)
                Xe.append(xe)
                Ye.append(ye)
        elif self.mode == 'full':
            for i in range(self.k):
                xe, ye  = densities.gauss_ell(self.mu[i,:], 
                        self.va[i*self.d:i*self.d+self.d,:], 
                        *args, **kargs)
                Xe.append(xe)
                Ye.append(ye)

        return Xe, Ye
    
    def check_state(self):
        """
        """
        if not self.is_valid:
            raise GmParamError("""Parameters of the model has not been 
                set yet, please set them using self.set_param()""")

        if self.mode == 'full':
            raise NotImplementedError, "not implemented for full mode yet"
        
        # # How to check w: if one component is negligeable, what shall
        # # we do ?
        # M   = N.max(self.w)
        # m   = N.min(self.w)

        # maxc    = m / M

        # Check condition number for cov matrix
        cond    = N.zeros(self.k)
        ava     = N.absolute(self.va)
        for c in range(self.k):
            cond[c] = N.amax(ava[c,:]) / N.amin(ava[c,:])

        print cond

    def gen_param(self, d, nc, varmode = 'diag', spread = 1):
        """Generate valid parameters for a gaussian mixture model.
        d is the dimension, nc the number of components, and varmode
        the mode for cov matrices.

        This is a class method.

        Returns: w, mu, va
        """
        w   = abs(randn(nc))
        w   = w / sum(w, 0)

        mu  = spread * randn(nc, d)
        if varmode == 'diag':
            va  = abs(randn(nc, d))
        elif varmode == 'full':
            va  = randn(nc * d, d)
            for k in range(nc):
                va[k*d:k*d+d]   = N.dot( va[k*d:k*d+d], 
                    va[k*d:k*d+d].transpose())
        else:
            raise GmParamError('cov matrix mode not recognized')

        return w, mu, va

    gen_param = classmethod(gen_param)

    #=======================
    # Regularization methods
    #=======================
    def _regularize(self):
        raise NotImplemented("No regularization")

    #=================
    # Plotting methods
    #=================
    def plot(self, *args, **kargs):
        """Plot the ellipsoides directly for the model
        
        Returns a list of lines, so that their style can be modified. By default,
        the style is red color, and nolegend for all of them.
        
        Does not work for 1d"""
        if not self.is_valid:
            raise GmParamError("""Parameters of the model has not been 
                set yet, please set them using self.set_param()""")

        k       = self.k
        Xe, Ye  = self.conf_ellipses(*args, **kargs)
        try:
            import pylab as P
            return [P.plot(Xe[i], Ye[i], 'r', label='_nolegend_')[0] for i in range(k)]
            #for i in range(k):
            #    P.plot(Xe[i], Ye[i], 'r')
        except ImportError:
            raise GmParamError("matplotlib not found, cannot plot...")

    def plot1d(self, level = 0.5, fill = 0, gpdf = 0):
        """This function plots the pdfs of each component of the model. 
        If gpdf is 1, also plots the global pdf. If fill is 1, fill confidence
        areas using level argument as a level value
        
        Returns a dictionary h of plot handles so that their properties can
        be modified (eg color, label, etc...):
            - h['pdf'] is a list of lines, one line per component pdf
            - h['gpdf'] is the line for the global pdf
            - h['conf'] is a list of filling area
        """
        # This is not optimized at all, may be slow. Should not be
        # difficult to make much faster, but it is late, and I am lazy
        if not self.d == 1:
            raise GmParamError("the model is not one dimensional model")
        from scipy.stats import norm
        nrm     = norm(0, 1)
        pval    = N.sqrt(self.va[:,0]) * nrm.ppf((1+level)/2)

        # Compute reasonable min/max for the normal pdf
        mc  = 3
        std = N.sqrt(self.va[:,0])
        m   = N.amin(self.mu[:, 0] - mc * std)
        M   = N.amax(self.mu[:, 0] + mc * std)

        np  = 500
        x   = N.linspace(m, M, np)
        Yf  = N.zeros(np)
        Yt  = N.zeros(np)

        # Prepare the dic of plot handles to return
        ks  = ['pdf', 'conf', 'gpdf']
        hp  = dict((i,[]) for i in ks)
        try:
            import pylab as P
            for c in range(self.k):
                y   = self.w[c]/(N.sqrt(2*N.pi) * std[c]) * \
                        N.exp(-(x-self.mu[c][0])**2/(2*std[c]**2))
                Yt  += y
                h   = P.plot(x, y, 'r', label ='_nolegend_')
                hp['pdf'].extend(h)
                if fill:
                    #P.axvspan(-pval[c] + self.mu[c][0], pval[c] + self.mu[c][0], 
                    #        facecolor = 'b', alpha = 0.2)
                    id1 = -pval[c] + self.mu[c]
                    id2 = pval[c] + self.mu[c]
                    xc  = x[:, N.where(x>id1)[0]]
                    xc  = xc[:, N.where(xc<id2)[0]]
                    Yf  = self.w[c]/(N.sqrt(2*N.pi) * std[c]) * \
                            N.exp(-(xc-self.mu[c][0])**2/(2*std[c]**2))
                    xc  = N.concatenate(([xc[0]], xc, [xc[-1]]))
                    Yf  = N.concatenate(([0], Yf, [0]))
                    h   = P.fill(xc, Yf, 
                            facecolor = 'b', alpha = 0.1, label='_nolegend_')
                    hp['conf'].extend(h)
                    #P.fill([xc[0], xc[0], xc[-1], xc[-1]], 
                    #        [0, Yf[0], Yf[-1], 0], facecolor = 'b', alpha = 0.2)
            if gpdf:
                h           = P.plot(x, Yt, 'r:', label='_nolegend_')
                hp['gpdf']  = h
            return hp
        except ImportError:
            raise GmParamError("matplotlib not found, cannot plot...")

    # Syntactic sugar
    def __repr__(self):
        repr    = ""
        repr    += "Gaussian Mixture:\n"
        repr    += " -> %d dimensions\n" % self.d
        repr    += " -> %d components\n" % self.k
        repr    += " -> %s covariance \n" % self.mode
        if self.is_valid:
            repr    += "Has initial values"""
        else:
            repr    += "Has no initial values yet"""
        return repr

    def __str__(self):
        return self.__repr__()

# Function to generate a random index: this is kept outside any class,
# as the function can be useful for other
def gen_rand_index(p, n):
    """Generate a N samples vector containing random index between 1 
    and length(p), each index i with probability p(i)"""
    # TODO Check args here
    
    # TODO: check each value of inverse distribution is
    # different
    invcdf  = N.cumsum(p)
    uni     = rand(n)
    index   = N.zeros(n, dtype=int)

    # This one should be a bit faster
    for k in range(len(p)-1, 0, -1):
        blop        = N.where(N.logical_and(invcdf[k-1] <= uni, 
                    uni < invcdf[k]))
        index[blop] = k
        
    return index

def check_gmm_param(w, mu, va):
    """Check that w, mu and va are valid parameters for
    a mixture of gaussian: w should sum to 1, there should
    be the same number of component in each param, the variances
    should be positive definite, etc... 
    
    Params:
        w   = vector or list of weigths of the mixture (K elements)
        mu  = matrix: K * d
        va  = list of variances (vector K * d or square matrices Kd * d)

    returns:
        K   = number of components
        d   = dimension
        mode    = 'diag' if diagonal covariance, 'full' of full matrices
    """
        
    # Check that w is valid
    if N.fabs(N.sum(w, 0)  - 1) > _MAX_DBL_DEV:
        raise GmParamError('weight does not sum to 1')
    
    if not len(w.shape) == 1:
        raise GmParamError('weight is not a vector')

    # Check that mean and va have the same number of components
    K           = len(w)

    if N.ndim(mu) < 2:
        msg = "mu should be a K,d matrix, and a row vector if only 1 comp"
        raise GmParamError(msg)
    if N.ndim(va) < 2:
        msg = """va should be a K,d / K *d, d matrix, and a row vector if
        only 1 diag comp"""
        raise GmParamError(msg)

    (Km, d)     = mu.shape
    (Ka, da)    = va.shape

    if not K == Km:
        msg = "not same number of component in mean and weights"
        raise GmParamError(msg)

    if not d == da:
        msg = "not same number of dimensions in mean and variances"
        raise GmParamError(msg)

    if Km == Ka:
        mode = 'diag'
    else:
        mode = 'full'
        if not Ka == Km*d:
            msg = "not same number of dimensions in mean and variances"
            raise GmParamError(msg)
        
    return K, d, mode
        
if __name__ == '__main__':
    # Meta parameters:
    #   - k = number of components
    #   - d = dimension
    #   - mode : mode of covariance matrices
    d       = 5
    k       = 4

    # Now, drawing a model
    mode    = 'full'
    nframes = 1e3

    # Build a model with random parameters
    w, mu, va   = GM.gen_param(d, k, mode, spread = 3)
    gm          = GM.fromvalues(w, mu, va)

    # Sample nframes frames  from the model
    X   = gm.sample(nframes)

    # Plot the data
    import pylab as P
    P.plot(X[:, 0], X[:, 1], '.', label = '_nolegend_')

    # Real confidence ellipses with confidence level 
    level       = 0.50
    h           = gm.plot(level=level)

    # set the first ellipse label, which will appear in the legend
    h[0].set_label('confidence ell at level ' + str(level))

    P.legend(loc = 0)
    P.show()