1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301
|
#!python
#cython: boundscheck=False
#cython: cdivision=True
# MultivariateGaussianDistribution.pyx
# Contact: Jacob Schreiber <jmschreiber91@gmail.com>
import numpy
import scipy
from libc.stdlib cimport calloc
from libc.stdlib cimport free
from libc.string cimport memset
from scipy.linalg.cython_blas cimport dgemm
from ..utils cimport _log
from ..utils cimport mdot
from ..utils cimport ndarray_wrap_cpointer
from ..utils cimport _is_gpu_enabled
from ..utils cimport isnan
from ..utils import check_random_state
from libc.math cimport sqrt as csqrt
# Define some useful constants
DEF NEGINF = float("-inf")
DEF INF = float("inf")
DEF SQRT_2_PI = 2.50662827463
DEF LOG_2_PI = 1.83787706641
eps = 1e-8
cdef class MultivariateGaussianDistribution(MultivariateDistribution):
property parameters:
def __get__(self):
return [self.mu.tolist(), self.cov.tolist()]
def __set__(self, parameters):
self.mu = numpy.array(parameters[0])
self.cov = numpy.array(parameters[1])
def __cinit__(self, means=[], covariance=[], frozen=False):
"""
Take in the mean vector and the covariance matrix.
"""
self.name = "MultivariateGaussianDistribution"
self.frozen = frozen
self.mu = numpy.array(means, dtype='float64')
self._mu = <double*> self.mu.data
self.cov = numpy.array(covariance, dtype='float64')
self._cov = <double*> self.cov.data
_, self._log_det = numpy.linalg.slogdet(self.cov)
if self.mu.shape[0] != self.cov.shape[0]:
raise ValueError("mu shape is {} while covariance shape is {}".format(self.mu.shape[0], self.cov.shape[0]))
if self.cov.shape[0] != self.cov.shape[1]:
raise ValueError("covariance is not a square matrix, dimensions are ({}, {})".format(self.cov.shape[0], self.cov.shape[1]))
if self._log_det == NEGINF:
raise ValueError("covariance matrix is not invertible.")
d = self.mu.shape[0]
self.d = d
self._inv_dot_mu = <double*> calloc(d, sizeof(double))
self._mu_new = <double*> calloc(d, sizeof(double))
chol = scipy.linalg.cholesky(self.cov, lower=True)
self.inv_cov = scipy.linalg.solve_triangular(chol, numpy.eye(d), lower=True).T
self._inv_cov = <double*> self.inv_cov.data
mdot(self._mu, self._inv_cov, self._inv_dot_mu, 1, d, d)
self.column_sum = <double*> calloc(d*d, sizeof(double))
self.column_w_sum = <double*> calloc(d, sizeof(double))
self.pair_sum = <double*> calloc(d*d, sizeof(double))
self.pair_w_sum = <double*> calloc(d*d, sizeof(double))
def __reduce__(self):
"""Serialize the distribution for pickle."""
return self.__class__, (self.mu, self.cov, self.frozen)
def __dealloc__(self):
free(self._inv_dot_mu)
free(self._mu_new)
free(self.column_sum)
free(self.column_w_sum)
free(self.pair_sum)
free(self.pair_w_sum)
cdef void _log_probability(self, double* X, double* logp, int n) nogil:
cdef int i, j, d = self.d
cdef double* dot
if _is_gpu_enabled():
with gil:
import cupy
x = ndarray_wrap_cpointer(X, n*d).reshape(n, d)
x1 = cupy.array(x)
x2 = cupy.array(self.inv_cov)
dot_ndarray = cupy.dot(x1, x2).get()
dot = <double*> (<numpy.ndarray> dot_ndarray).data
else:
dot = <double*> calloc(n*d, sizeof(double))
mdot(X, self._inv_cov, dot, n, d, d)
for i in range(n):
logp[i] = 0
for j in range(d):
if isnan(X[i*d + j]):
logp[i] = self._log_probability_missing(X+i*d)
break
else:
logp[i] += (dot[i*d + j] - self._inv_dot_mu[j])**2
else:
logp[i] = -0.5 * (d * LOG_2_PI + logp[i]) - 0.5 * self._log_det
if not _is_gpu_enabled():
free(dot)
cdef double _log_probability_missing(self, double* X) nogil:
cdef double logp
with gil:
X_ndarray = ndarray_wrap_cpointer(X, self.d)
avail = ~numpy.isnan(X_ndarray)
if avail.sum() == 0:
return 0
a = numpy.ix_(avail, avail)
d1 = MultivariateGaussianDistribution(self.mu[avail], self.cov[a])
logp = d1.log_probability(X_ndarray[avail])
return logp
def sample(self, n=None, random_state=None):
random_state = check_random_state(random_state)
return random_state.multivariate_normal(self.parameters[0],
self.parameters[1], n)
cdef double _summarize(self, double* X, double* weights, int n,
int column_idx, int d) nogil:
"""Calculate sufficient statistics for a minibatch.
The sufficient statistics for a multivariate gaussian update is the sum of
each column, and the sum of the outer products of the vectors.
"""
cdef int i, j, k
cdef double x, w, sqrt_weight, w_sum = 0.0
cdef double* column_sum = <double*> calloc(d*d, sizeof(double))
cdef double* column_w_sum = <double*> calloc(d, sizeof(double))
cdef double* pair_sum
cdef double* pair_w_sum = <double*> calloc(d*d, sizeof(double))
cdef double* y = <double*> calloc(n*d, sizeof(double))
cdef double alpha = 1
cdef double beta = 0
for i in range(n):
w = weights[i]
w_sum += w
sqrt_weight = csqrt(w)
for j in range(d):
x = X[i*d + j]
if isnan(x):
y[i*d + j] = 0.
for k in range(d):
pair_w_sum[j*d + k] -= w
if not isnan(X[i*d + k]):
pair_w_sum[k*d + j] -= w
column_sum[k*d + j] -= X[i*d + k] * w
else:
y[i*d + j] = x * sqrt_weight
column_sum[j*d + j] += x * w
column_w_sum[j] += w
if _is_gpu_enabled():
with gil:
import cupy
x_ndarray = ndarray_wrap_cpointer(y, n*d).reshape(n, d)
x_gpu = cupy.array(x_ndarray, copy=False)
pair_sum_ndarray = cupy.dot(x_gpu.T, x_gpu).get()
for j in range(d):
self.column_w_sum[j] += column_w_sum[j]
for k in range(d):
self.pair_sum[j*d + k] += pair_sum_ndarray[j, k]
self.pair_w_sum[j*d + k] += pair_w_sum[j*d + k] + w_sum
self.column_sum[j*d + k] += column_sum[j*d + k]
else:
pair_sum = <double*> calloc(d*d, sizeof(double))
dgemm('N', 'T', &d, &d, &n, &alpha, y, &d, y, &d, &beta, pair_sum, &d)
with gil:
for j in range(d):
self.column_w_sum[j] += column_w_sum[j]
for k in range(d):
self.pair_sum[j*d + k] += pair_sum[j*d + k]
self.pair_w_sum[j*d + k] += pair_w_sum[j*d + k] + w_sum
self.column_sum[j*d + k] += column_sum[j*d + k]
free(pair_sum)
free(column_sum)
free(column_w_sum)
free(pair_w_sum)
free(y)
def from_summaries(self, inertia=0.0, min_covar=1e-5):
"""
Set the parameters of this Distribution to maximize the likelihood of
the given sample. Items holds some sort of sequence. If weights is
specified, it holds a sequence of value to weight each item by.
"""
cdef int d = self.d, i, j, k
cdef double* column_sum = self.column_sum
cdef double pair_sum
cdef double* mu = self._mu_new
cdef double cov
cdef numpy.ndarray chol
cdef double w_sum = 0.0
for i in range(self.d):
w_sum += self.column_w_sum[i]
# If no summaries stored or the summary is frozen, don't do anything.
if self.frozen == True or w_sum < 1e-7:
return
for i in range(d):
mu[i] = self.column_sum[i*d + i] / self.column_w_sum[i]
self._mu[i] = self._mu[i] * inertia + mu[i] * (1-inertia)
for j in range(d):
for k in range(d):
x_jk = self.pair_sum[j*d + k]
w_jk = self.pair_w_sum[j*d + k]
if j == k:
x_j = self.column_sum[j*d + j]
x_k = self.column_sum[k*d + k]
else:
x_j = self.column_sum[j*d + j] + self.column_sum[j*d + k]
x_k = self.column_sum[k*d + k] + self.column_sum[k*d + j]
cov = (x_jk - x_j*x_k/w_jk) / w_jk if w_jk > 0.0 else 0
self._cov[j*d + k] = self._cov[j*d + k] * inertia + cov * (1-inertia)
try:
chol = scipy.linalg.cholesky(self.cov, lower=True)
self.inv_cov = scipy.linalg.solve_triangular(chol, numpy.eye(d),
lower=True).T
except:
if self.cov.sum() == 0:
self.cov += numpy.eye(d) * min_covar
else:
min_eig = numpy.linalg.eig(self.cov)[0].min()
self.cov -= numpy.eye(d) * (min_eig - eps)
chol = scipy.linalg.cholesky(self.cov, lower=True)
self.inv_cov = scipy.linalg.solve_triangular(chol, numpy.eye(d),
lower=True).T
_, self._log_det = numpy.linalg.slogdet(self.cov)
self._inv_cov = <double*> self.inv_cov.data
mdot(self._mu, self._inv_cov, self._inv_dot_mu, 1, d, d)
self.clear_summaries()
def clear_summaries(self):
"""Clear the summary statistics stored in the object."""
memset(self.column_sum, 0, self.d*self.d*sizeof(double))
memset(self.column_w_sum, 0, self.d*sizeof(double))
memset(self.pair_sum, 0, self.d*self.d*sizeof(double))
memset(self.pair_w_sum, 0, self.d*self.d*sizeof(double))
@classmethod
def from_samples(cls, X, weights=None, **kwargs):
"""Fit a distribution to some data without pre-specifying it."""
distribution = cls.blank(X.shape[1])
distribution.fit(X, weights, **kwargs)
return distribution
@classmethod
def blank(cls, d=2):
mu = numpy.zeros(d)
cov = numpy.eye(d)
return cls(mu, cov)
|