1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386
|
/*!
\file fff_GMM.h
\brief Gaussian Mixture Models
\author Bertrand Thirion
\date 2004-2006
This library implements different kinds of GMM estimation techniques
*/
#ifndef fff_GMM
#define fff_GMM
#ifdef __cplusplus
extern "C" {
#endif
#include "fff_graphlib.h"
typedef struct fff_GMM_{
long k; /* number of components */
long dim; /* feature dimension */
int prec_type; /* type of teh precision */
fff_matrix * means;
fff_matrix * precisions;
fff_vector * weights;
} fff_GMM_;
typedef struct fff_Bayesian_GMM{
long k; /* number of components */
long dim; /* feature dimension */
/* it is assumed that
1) the covariance is diagonal, so that
precision matrices and centroid matrices do have the same size:
(k, dim),
where k is the number of components in the mixture
and dim is the number of dimensions of the data
All the matrices in the model have this size.
All the vectors have length k.
2)Conjugate normal-Wishart prior are used
*/
/* priors */
fff_matrix * prior_means;
fff_vector * prior_means_scale;
fff_matrix * prior_precisions;
fff_vector * prior_dof;
fff_vector * prior_weights;
/* current estimates */
fff_vector * means_scale;
fff_matrix * means;
fff_matrix * precisions;
fff_vector * dof;
fff_vector * weights;
} fff_Bayesian_GMM;
/*!
\brief Constructor for the GMM structure
\param k : number of components
\param dim : the number of dimensions
\param prec_type: (int) the parameterization of the model
if prec_type==0 the model is assumed to be Gaussian,
heteroscedesatic with full covariance.
if prec_type==1 the model is assumed to be Gaussian,
heteroscedesatic and diagonal.
if prec_type==2 the model is assumed to be Gaussian,
homoscedastic, diagonal and with equal weights.
*/
extern fff_GMM_* fff_GMM_new( const long k, const long dim,const int prec_type );
/*!
\brief Destructor for the GMM structure
\param thisone the GMM to be deleted
*/
extern int fff_GMM_delete( fff_GMM_* thisone );
/*!
\brief Labelling checking algorithm
\param Label vector
\param k number of classes
This algorithm quickly checks that for each i in [0,..,k-1]
there exists an integer j so that Label[j] = i.
it returns 1 if yes, 0 if no
*/
extern int fff_clustering_OntoLabel(const fff_array * Label, const long k);
/*!
\brief GMM algorithm with selection of the number of clusters
\param X data matrix
\param Centers Cluster centers
\param Precision Cluster precisions
\param Weights Mixture Weights
\param Label data labels
\param nbclust number of clusters for which a mixture is searched.
\param maxiter maximum number of iterations
\param delta small constant for control of convergence
This function performs different initializations of the GMM clustering
algorithm with nbclust mixtures, and keeps the best one according
to a BIC criterion.
The optimal number of clusters is returned. It is assumed that Centers,
Precision, and Weights have been allocated for max(nbclust)
clusters.
*/
int fff_clustering_gmm_select( fff_matrix* Centers, fff_matrix* Precision, fff_vector *Weights, fff_array *Label, const fff_matrix* X, const fff_vector *nbclust, const int maxiter, const double delta);
/*!
\brief GMM algorithm
\param X data matrix
\param Centers Cluster centers
\param Precision Cluster precisions
\param Weights Mixture Weights
\param Label data labels
\param maxiter maximum number of iterations
\param delta small constant for control of convergence
\param ninit number of initializations
This function performs ninit initializations of the GMM clustering
algorithm, and keeps the best one. The average log-likelihood is
returned.
*/
extern double fff_clustering_gmm_ninit( fff_matrix* Centers, fff_matrix* Precision, fff_vector *Weights, fff_array *Label, const fff_matrix* X, const int maxiter, const double delta, const int ninit );
/*!
\brief GMM algorithm
\param X data matrix
\param Centers Cluster centers
\param Precision Cluster precisions
\param Weights Mixture Weights
\param Label data labels
\param maxiter maximum number of iterations
\param delta small constant for control of convergence
\param chunksize the number of features on which gmm is performed
\param verbose verbosity mode
This algorithm performs a GMM of the data X. Note that the data
and cluster matrices should be dimensioned as (nb items * feature
dimension) and (nb clusters * feature dimension).
The precision is written either in the form
- (nb clusters * sqf), where sqf is the square of the feature dimension.
- (nb clusters * feature dimension), then it is assumed that it codes for a
diagonal precision matrix, and an algorithm with diagonal
covariance is implemented.
- (1* sqf) then the Gaussians are assumed to be homoscedastic. Only
one diagonal variance is computed for all clusters. the weights of
the mixture are set (1/nb clusters)
chunksize has to be chosen in the interval [nb clusters, nb features]
chunksize = nb features is a standard choice,
but lowering chunksize will profit to computation time.
it is advisable to have chunksize<10^6
The metric used in the algorithm is Euclidian.
The returned Label vector is a hard membership function
computed after convergence.
The normlized log-likelihood is returned.
*/
extern double fff_clustering_gmm( fff_matrix* Centers, fff_matrix* Precision, fff_vector *Weights, fff_array *Label, const fff_matrix* X, const int maxiter, const double delta, const int chunksize, const int verbose );
/*!
\brief Evaluation of a GMM algorithm on a dataset
\param X data matrix
\param Centers Cluster centers
\param Precision Cluster precisions
\param Weights Mixture Weights
\param L average log-likelihood
This algorithm computes the average log-likelihood of a GMM on an
empirical dataset. This number is returned.
The algorithm adapts to the size of the precision matrix
- (k*(f*f)) precision is a full f*f matrix for all clusters
- (k*f) precision diagonal
- (1*f) precision diagonal and equal for all clusters
*/
extern double fff_gmm_mean_eval(double* L, const fff_matrix* X, const fff_matrix* Centers, const fff_matrix* Precision, const fff_vector* Weights);
/*!
\brief Relaxation of a previously estimated GMM on a dataset
\param X data matrix
\param Centers Cluster centers
\param Precision Cluster precisions
\param Weights Mixture Weights
\param LogLike log-likelihood of each data
\param Labels final labelling
This algorithm computes the average log-likelihood of a GMM on an
empirical dataset. This number is returned.
The algorithm adapts to the size of the precision matrix
- (k*(f*f)) precision is a full f*f matrix for all clusters
- (k*f) precision diagonal
- (1*f) precision diagonal and equal for all clusters
*/
extern int fff_gmm_relax( fff_vector* LogLike, fff_array* Labels, fff_matrix* Centers, fff_matrix* Precision, fff_vector* Weights, const fff_matrix* X, const int maxiter, const double delta);
/*!
\brief Evaluation of a GMM on a dataset and labelling of the dataset X
\param X data matrix
\param Centers Cluster centers
\param Precision Cluster precisions
\param Weights Mixture Weights
\param LogLike log-likelihood of each data
\param Labels final labelling
This algorithm computes the average log-likelihood of a GMM on an
empirical dataset. This number is returned.
The algorithm adapts to the size of the precision matrix
- (k*(f*f)) precision is a full f*f matrix for all clusters
- (k*f) precision diagonal
- (1*f) precision diagonal and equal for all clusters
*/
extern int fff_gmm_partition(fff_vector* LogLike, fff_array* Labels, const fff_matrix* X, const fff_matrix* Centers, const fff_matrix* Precision, const fff_vector* Weights);
/*!
\brief representation of GMM membership with a graph structure
\param G Weighted Graph
\param X data-defining matrix
\param Centers Cluster centers
\param Precision Cluster precisions
\param Weights Mixture Weights
This algorithm computes the probabilistic membership values
of the data X with respect to the GMM model and stores this
-presumably sparse- information in a graph structure.
If G->E = 0, nothing is done with graph
Otherwise, the memebership is coded in graph
The number of edges is retuned.
In practice, it is thus advised to call it once with an empty graph
to get the number of edges, then
to allocate the graph structure and recall the function.
*/
extern int fff_gmm_membership(fff_graph* G, const fff_matrix* X, const fff_matrix* Centers, const fff_matrix* Precision, const fff_vector* Weights);
/*!
\brief Shifting the points in X toward points of larger likelihood
\param X moving points
\param Centers Cluster centers
\param Precision Cluster precisions
\param Weights Mixture Weights
*/
extern int fff_gmm_shift(fff_matrix* X, const fff_matrix* Centers, const fff_matrix* Precision, const fff_vector* Weights);
/*!
\brief Implementation of a Byaesian GMM model based on the Variational Bayes Model.
\param BGMM the structure that contains the Bayesian GMM
\param Label final data labelling
\param X data matrix
\param maxiter Maximal number of iterations
\param delta minimal relative increment of the likelihood to declare convergence
The Model comes from Penny, 2001 (research report).
Given a dataset X, and prior about clsuter centers and precision
the dunction computes a GMM model (Centers,Precision,Weights)
*/
extern double fff_VB_gmm_estimate(fff_Bayesian_GMM* BGMM, fff_array *Label, const fff_matrix* X, const int maxiter, const double delta);
/*!
\brief Constructor for the Bayesian GMM structure
\param k : number of components
\param dim : the number of dimensions
*/
extern fff_Bayesian_GMM* fff_BGMM_new( const long k, const long dim );
/*!
\brief Destructor for the Bayesian GMM structure
\param thisone the BGMM to be deleted
*/
extern int fff_BGMM_delete( fff_Bayesian_GMM* thisone );
/*
\brief instantiation of the prior of the Bayesian GMM
\param BG the Bayesian GMM to be instantiated
\param prior_means the prior of the means
\param prior_means_scale scaling factor on the precision of the means
\param prior_precision prior precision in the wishart model
\param prior_dof number of degrees of freedom in the Wishart model
\param prior_weight prior on the weights in the Dirichlet model
Note that all the parameters are copied
*/
extern int fff_BGMM_set_priors(fff_Bayesian_GMM* BG, const fff_matrix * prior_means, const fff_vector *prior_means_scale, const fff_matrix * prior_precision, const fff_vector* prior_dof, const fff_vector *prior_weights );
/*
\brief Estimation of the BGMM using a Gibbs sampling technique
\param membership the average membership variable across iterations
\param BG the BGMM to be estimated
\param X the data used in the estimation of the model
\param niter the number of iterations in the sampling
\param method choice of a quick(purely normal) technique
This function performs the estimation of the model.
The membership (hidden) variable is randomly resampled at each step,
while the parameters of the model are recomputed analytically
for the sake of speed.
The number of itertaions is niter for bith the burun-in
and the estimation periods
The average data memebership across iterations is given in membership.
The BGMM is re-instantiated with the average parameters
during the estimation period.
*/
extern int fff_BGMM_Gibbs_estimation(fff_matrix* membership, fff_Bayesian_GMM* BG, const fff_matrix *X, const int niter, const int method);
/*
\brief Sampling of the BGMM using on a grid
\param the average density on the given grid
\param BG the BGMM to be estimated
\param X the data used in the estimation of the model
\param grid the grid used in the estimation of the model
\param niter the number of iterations in the sampling
\param method choice of a quick(purely normal) technique
*/
extern int fff_BGMM_Gibbs_sampling(fff_vector* density, fff_Bayesian_GMM* BG, const fff_matrix *X, const fff_matrix *grid, const int niter, const int method);
/*
\brief Reading out the BGMM structure
\param means current estimate of the cluster centers
\param means_scale current precision on the means matrices
\param precisions current precision estimate
\param dof current dif of the Wisharet model
\param weights current weights of the components
\param BG BGMM model.
Note that all the matrices are copied.
*/
extern int fff_BGMM_get_model( fff_matrix * means, fff_vector * means_scale, fff_matrix * precisions, fff_vector* dof, fff_vector * weights, const fff_Bayesian_GMM* BG);
/*
\brief Writing the BGMM structure
\param BG BGMM model.
\param means current estimate of the cluster centers
\param means_scale current precision on the means matrices
\param precisions current precision estimate
\param dof current dif of the Wisharet model
\param weights current weights of the components
Note that all the matrices are copied.
*/
extern int fff_BGMM_set_model( fff_Bayesian_GMM* BG, const fff_matrix * means, const fff_vector * means_scale, const fff_matrix * precisions, const fff_vector* dof, const fff_vector * weights );
/*!
\brief Sampling of a Bayesian GMM model based on the Variational Bayes.
\param density the posterior
\param BGMM the structure that contains the Bayesian GMM
\param grid the smapling grid for the posterior
*/
extern int fff_BGMM_sampling(fff_vector * density, fff_Bayesian_GMM* BG, const fff_matrix *grid);
/*!
\brief Sampling of a Bayesian GMM model
\param density the posterior as a (nbitem,BG->k) matrix, where nbitem X->size1
\param X the smapling grid for the posterior
\param BGMM the structure that contains the Bayesian GMM
*/
extern double fff_WNpval(fff_matrix * proba, const fff_matrix *X, const fff_Bayesian_GMM* BG);
#ifdef __cplusplus
}
#endif
#endif
|