1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
|
/*! \file : kmeans.c
*
*
* \brief K-means clustering
*
* \date Created : before 2005
* \date Last Modified : Time-stamp: <2014-03-01 12:51:00 antoine>
*
* \author R core team. Modified by A. Lucas for distance choice.
*
* R : A Computer Language for Statistical Data Analysis
* Copyright (C) 2004 The R Development Core Team.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <R.h>
/*#include "modreg.h" */ /* for declarations for registration */
#include "kmeans.h"
#include "distance_T.h"
using namespace amap;
/** K-means clustering using Lloyd algorithm.
* \brief compute k-nearest centroid of our dataset.
* \param x matrix of size nxp: input data
* \param pn nb of individual (pn=n)
* \param pp number of observation by individual (pp=p)
* \param cen matrix of size k*p: centroids
* \param pk number of centroids (k)
* \param cl vector of flag of size n
* \param pmaxiter integer: maximum iteration
* \param nc vector of size k: number of individuals in cluster k.
* \param wss vector of size k: sum of square in each cluster.
* \param method: which method to use.
*/
void kmeans_Lloyd2(double *x, int *pn, int *pp, double *cen, int *pk, int *cl,
int *pmaxiter, int *nc, double *wss, int * method)
{
/* n: nb of individuals
* k: nb of clusters
* p: number of abservations by individuals
* x: matrix of size nxp
* cen: matrix of size kxp
*/
int n = *pn, k = *pk, p = *pp, maxiter = *pmaxiter;
int iter, i, j, c, it, inew = 0;
double best, dd;
Rboolean updated;
distance_T<double>::T_tri opt;
int ierr[1];
//double * data_tri;
//int * order_tri;
//int * rank_tri;
matrice<double> dataMatrice (x, n,p);
matrice<double> centroidMatrice (cen, k, p);
if( (*method == distance_T<double>::SPEARMAN) || (*method == distance_T<double>::KENDALL))
{
opt.reset(p);
}
for(i = 0; i < n; i++) cl[i] = -1;
for(iter = 0; iter < maxiter; iter++) {
updated = FALSE;
for(i = 0; i < n; i++) {
/* find nearest centre for each point */
best = R_PosInf;
for(j = 0; j < k; j++) {
vecteur<double> dataI = dataMatrice.getRow(i);
vecteur<double> centroidJ = centroidMatrice.getRow(j);
dd = distance_T<double>::distance_kms(dataI, centroidJ,method,ierr,opt);
/*printf("| %f",dd);
*/
if(dd < best) {
best = dd;
inew = j+1;
}
}
if(cl[i] != inew) {
updated = TRUE;
cl[i] = inew;
}
}
if(!updated) break;
/* update each centre */
for(j = 0; j < k*p; j++) cen[j] = 0.0;
for(j = 0; j < k; j++) nc[j] = 0;
for(i = 0; i < n; i++) {
it = cl[i] - 1; nc[it]++;
for(c = 0; c < p; c++) cen[it+c*k] += x[i+c*n];
}
for(j = 0; j < k*p; j++) cen[j] /= nc[j % k];
}
*pmaxiter = iter + 1;
/* for(j = 0; j < k; j++) wss[j] = 0.0; */
for(i = 0; i < n; i++) {
it = cl[i] - 1;
vecteur<double> dataI = dataMatrice.getRow(i);
vecteur<double> centroidJ = centroidMatrice.getRow(it);
wss[it] = distance_T<double>::distance_kms(dataI, centroidJ,method,ierr,opt);
wss[it] = wss[it] * (wss[it]) ;
/*
for(c = 0; c < p; c++) {
tmp = x[i+n*c] - cen[it+k*c];
wss[it] += tmp * tmp;
}*/
}
}
|