File: kmeans.cpp

package info (click to toggle)
r-cran-amap 0.8-20-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 620 kB
  • sloc: ansic: 905; cpp: 492; fortran: 327; php: 81; sh: 17; makefile: 2
file content (128 lines) | stat: -rw-r--r-- 3,907 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
/*! \file : kmeans.c
 * 
 *
 * \brief  K-means clustering 
 *
 * \date Created       : before 2005
 * \date Last Modified : Time-stamp: <2014-03-01 12:51:00 antoine>
 *
 * \author R core team. Modified by A. Lucas for distance choice.
 *
 *  R : A Computer Language for Statistical Data Analysis
 *  Copyright (C) 2004   The R Development Core Team.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include <R.h>
/*#include "modreg.h" */ /* for declarations for registration */ 
#include "kmeans.h" 

#include "distance_T.h" 

using namespace amap;

/** K-means clustering using Lloyd algorithm.
 * \brief compute k-nearest centroid of our dataset.
 * \param x matrix of size nxp: input data
 * \param pn nb of individual (pn=n)
 * \param pp number of observation by individual (pp=p)
 * \param cen matrix of size k*p: centroids
 * \param pk number of centroids (k)
 * \param cl vector of flag of size n
 * \param pmaxiter integer: maximum iteration
 * \param nc vector of size k: number of individuals in cluster k.
 * \param wss vector of size k: sum of square in each cluster.
 * \param method: which method to use.
 */
void kmeans_Lloyd2(double *x, int *pn, int *pp, double *cen, int *pk, int *cl, 
		  int *pmaxiter, int *nc, double *wss, int * method)
{
  /* n: nb of individuals 
   * k: nb of clusters
   * p: number of abservations by individuals
   * x: matrix of size nxp
   * cen: matrix of size kxp
   */
  int n = *pn, k = *pk, p = *pp, maxiter = *pmaxiter;
  int iter, i, j, c, it, inew = 0;
  double best, dd;
  Rboolean updated;
  distance_T<double>::T_tri opt;
  int  ierr[1];
  //double * data_tri;
  //int * order_tri;
  //int * rank_tri;

  matrice<double> dataMatrice (x, n,p);
  matrice<double> centroidMatrice (cen, k, p);
    
  if( (*method == distance_T<double>::SPEARMAN) ||  (*method == distance_T<double>::KENDALL))
    {
      opt.reset(p);
    }

  for(i = 0; i < n; i++) cl[i] = -1;
  for(iter = 0; iter < maxiter; iter++) {
    updated = FALSE;
    for(i = 0; i < n; i++) {
      /* find nearest centre for each point */
      best = R_PosInf;
      for(j = 0; j < k; j++) {
	vecteur<double> dataI = dataMatrice.getRow(i);
	vecteur<double> centroidJ = centroidMatrice.getRow(j);
	dd = distance_T<double>::distance_kms(dataI, centroidJ,method,ierr,opt);
	/*printf("| %f",dd);
	 */
	if(dd < best) {
	  best = dd;
	  inew = j+1;
	}
      }
      if(cl[i] != inew) {
	updated = TRUE;
	cl[i] = inew;
      }
    }
    if(!updated) break;
    /* update each centre */
    for(j = 0; j < k*p; j++) cen[j] = 0.0;
    for(j = 0; j < k; j++) nc[j] = 0;
    for(i = 0; i < n; i++) {
      it = cl[i] - 1; nc[it]++;
      for(c = 0; c < p; c++) cen[it+c*k] += x[i+c*n];
    }
    for(j = 0; j < k*p; j++) cen[j] /= nc[j % k];
  }

  *pmaxiter = iter + 1;
  /*    for(j = 0; j < k; j++) wss[j] = 0.0; */
  for(i = 0; i < n; i++) {
    it = cl[i] - 1;

    vecteur<double> dataI = dataMatrice.getRow(i);
    vecteur<double> centroidJ = centroidMatrice.getRow(it);

    wss[it] = distance_T<double>::distance_kms(dataI, centroidJ,method,ierr,opt);
    wss[it] = wss[it] * (wss[it]) ;
    /*
      for(c = 0; c < p; c++) {
      tmp = x[i+n*c] - cen[it+k*c];
      wss[it] += tmp * tmp;
      }*/
  }


}