1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
|
#include "Clustering.h"
#include <limits>
#include <boost/foreach.hpp>
#include <map>
using namespace std;
namespace AG
{
namespace Clustering{
Vect operator+ (Vect &a, Vect &b)
{
Vect c = a;
for (int i=0; i<c.size(); i++) c[i] += b[i];
return c;
}
void operator+= (Vect &a, const Vect b)
{
for(int i=0; i<a.size(); i++) a[i] += b[i];
}
void operator *= (Vect &a, const float b)
{
for(int i=0; i<a.size(); i++) a[i] *= b;
}
void operator /= (Vect &a, const float b)
{
for(int i=0; i<a.size(); i++) a[i] /= b;
}
bool diameterInf(Vect x, double maxDist, double *limits)
{
int dim = x.size();
double *newLimits = new double[dim*2];
memcpy(newLimits, limits, sizeof(double)*dim*2);
bool bInside = true;
for (int d=0; d < dim; d++)
{
double dLow = limits[2*d];
double dHigh = limits[2*d+1];
if(dLow > x[d] || dHigh < x[d])
{
bInside = false;
break;
}
}
if(bInside) return true;
bool bOk = true;
for (int d=0; d < dim; d++)
{
double xd = x[d];
double dLow = min(xd, limits[d*2]);
double dHigh = max(xd, limits[d*2+1]);
if(dHigh - dLow > maxDist)
{
bOk = false;
break;
}
newLimits[2*d] = dLow;
newLimits[2*d+1] = dHigh;
}
if(bOk) memcpy(limits, newLimits, sizeof(double)*dim*2);
delete [] newLimits;
return bOk;
}
bool diameter2(Vect x, double maxDist, std::vector<Vect> clust, Vect sum, int count)
{
int dim = x.size();
maxDist = maxDist*maxDist;
// we keep the center of the cluster
Vect center = sum + x;
center /= count + 1;
double dist = 0;
for (int d=0; d < dim; d++) dist += (x[d]-center[d])*(x[d]-center[d]);
if(dist > maxDist) return false;
for (int i=0; i<clust.size(); i++)
{
double dist = 0;
for (int d=0; d<dim; d++) dist += (clust[i][d]-center[d])*(clust[i][d]-center[d]);
if(dist > maxDist) return false;
}
return true;
}
Clusters qt_clustering(VectorSpace & vs, double max_diameter, int minCount)
{
Clusters clusters(vs.size(), std::numeric_limits<index>::max()); // assign all the vectors to no cluster
int assignedCount = 0;
int clusterId = 0;
int count = vs.size();
int dim = vs[0].size();
while(assignedCount < vs.size())
{
std::vector < std::vector<Vect> > clusts;
std::vector < std::vector<int> > clustsIdx;
clusts.resize(vs.size());
clustsIdx.resize(vs.size());
double *limits = new double[dim*2];
int skipped = 0;
int docId_i = 0;
BOOST_FOREACH(Vect d_i, vs) // foreach document
{
if (clusters[docId_i] != std::numeric_limits<index>::max())
{
skipped++;
docId_i++;
continue; // sample taken already
}
for(int d=0; d<dim; d++)
{
limits[2*d] = d_i[d];
limits[2*d+1] = d_i[d];
}
std::vector<Vect> &clust = clusts[docId_i];
std::vector<int> &clustIdx = clustsIdx[docId_i];
clust.push_back(d_i);
clustIdx.push_back(docId_i);
for(index docId_j = 0; docId_j < vs.size(); docId_j++)
{
if(docId_j == docId_i) continue;
if (clusters[docId_j] != std::numeric_limits<index>::max()) continue; // sample taken already
if(diameterInf(vs[docId_j], max_diameter, limits))
{
clust.push_back(vs[docId_j]);
clustIdx.push_back(docId_j);
}
}
docId_i++;
} // foreach document
delete [] limits;
// find the cluster with the maximum count
int maxIndex = 0, maxCnt = 0;
for(int i=0; i<clustsIdx.size(); i++)
{
if(maxCnt < clustsIdx[i].size())
{
maxIndex = i;
maxCnt = clustsIdx[i].size();
}
}
std::cout << "maximum cluster: " << maxIndex << " with " << maxCnt << " samples"<< std::endl;
if(maxCnt < minCount) break;
for(int i=0; i<clustsIdx[maxIndex].size(); i++)
{
int index = clustsIdx[maxIndex][i];
clusters[index] = clusterId;
}
clusterId++;
assignedCount += maxCnt;
}
return clusters;
};
}; /* Clustering */
};
|