1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
|
/* Generalized single linkage clustering.
*
* Contents:
* 1. Single linkage clustering, generalized
* 2. Unit tests
* 3. Test driver
* 4. Example
*/
#include "esl_config.h"
#include <stdlib.h>
#include "easel.h"
#include "esl_cluster.h"
/*****************************************************************
* 1. Single linkage clustering, generalized
*****************************************************************/
/* Function: esl_cluster_SingleLinkage()
* Synopsis: Generalized single linkage clustering.
* Incept: SRE, Mon Jan 7 08:35:10 2008 [Janelia]
*
* Purpose: Given a set of vertices, cluster them by single-linkage
* clustering.
*
* The data describing each vertex is provided in an array
* starting at <base>, consisting of <n> vertices. Each
* vertex can be of any type (structure, scalar, pointer)
* so long as each vertex element is of fixed size <n>
* bytes.
*
* A pointer to the clustering function is provided in
* <(*linkfunc)()>, and a pointer to any necessary
* parameters for that function (for example, any
* thresholds) is provided in <param>.
*
* The <int (*linkfunc)()> must be written by the
* caller. It takes arguments <(void *v1, void *v2, void
* *param, int *ret_link)>: pointers to two vertices to
* test for linkage and a pointer to any necessary
* parameters, and it passes the answer <TRUE> (1) or
* <FALSE> (0) back in <*ret_link>. The <(*linkfunc)()>
* returns <eslOK> (0) on success, and a nonzero error code
* on failure (see <easel.h> for a list of Easel's error
* codes).
*
* The caller provides an allocated <workspace> with space
* for at least <2n> integers. (Allocation in the caller
* allows the caller to reuse memory and save
* allocation/free cycles, if it has many rounds of
* clustering to do.)
*
* The caller also provides allocated space in
* <assignments> for <n> integers which, upon successful
* return, contains assignments of the <0..n-1> vertices to
* <0..C-1> clusters. That is, if <assignments[42] = 1>,
* that means vertex 42 is assigned to cluster 1. The
* total number of clusters is returned in <ret_C>.
*
* The algorithm runs in $O(N)$ memory; importantly, it
* does not require a $O(N^2)$ adjacency matrix. Worst case
* time complexity is $O(N^2)$ (multiplied by any
* additional complexity in the <(*linkfunc()> itself), but
* the worst case (no links at all; <C=n> clusters) should
* be unusual. More typically, time scales as about $N \log
* N$. Best case is $N$, for a completely connected graph
* in which all vertices group into one cluster. (More
* precisely, best case complexity arises when vertex 0 is
* connected to all other <n-1> vertices.)
*
* Notes: I don't know if this algorithm is published. I
* haven't seen it in graph theory books, but that might
* be because it's so obvious that nobody's bothered.
*
* In brief, we're going to do a breadth-first search of the
* graph, and we're going to calculate links on the fly
* rather than precalculating them into a standard adjacency
* matrix.
*
* While working, we keep two stacks of maximum length N:
* a : list of vertices that are still unconnected.
* b : list of vertices that we've connected to
* in our current breadth level, but we haven't
* yet tested for other connections to a.
* The current length (number of elements in) a and b are
* kept in na, nb.
*
* We store our results in an array of length N:
* c : assigns each vertex to a component. for example
* c[4] = 1 means that vertex 4 is in component 1.
* nc is the number of components. Components
* are numbered from 0 to nc-1. We return c and nc
* to our caller.
*
* The algorithm is:
*
* Initialisation:
* a <-- all the vertices
* na <-- N
* b <-- empty set
* nb <-- 0
* nc <-- 0
*
* Then:
* while (a is not empty)
* pop a vertex off a, push onto b
* while (b is not empty)
* pop vertex v off b
* assign c[v] = nc
* for each vertex w in a:
* compare v,w. If w is linked to v, remove w
* from a, push onto b.
* nc++
* q.e.d.
*
* Args: base - pointer to array of n fixed-size vertices to be clustered.
* n - number of vertices
* size - size of each vertex element
* linkfunc - pointer to caller's function for defining linked pairs
* param - pointer to any data that needs to be provided to <(*linkfunc)>
* workspace - caller provides at least 2n*sizeof(int) of workspace
* assignments - RETURN: assignments to clusters (caller provides n*sizeof(int) space)
* ret_C - RETURN: number of clusters
*
* Returns: <eslOK> on success; <assignments[0..n-1]> contains cluster assigments
* <0..C-1> for each vertex, and <*ret_C> contains the number of clusters
* <C>
*
* Throws: status codes from the caller's <(*linkfunc)> on failure; in this case,
* the contents of <*assignments> is undefined, and <*ret_C> is 0.
*/
int
esl_cluster_SingleLinkage(void *base, size_t n, size_t size,
int (*linkfunc)(const void *, const void *, const void *, int *), void *param,
int *workspace, int *assignments, int *ret_C)
{
int na, *a = NULL; /* stack of available vertices (still unconnected) */
int nb, *b = NULL; /* stack of connected but unextended vertices */
int nc, *c = NULL; /* array of results: # clusters, assignments to clusters */
int v,w; /* indices of vertices */
int i; /* counter over the available list */
int do_link;
int status;
a = workspace;
b = workspace + n;
c = assignments;
for (v = 0; v < n; v++) a[v] = n-v-1; /* initialize by pushing all vertices onto available list (backwards) */
na = n;
nb = 0;
nc = 0;
while (na > 0) /* while vertices remain unexamined or unclustered: */
{
v = a[na-1]; na--; /* pop a vertex off a, */
b[nb] = v; nb++; /* and push it onto b */
while (nb > 0) /* while vertices remain unextended: */
{
v = b[nb-1]; nb--; /* pop vertex off b */
c[v] = nc; /* assign it to cluster nc */
for (i = na-1; i >= 0; i--) /* backwards, because of deletion/swapping we do*/
{
if ((status = (*linkfunc)( (char *) base + v*size, (char *) base + a[i]*size, param, &do_link)) != eslOK) goto ERROR;
if (do_link)
{
w = a[i]; a[i] = a[na-1]; na--; /* delete w from a */
b[nb] = w; nb++; /* and push it onto b */
}
}
}
nc++;
}
*ret_C = nc;
return eslOK;
ERROR:
*ret_C = 0;
return status;
}
/*------------------ end, single linkage clustering -------------*/
/*****************************************************************
* 2. Unit tests
*****************************************************************/
#ifdef eslCLUSTER_TESTDRIVE
#include <math.h>
static int
test_linkage_definition(const void *v1, const void *v2, const void *param, int *ret_link)
{
double a = *((double *) v1); /* you have to cast a void ptr before you can dereference it */
double b = *((double *) v2);
double threshold = *((double *) param);
*ret_link = ((fabs(a-b) <= threshold) ? TRUE : FALSE);
return eslOK;
}
static void
utest_singlelinkage(double *testdata, int n, double threshold, int *correct_assignment, int correct_C)
{
int *workspace;
int *assignment;
int C;
int v;
if ((workspace = malloc(sizeof(int) * n * 2)) == NULL) esl_fatal("allocation failed");
if ((assignment = malloc(sizeof(int) * n)) == NULL) esl_fatal("allocation failed");
if (esl_cluster_SingleLinkage(testdata, n, sizeof(double),
test_linkage_definition, &threshold,
workspace, assignment, &C) != eslOK) esl_fatal("single linkage clustering failed");
if (C != correct_C) esl_fatal("expected %d clusters, but got %d\n", correct_C, C);
for (v = 0; v < n; v++)
if (correct_assignment[v] != assignment[v])
esl_fatal("expected vertex %d to be in cluster %d, but it's in %d\n", v, correct_assignment[v], assignment[v]);
free(workspace);
free(assignment);
}
#endif /* eslCLUSTER_TESTDRIVE */
/*****************************************************************
* 3. Test driver
*****************************************************************/
#ifdef eslCLUSTER_TESTDRIVE
/* gcc -g -Wall -o test -I. -L. -DeslCLUSTER_TESTDRIVE esl_cluster.c -leasel -lm
*/
#include "esl_config.h"
#include <stdio.h>
#include <math.h>
#include "easel.h"
#include "esl_getopts.h"
#include "esl_cluster.h"
static ESL_OPTIONS options[] = {
/* name type default env range toggles reqs incomp help docgroup*/
{ "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
};
static char usage[] = "[-options]";
static char banner[] = "test driver for cluster module";
int
main(int argc, char **argv)
{
ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
double vertex[] = { 1.0, 2.0, 4.0, 5.0, 7.0, 8.0 };
int na1 = 3, a1[] = { 0, 0, 1, 1, 2, 2 }; /* correct answer when threshold = 1.5 */
int na2 = 6, a2[] = { 0, 1, 2, 3, 4, 5 }; /* correct answer when threshold < 1.0 */
int na3 = 1, a3[] = { 0, 0, 0, 0, 0, 0 }; /* correct answer when threshold > 2.0 */
int n = sizeof(vertex) / sizeof(double);
utest_singlelinkage(vertex, n, 1.5, a1, na1);
utest_singlelinkage(vertex, n, 0.5, a2, na2);
utest_singlelinkage(vertex, n, 2.5, a3, na3);
esl_getopts_Destroy(go);
return 0;
}
#endif /* eslCLUSTER_TESTDRIVE*/
/*****************************************************************
* 4. Example
*****************************************************************/
#ifdef eslCLUSTER_EXAMPLE
/*::cexcerpt::cluster_example::begin::*/
/* gcc -g -Wall -o example -I. -L. -DeslCLUSTER_EXAMPLE esl_cluster.c easel.c -lm */
#include "esl_config.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "easel.h"
#include "esl_cluster.h"
static int
my_linkage_definition(const void *v1, const void *v2, const void *param, int *ret_link)
{
double a = *((double *) v1); /* you have to cast a void ptr before you can dereference it */
double b = *((double *) v2);
double threshold = *((double *) param);
*ret_link = ((fabs(a-b) <= threshold) ? TRUE : FALSE);
return eslOK;
}
int
main(int argc, char **argv)
{
double vertex[] = { 1.0, 2.0, 4.0, 5.0, 7.0, 8.0 };
int n = sizeof(vertex) / sizeof(double);
double threshold = 1.5;
int *workspace;
int *assignment;
int C;
int v;
workspace = malloc(sizeof(int) * n * 2);
assignment = malloc(sizeof(int) * n);
esl_cluster_SingleLinkage(vertex, n, sizeof(double),
my_linkage_definition, &threshold,
workspace, assignment, &C);
printf("There are %d clusters.\n", C);
for (v = 0; v < n; v++) printf("vertex %d is in cluster %d\n", v, assignment[v]);
free(workspace);
free(assignment);
return 0;
}
/*::cexcerpt::cluster_example::end::*/
#endif /*eslCLUSTER_EXAMPLE*/
|