File: upgma2.cpp

package info (click to toggle)
muscle 3.52-2
  • links: PTS
  • area: main
  • in suites: sarge
  • size: 1,196 kB
  • ctags: 1,763
  • sloc: cpp: 21,335; xml: 185; makefile: 104
file content (395 lines) | stat: -rw-r--r-- 10,414 bytes parent folder | download | duplicates (14)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
#include "muscle.h"
#include "tree.h"
#include "distcalc.h"

// UPGMA clustering in O(N^2) time and space.

#define	TRACE	0

#define	MIN(x, y)	((x) < (y) ? (x) : (y))
#define	MAX(x, y)	((x) > (y) ? (x) : (y))
#define	AVG(x, y)	(((x) + (y))/2)

static unsigned g_uLeafCount;
static unsigned g_uTriangleSize;
static unsigned g_uInternalNodeCount;
static unsigned g_uInternalNodeIndex;

// Triangular distance matrix is g_Dist, which is allocated
// as a one-dimensional vector of length g_uTriangleSize.
// TriangleSubscript(i,j) maps row,column=i,j to the subscript
// into this vector.
// Row / column coordinates are a bit messy.
// Initially they are leaf indexes 0..N-1.
// But each time we create a new node (=new cluster, new subtree),
// we re-use one of the two rows that become available (the children
// of the new node). This saves memory.
// We keep track of this through the g_uNodeIndex vector.
static dist_t *g_Dist;

// Distance to nearest neighbor in row i of distance matrix.
// Subscript is distance matrix row.
static dist_t *g_MinDist;

// Nearest neighbor to row i of distance matrix.
// Subscript is distance matrix row.
static unsigned *g_uNearestNeighbor;

// Node index of row i in distance matrix.
// Node indexes are 0..N-1 for leaves, N..2N-2 for internal nodes.
// Subscript is distance matrix row.
static unsigned *g_uNodeIndex;

// The following vectors are defined on internal nodes,
// subscripts are internal node index 0..N-2.
// For g_uLeft/Right, value is the node index 0 .. 2N-2
// because a child can be internal or leaf.
static unsigned *g_uLeft;
static unsigned *g_uRight;
static dist_t *g_Height;
static dist_t *g_LeftLength;
static dist_t *g_RightLength;

static inline unsigned TriangleSubscript(unsigned uIndex1, unsigned uIndex2)
	{
#if	DEBUG
	if (uIndex1 >= g_uLeafCount || uIndex2 >= g_uLeafCount)
		Quit("TriangleSubscript(%u,%u) %u", uIndex1, uIndex2, g_uLeafCount);
#endif
	unsigned v;
	if (uIndex1 >= uIndex2)
		v = uIndex2 + (uIndex1*(uIndex1 - 1))/2;
	else
		v = uIndex1 + (uIndex2*(uIndex2 - 1))/2;
	assert(v < (g_uLeafCount*(g_uLeafCount - 1))/2);
	return v;
	}

static void ListState()
	{
	Log("Dist matrix\n");
	Log("     ");
	for (unsigned i = 0; i < g_uLeafCount; ++i)
		{
		if (uInsane == g_uNodeIndex[i])
			continue;
		Log("  %5u", g_uNodeIndex[i]);
		}
	Log("\n");

	for (unsigned i = 0; i < g_uLeafCount; ++i)
		{
		if (uInsane == g_uNodeIndex[i])
			continue;
		Log("%5u  ", g_uNodeIndex[i]);
		for (unsigned j = 0; j < g_uLeafCount; ++j)
			{
			if (uInsane == g_uNodeIndex[j])
				continue;
			if (i == j)
				Log("       ");
			else
				{
				unsigned v = TriangleSubscript(i, j);
				Log("%5.2g  ", g_Dist[v]);
				}
			}
		Log("\n");
		}

	Log("\n");
	Log("    i   Node   NrNb      Dist\n");
	Log("-----  -----  -----  --------\n");
	for (unsigned i = 0; i < g_uLeafCount; ++i)
		{
		if (uInsane == g_uNodeIndex[i])
			continue;
		Log("%5u  %5u  %5u  %8.3f\n",
		  i,
		  g_uNodeIndex[i],
		  g_uNearestNeighbor[i],
		  g_MinDist[i]);
		}

	Log("\n");
	Log(" Node      L      R  Height  LLength  RLength\n");
	Log("-----  -----  -----  ------  -------  -------\n");
	for (unsigned i = 0; i <= g_uInternalNodeIndex; ++i)
		Log("%5u  %5u  %5u  %6.2g  %6.2g  %6.2g\n",
		  i,
		  g_uLeft[i],
		  g_uRight[i],
		  g_Height[i],
		  g_LeftLength[i],
		  g_RightLength[i]);
	}

void UPGMA2(const DistCalc &DC, Tree &tree, LINKAGE Linkage)
	{
	g_uLeafCount = DC.GetCount();

	g_uTriangleSize = (g_uLeafCount*(g_uLeafCount - 1))/2;
	g_uInternalNodeCount = g_uLeafCount - 1;

	g_Dist = new dist_t[g_uTriangleSize];

	g_uNodeIndex = new unsigned[g_uLeafCount];
	g_uNearestNeighbor = new unsigned[g_uLeafCount];
	g_MinDist = new dist_t[g_uLeafCount];
	unsigned *Ids = new unsigned [g_uLeafCount];
	char **Names = new char *[g_uLeafCount];

	g_uLeft = new unsigned[g_uInternalNodeCount];
	g_uRight = new unsigned[g_uInternalNodeCount];
	g_Height = new dist_t[g_uInternalNodeCount];
	g_LeftLength = new dist_t[g_uInternalNodeCount];
	g_RightLength = new dist_t[g_uInternalNodeCount];

	for (unsigned i = 0; i < g_uLeafCount; ++i)
		{
		g_MinDist[i] = BIG_DIST;
		g_uNodeIndex[i] = i;
		g_uNearestNeighbor[i] = uInsane;
		Ids[i] = DC.GetId(i);
		Names[i] = strsave(DC.GetName(i));
		}

	for (unsigned i = 0; i < g_uInternalNodeCount; ++i)
		{
		g_uLeft[i] = uInsane;
		g_uRight[i] = uInsane;
		g_LeftLength[i] = BIG_DIST;
		g_RightLength[i] = BIG_DIST;
		g_Height[i] = BIG_DIST;
		}

// Compute initial NxN triangular distance matrix.
// Store minimum distance for each full (not triangular) row.
// Loop from 1, not 0, because "row" is 0, 1 ... i-1,
// so nothing to do when i=0.
	for (unsigned i = 1; i < g_uLeafCount; ++i)
		{
		dist_t *Row = g_Dist + TriangleSubscript(i, 0);
		DC.CalcDistRange(i, Row);
		for (unsigned j = 0; j < i; ++j)
			{
			const dist_t d = Row[j];
			if (d < g_MinDist[i])
				{
				g_MinDist[i] = d;
				g_uNearestNeighbor[i] = j;
				}
			if (d < g_MinDist[j])
				{
				g_MinDist[j] = d;
				g_uNearestNeighbor[j] = i;
				}
			}
		}

#if	TRACE
	Log("Initial state:\n");
	ListState();
#endif

	for (g_uInternalNodeIndex = 0; g_uInternalNodeIndex < g_uLeafCount - 1;
	  ++g_uInternalNodeIndex)
		{
#if	TRACE
		Log("\n");
		Log("Internal node index %5u\n", g_uInternalNodeIndex);
		Log("-------------------------\n");
#endif

	// Find nearest neighbors
		unsigned Lmin = uInsane;
		unsigned Rmin = uInsane;
		dist_t dtMinDist = BIG_DIST;
		for (unsigned j = 0; j < g_uLeafCount; ++j)
			{
			if (uInsane == g_uNodeIndex[j])
				continue;

			dist_t d = g_MinDist[j];
			if (d < dtMinDist)
				{
				dtMinDist = d;
				Lmin = j;
				Rmin = g_uNearestNeighbor[j];
				assert(uInsane != Rmin);
				assert(uInsane != g_uNodeIndex[Rmin]);
				}
			}

		assert(Lmin != uInsane);
		assert(Rmin != uInsane);
		assert(dtMinDist != BIG_DIST);

#if	TRACE
		Log("Nearest neighbors Lmin %u[=%u] Rmin %u[=%u] dist %.3g\n",
		  Lmin,
		  g_uNodeIndex[Lmin],
		  Rmin,
		  g_uNodeIndex[Rmin],
		  dtMinDist);
#endif

	// Compute distances to new node
	// New node overwrites row currently assigned to Lmin
		dist_t dtNewMinDist = BIG_DIST;
		unsigned uNewNearestNeighbor = uInsane;
		for (unsigned j = 0; j < g_uLeafCount; ++j)
			{
			if (j == Lmin || j == Rmin)
				continue;
			if (uInsane == g_uNodeIndex[j])
				continue;

			const unsigned vL = TriangleSubscript(Lmin, j);
			const unsigned vR = TriangleSubscript(Rmin, j);
			const dist_t dL = g_Dist[vL];
			const dist_t dR = g_Dist[vR];
			dist_t dtNewDist;

			switch (Linkage)
				{
			case LINKAGE_Avg:
				dtNewDist = AVG(dL, dR);
				break;

			case LINKAGE_Min:
				dtNewDist = MIN(dL, dR);
				break;

			case LINKAGE_Max:
				dtNewDist = MAX(dL, dR);
				break;

			case LINKAGE_Biased:
				dtNewDist = g_dSUEFF*AVG(dL, dR) + (1 - g_dSUEFF)*MIN(dL, dR);
				break;

			default:
				Quit("UPGMA2: Invalid LINKAGE_%u", Linkage);
				}

		// Nasty special case.
		// If nearest neighbor of j is Lmin or Rmin, then make the new
		// node (which overwrites the row currently occupied by Lmin)
		// the nearest neighbor. This situation can occur when there are
		// equal distances in the matrix. If we don't make this fix,
		// the nearest neighbor pointer for j would become invalid.
		// (We don't need to test for == Lmin, because in that case
		// the net change needed is zero due to the change in row
		// numbering).
			if (g_uNearestNeighbor[j] == Rmin)
				g_uNearestNeighbor[j] = Lmin;

#if	TRACE
			Log("New dist to %u = (%u/%.3g + %u/%.3g)/2 = %.3g\n",
			  j, Lmin, dL, Rmin, dR, dtNewDist);
#endif
			g_Dist[vL] = dtNewDist;
			if (dtNewDist < dtNewMinDist)
				{
				dtNewMinDist = dtNewDist;
				uNewNearestNeighbor = j;
				}
			}

		assert(g_uInternalNodeIndex < g_uLeafCount - 1 || BIG_DIST != dtNewMinDist);
		assert(g_uInternalNodeIndex < g_uLeafCount - 1 || uInsane != uNewNearestNeighbor);

		const unsigned v = TriangleSubscript(Lmin, Rmin);
		const dist_t dLR = g_Dist[v];
		const dist_t dHeightNew = dLR/2;
		const unsigned uLeft = g_uNodeIndex[Lmin];
		const unsigned uRight = g_uNodeIndex[Rmin];
		const dist_t HeightLeft =
		  uLeft < g_uLeafCount ? 0 : g_Height[uLeft - g_uLeafCount];
		const dist_t HeightRight =
		  uRight < g_uLeafCount ? 0 : g_Height[uRight - g_uLeafCount];

		g_uLeft[g_uInternalNodeIndex] = uLeft;
		g_uRight[g_uInternalNodeIndex] = uRight;
		g_LeftLength[g_uInternalNodeIndex] = dHeightNew - HeightLeft;
		g_RightLength[g_uInternalNodeIndex] = dHeightNew - HeightRight;
		g_Height[g_uInternalNodeIndex] = dHeightNew;

	// Row for left child overwritten by row for new node
		g_uNodeIndex[Lmin] = g_uLeafCount + g_uInternalNodeIndex;
		g_uNearestNeighbor[Lmin] = uNewNearestNeighbor;
		g_MinDist[Lmin] = dtNewMinDist;

	// Delete row for right child
		g_uNodeIndex[Rmin] = uInsane;

#if	TRACE
		Log("\nInternalNodeIndex=%u Lmin=%u Rmin=%u\n",
		  g_uInternalNodeIndex, Lmin, Rmin);
		ListState();
#endif
		}

	unsigned uRoot = g_uLeafCount - 2;
	tree.Create(g_uLeafCount, uRoot, g_uLeft, g_uRight, g_LeftLength, g_RightLength,
	  Ids, Names);

#if	TRACE
	tree.LogMe();
#endif

	delete[] g_Dist;

	delete[] g_uNodeIndex;
	delete[] g_uNearestNeighbor;
	delete[] g_MinDist;
	delete[] g_Height;

	delete[] g_uLeft;
	delete[] g_uRight;
	delete[] g_LeftLength;
	delete[] g_RightLength;
	
	for (unsigned i = 0; i < g_uLeafCount; ++i)
		free(Names[i]);
	delete[] Names;
	delete[] Ids;
	}

class DistCalcTest : public DistCalc
	{
	virtual void CalcDistRange(unsigned i, dist_t Dist[]) const
		{
		static dist_t TestDist[5][5] =
			{
			0,  2, 14, 14, 20,
			2,  0, 14, 14, 20,
			14, 14,  0,  4, 20,
			14, 14,  4,  0, 20,
			20, 20, 20, 20,  0,
			};
		for (unsigned j = 0; j < i; ++j)
			Dist[j] = TestDist[i][j];
		}
	virtual unsigned GetCount() const
		{
		return 5;
		}
	virtual unsigned GetId(unsigned i) const
		{
		return i;
		}
	virtual const char *GetName(unsigned i) const
		{
		return "name";
		}
	};

void Test()
	{
	SetListFileName("c:\\tmp\\lobster.log", false);
	DistCalcTest DC;
	Tree tree;
	UPGMA2(DC, tree, LINKAGE_Avg);
	}