File: kd_split.cpp

package info (click to toggle)
mldemos 0.5.1-3
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 32,224 kB
  • ctags: 46,525
  • sloc: cpp: 306,887; ansic: 167,718; ml: 126; sh: 109; makefile: 2
file content (428 lines) | stat: -rw-r--r-- 17,306 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
//----------------------------------------------------------------------
// File:			kd_split.cpp
// Programmer:		Sunil Arya and David Mount
// Description:		Methods for splitting kd-trees
// Last modified:	01/04/05 (Version 1.0)
//----------------------------------------------------------------------
// Copyright (c) 1997-2005 University of Maryland and Sunil Arya and
// David Mount.  All Rights Reserved.
// 
// This software and related documentation is part of the Approximate
// Nearest Neighbor Library (ANN).  This software is provided under
// the provisions of the Lesser GNU Public License (LGPL).  See the
// file ../ReadMe.txt for further information.
// 
// The University of Maryland (U.M.) and the authors make no
// representations about the suitability or fitness of this software for
// any purpose.  It is provided "as is" without express or implied
// warranty.
//----------------------------------------------------------------------
// History:
//	Revision 0.1  03/04/98
//		Initial release
//	Revision 1.0  04/01/05
//----------------------------------------------------------------------

#include "kd_tree.h"					// kd-tree definitions
#include "kd_util.h"					// kd-tree utilities
#include "kd_split.h"					// splitting functions

//----------------------------------------------------------------------
//	Constants
//----------------------------------------------------------------------

const double ERR = 0.001;				// a small value
const double FS_ASPECT_RATIO = 3.0;		// maximum allowed aspect ratio
										// in fair split. Must be >= 2.

//----------------------------------------------------------------------
//	kd_split - Bentley's standard splitting routine for kd-trees
//		Find the dimension of the greatest spread, and split
//		just before the median point along this dimension.
//----------------------------------------------------------------------

void kd_split(
	ANNpointArray		pa,				// point array (permuted on return)
	ANNidxArray			pidx,			// point indices
	const ANNorthRect	&bnds,			// bounding rectangle for cell
	int					n,				// number of points
	int					dim,			// dimension of space
	int					&cut_dim,		// cutting dimension (returned)
	ANNcoord			&cut_val,		// cutting value (returned)
	int					&n_lo)			// num of points on low side (returned)
{
										// find dimension of maximum spread
	cut_dim = annMaxSpread(pa, pidx, n, dim);
	n_lo = n/2;							// median rank
										// split about median
	annMedianSplit(pa, pidx, n, cut_dim, cut_val, n_lo);
}

//----------------------------------------------------------------------
//	midpt_split - midpoint splitting rule for box-decomposition trees
//
//		This is the simplest splitting rule that guarantees boxes
//		of bounded aspect ratio.  It simply cuts the box with the
//		longest side through its midpoint.  If there are ties, it
//		selects the dimension with the maximum point spread.
//
//		WARNING: This routine (while simple) doesn't seem to work
//		well in practice in high dimensions, because it tends to
//		generate a large number of trivial and/or unbalanced splits.
//		Either kd_split(), sl_midpt_split(), or fair_split() are
//		recommended, instead.
//----------------------------------------------------------------------

void midpt_split(
	ANNpointArray		pa,				// point array
	ANNidxArray			pidx,			// point indices (permuted on return)
	const ANNorthRect	&bnds,			// bounding rectangle for cell
	int					n,				// number of points
	int					dim,			// dimension of space
	int					&cut_dim,		// cutting dimension (returned)
	ANNcoord			&cut_val,		// cutting value (returned)
	int					&n_lo)			// num of points on low side (returned)
{
	int d;

	ANNcoord max_length = bnds.hi[0] - bnds.lo[0];
	for (d = 1; d < dim; d++) {			// find length of longest box side
		ANNcoord length = bnds.hi[d] - bnds.lo[d];
		if (length > max_length) {
			max_length = length;
		}
	}
	ANNcoord max_spread = -1;			// find long side with most spread
	for (d = 0; d < dim; d++) {
										// is it among longest?
		if (double(bnds.hi[d] - bnds.lo[d]) >= (1-ERR)*max_length) {
										// compute its spread
			ANNcoord spr = annSpread(pa, pidx, n, d);
			if (spr > max_spread) {		// is it max so far?
				max_spread = spr;
				cut_dim = d;
			}
		}
	}
										// split along cut_dim at midpoint
	cut_val = (bnds.lo[cut_dim] + bnds.hi[cut_dim]) / 2;
										// permute points accordingly
	int br1, br2;
	annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
	//------------------------------------------------------------------
	//	On return:		pa[0..br1-1] < cut_val
	//					pa[br1..br2-1] == cut_val
	//					pa[br2..n-1] > cut_val
	//
	//	We can set n_lo to any value in the range [br1..br2].
	//	We choose split so that points are most evenly divided.
	//------------------------------------------------------------------
	if (br1 > n/2) n_lo = br1;
	else if (br2 < n/2) n_lo = br2;
	else n_lo = n/2;
}

//----------------------------------------------------------------------
//	sl_midpt_split - sliding midpoint splitting rule
//
//		This is a modification of midpt_split, which has the nonsensical
//		name "sliding midpoint".  The idea is that we try to use the
//		midpoint rule, by bisecting the longest side.  If there are
//		ties, the dimension with the maximum spread is selected.  If,
//		however, the midpoint split produces a trivial split (no points
//		on one side of the splitting plane) then we slide the splitting
//		(maintaining its orientation) until it produces a nontrivial
//		split. For example, if the splitting plane is along the x-axis,
//		and all the data points have x-coordinate less than the x-bisector,
//		then the split is taken along the maximum x-coordinate of the
//		data points.
//
//		Intuitively, this rule cannot generate trivial splits, and
//		hence avoids midpt_split's tendency to produce trees with
//		a very large number of nodes.
//
//----------------------------------------------------------------------

void sl_midpt_split(
	ANNpointArray		pa,				// point array
	ANNidxArray			pidx,			// point indices (permuted on return)
	const ANNorthRect	&bnds,			// bounding rectangle for cell
	int					n,				// number of points
	int					dim,			// dimension of space
	int					&cut_dim,		// cutting dimension (returned)
	ANNcoord			&cut_val,		// cutting value (returned)
	int					&n_lo)			// num of points on low side (returned)
{
	int d;

	ANNcoord max_length = bnds.hi[0] - bnds.lo[0];
	for (d = 1; d < dim; d++) {			// find length of longest box side
		ANNcoord length = bnds.hi[d] - bnds.lo[d];
		if (length > max_length) {
			max_length = length;
		}
	}
	ANNcoord max_spread = -1;			// find long side with most spread
	for (d = 0; d < dim; d++) {
										// is it among longest?
		if ((bnds.hi[d] - bnds.lo[d]) >= (1-ERR)*max_length) {
										// compute its spread
			ANNcoord spr = annSpread(pa, pidx, n, d);
			if (spr > max_spread) {		// is it max so far?
				max_spread = spr;
				cut_dim = d;
			}
		}
	}
										// ideal split at midpoint
	ANNcoord ideal_cut_val = (bnds.lo[cut_dim] + bnds.hi[cut_dim])/2;

	ANNcoord min, max;
	annMinMax(pa, pidx, n, cut_dim, min, max);	// find min/max coordinates

	if (ideal_cut_val < min)			// slide to min or max as needed
		cut_val = min;
	else if (ideal_cut_val > max)
		cut_val = max;
	else
		cut_val = ideal_cut_val;

										// permute points accordingly
	int br1, br2;
	annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
	//------------------------------------------------------------------
	//	On return:		pa[0..br1-1] < cut_val
	//					pa[br1..br2-1] == cut_val
	//					pa[br2..n-1] > cut_val
	//
	//	We can set n_lo to any value in the range [br1..br2] to satisfy
	//	the exit conditions of the procedure.
	//
	//	if ideal_cut_val < min (implying br2 >= 1),
	//			then we select n_lo = 1 (so there is one point on left) and
	//	if ideal_cut_val > max (implying br1 <= n-1),
	//			then we select n_lo = n-1 (so there is one point on right).
	//	Otherwise, we select n_lo as close to n/2 as possible within
	//			[br1..br2].
	//------------------------------------------------------------------
	if (ideal_cut_val < min) n_lo = 1;
	else if (ideal_cut_val > max) n_lo = n-1;
	else if (br1 > n/2) n_lo = br1;
	else if (br2 < n/2) n_lo = br2;
	else n_lo = n/2;
}

//----------------------------------------------------------------------
//	fair_split - fair-split splitting rule
//
//		This is a compromise between the kd-tree splitting rule (which
//		always splits data points at their median) and the midpoint
//		splitting rule (which always splits a box through its center.
//		The goal of this procedure is to achieve both nicely balanced
//		splits, and boxes of bounded aspect ratio.
//
//		A constant FS_ASPECT_RATIO is defined. Given a box, those sides
//		which can be split so that the ratio of the longest to shortest
//		side does not exceed ASPECT_RATIO are identified.  Among these
//		sides, we select the one in which the points have the largest
//		spread. We then split the points in a manner which most evenly
//		distributes the points on either side of the splitting plane,
//		subject to maintaining the bound on the ratio of long to short
//		sides. To determine that the aspect ratio will be preserved,
//		we determine the longest side (other than this side), and
//		determine how narrowly we can cut this side, without causing the
//		aspect ratio bound to be exceeded (small_piece).
//
//		This procedure is more robust than either kd_split or midpt_split,
//		but is more complicated as well.  When point distribution is
//		extremely skewed, this degenerates to midpt_split (actually
//		1/3 point split), and when the points are most evenly distributed,
//		this degenerates to kd-split.
//----------------------------------------------------------------------

void fair_split(
	ANNpointArray		pa,				// point array
	ANNidxArray			pidx,			// point indices (permuted on return)
	const ANNorthRect	&bnds,			// bounding rectangle for cell
	int					n,				// number of points
	int					dim,			// dimension of space
	int					&cut_dim,		// cutting dimension (returned)
	ANNcoord			&cut_val,		// cutting value (returned)
	int					&n_lo)			// num of points on low side (returned)
{
	int d;
	ANNcoord max_length = bnds.hi[0] - bnds.lo[0];
	cut_dim = 0;
	for (d = 1; d < dim; d++) {			// find length of longest box side
		ANNcoord length = bnds.hi[d] - bnds.lo[d];
		if (length > max_length) {
			max_length = length;
			cut_dim = d;
		}
	}

	ANNcoord max_spread = 0;			// find legal cut with max spread
	cut_dim = 0;
	for (d = 0; d < dim; d++) {
		ANNcoord length = bnds.hi[d] - bnds.lo[d];
										// is this side midpoint splitable
										// without violating aspect ratio?
		if (((double) max_length)*2.0/((double) length) <= FS_ASPECT_RATIO) {
										// compute spread along this dim
			ANNcoord spr = annSpread(pa, pidx, n, d);
			if (spr > max_spread) {		// best spread so far
				max_spread = spr;
				cut_dim = d;			// this is dimension to cut
			}
		}
	}

	max_length = 0;						// find longest side other than cut_dim
	for (d = 0; d < dim; d++) {
		ANNcoord length = bnds.hi[d] - bnds.lo[d];
		if (d != cut_dim && length > max_length)
			max_length = length;
	}
										// consider most extreme splits
	ANNcoord small_piece = max_length / FS_ASPECT_RATIO;
	ANNcoord lo_cut = bnds.lo[cut_dim] + small_piece;// lowest legal cut
	ANNcoord hi_cut = bnds.hi[cut_dim] - small_piece;// highest legal cut

	int br1, br2;
										// is median below lo_cut ?
	if (annSplitBalance(pa, pidx, n, cut_dim, lo_cut) >= 0) {
		cut_val = lo_cut;				// cut at lo_cut
		annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
		n_lo = br1;
	}
										// is median above hi_cut?
	else if (annSplitBalance(pa, pidx, n, cut_dim, hi_cut) <= 0) {
		cut_val = hi_cut;				// cut at hi_cut
		annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
		n_lo = br2;
	}
	else {								// median cut preserves asp ratio
		n_lo = n/2;						// split about median
		annMedianSplit(pa, pidx, n, cut_dim, cut_val, n_lo);
	}
}

//----------------------------------------------------------------------
//	sl_fair_split - sliding fair split splitting rule
//
//		Sliding fair split is a splitting rule that combines the
//		strengths of both fair split with sliding midpoint split.
//		Fair split tends to produce balanced splits when the points
//		are roughly uniformly distributed, but it can produce many
//		trivial splits when points are highly clustered.  Sliding
//		midpoint never produces trivial splits, and shrinks boxes
//		nicely if points are highly clustered, but it may produce
//		rather unbalanced splits when points are unclustered but not
//		quite uniform.
//
//		Sliding fair split is based on the theory that there are two
//		types of splits that are "good": balanced splits that produce
//		fat boxes, and unbalanced splits provided the cell with fewer
//		points is fat.
//
//		This splitting rule operates by first computing the longest
//		side of the current bounding box.  Then it asks which sides
//		could be split (at the midpoint) and still satisfy the aspect
//		ratio bound with respect to this side.	Among these, it selects
//		the side with the largest spread (as fair split would).	 It
//		then considers the most extreme cuts that would be allowed by
//		the aspect ratio bound.	 This is done by dividing the longest
//		side of the box by the aspect ratio bound.	If the median cut
//		lies between these extreme cuts, then we use the median cut.
//		If not, then consider the extreme cut that is closer to the
//		median.	 If all the points lie to one side of this cut, then
//		we slide the cut until it hits the first point.	 This may
//		violate the aspect ratio bound, but will never generate empty
//		cells.	However the sibling of every such skinny cell is fat,
//		and hence packing arguments still apply.
//
//----------------------------------------------------------------------

void sl_fair_split(
	ANNpointArray		pa,				// point array
	ANNidxArray			pidx,			// point indices (permuted on return)
	const ANNorthRect	&bnds,			// bounding rectangle for cell
	int					n,				// number of points
	int					dim,			// dimension of space
	int					&cut_dim,		// cutting dimension (returned)
	ANNcoord			&cut_val,		// cutting value (returned)
	int					&n_lo)			// num of points on low side (returned)
{
	int d;
	ANNcoord min, max;					// min/max coordinates
	int br1, br2;						// split break points

	ANNcoord max_length = bnds.hi[0] - bnds.lo[0];
	cut_dim = 0;
	for (d = 1; d < dim; d++) {			// find length of longest box side
		ANNcoord length = bnds.hi[d] - bnds.lo[d];
		if (length	> max_length) {
			max_length = length;
			cut_dim = d;
		}
	}

	ANNcoord max_spread = 0;			// find legal cut with max spread
	cut_dim = 0;
	for (d = 0; d < dim; d++) {
		ANNcoord length = bnds.hi[d] - bnds.lo[d];
										// is this side midpoint splitable
										// without violating aspect ratio?
		if (((double) max_length)*2.0/((double) length) <= FS_ASPECT_RATIO) {
										// compute spread along this dim
			ANNcoord spr = annSpread(pa, pidx, n, d);
			if (spr > max_spread) {		// best spread so far
				max_spread = spr;
				cut_dim = d;			// this is dimension to cut
			}
		}
	}

	max_length = 0;						// find longest side other than cut_dim
	for (d = 0; d < dim; d++) {
		ANNcoord length = bnds.hi[d] - bnds.lo[d];
		if (d != cut_dim && length > max_length)
			max_length = length;
	}
										// consider most extreme splits
	ANNcoord small_piece = max_length / FS_ASPECT_RATIO;
	ANNcoord lo_cut = bnds.lo[cut_dim] + small_piece;// lowest legal cut
	ANNcoord hi_cut = bnds.hi[cut_dim] - small_piece;// highest legal cut
										// find min and max along cut_dim
	annMinMax(pa, pidx, n, cut_dim, min, max);
										// is median below lo_cut?
	if (annSplitBalance(pa, pidx, n, cut_dim, lo_cut) >= 0) {
		if (max > lo_cut) {				// are any points above lo_cut?
			cut_val = lo_cut;			// cut at lo_cut
			annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
			n_lo = br1;					// balance if there are ties
		}
		else {							// all points below lo_cut
			cut_val = max;				// cut at max value
			annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
			n_lo = n-1;
		}
	}
										// is median above hi_cut?
	else if (annSplitBalance(pa, pidx, n, cut_dim, hi_cut) <= 0) {
		if (min < hi_cut) {				// are any points below hi_cut?
			cut_val = hi_cut;			// cut at hi_cut
			annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
			n_lo = br2;					// balance if there are ties
		}
		else {							// all points above hi_cut
			cut_val = min;				// cut at min value
			annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
			n_lo = 1;
		}
	}
	else {								// median cut is good enough
		n_lo = n/2;						// split about median
		annMedianSplit(pa, pidx, n, cut_dim, cut_val, n_lo);
	}
}