1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
|
import numpy as np
import pytest
from numpy.testing import assert_array_almost_equal
from sklearn.neighbors import KDTree
from sklearn.preprocessing import normalize
from umap import distances as dist
from umap.umap_ import (
nearest_neighbors,
smooth_knn_dist,
)
# ===================================================
# Nearest Neighbour Test cases
# ===================================================
# nearest_neighbours metric parameter validation
# -----------------------------------------------
def test_nn_bad_metric(nn_data):
with pytest.raises(ValueError):
nearest_neighbors(nn_data, 10, 42, {}, False, np.random)
def test_nn_bad_metric_sparse_data(sparse_nn_data):
with pytest.raises(ValueError):
nearest_neighbors(
sparse_nn_data,
10,
"seuclidean",
{},
False,
np.random,
)
# -------------------------------------------------
# Utility functions for Nearest Neighbour
# -------------------------------------------------
def knn(indices, nn_data): # pragma: no cover
tree = KDTree(nn_data)
true_indices = tree.query(nn_data, 10, return_distance=False)
num_correct = 0.0
for i in range(nn_data.shape[0]):
num_correct += np.sum(np.in1d(true_indices[i], indices[i]))
return num_correct / (nn_data.shape[0] * 10)
def smooth_knn(nn_data, local_connectivity=1.0):
knn_indices, knn_dists, _ = nearest_neighbors(
nn_data, 10, "euclidean", {}, False, np.random
)
sigmas, rhos = smooth_knn_dist(
knn_dists, 10.0, local_connectivity=local_connectivity
)
shifted_dists = knn_dists - rhos[:, np.newaxis]
shifted_dists[shifted_dists < 0.0] = 0.0
vals = np.exp(-(shifted_dists / sigmas[:, np.newaxis]))
norms = np.sum(vals, axis=1)
return norms
@pytest.mark.skip()
def test_nn_descent_neighbor_accuracy(nn_data): # pragma: no cover
knn_indices, knn_dists, _ = nearest_neighbors(
nn_data, 10, "euclidean", {}, False, np.random
)
percent_correct = knn(knn_indices, nn_data)
assert (
percent_correct >= 0.85
), "NN-descent did not get 89% accuracy on nearest neighbors"
@pytest.mark.skip()
def test_nn_descent_neighbor_accuracy_low_memory(nn_data): # pragma: no cover
knn_indices, knn_dists, _ = nearest_neighbors(
nn_data, 10, "euclidean", {}, False, np.random, low_memory=True
)
percent_correct = knn(knn_indices, nn_data)
assert (
percent_correct >= 0.89
), "NN-descent did not get 89% accuracy on nearest neighbors"
@pytest.mark.skip()
def test_angular_nn_descent_neighbor_accuracy(nn_data): # pragma: no cover
knn_indices, knn_dists, _ = nearest_neighbors(
nn_data, 10, "cosine", {}, True, np.random
)
angular_data = normalize(nn_data, norm="l2")
percent_correct = knn(knn_indices, angular_data)
assert (
percent_correct >= 0.85
), "NN-descent did not get 89% accuracy on nearest neighbors"
@pytest.mark.skip()
def test_sparse_nn_descent_neighbor_accuracy(sparse_nn_data): # pragma: no cover
knn_indices, knn_dists, _ = nearest_neighbors(
sparse_nn_data, 20, "euclidean", {}, False, np.random
)
percent_correct = knn(knn_indices, sparse_nn_data.todense())
assert (
percent_correct >= 0.75
), "Sparse NN-descent did not get 90% accuracy on nearest neighbors"
@pytest.mark.skip()
def test_sparse_nn_descent_neighbor_accuracy_low_memory(
sparse_nn_data,
): # pragma: no cover
knn_indices, knn_dists, _ = nearest_neighbors(
sparse_nn_data, 20, "euclidean", {}, False, np.random, low_memory=True
)
percent_correct = knn(knn_indices, sparse_nn_data.todense())
assert (
percent_correct >= 0.85
), "Sparse NN-descent did not get 90% accuracy on nearest neighbors"
@pytest.mark.skip()
def test_nn_descent_neighbor_accuracy_callable_metric(nn_data): # pragma: no cover
knn_indices, knn_dists, _ = nearest_neighbors(
nn_data, 10, dist.euclidean, {}, False, np.random
)
percent_correct = knn(knn_indices, nn_data)
assert (
percent_correct >= 0.95
), "NN-descent did not get 95% accuracy on nearest neighbors with callable metric"
@pytest.mark.skip()
def test_sparse_angular_nn_descent_neighbor_accuracy(
sparse_nn_data,
): # pragma: no cover
knn_indices, knn_dists, _ = nearest_neighbors(
sparse_nn_data, 20, "cosine", {}, True, np.random
)
angular_data = normalize(sparse_nn_data, norm="l2").toarray()
percent_correct = knn(knn_indices, angular_data)
assert (
percent_correct >= 0.90
), "Sparse NN-descent did not get 90% accuracy on nearest neighbors"
def test_smooth_knn_dist_l1norms(nn_data):
norms = smooth_knn(nn_data)
assert_array_almost_equal(
norms,
1.0 + np.log2(10) * np.ones(norms.shape[0]),
decimal=3,
err_msg="Smooth knn-dists does not give expected" "norms",
)
def test_smooth_knn_dist_l1norms_w_connectivity(nn_data):
norms = smooth_knn(nn_data, local_connectivity=1.75)
assert_array_almost_equal(
norms,
1.0 + np.log2(10) * np.ones(norms.shape[0]),
decimal=3,
err_msg="Smooth knn-dists does not give expected"
"norms for local_connectivity=1.75",
)
|