File: test_pairwise_distances_reduction.py

package info (click to toggle)
scikit-learn 1.2.1%2Bdfsg-1
links: PTS, VCS
area: main
in suites: bookworm
size: 23,280 kB
sloc: python: 184,491; cpp: 5,783; ansic: 854; makefile: 307; sh: 45; javascript: 1
file content (1228 lines) | stat: -rw-r--r-- 41,336 bytes
import itertools
import re
import warnings
from collections import defaultdict

import numpy as np
import pytest
import threadpoolctl
from math import log10, floor
from scipy.sparse import csr_matrix
from scipy.spatial.distance import cdist

from sklearn.metrics._pairwise_distances_reduction import (
    BaseDistancesReductionDispatcher,
    ArgKmin,
    RadiusNeighbors,
    sqeuclidean_row_norms,
)

from sklearn.metrics import euclidean_distances
from sklearn.utils.fixes import sp_version, parse_version
from sklearn.utils._testing import (
    assert_array_equal,
    assert_allclose,
    create_memmap_backed_data,
)

# Common supported metric between scipy.spatial.distance.cdist
# and BaseDistanceReductionDispatcher.
# This allows constructing tests to check consistency of results
# of concrete BaseDistanceReductionDispatcher on some metrics using APIs
# from scipy and numpy.
CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS = [
    "braycurtis",
    "canberra",
    "chebyshev",
    "cityblock",
    "euclidean",
    "minkowski",
    "seuclidean",
]


def _get_metric_params_list(metric: str, n_features: int, seed: int = 1):
    """Return list of dummy DistanceMetric kwargs for tests."""

    # Distinguishing on cases not to compute unneeded datastructures.
    rng = np.random.RandomState(seed)

    if metric == "minkowski":
        minkowski_kwargs = [dict(p=1.5), dict(p=2), dict(p=3), dict(p=np.inf)]
        if sp_version >= parse_version("1.8.0.dev0"):
            # TODO: remove the test once we no longer support scipy < 1.8.0.
            # Recent scipy versions accept weights in the Minkowski metric directly:
            # type: ignore
            minkowski_kwargs.append(dict(p=3, w=rng.rand(n_features)))

        return minkowski_kwargs

    # TODO: remove this case for "wminkowski" once we no longer support scipy < 1.8.0.
    if metric == "wminkowski":
        weights = rng.random_sample(n_features)
        weights /= weights.sum()
        wminkowski_kwargs = [dict(p=1.5, w=weights)]
        if sp_version < parse_version("1.8.0.dev0"):
            # wminkowski was removed in scipy 1.8.0 but should work for previous
            # versions.
            wminkowski_kwargs.append(dict(p=3, w=rng.rand(n_features)))
        return wminkowski_kwargs

    if metric == "seuclidean":
        return [dict(V=rng.rand(n_features))]

    # Case of: "euclidean", "manhattan", "chebyshev", "haversine" or any other metric.
    # In those cases, no kwargs is needed.
    return [{}]


def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices, rtol=1e-7):
    assert_array_equal(
        ref_indices,
        indices,
        err_msg="Query vectors have different neighbors' indices",
    )
    assert_allclose(
        ref_dist,
        dist,
        err_msg="Query vectors have different neighbors' distances",
        rtol=rtol,
    )


def relative_rounding(scalar, n_significant_digits):
    """Round a scalar to a number of significant digits relatively to its value."""
    if scalar == 0:
        return 0.0
    magnitude = int(floor(log10(abs(scalar)))) + 1
    return round(scalar, n_significant_digits - magnitude)


def test_relative_rounding():

    assert relative_rounding(0, 1) == 0.0
    assert relative_rounding(0, 10) == 0.0
    assert relative_rounding(0, 123456) == 0.0

    assert relative_rounding(123456789, 0) == 0
    assert relative_rounding(123456789, 2) == 120000000
    assert relative_rounding(123456789, 3) == 123000000
    assert relative_rounding(123456789, 10) == 123456789
    assert relative_rounding(123456789, 20) == 123456789

    assert relative_rounding(1.23456789, 2) == 1.2
    assert relative_rounding(1.23456789, 3) == 1.23
    assert relative_rounding(1.23456789, 10) == 1.23456789

    assert relative_rounding(123.456789, 3) == 123.0
    assert relative_rounding(123.456789, 9) == 123.456789
    assert relative_rounding(123.456789, 10) == 123.456789


def assert_argkmin_results_quasi_equality(
    ref_dist,
    dist,
    ref_indices,
    indices,
    rtol=1e-4,
):
    """Assert that argkmin results are valid up to:
      - relative tolerance on computed distance values
      - permutations of indices for distances values that differ up to
        a precision level

    To be used for testing neighbors queries on float32 datasets: we
    accept neighbors rank swaps only if they are caused by small
    rounding errors on the distance computations.
    """
    is_sorted = lambda a: np.all(a[:-1] <= a[1:])

    n_significant_digits = -(int(floor(log10(abs(rtol)))) + 1)

    assert (
        ref_dist.shape == dist.shape == ref_indices.shape == indices.shape
    ), "Arrays of results have various shapes."

    n_queries, n_neighbors = ref_dist.shape

    # Asserting equality results one row at a time
    for query_idx in range(n_queries):
        ref_dist_row = ref_dist[query_idx]
        dist_row = dist[query_idx]

        assert is_sorted(
            ref_dist_row
        ), f"Reference distances aren't sorted on row {query_idx}"
        assert is_sorted(dist_row), f"Distances aren't sorted on row {query_idx}"

        assert_allclose(ref_dist_row, dist_row, rtol=rtol)

        ref_indices_row = ref_indices[query_idx]
        indices_row = indices[query_idx]

        # Grouping indices by distances using sets on a rounded distances up
        # to a given number of decimals of significant digits derived from rtol.
        reference_neighbors_groups = defaultdict(set)
        effective_neighbors_groups = defaultdict(set)

        for neighbor_rank in range(n_neighbors):
            rounded_dist = relative_rounding(
                ref_dist_row[neighbor_rank],
                n_significant_digits=n_significant_digits,
            )
            reference_neighbors_groups[rounded_dist].add(ref_indices_row[neighbor_rank])
            effective_neighbors_groups[rounded_dist].add(indices_row[neighbor_rank])

        # Asserting equality of groups (sets) for each distance
        msg = (
            f"Neighbors indices for query {query_idx} are not matching "
            f"when rounding distances at {n_significant_digits} significant digits "
            f"derived from rtol={rtol:.1e}"
        )
        for rounded_distance in reference_neighbors_groups.keys():
            assert (
                reference_neighbors_groups[rounded_distance]
                == effective_neighbors_groups[rounded_distance]
            ), msg


def assert_radius_neighbors_results_equality(
    ref_dist, dist, ref_indices, indices, radius
):
    # We get arrays of arrays and we need to check for individual pairs
    for i in range(ref_dist.shape[0]):
        assert (ref_dist[i] <= radius).all()
        assert_array_equal(
            ref_indices[i],
            indices[i],
            err_msg=f"Query vector #{i} has different neighbors' indices",
        )
        assert_allclose(
            ref_dist[i],
            dist[i],
            err_msg=f"Query vector #{i} has different neighbors' distances",
            rtol=1e-7,
        )


def assert_radius_neighbors_results_quasi_equality(
    ref_dist,
    dist,
    ref_indices,
    indices,
    radius,
    rtol=1e-4,
):
    """Assert that radius neighborhood results are valid up to:
      - relative tolerance on computed distance values
      - permutations of indices for distances values that differ up to
        a precision level
      - missing or extra last elements if their distance is
        close to the radius

    To be used for testing neighbors queries on float32 datasets: we
    accept neighbors rank swaps only if they are caused by small
    rounding errors on the distance computations.

    Input arrays must be sorted w.r.t distances.
    """
    is_sorted = lambda a: np.all(a[:-1] <= a[1:])

    n_significant_digits = -(int(floor(log10(abs(rtol)))) + 1)

    assert (
        len(ref_dist) == len(dist) == len(ref_indices) == len(indices)
    ), "Arrays of results have various lengths."

    n_queries = len(ref_dist)

    # Asserting equality of results one vector at a time
    for query_idx in range(n_queries):

        ref_dist_row = ref_dist[query_idx]
        dist_row = dist[query_idx]

        assert is_sorted(
            ref_dist_row
        ), f"Reference distances aren't sorted on row {query_idx}"
        assert is_sorted(dist_row), f"Distances aren't sorted on row {query_idx}"

        # Vectors' lengths might be different due to small
        # numerical differences of distance w.r.t the `radius` threshold.
        largest_row = ref_dist_row if len(ref_dist_row) > len(dist_row) else dist_row

        # For the longest distances vector, we check that last extra elements
        # that aren't present in the other vector are all in: [radius ± rtol]
        min_length = min(len(ref_dist_row), len(dist_row))
        last_extra_elements = largest_row[min_length:]
        if last_extra_elements.size > 0:
            assert np.all(radius - rtol <= last_extra_elements <= radius + rtol), (
                f"The last extra elements ({last_extra_elements}) aren't in [radius ±"
                f" rtol]=[{radius} ± {rtol}]"
            )

        # We truncate the neighbors results list on the smallest length to
        # be able to compare them, ignoring the elements checked above.
        ref_dist_row = ref_dist_row[:min_length]
        dist_row = dist_row[:min_length]

        assert_allclose(ref_dist_row, dist_row, rtol=rtol)

        ref_indices_row = ref_indices[query_idx]
        indices_row = indices[query_idx]

        # Grouping indices by distances using sets on a rounded distances up
        # to a given number of significant digits derived from rtol.
        reference_neighbors_groups = defaultdict(set)
        effective_neighbors_groups = defaultdict(set)

        for neighbor_rank in range(min_length):
            rounded_dist = relative_rounding(
                ref_dist_row[neighbor_rank],
                n_significant_digits=n_significant_digits,
            )
            reference_neighbors_groups[rounded_dist].add(ref_indices_row[neighbor_rank])
            effective_neighbors_groups[rounded_dist].add(indices_row[neighbor_rank])

        # Asserting equality of groups (sets) for each distance
        msg = (
            f"Neighbors indices for query {query_idx} are not matching "
            f"when rounding distances at {n_significant_digits} significant digits "
            f"derived from rtol={rtol:.1e}"
        )
        for rounded_distance in reference_neighbors_groups.keys():
            assert (
                reference_neighbors_groups[rounded_distance]
                == effective_neighbors_groups[rounded_distance]
            ), msg


ASSERT_RESULT = {
    # In the case of 64bit, we test for exact equality of the results rankings
    # and standard tolerance levels for the computed distance values.
    #
    # XXX: Note that in the future we might be interested in using quasi equality
    # checks also for float64 data (with a larger number of significant digits)
    # as the tests could be unstable because of numerically tied distances on
    # some datasets (e.g. uniform grids).
    (ArgKmin, np.float64): assert_argkmin_results_equality,
    (
        RadiusNeighbors,
        np.float64,
    ): assert_radius_neighbors_results_equality,
    # In the case of 32bit, indices can be permuted due to small difference
    # in the computations of their associated distances, hence we test equality of
    # results up to valid permutations.
    (ArgKmin, np.float32): assert_argkmin_results_quasi_equality,
    (
        RadiusNeighbors,
        np.float32,
    ): assert_radius_neighbors_results_quasi_equality,
}


def test_assert_argkmin_results_quasi_equality():

    rtol = 1e-7
    eps = 1e-7
    _1m = 1.0 - eps
    _1p = 1.0 + eps

    _6_1m = 6.1 - eps
    _6_1p = 6.1 + eps

    ref_dist = np.array(
        [
            [1.2, 2.5, _6_1m, 6.1, _6_1p],
            [_1m, _1m, 1, _1p, _1p],
        ]
    )
    ref_indices = np.array(
        [
            [1, 2, 3, 4, 5],
            [6, 7, 8, 9, 10],
        ]
    )

    # Sanity check: compare the reference results to themselves.
    assert_argkmin_results_quasi_equality(
        ref_dist, ref_dist, ref_indices, ref_indices, rtol
    )

    # Apply valid permutation on indices: the last 3 points are
    # all very close to one another so we accept any permutation
    # on their rankings.
    assert_argkmin_results_quasi_equality(
        np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
        np.array([[1.2, 2.5, 6.1, 6.1, 6.1]]),
        np.array([[1, 2, 3, 4, 5]]),
        np.array([[1, 2, 4, 5, 3]]),
        rtol=rtol,
    )
    # All points are have close distances so any ranking permutation
    # is valid for this query result.
    assert_argkmin_results_quasi_equality(
        np.array([[_1m, _1m, 1, _1p, _1p]]),
        np.array([[_1m, _1m, 1, _1p, _1p]]),
        np.array([[6, 7, 8, 9, 10]]),
        np.array([[6, 9, 7, 8, 10]]),
        rtol=rtol,
    )

    # Apply invalid permutation on indices: permuting the ranks
    # of the 2 nearest neighbors is invalid because the distance
    # values are too different.
    msg = "Neighbors indices for query 0 are not matching"
    with pytest.raises(AssertionError, match=msg):
        assert_argkmin_results_quasi_equality(
            np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
            np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
            np.array([[1, 2, 3, 4, 5]]),
            np.array([[2, 1, 3, 4, 5]]),
            rtol=rtol,
        )

    # Indices aren't properly sorted w.r.t their distances
    msg = "Neighbors indices for query 0 are not matching"
    with pytest.raises(AssertionError, match=msg):
        assert_argkmin_results_quasi_equality(
            np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
            np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
            np.array([[1, 2, 3, 4, 5]]),
            np.array([[2, 1, 4, 5, 3]]),
            rtol=rtol,
        )

    # Distances aren't properly sorted
    msg = "Distances aren't sorted on row 0"
    with pytest.raises(AssertionError, match=msg):
        assert_argkmin_results_quasi_equality(
            np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
            np.array([[2.5, 1.2, _6_1m, 6.1, _6_1p]]),
            np.array([[1, 2, 3, 4, 5]]),
            np.array([[2, 1, 4, 5, 3]]),
            rtol=rtol,
        )


def test_assert_radius_neighbors_results_quasi_equality():

    rtol = 1e-7
    eps = 1e-7
    _1m = 1.0 - eps
    _1p = 1.0 + eps

    _6_1m = 6.1 - eps
    _6_1p = 6.1 + eps

    ref_dist = [
        np.array([1.2, 2.5, _6_1m, 6.1, _6_1p]),
        np.array([_1m, 1, _1p, _1p]),
    ]

    ref_indices = [
        np.array([1, 2, 3, 4, 5]),
        np.array([6, 7, 8, 9]),
    ]

    # Sanity check: compare the reference results to themselves.
    assert_radius_neighbors_results_quasi_equality(
        ref_dist,
        ref_dist,
        ref_indices,
        ref_indices,
        radius=6.1,
        rtol=rtol,
    )

    # Apply valid permutation on indices
    assert_radius_neighbors_results_quasi_equality(
        np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
        np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
        np.array([np.array([1, 2, 3, 4, 5])]),
        np.array([np.array([1, 2, 4, 5, 3])]),
        radius=6.1,
        rtol=rtol,
    )
    assert_radius_neighbors_results_quasi_equality(
        np.array([np.array([_1m, _1m, 1, _1p, _1p])]),
        np.array([np.array([_1m, _1m, 1, _1p, _1p])]),
        np.array([np.array([6, 7, 8, 9, 10])]),
        np.array([np.array([6, 9, 7, 8, 10])]),
        radius=6.1,
        rtol=rtol,
    )

    # Apply invalid permutation on indices
    msg = "Neighbors indices for query 0 are not matching"
    with pytest.raises(AssertionError, match=msg):
        assert_radius_neighbors_results_quasi_equality(
            np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
            np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
            np.array([np.array([1, 2, 3, 4, 5])]),
            np.array([np.array([2, 1, 3, 4, 5])]),
            radius=6.1,
            rtol=rtol,
        )

    # Having extra last elements is valid if they are in: [radius ± rtol]
    assert_radius_neighbors_results_quasi_equality(
        np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
        np.array([np.array([1.2, 2.5, _6_1m, 6.1])]),
        np.array([np.array([1, 2, 3, 4, 5])]),
        np.array([np.array([1, 2, 3, 4])]),
        radius=6.1,
        rtol=rtol,
    )

    # Having extra last elements is invalid if they are lesser than radius - rtol
    msg = re.escape(
        "The last extra elements ([6.]) aren't in [radius ± rtol]=[6.1 ± 1e-07]"
    )
    with pytest.raises(AssertionError, match=msg):
        assert_radius_neighbors_results_quasi_equality(
            np.array([np.array([1.2, 2.5, 6])]),
            np.array([np.array([1.2, 2.5])]),
            np.array([np.array([1, 2, 3])]),
            np.array([np.array([1, 2])]),
            radius=6.1,
            rtol=rtol,
        )

    # Indices aren't properly sorted w.r.t their distances
    msg = "Neighbors indices for query 0 are not matching"
    with pytest.raises(AssertionError, match=msg):
        assert_radius_neighbors_results_quasi_equality(
            np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
            np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
            np.array([np.array([1, 2, 3, 4, 5])]),
            np.array([np.array([2, 1, 4, 5, 3])]),
            radius=6.1,
            rtol=rtol,
        )

    # Distances aren't properly sorted
    msg = "Distances aren't sorted on row 0"
    with pytest.raises(AssertionError, match=msg):
        assert_radius_neighbors_results_quasi_equality(
            np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
            np.array([np.array([2.5, 1.2, _6_1m, 6.1, _6_1p])]),
            np.array([np.array([1, 2, 3, 4, 5])]),
            np.array([np.array([2, 1, 4, 5, 3])]),
            radius=6.1,
            rtol=rtol,
        )


def test_pairwise_distances_reduction_is_usable_for():
    rng = np.random.RandomState(0)
    X = rng.rand(100, 10)
    Y = rng.rand(100, 10)
    X_csr = csr_matrix(X)
    Y_csr = csr_matrix(Y)
    metric = "manhattan"

    # Must be usable for all possible pair of {dense, sparse} datasets
    assert BaseDistancesReductionDispatcher.is_usable_for(X, Y, metric)
    assert BaseDistancesReductionDispatcher.is_usable_for(X_csr, Y_csr, metric)
    assert BaseDistancesReductionDispatcher.is_usable_for(X_csr, Y, metric)
    assert BaseDistancesReductionDispatcher.is_usable_for(X, Y_csr, metric)

    assert BaseDistancesReductionDispatcher.is_usable_for(
        X.astype(np.float64), Y.astype(np.float64), metric
    )

    assert BaseDistancesReductionDispatcher.is_usable_for(
        X.astype(np.float32), Y.astype(np.float32), metric
    )

    assert not BaseDistancesReductionDispatcher.is_usable_for(
        X.astype(np.int64), Y.astype(np.int64), metric
    )

    assert not BaseDistancesReductionDispatcher.is_usable_for(X, Y, metric="pyfunc")
    assert not BaseDistancesReductionDispatcher.is_usable_for(
        X.astype(np.float32), Y, metric
    )
    assert not BaseDistancesReductionDispatcher.is_usable_for(
        X, Y.astype(np.int32), metric
    )

    # F-ordered arrays are not supported
    assert not BaseDistancesReductionDispatcher.is_usable_for(
        np.asfortranarray(X), Y, metric
    )

    # We prefer not to use those implementations for fused sparse-dense when
    # metric="(sq)euclidean" because it's not yet the most efficient one on
    # all configurations of datasets.
    # See: https://github.com/scikit-learn/scikit-learn/pull/23585#issuecomment-1247996669  # noqa
    # TODO: implement specialisation for (sq)euclidean on fused sparse-dense
    # using sparse-dense routines for matrix-vector multiplications.
    assert not BaseDistancesReductionDispatcher.is_usable_for(
        X_csr, Y, metric="euclidean"
    )
    assert BaseDistancesReductionDispatcher.is_usable_for(
        X_csr, Y_csr, metric="sqeuclidean"
    )
    assert BaseDistancesReductionDispatcher.is_usable_for(
        X_csr, Y_csr, metric="euclidean"
    )

    # CSR matrices without non-zeros elements aren't currently supported
    # TODO: support CSR matrices without non-zeros elements
    X_csr_0_nnz = csr_matrix(X * 0)
    assert not BaseDistancesReductionDispatcher.is_usable_for(X_csr_0_nnz, Y, metric)

    # CSR matrices with int64 indices and indptr (e.g. large nnz, or large n_features)
    # aren't supported as of now.
    # See: https://github.com/scikit-learn/scikit-learn/issues/23653
    # TODO: support CSR matrices with int64 indices and indptr
    X_csr_int64 = csr_matrix(X)
    X_csr_int64.indices = X_csr_int64.indices.astype(np.int64)
    assert not BaseDistancesReductionDispatcher.is_usable_for(X_csr_int64, Y, metric)


def test_argkmin_factory_method_wrong_usages():
    rng = np.random.RandomState(1)
    X = rng.rand(100, 10)
    Y = rng.rand(100, 10)
    k = 5
    metric = "euclidean"

    msg = (
        "Only float64 or float32 datasets pairs are supported at this time, "
        "got: X.dtype=float32 and Y.dtype=float64"
    )
    with pytest.raises(ValueError, match=msg):
        ArgKmin.compute(X=X.astype(np.float32), Y=Y, k=k, metric=metric)

    msg = (
        "Only float64 or float32 datasets pairs are supported at this time, "
        "got: X.dtype=float64 and Y.dtype=int32"
    )
    with pytest.raises(ValueError, match=msg):
        ArgKmin.compute(X=X, Y=Y.astype(np.int32), k=k, metric=metric)

    with pytest.raises(ValueError, match="k == -1, must be >= 1."):
        ArgKmin.compute(X=X, Y=Y, k=-1, metric=metric)

    with pytest.raises(ValueError, match="k == 0, must be >= 1."):
        ArgKmin.compute(X=X, Y=Y, k=0, metric=metric)

    with pytest.raises(ValueError, match="Unrecognized metric"):
        ArgKmin.compute(X=X, Y=Y, k=k, metric="wrong metric")

    with pytest.raises(
        ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
    ):
        ArgKmin.compute(X=np.array([1.0, 2.0]), Y=Y, k=k, metric=metric)

    with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
        ArgKmin.compute(X=np.asfortranarray(X), Y=Y, k=k, metric=metric)

    # A UserWarning must be raised in this case.
    unused_metric_kwargs = {"p": 3}

    message = r"Some metric_kwargs have been passed \({'p': 3}\) but"

    with pytest.warns(UserWarning, match=message):
        ArgKmin.compute(
            X=X, Y=Y, k=k, metric=metric, metric_kwargs=unused_metric_kwargs
        )

    # A UserWarning must be raised in this case.
    metric_kwargs = {
        "p": 3,  # unused
        "Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
    }

    message = r"Some metric_kwargs have been passed \({'p': 3, 'Y_norm_squared'"

    with pytest.warns(UserWarning, match=message):
        ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs)

    # No user warning must be raised in this case.
    metric_kwargs = {
        "X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
    }
    with warnings.catch_warnings():
        warnings.simplefilter("error", category=UserWarning)
        ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs)

    # No user warning must be raised in this case.
    metric_kwargs = {
        "X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
        "Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
    }
    with warnings.catch_warnings():
        warnings.simplefilter("error", category=UserWarning)
        ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs)


def test_radius_neighbors_factory_method_wrong_usages():
    rng = np.random.RandomState(1)
    X = rng.rand(100, 10)
    Y = rng.rand(100, 10)
    radius = 5
    metric = "euclidean"

    msg = (
        "Only float64 or float32 datasets pairs are supported at this time, "
        "got: X.dtype=float32 and Y.dtype=float64"
    )
    with pytest.raises(
        ValueError,
        match=msg,
    ):
        RadiusNeighbors.compute(
            X=X.astype(np.float32), Y=Y, radius=radius, metric=metric
        )

    msg = (
        "Only float64 or float32 datasets pairs are supported at this time, "
        "got: X.dtype=float64 and Y.dtype=int32"
    )
    with pytest.raises(
        ValueError,
        match=msg,
    ):
        RadiusNeighbors.compute(X=X, Y=Y.astype(np.int32), radius=radius, metric=metric)

    with pytest.raises(ValueError, match="radius == -1.0, must be >= 0."):
        RadiusNeighbors.compute(X=X, Y=Y, radius=-1, metric=metric)

    with pytest.raises(ValueError, match="Unrecognized metric"):
        RadiusNeighbors.compute(X=X, Y=Y, radius=radius, metric="wrong metric")

    with pytest.raises(
        ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
    ):
        RadiusNeighbors.compute(
            X=np.array([1.0, 2.0]), Y=Y, radius=radius, metric=metric
        )

    with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
        RadiusNeighbors.compute(
            X=np.asfortranarray(X), Y=Y, radius=radius, metric=metric
        )

    unused_metric_kwargs = {"p": 3}

    # A UserWarning must be raised in this case.
    message = r"Some metric_kwargs have been passed \({'p': 3}\) but"

    with pytest.warns(UserWarning, match=message):
        RadiusNeighbors.compute(
            X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=unused_metric_kwargs
        )

    # A UserWarning must be raised in this case.
    metric_kwargs = {
        "p": 3,  # unused
        "Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
    }

    message = r"Some metric_kwargs have been passed \({'p': 3, 'Y_norm_squared'"

    with pytest.warns(UserWarning, match=message):
        RadiusNeighbors.compute(
            X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs
        )

    # No user warning must be raised in this case.
    metric_kwargs = {
        "X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
        "Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
    }
    with warnings.catch_warnings():
        warnings.simplefilter("error", category=UserWarning)
        RadiusNeighbors.compute(
            X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs
        )

    # No user warning must be raised in this case.
    metric_kwargs = {
        "X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
    }
    with warnings.catch_warnings():
        warnings.simplefilter("error", category=UserWarning)
        RadiusNeighbors.compute(
            X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs
        )


@pytest.mark.parametrize(
    "n_samples_X, n_samples_Y", [(100, 100), (500, 100), (100, 500)]
)
@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_chunk_size_agnosticism(
    global_random_seed,
    Dispatcher,
    n_samples_X,
    n_samples_Y,
    dtype,
    n_features=100,
):
    """Check that results do not depend on the chunk size."""
    rng = np.random.RandomState(global_random_seed)
    spread = 100
    X = rng.rand(n_samples_X, n_features).astype(dtype) * spread
    Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread

    if Dispatcher is ArgKmin:
        parameter = 10
        check_parameters = {}
        compute_parameters = {}
    else:
        # Scaling the radius slightly with the numbers of dimensions
        radius = 10 ** np.log(n_features)
        parameter = radius
        check_parameters = {"radius": radius}
        compute_parameters = {"sort_results": True}

    ref_dist, ref_indices = Dispatcher.compute(
        X,
        Y,
        parameter,
        chunk_size=256,  # default
        metric="manhattan",
        return_distance=True,
        **compute_parameters,
    )

    dist, indices = Dispatcher.compute(
        X,
        Y,
        parameter,
        chunk_size=41,
        metric="manhattan",
        return_distance=True,
        **compute_parameters,
    )

    ASSERT_RESULT[(Dispatcher, dtype)](
        ref_dist, dist, ref_indices, indices, **check_parameters
    )


@pytest.mark.parametrize(
    "n_samples_X, n_samples_Y", [(100, 100), (500, 100), (100, 500)]
)
@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_n_threads_agnosticism(
    global_random_seed,
    Dispatcher,
    n_samples_X,
    n_samples_Y,
    dtype,
    n_features=100,
):
    """Check that results do not depend on the number of threads."""
    rng = np.random.RandomState(global_random_seed)
    spread = 100
    X = rng.rand(n_samples_X, n_features).astype(dtype) * spread
    Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread

    if Dispatcher is ArgKmin:
        parameter = 10
        check_parameters = {}
        compute_parameters = {}
    else:
        # Scaling the radius slightly with the numbers of dimensions
        radius = 10 ** np.log(n_features)
        parameter = radius
        check_parameters = {"radius": radius}
        compute_parameters = {"sort_results": True}

    ref_dist, ref_indices = Dispatcher.compute(
        X,
        Y,
        parameter,
        chunk_size=25,  # make sure we use multiple threads
        return_distance=True,
        **compute_parameters,
    )

    with threadpoolctl.threadpool_limits(limits=1, user_api="openmp"):
        dist, indices = Dispatcher.compute(
            X,
            Y,
            parameter,
            chunk_size=25,
            return_distance=True,
            **compute_parameters,
        )

    ASSERT_RESULT[(Dispatcher, dtype)](
        ref_dist, dist, ref_indices, indices, **check_parameters
    )


@pytest.mark.parametrize(
    "Dispatcher, dtype",
    [
        (ArgKmin, np.float64),
        (RadiusNeighbors, np.float32),
        (ArgKmin, np.float32),
        (RadiusNeighbors, np.float64),
    ],
)
def test_format_agnosticism(
    global_random_seed,
    Dispatcher,
    dtype,
):
    """Check that results do not depend on the format (dense, sparse) of the input."""
    rng = np.random.RandomState(global_random_seed)
    spread = 100
    n_samples, n_features = 100, 100

    X = rng.rand(n_samples, n_features).astype(dtype) * spread
    Y = rng.rand(n_samples, n_features).astype(dtype) * spread

    X_csr = csr_matrix(X)
    Y_csr = csr_matrix(Y)

    if Dispatcher is ArgKmin:
        parameter = 10
        check_parameters = {}
        compute_parameters = {}
    else:
        # Scaling the radius slightly with the numbers of dimensions
        radius = 10 ** np.log(n_features)
        parameter = radius
        check_parameters = {"radius": radius}
        compute_parameters = {"sort_results": True}

    dist_dense, indices_dense = Dispatcher.compute(
        X,
        Y,
        parameter,
        chunk_size=50,
        return_distance=True,
        **compute_parameters,
    )

    for _X, _Y in itertools.product((X, X_csr), (Y, Y_csr)):
        if _X is X and _Y is Y:
            continue
        dist, indices = Dispatcher.compute(
            _X,
            _Y,
            parameter,
            chunk_size=50,
            return_distance=True,
            **compute_parameters,
        )
        ASSERT_RESULT[(Dispatcher, dtype)](
            dist_dense,
            dist,
            indices_dense,
            indices,
            **check_parameters,
        )


@pytest.mark.parametrize(
    "n_samples_X, n_samples_Y", [(100, 100), (100, 500), (500, 100)]
)
@pytest.mark.parametrize(
    "metric",
    ["euclidean", "minkowski", "manhattan", "infinity", "seuclidean", "haversine"],
)
@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_strategies_consistency(
    global_random_seed,
    Dispatcher,
    metric,
    n_samples_X,
    n_samples_Y,
    dtype,
    n_features=10,
):
    """Check that the results do not depend on the strategy used."""
    rng = np.random.RandomState(global_random_seed)
    spread = 100
    X = rng.rand(n_samples_X, n_features).astype(dtype) * spread
    Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread

    # Haversine distance only accepts 2D data
    if metric == "haversine":
        X = np.ascontiguousarray(X[:, :2])
        Y = np.ascontiguousarray(Y[:, :2])

    if Dispatcher is ArgKmin:
        parameter = 10
        check_parameters = {}
        compute_parameters = {}
    else:
        # Scaling the radius slightly with the numbers of dimensions
        radius = 10 ** np.log(n_features)
        parameter = radius
        check_parameters = {"radius": radius}
        compute_parameters = {"sort_results": True}

    dist_par_X, indices_par_X = Dispatcher.compute(
        X,
        Y,
        parameter,
        metric=metric,
        # Taking the first
        metric_kwargs=_get_metric_params_list(
            metric, n_features, seed=global_random_seed
        )[0],
        # To be sure to use parallelization
        chunk_size=n_samples_X // 4,
        strategy="parallel_on_X",
        return_distance=True,
        **compute_parameters,
    )

    dist_par_Y, indices_par_Y = Dispatcher.compute(
        X,
        Y,
        parameter,
        metric=metric,
        # Taking the first
        metric_kwargs=_get_metric_params_list(
            metric, n_features, seed=global_random_seed
        )[0],
        # To be sure to use parallelization
        chunk_size=n_samples_Y // 4,
        strategy="parallel_on_Y",
        return_distance=True,
        **compute_parameters,
    )

    ASSERT_RESULT[(Dispatcher, dtype)](
        dist_par_X, dist_par_Y, indices_par_X, indices_par_Y, **check_parameters
    )


# "Concrete Dispatchers"-specific tests

# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
@pytest.mark.parametrize("n_features", [50, 500])
@pytest.mark.parametrize("translation", [0, 1e6])
@pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS)
@pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y"))
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_pairwise_distances_argkmin(
    global_random_seed,
    n_features,
    translation,
    metric,
    strategy,
    dtype,
    n_samples=100,
    k=10,
):
    # TODO: can we easily fix this discrepancy?
    edge_cases = [
        (np.float32, "chebyshev", 1000000.0),
        (np.float32, "cityblock", 1000000.0),
    ]
    if (dtype, metric, translation) in edge_cases:
        pytest.xfail("Numerical differences lead to small differences in results.")

    rng = np.random.RandomState(global_random_seed)
    spread = 1000
    X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
    Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread

    X_csr = csr_matrix(X)
    Y_csr = csr_matrix(Y)

    # Haversine distance only accepts 2D data
    if metric == "haversine":
        X = np.ascontiguousarray(X[:, :2])
        Y = np.ascontiguousarray(Y[:, :2])

    metric_kwargs = _get_metric_params_list(metric, n_features)[0]

    # Reference for argkmin results
    if metric == "euclidean":
        # Compare to scikit-learn GEMM optimized implementation
        dist_matrix = euclidean_distances(X, Y)
    else:
        dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs)
    # Taking argkmin (indices of the k smallest values)
    argkmin_indices_ref = np.argsort(dist_matrix, axis=1)[:, :k]
    # Getting the associated distances
    argkmin_distances_ref = np.zeros(argkmin_indices_ref.shape, dtype=np.float64)
    for row_idx in range(argkmin_indices_ref.shape[0]):
        argkmin_distances_ref[row_idx] = dist_matrix[
            row_idx, argkmin_indices_ref[row_idx]
        ]

    for _X, _Y in [(X, Y), (X_csr, Y_csr)]:
        argkmin_distances, argkmin_indices = ArgKmin.compute(
            _X,
            _Y,
            k,
            metric=metric,
            metric_kwargs=metric_kwargs,
            return_distance=True,
            # So as to have more than a chunk, forcing parallelism.
            chunk_size=n_samples // 4,
            strategy=strategy,
        )

        ASSERT_RESULT[(ArgKmin, dtype)](
            argkmin_distances,
            argkmin_distances_ref,
            argkmin_indices,
            argkmin_indices_ref,
        )


# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
@pytest.mark.parametrize("n_features", [50, 500])
@pytest.mark.parametrize("translation", [0, 1e6])
@pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS)
@pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y"))
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_pairwise_distances_radius_neighbors(
    global_random_seed,
    n_features,
    translation,
    metric,
    strategy,
    dtype,
    n_samples=100,
):
    rng = np.random.RandomState(global_random_seed)
    spread = 1000
    radius = spread * np.log(n_features)
    X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
    Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread

    metric_kwargs = _get_metric_params_list(
        metric, n_features, seed=global_random_seed
    )[0]

    # Reference for argkmin results
    if metric == "euclidean":
        # Compare to scikit-learn GEMM optimized implementation
        dist_matrix = euclidean_distances(X, Y)
    else:
        dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs)

    # Getting the neighbors for a given radius
    neigh_indices_ref = []
    neigh_distances_ref = []

    for row in dist_matrix:
        ind = np.arange(row.shape[0])[row <= radius]
        dist = row[ind]

        sort = np.argsort(dist)
        ind, dist = ind[sort], dist[sort]

        neigh_indices_ref.append(ind)
        neigh_distances_ref.append(dist)

    neigh_distances, neigh_indices = RadiusNeighbors.compute(
        X,
        Y,
        radius,
        metric=metric,
        metric_kwargs=metric_kwargs,
        return_distance=True,
        # So as to have more than a chunk, forcing parallelism.
        chunk_size=n_samples // 4,
        strategy=strategy,
        sort_results=True,
    )

    ASSERT_RESULT[(RadiusNeighbors, dtype)](
        neigh_distances, neigh_distances_ref, neigh_indices, neigh_indices_ref, radius
    )


@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
@pytest.mark.parametrize("metric", ["manhattan", "euclidean"])
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_memmap_backed_data(
    metric,
    Dispatcher,
    dtype,
):
    """Check that the results do not depend on the datasets writability."""
    rng = np.random.RandomState(0)
    spread = 100
    n_samples, n_features = 128, 10
    X = rng.rand(n_samples, n_features).astype(dtype) * spread
    Y = rng.rand(n_samples, n_features).astype(dtype) * spread

    # Create read only datasets
    X_mm, Y_mm = create_memmap_backed_data([X, Y])

    if Dispatcher is ArgKmin:
        parameter = 10
        check_parameters = {}
        compute_parameters = {}
    else:
        # Scaling the radius slightly with the numbers of dimensions
        radius = 10 ** np.log(n_features)
        parameter = radius
        check_parameters = {"radius": radius}
        compute_parameters = {"sort_results": True}

    ref_dist, ref_indices = Dispatcher.compute(
        X,
        Y,
        parameter,
        metric=metric,
        return_distance=True,
        **compute_parameters,
    )

    dist_mm, indices_mm = Dispatcher.compute(
        X_mm,
        Y_mm,
        parameter,
        metric=metric,
        return_distance=True,
        **compute_parameters,
    )

    ASSERT_RESULT[(Dispatcher, dtype)](
        ref_dist, dist_mm, ref_indices, indices_mm, **check_parameters
    )


@pytest.mark.parametrize("n_samples", [100, 1000])
@pytest.mark.parametrize("n_features", [5, 10, 100])
@pytest.mark.parametrize("num_threads", [1, 2, 8])
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_sqeuclidean_row_norms(
    global_random_seed,
    n_samples,
    n_features,
    num_threads,
    dtype,
):
    rng = np.random.RandomState(global_random_seed)
    spread = 100
    X = rng.rand(n_samples, n_features).astype(dtype) * spread

    X_csr = csr_matrix(X)

    sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2
    sq_row_norm = sqeuclidean_row_norms(X, num_threads=num_threads)

    sq_row_norm_csr = sqeuclidean_row_norms(X_csr, num_threads=num_threads)

    assert_allclose(sq_row_norm_reference, sq_row_norm)
    assert_allclose(sq_row_norm_reference, sq_row_norm_csr)

    with pytest.raises(ValueError):
        X = np.asfortranarray(X)
        sqeuclidean_row_norms(X, num_threads=num_threads)